/* sp.c
 *
 * Copyright (C) 2006-2018 wolfSSL Inc.
 *
 * This file is part of wolfSSL.
 *
 * wolfSSL is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * wolfSSL is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
 */

/* Implementation by Sean Parkinson. */

#ifdef HAVE_CONFIG_H
    #include <config.h>
#endif

#include <wolfssl/wolfcrypt/settings.h>
#include <wolfssl/wolfcrypt/error-crypt.h>
#include <wolfssl/wolfcrypt/cpuid.h>
#ifdef NO_INLINE
    #include <wolfssl/wolfcrypt/misc.h>
#else
    #define WOLFSSL_MISC_INCLUDED
    #include <wolfcrypt/src/misc.c>
#endif

#if defined(WOLFSSL_HAVE_SP_RSA) || defined(WOLFSSL_HAVE_SP_DH) || \
                                    defined(WOLFSSL_HAVE_SP_ECC)

#ifdef RSA_LOW_MEM
#define SP_RSA_PRIVATE_EXP_D

#ifndef WOLFSSL_SP_SMALL
#define WOLFSSL_SP_SMALL
#endif
#endif

#include <wolfssl/wolfcrypt/sp.h>

#ifdef WOLFSSL_SP_X86_64_ASM
#if defined(WOLFSSL_HAVE_SP_RSA) || defined(WOLFSSL_HAVE_SP_DH)
#ifndef WOLFSSL_SP_NO_2048
/* Read big endian unsigned byte aray into r.
 *
 * r  A single precision integer.
 * a  Byte array.
 * n  Number of bytes in array to read.
 */
static void sp_2048_from_bin(sp_digit* r, int max, const byte* a, int n)
{
    int i, j = 0, s = 0;

    r[0] = 0;
    for (i = n-1; i >= 0; i--) {
        r[j] |= ((sp_digit)a[i]) << s;
        if (s >= 56) {
            r[j] &= 0xffffffffffffffffl;
            s = 64 - s;
            if (j + 1 >= max)
                break;
            r[++j] = a[i] >> s;
            s = 8 - s;
        }
        else
            s += 8;
    }

    for (j++; j < max; j++)
        r[j] = 0;
}

/* Convert an mp_int to an array of sp_digit.
 *
 * r  A single precision integer.
 * a  A multi-precision integer.
 */
static void sp_2048_from_mp(sp_digit* r, int max, mp_int* a)
{
#if DIGIT_BIT == 64
    int j;

    XMEMCPY(r, a->dp, sizeof(sp_digit) * a->used);

    for (j = a->used; j < max; j++)
        r[j] = 0;
#elif DIGIT_BIT > 64
    int i, j = 0, s = 0;

    r[0] = 0;
    for (i = 0; i < a->used && j < max; i++) {
        r[j] |= a->dp[i] << s;
        r[j] &= 0xffffffffffffffffl;
        s = 64 - s;
        if (j + 1 >= max)
            break;
        r[++j] = a->dp[i] >> s;
        while (s + 64 <= DIGIT_BIT) {
            s += 64;
            r[j] &= 0xffffffffffffffffl;
            if (j + 1 >= max)
                break;
            if (s < DIGIT_BIT)
                r[++j] = a->dp[i] >> s;
            else
                r[++j] = 0;
        }
        s = DIGIT_BIT - s;
    }

    for (j++; j < max; j++)
        r[j] = 0;
#else
    int i, j = 0, s = 0;

    r[0] = 0;
    for (i = 0; i < a->used && j < max; i++) {
        r[j] |= ((sp_digit)a->dp[i]) << s;
        if (s + DIGIT_BIT >= 64) {
            r[j] &= 0xffffffffffffffffl;
            if (j + 1 >= max)
                break;
            s = 64 - s;
            if (s == DIGIT_BIT) {
                r[++j] = 0;
                s = 0;
            }
            else {
                r[++j] = a->dp[i] >> s;
                s = DIGIT_BIT - s;
            }
        }
        else
            s += DIGIT_BIT;
    }

    for (j++; j < max; j++)
        r[j] = 0;
#endif
}

/* Write r as big endian to byte aray.
 * Fixed length number of bytes written: 256
 *
 * r  A single precision integer.
 * a  Byte array.
 */
static void sp_2048_to_bin(sp_digit* r, byte* a)
{
    int i, j, s = 0, b;

    j = 2048 / 8 - 1;
    a[j] = 0;
    for (i=0; i<32 && j>=0; i++) {
        b = 0;
        a[j--] |= r[i] << s; b += 8 - s;
        if (j < 0)
            break;
        while (b < 64) {
            a[j--] = r[i] >> b; b += 8;
            if (j < 0)
                break;
        }
        s = 8 - (b - 64);
        if (j >= 0)
            a[j] = 0;
        if (s != 0)
            j++;
    }
}

/* Multiply a and b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
SP_NOINLINE static void sp_2048_mul_16(sp_digit* r, const sp_digit* a,
    const sp_digit* b)
{
    sp_digit tmp[16];

    __asm__ __volatile__ (
        "#  A[0] * B[0]\n\t"
        "movq	0(%[b]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "movq	%%rax, (%[tmp])\n\t"
        "movq	%%rdx, %%rcx\n\t"
        "#  A[0] * B[1]\n\t"
        "movq	8(%[b]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[1] * B[0]\n\t"
        "movq	0(%[b]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "movq	%%rcx, 8(%[tmp])\n\t"
        "#  A[0] * B[2]\n\t"
        "movq	16(%[b]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[1] * B[1]\n\t"
        "movq	8(%[b]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[2] * B[0]\n\t"
        "movq	0(%[b]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "movq	%%r8, 16(%[tmp])\n\t"
        "#  A[0] * B[3]\n\t"
        "movq	24(%[b]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[1] * B[2]\n\t"
        "movq	16(%[b]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[2] * B[1]\n\t"
        "movq	8(%[b]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[3] * B[0]\n\t"
        "movq	0(%[b]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "movq	%%rbx, 24(%[tmp])\n\t"
        "#  A[0] * B[4]\n\t"
        "movq	32(%[b]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[1] * B[3]\n\t"
        "movq	24(%[b]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[2] * B[2]\n\t"
        "movq	16(%[b]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[3] * B[1]\n\t"
        "movq	8(%[b]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[4] * B[0]\n\t"
        "movq	0(%[b]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "movq	%%rcx, 32(%[tmp])\n\t"
        "#  A[0] * B[5]\n\t"
        "movq	40(%[b]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[1] * B[4]\n\t"
        "movq	32(%[b]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[2] * B[3]\n\t"
        "movq	24(%[b]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[3] * B[2]\n\t"
        "movq	16(%[b]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[4] * B[1]\n\t"
        "movq	8(%[b]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[5] * B[0]\n\t"
        "movq	0(%[b]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "movq	%%r8, 40(%[tmp])\n\t"
        "#  A[0] * B[6]\n\t"
        "movq	48(%[b]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[1] * B[5]\n\t"
        "movq	40(%[b]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[2] * B[4]\n\t"
        "movq	32(%[b]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[3] * B[3]\n\t"
        "movq	24(%[b]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[4] * B[2]\n\t"
        "movq	16(%[b]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[5] * B[1]\n\t"
        "movq	8(%[b]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[6] * B[0]\n\t"
        "movq	0(%[b]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "movq	%%rbx, 48(%[tmp])\n\t"
        "#  A[0] * B[7]\n\t"
        "movq	56(%[b]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[1] * B[6]\n\t"
        "movq	48(%[b]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[2] * B[5]\n\t"
        "movq	40(%[b]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[3] * B[4]\n\t"
        "movq	32(%[b]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[4] * B[3]\n\t"
        "movq	24(%[b]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[5] * B[2]\n\t"
        "movq	16(%[b]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[6] * B[1]\n\t"
        "movq	8(%[b]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[7] * B[0]\n\t"
        "movq	0(%[b]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "movq	%%rcx, 56(%[tmp])\n\t"
        "#  A[0] * B[8]\n\t"
        "movq	64(%[b]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[1] * B[7]\n\t"
        "movq	56(%[b]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[2] * B[6]\n\t"
        "movq	48(%[b]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[3] * B[5]\n\t"
        "movq	40(%[b]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[4] * B[4]\n\t"
        "movq	32(%[b]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[5] * B[3]\n\t"
        "movq	24(%[b]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[6] * B[2]\n\t"
        "movq	16(%[b]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[7] * B[1]\n\t"
        "movq	8(%[b]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[8] * B[0]\n\t"
        "movq	0(%[b]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "movq	%%r8, 64(%[tmp])\n\t"
        "#  A[0] * B[9]\n\t"
        "movq	72(%[b]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[1] * B[8]\n\t"
        "movq	64(%[b]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[2] * B[7]\n\t"
        "movq	56(%[b]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[3] * B[6]\n\t"
        "movq	48(%[b]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[4] * B[5]\n\t"
        "movq	40(%[b]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[5] * B[4]\n\t"
        "movq	32(%[b]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[6] * B[3]\n\t"
        "movq	24(%[b]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[7] * B[2]\n\t"
        "movq	16(%[b]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[8] * B[1]\n\t"
        "movq	8(%[b]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[9] * B[0]\n\t"
        "movq	0(%[b]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "movq	%%rbx, 72(%[tmp])\n\t"
        "#  A[0] * B[10]\n\t"
        "movq	80(%[b]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[1] * B[9]\n\t"
        "movq	72(%[b]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[2] * B[8]\n\t"
        "movq	64(%[b]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[3] * B[7]\n\t"
        "movq	56(%[b]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[4] * B[6]\n\t"
        "movq	48(%[b]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[5] * B[5]\n\t"
        "movq	40(%[b]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[6] * B[4]\n\t"
        "movq	32(%[b]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[7] * B[3]\n\t"
        "movq	24(%[b]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[8] * B[2]\n\t"
        "movq	16(%[b]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[9] * B[1]\n\t"
        "movq	8(%[b]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[10] * B[0]\n\t"
        "movq	0(%[b]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "movq	%%rcx, 80(%[tmp])\n\t"
        "#  A[0] * B[11]\n\t"
        "movq	88(%[b]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[1] * B[10]\n\t"
        "movq	80(%[b]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[2] * B[9]\n\t"
        "movq	72(%[b]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[3] * B[8]\n\t"
        "movq	64(%[b]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[4] * B[7]\n\t"
        "movq	56(%[b]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[5] * B[6]\n\t"
        "movq	48(%[b]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[6] * B[5]\n\t"
        "movq	40(%[b]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[7] * B[4]\n\t"
        "movq	32(%[b]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[8] * B[3]\n\t"
        "movq	24(%[b]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[9] * B[2]\n\t"
        "movq	16(%[b]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[10] * B[1]\n\t"
        "movq	8(%[b]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[11] * B[0]\n\t"
        "movq	0(%[b]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "movq	%%r8, 88(%[tmp])\n\t"
        "#  A[0] * B[12]\n\t"
        "movq	96(%[b]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[1] * B[11]\n\t"
        "movq	88(%[b]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[2] * B[10]\n\t"
        "movq	80(%[b]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[3] * B[9]\n\t"
        "movq	72(%[b]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[4] * B[8]\n\t"
        "movq	64(%[b]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[5] * B[7]\n\t"
        "movq	56(%[b]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[6] * B[6]\n\t"
        "movq	48(%[b]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[7] * B[5]\n\t"
        "movq	40(%[b]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[8] * B[4]\n\t"
        "movq	32(%[b]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[9] * B[3]\n\t"
        "movq	24(%[b]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[10] * B[2]\n\t"
        "movq	16(%[b]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[11] * B[1]\n\t"
        "movq	8(%[b]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[12] * B[0]\n\t"
        "movq	0(%[b]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "movq	%%rbx, 96(%[tmp])\n\t"
        "#  A[0] * B[13]\n\t"
        "movq	104(%[b]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[1] * B[12]\n\t"
        "movq	96(%[b]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[2] * B[11]\n\t"
        "movq	88(%[b]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[3] * B[10]\n\t"
        "movq	80(%[b]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[4] * B[9]\n\t"
        "movq	72(%[b]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[5] * B[8]\n\t"
        "movq	64(%[b]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[6] * B[7]\n\t"
        "movq	56(%[b]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[7] * B[6]\n\t"
        "movq	48(%[b]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[8] * B[5]\n\t"
        "movq	40(%[b]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[9] * B[4]\n\t"
        "movq	32(%[b]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[10] * B[3]\n\t"
        "movq	24(%[b]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[11] * B[2]\n\t"
        "movq	16(%[b]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[12] * B[1]\n\t"
        "movq	8(%[b]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[13] * B[0]\n\t"
        "movq	0(%[b]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "movq	%%rcx, 104(%[tmp])\n\t"
        "#  A[0] * B[14]\n\t"
        "movq	112(%[b]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[1] * B[13]\n\t"
        "movq	104(%[b]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[2] * B[12]\n\t"
        "movq	96(%[b]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[3] * B[11]\n\t"
        "movq	88(%[b]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[4] * B[10]\n\t"
        "movq	80(%[b]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[5] * B[9]\n\t"
        "movq	72(%[b]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[6] * B[8]\n\t"
        "movq	64(%[b]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[7] * B[7]\n\t"
        "movq	56(%[b]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[8] * B[6]\n\t"
        "movq	48(%[b]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[9] * B[5]\n\t"
        "movq	40(%[b]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[10] * B[4]\n\t"
        "movq	32(%[b]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[11] * B[3]\n\t"
        "movq	24(%[b]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[12] * B[2]\n\t"
        "movq	16(%[b]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[13] * B[1]\n\t"
        "movq	8(%[b]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[14] * B[0]\n\t"
        "movq	0(%[b]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "movq	%%r8, 112(%[tmp])\n\t"
        "#  A[0] * B[15]\n\t"
        "movq	120(%[b]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[1] * B[14]\n\t"
        "movq	112(%[b]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[2] * B[13]\n\t"
        "movq	104(%[b]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[3] * B[12]\n\t"
        "movq	96(%[b]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[4] * B[11]\n\t"
        "movq	88(%[b]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[5] * B[10]\n\t"
        "movq	80(%[b]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[6] * B[9]\n\t"
        "movq	72(%[b]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[7] * B[8]\n\t"
        "movq	64(%[b]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[8] * B[7]\n\t"
        "movq	56(%[b]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[9] * B[6]\n\t"
        "movq	48(%[b]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[10] * B[5]\n\t"
        "movq	40(%[b]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[11] * B[4]\n\t"
        "movq	32(%[b]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[12] * B[3]\n\t"
        "movq	24(%[b]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[13] * B[2]\n\t"
        "movq	16(%[b]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[14] * B[1]\n\t"
        "movq	8(%[b]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[15] * B[0]\n\t"
        "movq	0(%[b]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "movq	%%rbx, 120(%[tmp])\n\t"
        "#  A[1] * B[15]\n\t"
        "movq	120(%[b]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[2] * B[14]\n\t"
        "movq	112(%[b]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[3] * B[13]\n\t"
        "movq	104(%[b]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[4] * B[12]\n\t"
        "movq	96(%[b]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[5] * B[11]\n\t"
        "movq	88(%[b]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[6] * B[10]\n\t"
        "movq	80(%[b]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[7] * B[9]\n\t"
        "movq	72(%[b]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[8] * B[8]\n\t"
        "movq	64(%[b]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[9] * B[7]\n\t"
        "movq	56(%[b]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[10] * B[6]\n\t"
        "movq	48(%[b]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[11] * B[5]\n\t"
        "movq	40(%[b]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[12] * B[4]\n\t"
        "movq	32(%[b]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[13] * B[3]\n\t"
        "movq	24(%[b]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[14] * B[2]\n\t"
        "movq	16(%[b]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[15] * B[1]\n\t"
        "movq	8(%[b]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "movq	%%rcx, 128(%[r])\n\t"
        "#  A[2] * B[15]\n\t"
        "movq	120(%[b]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[3] * B[14]\n\t"
        "movq	112(%[b]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[4] * B[13]\n\t"
        "movq	104(%[b]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[5] * B[12]\n\t"
        "movq	96(%[b]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[6] * B[11]\n\t"
        "movq	88(%[b]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[7] * B[10]\n\t"
        "movq	80(%[b]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[8] * B[9]\n\t"
        "movq	72(%[b]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[9] * B[8]\n\t"
        "movq	64(%[b]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[10] * B[7]\n\t"
        "movq	56(%[b]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[11] * B[6]\n\t"
        "movq	48(%[b]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[12] * B[5]\n\t"
        "movq	40(%[b]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[13] * B[4]\n\t"
        "movq	32(%[b]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[14] * B[3]\n\t"
        "movq	24(%[b]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[15] * B[2]\n\t"
        "movq	16(%[b]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "movq	%%r8, 136(%[r])\n\t"
        "#  A[3] * B[15]\n\t"
        "movq	120(%[b]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[4] * B[14]\n\t"
        "movq	112(%[b]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[5] * B[13]\n\t"
        "movq	104(%[b]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[6] * B[12]\n\t"
        "movq	96(%[b]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[7] * B[11]\n\t"
        "movq	88(%[b]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[8] * B[10]\n\t"
        "movq	80(%[b]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[9] * B[9]\n\t"
        "movq	72(%[b]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[10] * B[8]\n\t"
        "movq	64(%[b]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[11] * B[7]\n\t"
        "movq	56(%[b]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[12] * B[6]\n\t"
        "movq	48(%[b]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[13] * B[5]\n\t"
        "movq	40(%[b]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[14] * B[4]\n\t"
        "movq	32(%[b]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[15] * B[3]\n\t"
        "movq	24(%[b]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "movq	%%rbx, 144(%[r])\n\t"
        "#  A[4] * B[15]\n\t"
        "movq	120(%[b]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[5] * B[14]\n\t"
        "movq	112(%[b]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[6] * B[13]\n\t"
        "movq	104(%[b]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[7] * B[12]\n\t"
        "movq	96(%[b]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[8] * B[11]\n\t"
        "movq	88(%[b]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[9] * B[10]\n\t"
        "movq	80(%[b]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[10] * B[9]\n\t"
        "movq	72(%[b]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[11] * B[8]\n\t"
        "movq	64(%[b]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[12] * B[7]\n\t"
        "movq	56(%[b]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[13] * B[6]\n\t"
        "movq	48(%[b]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[14] * B[5]\n\t"
        "movq	40(%[b]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[15] * B[4]\n\t"
        "movq	32(%[b]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "movq	%%rcx, 152(%[r])\n\t"
        "#  A[5] * B[15]\n\t"
        "movq	120(%[b]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[6] * B[14]\n\t"
        "movq	112(%[b]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[7] * B[13]\n\t"
        "movq	104(%[b]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[8] * B[12]\n\t"
        "movq	96(%[b]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[9] * B[11]\n\t"
        "movq	88(%[b]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[10] * B[10]\n\t"
        "movq	80(%[b]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[11] * B[9]\n\t"
        "movq	72(%[b]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[12] * B[8]\n\t"
        "movq	64(%[b]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[13] * B[7]\n\t"
        "movq	56(%[b]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[14] * B[6]\n\t"
        "movq	48(%[b]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[15] * B[5]\n\t"
        "movq	40(%[b]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "movq	%%r8, 160(%[r])\n\t"
        "#  A[6] * B[15]\n\t"
        "movq	120(%[b]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[7] * B[14]\n\t"
        "movq	112(%[b]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[8] * B[13]\n\t"
        "movq	104(%[b]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[9] * B[12]\n\t"
        "movq	96(%[b]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[10] * B[11]\n\t"
        "movq	88(%[b]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[11] * B[10]\n\t"
        "movq	80(%[b]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[12] * B[9]\n\t"
        "movq	72(%[b]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[13] * B[8]\n\t"
        "movq	64(%[b]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[14] * B[7]\n\t"
        "movq	56(%[b]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[15] * B[6]\n\t"
        "movq	48(%[b]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "movq	%%rbx, 168(%[r])\n\t"
        "#  A[7] * B[15]\n\t"
        "movq	120(%[b]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[8] * B[14]\n\t"
        "movq	112(%[b]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[9] * B[13]\n\t"
        "movq	104(%[b]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[10] * B[12]\n\t"
        "movq	96(%[b]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[11] * B[11]\n\t"
        "movq	88(%[b]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[12] * B[10]\n\t"
        "movq	80(%[b]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[13] * B[9]\n\t"
        "movq	72(%[b]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[14] * B[8]\n\t"
        "movq	64(%[b]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[15] * B[7]\n\t"
        "movq	56(%[b]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "movq	%%rcx, 176(%[r])\n\t"
        "#  A[8] * B[15]\n\t"
        "movq	120(%[b]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[9] * B[14]\n\t"
        "movq	112(%[b]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[10] * B[13]\n\t"
        "movq	104(%[b]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[11] * B[12]\n\t"
        "movq	96(%[b]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[12] * B[11]\n\t"
        "movq	88(%[b]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[13] * B[10]\n\t"
        "movq	80(%[b]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[14] * B[9]\n\t"
        "movq	72(%[b]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[15] * B[8]\n\t"
        "movq	64(%[b]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "movq	%%r8, 184(%[r])\n\t"
        "#  A[9] * B[15]\n\t"
        "movq	120(%[b]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[10] * B[14]\n\t"
        "movq	112(%[b]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[11] * B[13]\n\t"
        "movq	104(%[b]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[12] * B[12]\n\t"
        "movq	96(%[b]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[13] * B[11]\n\t"
        "movq	88(%[b]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[14] * B[10]\n\t"
        "movq	80(%[b]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[15] * B[9]\n\t"
        "movq	72(%[b]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "movq	%%rbx, 192(%[r])\n\t"
        "#  A[10] * B[15]\n\t"
        "movq	120(%[b]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[11] * B[14]\n\t"
        "movq	112(%[b]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[12] * B[13]\n\t"
        "movq	104(%[b]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[13] * B[12]\n\t"
        "movq	96(%[b]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[14] * B[11]\n\t"
        "movq	88(%[b]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[15] * B[10]\n\t"
        "movq	80(%[b]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "movq	%%rcx, 200(%[r])\n\t"
        "#  A[11] * B[15]\n\t"
        "movq	120(%[b]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[12] * B[14]\n\t"
        "movq	112(%[b]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[13] * B[13]\n\t"
        "movq	104(%[b]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[14] * B[12]\n\t"
        "movq	96(%[b]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[15] * B[11]\n\t"
        "movq	88(%[b]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "movq	%%r8, 208(%[r])\n\t"
        "#  A[12] * B[15]\n\t"
        "movq	120(%[b]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[13] * B[14]\n\t"
        "movq	112(%[b]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[14] * B[13]\n\t"
        "movq	104(%[b]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[15] * B[12]\n\t"
        "movq	96(%[b]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "movq	%%rbx, 216(%[r])\n\t"
        "#  A[13] * B[15]\n\t"
        "movq	120(%[b]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[14] * B[14]\n\t"
        "movq	112(%[b]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[15] * B[13]\n\t"
        "movq	104(%[b]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "movq	%%rcx, 224(%[r])\n\t"
        "#  A[14] * B[15]\n\t"
        "movq	120(%[b]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[15] * B[14]\n\t"
        "movq	112(%[b]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "movq	%%r8, 232(%[r])\n\t"
        "#  A[15] * B[15]\n\t"
        "movq	120(%[b]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "movq	%%rbx, 240(%[r])\n\t"
        "movq	%%rcx, 248(%[r])\n\t"
        :
        : [r] "r" (r), [a] "r" (a), [b] "r" (b), [tmp] "r" (tmp)
        : "memory", "rax", "rdx", "rbx", "rcx", "r8"
    );

    XMEMCPY(r, tmp, sizeof(tmp));
}

/* Square a and put result in r. (r = a * a)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 */
SP_NOINLINE static void sp_2048_sqr_16(sp_digit* r, const sp_digit* a)
{
    sp_digit tmp[16];

    __asm__ __volatile__ (
        "#  A[0] * A[0]\n\t"
        "movq	0(%[a]), %%rax\n\t"
        "mulq	%%rax\n\t"
        "xorq	%%r9, %%r9\n\t"
        "movq	%%rax, (%[tmp])\n\t"
        "movq	%%rdx, %%r8\n\t"
        "#  A[0] * A[1]\n\t"
        "movq	8(%[a]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "adcq	$0, %%rcx\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "adcq	$0, %%rcx\n\t"
        "movq	%%r8, 8(%[tmp])\n\t"
        "#  A[0] * A[2]\n\t"
        "movq	16(%[a]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "addq	%%rax, %%r9\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "addq	%%rax, %%r9\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[1] * A[1]\n\t"
        "movq	8(%[a]), %%rax\n\t"
        "mulq	%%rax\n\t"
        "addq	%%rax, %%r9\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "movq	%%r9, 16(%[tmp])\n\t"
        "#  A[0] * A[3]\n\t"
        "movq	24(%[a]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%r9, %%r9\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%r9\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%r9\n\t"
        "#  A[1] * A[2]\n\t"
        "movq	16(%[a]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%r9\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%r9\n\t"
        "movq	%%rcx, 24(%[tmp])\n\t"
        "#  A[0] * A[4]\n\t"
        "movq	32(%[a]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "adcq	$0, %%rcx\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[1] * A[3]\n\t"
        "movq	24(%[a]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "adcq	$0, %%rcx\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[2] * A[2]\n\t"
        "movq	16(%[a]), %%rax\n\t"
        "mulq	%%rax\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "adcq	$0, %%rcx\n\t"
        "movq	%%r8, 32(%[tmp])\n\t"
        "#  A[0] * A[5]\n\t"
        "movq	40(%[a]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[1] * A[4]\n\t"
        "movq	32(%[a]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[2] * A[3]\n\t"
        "movq	24(%[a]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%r10, %%r9\n\t"
        "adcq	%%r11, %%rcx\n\t"
        "adcq	%%r12, %%r8\n\t"
        "movq	%%r9, 40(%[tmp])\n\t"
        "#  A[0] * A[6]\n\t"
        "movq	48(%[a]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%r9, %%r9\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[1] * A[5]\n\t"
        "movq	40(%[a]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[2] * A[4]\n\t"
        "movq	32(%[a]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[3] * A[3]\n\t"
        "movq	24(%[a]), %%rax\n\t"
        "mulq	%%rax\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%rcx\n\t"
        "adcq	%%r11, %%r8\n\t"
        "adcq	%%r12, %%r9\n\t"
        "movq	%%rcx, 48(%[tmp])\n\t"
        "#  A[0] * A[7]\n\t"
        "movq	56(%[a]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[1] * A[6]\n\t"
        "movq	48(%[a]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[2] * A[5]\n\t"
        "movq	40(%[a]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[3] * A[4]\n\t"
        "movq	32(%[a]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%r10, %%r8\n\t"
        "adcq	%%r11, %%r9\n\t"
        "adcq	%%r12, %%rcx\n\t"
        "movq	%%r8, 56(%[tmp])\n\t"
        "#  A[0] * A[8]\n\t"
        "movq	64(%[a]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[1] * A[7]\n\t"
        "movq	56(%[a]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[2] * A[6]\n\t"
        "movq	48(%[a]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[3] * A[5]\n\t"
        "movq	40(%[a]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[4] * A[4]\n\t"
        "movq	32(%[a]), %%rax\n\t"
        "mulq	%%rax\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r9\n\t"
        "adcq	%%r11, %%rcx\n\t"
        "adcq	%%r12, %%r8\n\t"
        "movq	%%r9, 64(%[tmp])\n\t"
        "#  A[0] * A[9]\n\t"
        "movq	72(%[a]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%r9, %%r9\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[1] * A[8]\n\t"
        "movq	64(%[a]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[2] * A[7]\n\t"
        "movq	56(%[a]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[3] * A[6]\n\t"
        "movq	48(%[a]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[4] * A[5]\n\t"
        "movq	40(%[a]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%r10, %%rcx\n\t"
        "adcq	%%r11, %%r8\n\t"
        "adcq	%%r12, %%r9\n\t"
        "movq	%%rcx, 72(%[tmp])\n\t"
        "#  A[0] * A[10]\n\t"
        "movq	80(%[a]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[1] * A[9]\n\t"
        "movq	72(%[a]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[2] * A[8]\n\t"
        "movq	64(%[a]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[3] * A[7]\n\t"
        "movq	56(%[a]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[4] * A[6]\n\t"
        "movq	48(%[a]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[5] * A[5]\n\t"
        "movq	40(%[a]), %%rax\n\t"
        "mulq	%%rax\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r8\n\t"
        "adcq	%%r11, %%r9\n\t"
        "adcq	%%r12, %%rcx\n\t"
        "movq	%%r8, 80(%[tmp])\n\t"
        "#  A[0] * A[11]\n\t"
        "movq	88(%[a]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[1] * A[10]\n\t"
        "movq	80(%[a]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[2] * A[9]\n\t"
        "movq	72(%[a]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[3] * A[8]\n\t"
        "movq	64(%[a]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[4] * A[7]\n\t"
        "movq	56(%[a]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[5] * A[6]\n\t"
        "movq	48(%[a]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%r10, %%r9\n\t"
        "adcq	%%r11, %%rcx\n\t"
        "adcq	%%r12, %%r8\n\t"
        "movq	%%r9, 88(%[tmp])\n\t"
        "#  A[0] * A[12]\n\t"
        "movq	96(%[a]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%r9, %%r9\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[1] * A[11]\n\t"
        "movq	88(%[a]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[2] * A[10]\n\t"
        "movq	80(%[a]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[3] * A[9]\n\t"
        "movq	72(%[a]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[4] * A[8]\n\t"
        "movq	64(%[a]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[5] * A[7]\n\t"
        "movq	56(%[a]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[6] * A[6]\n\t"
        "movq	48(%[a]), %%rax\n\t"
        "mulq	%%rax\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%rcx\n\t"
        "adcq	%%r11, %%r8\n\t"
        "adcq	%%r12, %%r9\n\t"
        "movq	%%rcx, 96(%[tmp])\n\t"
        "#  A[0] * A[13]\n\t"
        "movq	104(%[a]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[1] * A[12]\n\t"
        "movq	96(%[a]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[2] * A[11]\n\t"
        "movq	88(%[a]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[3] * A[10]\n\t"
        "movq	80(%[a]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[4] * A[9]\n\t"
        "movq	72(%[a]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[5] * A[8]\n\t"
        "movq	64(%[a]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[6] * A[7]\n\t"
        "movq	56(%[a]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%r10, %%r8\n\t"
        "adcq	%%r11, %%r9\n\t"
        "adcq	%%r12, %%rcx\n\t"
        "movq	%%r8, 104(%[tmp])\n\t"
        "#  A[0] * A[14]\n\t"
        "movq	112(%[a]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[1] * A[13]\n\t"
        "movq	104(%[a]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[2] * A[12]\n\t"
        "movq	96(%[a]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[3] * A[11]\n\t"
        "movq	88(%[a]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[4] * A[10]\n\t"
        "movq	80(%[a]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[5] * A[9]\n\t"
        "movq	72(%[a]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[6] * A[8]\n\t"
        "movq	64(%[a]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[7] * A[7]\n\t"
        "movq	56(%[a]), %%rax\n\t"
        "mulq	%%rax\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r9\n\t"
        "adcq	%%r11, %%rcx\n\t"
        "adcq	%%r12, %%r8\n\t"
        "movq	%%r9, 112(%[tmp])\n\t"
        "#  A[0] * A[15]\n\t"
        "movq	120(%[a]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%r9, %%r9\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[1] * A[14]\n\t"
        "movq	112(%[a]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[2] * A[13]\n\t"
        "movq	104(%[a]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[3] * A[12]\n\t"
        "movq	96(%[a]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[4] * A[11]\n\t"
        "movq	88(%[a]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[5] * A[10]\n\t"
        "movq	80(%[a]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[6] * A[9]\n\t"
        "movq	72(%[a]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[7] * A[8]\n\t"
        "movq	64(%[a]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%r10, %%rcx\n\t"
        "adcq	%%r11, %%r8\n\t"
        "adcq	%%r12, %%r9\n\t"
        "movq	%%rcx, 120(%[tmp])\n\t"
        "#  A[1] * A[15]\n\t"
        "movq	120(%[a]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[2] * A[14]\n\t"
        "movq	112(%[a]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[3] * A[13]\n\t"
        "movq	104(%[a]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[4] * A[12]\n\t"
        "movq	96(%[a]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[5] * A[11]\n\t"
        "movq	88(%[a]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[6] * A[10]\n\t"
        "movq	80(%[a]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[7] * A[9]\n\t"
        "movq	72(%[a]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[8] * A[8]\n\t"
        "movq	64(%[a]), %%rax\n\t"
        "mulq	%%rax\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r8\n\t"
        "adcq	%%r11, %%r9\n\t"
        "adcq	%%r12, %%rcx\n\t"
        "movq	%%r8, 128(%[r])\n\t"
        "#  A[2] * A[15]\n\t"
        "movq	120(%[a]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[3] * A[14]\n\t"
        "movq	112(%[a]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[4] * A[13]\n\t"
        "movq	104(%[a]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[5] * A[12]\n\t"
        "movq	96(%[a]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[6] * A[11]\n\t"
        "movq	88(%[a]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[7] * A[10]\n\t"
        "movq	80(%[a]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[8] * A[9]\n\t"
        "movq	72(%[a]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%r10, %%r9\n\t"
        "adcq	%%r11, %%rcx\n\t"
        "adcq	%%r12, %%r8\n\t"
        "movq	%%r9, 136(%[r])\n\t"
        "#  A[3] * A[15]\n\t"
        "movq	120(%[a]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "xorq	%%r9, %%r9\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[4] * A[14]\n\t"
        "movq	112(%[a]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[5] * A[13]\n\t"
        "movq	104(%[a]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[6] * A[12]\n\t"
        "movq	96(%[a]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[7] * A[11]\n\t"
        "movq	88(%[a]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[8] * A[10]\n\t"
        "movq	80(%[a]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[9] * A[9]\n\t"
        "movq	72(%[a]), %%rax\n\t"
        "mulq	%%rax\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%rcx\n\t"
        "adcq	%%r11, %%r8\n\t"
        "adcq	%%r12, %%r9\n\t"
        "movq	%%rcx, 144(%[r])\n\t"
        "#  A[4] * A[15]\n\t"
        "movq	120(%[a]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[5] * A[14]\n\t"
        "movq	112(%[a]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[6] * A[13]\n\t"
        "movq	104(%[a]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[7] * A[12]\n\t"
        "movq	96(%[a]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[8] * A[11]\n\t"
        "movq	88(%[a]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[9] * A[10]\n\t"
        "movq	80(%[a]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%r10, %%r8\n\t"
        "adcq	%%r11, %%r9\n\t"
        "adcq	%%r12, %%rcx\n\t"
        "movq	%%r8, 152(%[r])\n\t"
        "#  A[5] * A[15]\n\t"
        "movq	120(%[a]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[6] * A[14]\n\t"
        "movq	112(%[a]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[7] * A[13]\n\t"
        "movq	104(%[a]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[8] * A[12]\n\t"
        "movq	96(%[a]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[9] * A[11]\n\t"
        "movq	88(%[a]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[10] * A[10]\n\t"
        "movq	80(%[a]), %%rax\n\t"
        "mulq	%%rax\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r9\n\t"
        "adcq	%%r11, %%rcx\n\t"
        "adcq	%%r12, %%r8\n\t"
        "movq	%%r9, 160(%[r])\n\t"
        "#  A[6] * A[15]\n\t"
        "movq	120(%[a]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "xorq	%%r9, %%r9\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[7] * A[14]\n\t"
        "movq	112(%[a]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[8] * A[13]\n\t"
        "movq	104(%[a]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[9] * A[12]\n\t"
        "movq	96(%[a]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[10] * A[11]\n\t"
        "movq	88(%[a]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%r10, %%rcx\n\t"
        "adcq	%%r11, %%r8\n\t"
        "adcq	%%r12, %%r9\n\t"
        "movq	%%rcx, 168(%[r])\n\t"
        "#  A[7] * A[15]\n\t"
        "movq	120(%[a]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[8] * A[14]\n\t"
        "movq	112(%[a]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[9] * A[13]\n\t"
        "movq	104(%[a]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[10] * A[12]\n\t"
        "movq	96(%[a]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[11] * A[11]\n\t"
        "movq	88(%[a]), %%rax\n\t"
        "mulq	%%rax\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r8\n\t"
        "adcq	%%r11, %%r9\n\t"
        "adcq	%%r12, %%rcx\n\t"
        "movq	%%r8, 176(%[r])\n\t"
        "#  A[8] * A[15]\n\t"
        "movq	120(%[a]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[9] * A[14]\n\t"
        "movq	112(%[a]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[10] * A[13]\n\t"
        "movq	104(%[a]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[11] * A[12]\n\t"
        "movq	96(%[a]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%r10, %%r9\n\t"
        "adcq	%%r11, %%rcx\n\t"
        "adcq	%%r12, %%r8\n\t"
        "movq	%%r9, 184(%[r])\n\t"
        "#  A[9] * A[15]\n\t"
        "movq	120(%[a]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "xorq	%%r9, %%r9\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[10] * A[14]\n\t"
        "movq	112(%[a]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[11] * A[13]\n\t"
        "movq	104(%[a]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[12] * A[12]\n\t"
        "movq	96(%[a]), %%rax\n\t"
        "mulq	%%rax\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%rcx\n\t"
        "adcq	%%r11, %%r8\n\t"
        "adcq	%%r12, %%r9\n\t"
        "movq	%%rcx, 192(%[r])\n\t"
        "#  A[10] * A[15]\n\t"
        "movq	120(%[a]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[11] * A[14]\n\t"
        "movq	112(%[a]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[12] * A[13]\n\t"
        "movq	104(%[a]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%r10, %%r8\n\t"
        "adcq	%%r11, %%r9\n\t"
        "adcq	%%r12, %%rcx\n\t"
        "movq	%%r8, 200(%[r])\n\t"
        "#  A[11] * A[15]\n\t"
        "movq	120(%[a]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "addq	%%rax, %%r9\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "addq	%%rax, %%r9\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[12] * A[14]\n\t"
        "movq	112(%[a]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%r9\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "addq	%%rax, %%r9\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[13] * A[13]\n\t"
        "movq	104(%[a]), %%rax\n\t"
        "mulq	%%rax\n\t"
        "addq	%%rax, %%r9\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "movq	%%r9, 208(%[r])\n\t"
        "#  A[12] * A[15]\n\t"
        "movq	120(%[a]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "xorq	%%r9, %%r9\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%r9\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%r9\n\t"
        "#  A[13] * A[14]\n\t"
        "movq	112(%[a]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%r9\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%r9\n\t"
        "movq	%%rcx, 216(%[r])\n\t"
        "#  A[13] * A[15]\n\t"
        "movq	120(%[a]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "adcq	$0, %%rcx\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[14] * A[14]\n\t"
        "movq	112(%[a]), %%rax\n\t"
        "mulq	%%rax\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "adcq	$0, %%rcx\n\t"
        "movq	%%r8, 224(%[r])\n\t"
        "#  A[14] * A[15]\n\t"
        "movq	120(%[a]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "addq	%%rax, %%r9\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "addq	%%rax, %%r9\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "movq	%%r9, 232(%[r])\n\t"
        "#  A[15] * A[15]\n\t"
        "movq	120(%[a]), %%rax\n\t"
        "mulq	%%rax\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "movq	%%rcx, 240(%[r])\n\t"
        "movq	%%r8, 248(%[r])\n\t"
        :
        : [r] "r" (r), [a] "r" (a), [tmp] "r" (tmp)
        : "memory", "rax", "rdx", "rcx", "r8", "r9", "r10", "r11", "r12"
    );

    XMEMCPY(r, tmp, sizeof(tmp));
}

#ifdef HAVE_INTEL_AVX2
/* Multiply a and b into r. (r = a * b)
 *
 * r   Result of multiplication.
 * a   First number to multiply.
 * b   Second number to multiply.
 */
SP_NOINLINE static void sp_2048_mul_avx2_16(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    sp_digit tmp[2*16];

    __asm__ __volatile__ (
        "movq	0(%[a]), %%rdx\n\t"
        "xorq	%%r15, %%r15\n\t"
        "# A[0] * B[0]\n\t"
        "mulx	0(%[b]), %%r10, %%r11\n\t"
        "# A[0] * B[1]\n\t"
        "mulx	8(%[b]), %%r8, %%r12\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "# A[0] * B[2]\n\t"
        "mulx	16(%[b]), %%r8, %%r13\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "# A[0] * B[3]\n\t"
        "mulx	24(%[b]), %%r8, %%r14\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "movq	%%r10, 0(%[t])\n\t"
        "movq	%%r11, 8(%[t])\n\t"
        "movq	%%r12, 16(%[t])\n\t"
        "movq	%%r13, 24(%[t])\n\t"
        "# A[0] * B[4]\n\t"
        "mulx	32(%[b]), %%r8, %%rax\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "# A[0] * B[5]\n\t"
        "mulx	40(%[b]), %%r8, %%r10\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "# A[0] * B[6]\n\t"
        "mulx	48(%[b]), %%r8, %%r11\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "# A[0] * B[7]\n\t"
        "mulx	56(%[b]), %%r8, %%r12\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "movq	%%r14, 32(%[t])\n\t"
        "movq	%%rax, 40(%[t])\n\t"
        "movq	%%r10, 48(%[t])\n\t"
        "movq	%%r11, 56(%[t])\n\t"
        "# A[0] * B[8]\n\t"
        "mulx	64(%[b]), %%r8, %%r13\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "# A[0] * B[9]\n\t"
        "mulx	72(%[b]), %%r8, %%r14\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "# A[0] * B[10]\n\t"
        "mulx	80(%[b]), %%r8, %%rax\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "# A[0] * B[11]\n\t"
        "mulx	88(%[b]), %%r8, %%r10\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "movq	%%r12, 64(%[t])\n\t"
        "movq	%%r13, 72(%[t])\n\t"
        "movq	%%r14, 80(%[t])\n\t"
        "movq	%%rax, 88(%[t])\n\t"
        "# A[0] * B[12]\n\t"
        "mulx	96(%[b]), %%r8, %%r11\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "# A[0] * B[13]\n\t"
        "mulx	104(%[b]), %%r8, %%r12\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "# A[0] * B[14]\n\t"
        "mulx	112(%[b]), %%r8, %%r13\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "# A[0] * B[15]\n\t"
        "mulx	120(%[b]), %%r8, %%r14\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adcxq	%%r15, %%r14\n\t"
        "movq	%%r15, %%rcx\n\t"
        "adcxq	%%r15, %%rcx\n\t"
        "movq	%%r10, 96(%[t])\n\t"
        "movq	%%r11, 104(%[t])\n\t"
        "movq	%%r12, 112(%[t])\n\t"
        "movq	%%r13, 120(%[t])\n\t"
        "movq	%%r14, 128(%[t])\n\t"
        "movq	8(%[a]), %%rdx\n\t"
        "xorq	%%r15, %%r15\n\t"
        "movq	8(%[t]), %%r11\n\t"
        "movq	16(%[t]), %%r12\n\t"
        "movq	24(%[t]), %%r13\n\t"
        "movq	32(%[t]), %%r14\n\t"
        "movq	40(%[t]), %%rax\n\t"
        "# A[1] * B[0]\n\t"
        "mulx	0(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[1] * B[1]\n\t"
        "mulx	8(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[1] * B[2]\n\t"
        "mulx	16(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[1] * B[3]\n\t"
        "mulx	24(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "movq	%%r11, 8(%[t])\n\t"
        "movq	%%r12, 16(%[t])\n\t"
        "movq	%%r13, 24(%[t])\n\t"
        "movq	%%r14, 32(%[t])\n\t"
        "movq	48(%[t]), %%r10\n\t"
        "movq	56(%[t]), %%r11\n\t"
        "movq	64(%[t]), %%r12\n\t"
        "movq	72(%[t]), %%r13\n\t"
        "# A[1] * B[4]\n\t"
        "mulx	32(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[1] * B[5]\n\t"
        "mulx	40(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[1] * B[6]\n\t"
        "mulx	48(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[1] * B[7]\n\t"
        "mulx	56(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "movq	%%rax, 40(%[t])\n\t"
        "movq	%%r10, 48(%[t])\n\t"
        "movq	%%r11, 56(%[t])\n\t"
        "movq	%%r12, 64(%[t])\n\t"
        "movq	80(%[t]), %%r14\n\t"
        "movq	88(%[t]), %%rax\n\t"
        "movq	96(%[t]), %%r10\n\t"
        "movq	104(%[t]), %%r11\n\t"
        "# A[1] * B[8]\n\t"
        "mulx	64(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[1] * B[9]\n\t"
        "mulx	72(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[1] * B[10]\n\t"
        "mulx	80(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[1] * B[11]\n\t"
        "mulx	88(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "movq	%%r13, 72(%[t])\n\t"
        "movq	%%r14, 80(%[t])\n\t"
        "movq	%%rax, 88(%[t])\n\t"
        "movq	%%r10, 96(%[t])\n\t"
        "movq	112(%[t]), %%r12\n\t"
        "movq	120(%[t]), %%r13\n\t"
        "movq	128(%[t]), %%r14\n\t"
        "# A[1] * B[12]\n\t"
        "mulx	96(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[1] * B[13]\n\t"
        "mulx	104(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[1] * B[14]\n\t"
        "mulx	112(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[1] * B[15]\n\t"
        "mulx	120(%[b]), %%r8, %%r9\n\t"
        "movq	%%r15, %%rax\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "adcxq	%%rcx, %%rax\n\t"
        "movq	%%r15, %%rcx\n\t"
        "adoxq	%%r15, %%rcx\n\t"
        "adcxq	%%r15, %%rcx\n\t"
        "movq	%%r11, 104(%[t])\n\t"
        "movq	%%r12, 112(%[t])\n\t"
        "movq	%%r13, 120(%[t])\n\t"
        "movq	%%r14, 128(%[t])\n\t"
        "movq	%%rax, 136(%[t])\n\t"
        "movq	16(%[a]), %%rdx\n\t"
        "xorq	%%r15, %%r15\n\t"
        "movq	16(%[t]), %%r12\n\t"
        "movq	24(%[t]), %%r13\n\t"
        "movq	32(%[t]), %%r14\n\t"
        "movq	40(%[t]), %%rax\n\t"
        "movq	48(%[t]), %%r10\n\t"
        "# A[2] * B[0]\n\t"
        "mulx	0(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[2] * B[1]\n\t"
        "mulx	8(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[2] * B[2]\n\t"
        "mulx	16(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[2] * B[3]\n\t"
        "mulx	24(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "movq	%%r12, 16(%[t])\n\t"
        "movq	%%r13, 24(%[t])\n\t"
        "movq	%%r14, 32(%[t])\n\t"
        "movq	%%rax, 40(%[t])\n\t"
        "movq	56(%[t]), %%r11\n\t"
        "movq	64(%[t]), %%r12\n\t"
        "movq	72(%[t]), %%r13\n\t"
        "movq	80(%[t]), %%r14\n\t"
        "# A[2] * B[4]\n\t"
        "mulx	32(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[2] * B[5]\n\t"
        "mulx	40(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[2] * B[6]\n\t"
        "mulx	48(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[2] * B[7]\n\t"
        "mulx	56(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "movq	%%r10, 48(%[t])\n\t"
        "movq	%%r11, 56(%[t])\n\t"
        "movq	%%r12, 64(%[t])\n\t"
        "movq	%%r13, 72(%[t])\n\t"
        "movq	88(%[t]), %%rax\n\t"
        "movq	96(%[t]), %%r10\n\t"
        "movq	104(%[t]), %%r11\n\t"
        "movq	112(%[t]), %%r12\n\t"
        "# A[2] * B[8]\n\t"
        "mulx	64(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[2] * B[9]\n\t"
        "mulx	72(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[2] * B[10]\n\t"
        "mulx	80(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[2] * B[11]\n\t"
        "mulx	88(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "movq	%%r14, 80(%[t])\n\t"
        "movq	%%rax, 88(%[t])\n\t"
        "movq	%%r10, 96(%[t])\n\t"
        "movq	%%r11, 104(%[t])\n\t"
        "movq	120(%[t]), %%r13\n\t"
        "movq	128(%[t]), %%r14\n\t"
        "movq	136(%[t]), %%rax\n\t"
        "# A[2] * B[12]\n\t"
        "mulx	96(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[2] * B[13]\n\t"
        "mulx	104(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[2] * B[14]\n\t"
        "mulx	112(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[2] * B[15]\n\t"
        "mulx	120(%[b]), %%r8, %%r9\n\t"
        "movq	%%r15, %%r10\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "adcxq	%%rcx, %%r10\n\t"
        "movq	%%r15, %%rcx\n\t"
        "adoxq	%%r15, %%rcx\n\t"
        "adcxq	%%r15, %%rcx\n\t"
        "movq	%%r12, 112(%[t])\n\t"
        "movq	%%r13, 120(%[t])\n\t"
        "movq	%%r14, 128(%[t])\n\t"
        "movq	%%rax, 136(%[t])\n\t"
        "movq	%%r10, 144(%[t])\n\t"
        "movq	24(%[a]), %%rdx\n\t"
        "xorq	%%r15, %%r15\n\t"
        "movq	24(%[t]), %%r13\n\t"
        "movq	32(%[t]), %%r14\n\t"
        "movq	40(%[t]), %%rax\n\t"
        "movq	48(%[t]), %%r10\n\t"
        "movq	56(%[t]), %%r11\n\t"
        "# A[3] * B[0]\n\t"
        "mulx	0(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[3] * B[1]\n\t"
        "mulx	8(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[3] * B[2]\n\t"
        "mulx	16(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[3] * B[3]\n\t"
        "mulx	24(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "movq	%%r13, 24(%[t])\n\t"
        "movq	%%r14, 32(%[t])\n\t"
        "movq	%%rax, 40(%[t])\n\t"
        "movq	%%r10, 48(%[t])\n\t"
        "movq	64(%[t]), %%r12\n\t"
        "movq	72(%[t]), %%r13\n\t"
        "movq	80(%[t]), %%r14\n\t"
        "movq	88(%[t]), %%rax\n\t"
        "# A[3] * B[4]\n\t"
        "mulx	32(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[3] * B[5]\n\t"
        "mulx	40(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[3] * B[6]\n\t"
        "mulx	48(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[3] * B[7]\n\t"
        "mulx	56(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "movq	%%r11, 56(%[t])\n\t"
        "movq	%%r12, 64(%[t])\n\t"
        "movq	%%r13, 72(%[t])\n\t"
        "movq	%%r14, 80(%[t])\n\t"
        "movq	96(%[t]), %%r10\n\t"
        "movq	104(%[t]), %%r11\n\t"
        "movq	112(%[t]), %%r12\n\t"
        "movq	120(%[t]), %%r13\n\t"
        "# A[3] * B[8]\n\t"
        "mulx	64(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[3] * B[9]\n\t"
        "mulx	72(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[3] * B[10]\n\t"
        "mulx	80(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[3] * B[11]\n\t"
        "mulx	88(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "movq	%%rax, 88(%[t])\n\t"
        "movq	%%r10, 96(%[t])\n\t"
        "movq	%%r11, 104(%[t])\n\t"
        "movq	%%r12, 112(%[t])\n\t"
        "movq	128(%[t]), %%r14\n\t"
        "movq	136(%[t]), %%rax\n\t"
        "movq	144(%[t]), %%r10\n\t"
        "# A[3] * B[12]\n\t"
        "mulx	96(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[3] * B[13]\n\t"
        "mulx	104(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[3] * B[14]\n\t"
        "mulx	112(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[3] * B[15]\n\t"
        "mulx	120(%[b]), %%r8, %%r9\n\t"
        "movq	%%r15, %%r11\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "adcxq	%%rcx, %%r11\n\t"
        "movq	%%r15, %%rcx\n\t"
        "adoxq	%%r15, %%rcx\n\t"
        "adcxq	%%r15, %%rcx\n\t"
        "movq	%%r13, 120(%[t])\n\t"
        "movq	%%r14, 128(%[t])\n\t"
        "movq	%%rax, 136(%[t])\n\t"
        "movq	%%r10, 144(%[t])\n\t"
        "movq	%%r11, 152(%[t])\n\t"
        "movq	32(%[a]), %%rdx\n\t"
        "xorq	%%r15, %%r15\n\t"
        "movq	32(%[t]), %%r14\n\t"
        "movq	40(%[t]), %%rax\n\t"
        "movq	48(%[t]), %%r10\n\t"
        "movq	56(%[t]), %%r11\n\t"
        "movq	64(%[t]), %%r12\n\t"
        "# A[4] * B[0]\n\t"
        "mulx	0(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[4] * B[1]\n\t"
        "mulx	8(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[4] * B[2]\n\t"
        "mulx	16(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[4] * B[3]\n\t"
        "mulx	24(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "movq	%%r14, 32(%[t])\n\t"
        "movq	%%rax, 40(%[t])\n\t"
        "movq	%%r10, 48(%[t])\n\t"
        "movq	%%r11, 56(%[t])\n\t"
        "movq	72(%[t]), %%r13\n\t"
        "movq	80(%[t]), %%r14\n\t"
        "movq	88(%[t]), %%rax\n\t"
        "movq	96(%[t]), %%r10\n\t"
        "# A[4] * B[4]\n\t"
        "mulx	32(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[4] * B[5]\n\t"
        "mulx	40(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[4] * B[6]\n\t"
        "mulx	48(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[4] * B[7]\n\t"
        "mulx	56(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "movq	%%r12, 64(%[t])\n\t"
        "movq	%%r13, 72(%[t])\n\t"
        "movq	%%r14, 80(%[t])\n\t"
        "movq	%%rax, 88(%[t])\n\t"
        "movq	104(%[t]), %%r11\n\t"
        "movq	112(%[t]), %%r12\n\t"
        "movq	120(%[t]), %%r13\n\t"
        "movq	128(%[t]), %%r14\n\t"
        "# A[4] * B[8]\n\t"
        "mulx	64(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[4] * B[9]\n\t"
        "mulx	72(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[4] * B[10]\n\t"
        "mulx	80(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[4] * B[11]\n\t"
        "mulx	88(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "movq	%%r10, 96(%[t])\n\t"
        "movq	%%r11, 104(%[t])\n\t"
        "movq	%%r12, 112(%[t])\n\t"
        "movq	%%r13, 120(%[t])\n\t"
        "movq	136(%[t]), %%rax\n\t"
        "movq	144(%[t]), %%r10\n\t"
        "movq	152(%[t]), %%r11\n\t"
        "# A[4] * B[12]\n\t"
        "mulx	96(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[4] * B[13]\n\t"
        "mulx	104(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[4] * B[14]\n\t"
        "mulx	112(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[4] * B[15]\n\t"
        "mulx	120(%[b]), %%r8, %%r9\n\t"
        "movq	%%r15, %%r12\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "adcxq	%%rcx, %%r12\n\t"
        "movq	%%r15, %%rcx\n\t"
        "adoxq	%%r15, %%rcx\n\t"
        "adcxq	%%r15, %%rcx\n\t"
        "movq	%%r14, 128(%[t])\n\t"
        "movq	%%rax, 136(%[t])\n\t"
        "movq	%%r10, 144(%[t])\n\t"
        "movq	%%r11, 152(%[t])\n\t"
        "movq	%%r12, 160(%[t])\n\t"
        "movq	40(%[a]), %%rdx\n\t"
        "xorq	%%r15, %%r15\n\t"
        "movq	40(%[t]), %%rax\n\t"
        "movq	48(%[t]), %%r10\n\t"
        "movq	56(%[t]), %%r11\n\t"
        "movq	64(%[t]), %%r12\n\t"
        "movq	72(%[t]), %%r13\n\t"
        "# A[5] * B[0]\n\t"
        "mulx	0(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[5] * B[1]\n\t"
        "mulx	8(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[5] * B[2]\n\t"
        "mulx	16(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[5] * B[3]\n\t"
        "mulx	24(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "movq	%%rax, 40(%[t])\n\t"
        "movq	%%r10, 48(%[t])\n\t"
        "movq	%%r11, 56(%[t])\n\t"
        "movq	%%r12, 64(%[t])\n\t"
        "movq	80(%[t]), %%r14\n\t"
        "movq	88(%[t]), %%rax\n\t"
        "movq	96(%[t]), %%r10\n\t"
        "movq	104(%[t]), %%r11\n\t"
        "# A[5] * B[4]\n\t"
        "mulx	32(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[5] * B[5]\n\t"
        "mulx	40(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[5] * B[6]\n\t"
        "mulx	48(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[5] * B[7]\n\t"
        "mulx	56(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "movq	%%r13, 72(%[t])\n\t"
        "movq	%%r14, 80(%[t])\n\t"
        "movq	%%rax, 88(%[t])\n\t"
        "movq	%%r10, 96(%[t])\n\t"
        "movq	112(%[t]), %%r12\n\t"
        "movq	120(%[t]), %%r13\n\t"
        "movq	128(%[t]), %%r14\n\t"
        "movq	136(%[t]), %%rax\n\t"
        "# A[5] * B[8]\n\t"
        "mulx	64(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[5] * B[9]\n\t"
        "mulx	72(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[5] * B[10]\n\t"
        "mulx	80(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[5] * B[11]\n\t"
        "mulx	88(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "movq	%%r11, 104(%[t])\n\t"
        "movq	%%r12, 112(%[t])\n\t"
        "movq	%%r13, 120(%[t])\n\t"
        "movq	%%r14, 128(%[t])\n\t"
        "movq	144(%[t]), %%r10\n\t"
        "movq	152(%[t]), %%r11\n\t"
        "movq	160(%[t]), %%r12\n\t"
        "# A[5] * B[12]\n\t"
        "mulx	96(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[5] * B[13]\n\t"
        "mulx	104(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[5] * B[14]\n\t"
        "mulx	112(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[5] * B[15]\n\t"
        "mulx	120(%[b]), %%r8, %%r9\n\t"
        "movq	%%r15, %%r13\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "adcxq	%%rcx, %%r13\n\t"
        "movq	%%r15, %%rcx\n\t"
        "adoxq	%%r15, %%rcx\n\t"
        "adcxq	%%r15, %%rcx\n\t"
        "movq	%%rax, 136(%[t])\n\t"
        "movq	%%r10, 144(%[t])\n\t"
        "movq	%%r11, 152(%[t])\n\t"
        "movq	%%r12, 160(%[t])\n\t"
        "movq	%%r13, 168(%[t])\n\t"
        "movq	48(%[a]), %%rdx\n\t"
        "xorq	%%r15, %%r15\n\t"
        "movq	48(%[t]), %%r10\n\t"
        "movq	56(%[t]), %%r11\n\t"
        "movq	64(%[t]), %%r12\n\t"
        "movq	72(%[t]), %%r13\n\t"
        "movq	80(%[t]), %%r14\n\t"
        "# A[6] * B[0]\n\t"
        "mulx	0(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[6] * B[1]\n\t"
        "mulx	8(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[6] * B[2]\n\t"
        "mulx	16(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[6] * B[3]\n\t"
        "mulx	24(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "movq	%%r10, 48(%[t])\n\t"
        "movq	%%r11, 56(%[t])\n\t"
        "movq	%%r12, 64(%[t])\n\t"
        "movq	%%r13, 72(%[t])\n\t"
        "movq	88(%[t]), %%rax\n\t"
        "movq	96(%[t]), %%r10\n\t"
        "movq	104(%[t]), %%r11\n\t"
        "movq	112(%[t]), %%r12\n\t"
        "# A[6] * B[4]\n\t"
        "mulx	32(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[6] * B[5]\n\t"
        "mulx	40(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[6] * B[6]\n\t"
        "mulx	48(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[6] * B[7]\n\t"
        "mulx	56(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "movq	%%r14, 80(%[t])\n\t"
        "movq	%%rax, 88(%[t])\n\t"
        "movq	%%r10, 96(%[t])\n\t"
        "movq	%%r11, 104(%[t])\n\t"
        "movq	120(%[t]), %%r13\n\t"
        "movq	128(%[t]), %%r14\n\t"
        "movq	136(%[t]), %%rax\n\t"
        "movq	144(%[t]), %%r10\n\t"
        "# A[6] * B[8]\n\t"
        "mulx	64(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[6] * B[9]\n\t"
        "mulx	72(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[6] * B[10]\n\t"
        "mulx	80(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[6] * B[11]\n\t"
        "mulx	88(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "movq	%%r12, 112(%[t])\n\t"
        "movq	%%r13, 120(%[t])\n\t"
        "movq	%%r14, 128(%[t])\n\t"
        "movq	%%rax, 136(%[t])\n\t"
        "movq	152(%[t]), %%r11\n\t"
        "movq	160(%[t]), %%r12\n\t"
        "movq	168(%[t]), %%r13\n\t"
        "# A[6] * B[12]\n\t"
        "mulx	96(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[6] * B[13]\n\t"
        "mulx	104(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[6] * B[14]\n\t"
        "mulx	112(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[6] * B[15]\n\t"
        "mulx	120(%[b]), %%r8, %%r9\n\t"
        "movq	%%r15, %%r14\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "adcxq	%%rcx, %%r14\n\t"
        "movq	%%r15, %%rcx\n\t"
        "adoxq	%%r15, %%rcx\n\t"
        "adcxq	%%r15, %%rcx\n\t"
        "movq	%%r10, 144(%[t])\n\t"
        "movq	%%r11, 152(%[t])\n\t"
        "movq	%%r12, 160(%[t])\n\t"
        "movq	%%r13, 168(%[t])\n\t"
        "movq	%%r14, 176(%[t])\n\t"
        "movq	56(%[a]), %%rdx\n\t"
        "xorq	%%r15, %%r15\n\t"
        "movq	56(%[t]), %%r11\n\t"
        "movq	64(%[t]), %%r12\n\t"
        "movq	72(%[t]), %%r13\n\t"
        "movq	80(%[t]), %%r14\n\t"
        "movq	88(%[t]), %%rax\n\t"
        "# A[7] * B[0]\n\t"
        "mulx	0(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[7] * B[1]\n\t"
        "mulx	8(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[7] * B[2]\n\t"
        "mulx	16(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[7] * B[3]\n\t"
        "mulx	24(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "movq	%%r11, 56(%[t])\n\t"
        "movq	%%r12, 64(%[t])\n\t"
        "movq	%%r13, 72(%[t])\n\t"
        "movq	%%r14, 80(%[t])\n\t"
        "movq	96(%[t]), %%r10\n\t"
        "movq	104(%[t]), %%r11\n\t"
        "movq	112(%[t]), %%r12\n\t"
        "movq	120(%[t]), %%r13\n\t"
        "# A[7] * B[4]\n\t"
        "mulx	32(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[7] * B[5]\n\t"
        "mulx	40(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[7] * B[6]\n\t"
        "mulx	48(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[7] * B[7]\n\t"
        "mulx	56(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "movq	%%rax, 88(%[t])\n\t"
        "movq	%%r10, 96(%[t])\n\t"
        "movq	%%r11, 104(%[t])\n\t"
        "movq	%%r12, 112(%[t])\n\t"
        "movq	128(%[t]), %%r14\n\t"
        "movq	136(%[t]), %%rax\n\t"
        "movq	144(%[t]), %%r10\n\t"
        "movq	152(%[t]), %%r11\n\t"
        "# A[7] * B[8]\n\t"
        "mulx	64(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[7] * B[9]\n\t"
        "mulx	72(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[7] * B[10]\n\t"
        "mulx	80(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[7] * B[11]\n\t"
        "mulx	88(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "movq	%%r13, 120(%[t])\n\t"
        "movq	%%r14, 128(%[t])\n\t"
        "movq	%%rax, 136(%[t])\n\t"
        "movq	%%r10, 144(%[t])\n\t"
        "movq	160(%[t]), %%r12\n\t"
        "movq	168(%[t]), %%r13\n\t"
        "movq	176(%[t]), %%r14\n\t"
        "# A[7] * B[12]\n\t"
        "mulx	96(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[7] * B[13]\n\t"
        "mulx	104(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[7] * B[14]\n\t"
        "mulx	112(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[7] * B[15]\n\t"
        "mulx	120(%[b]), %%r8, %%r9\n\t"
        "movq	%%r15, %%rax\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "adcxq	%%rcx, %%rax\n\t"
        "movq	%%r15, %%rcx\n\t"
        "adoxq	%%r15, %%rcx\n\t"
        "adcxq	%%r15, %%rcx\n\t"
        "movq	%%r11, 152(%[t])\n\t"
        "movq	%%r12, 160(%[t])\n\t"
        "movq	%%r13, 168(%[t])\n\t"
        "movq	%%r14, 176(%[t])\n\t"
        "movq	%%rax, 184(%[t])\n\t"
        "movq	64(%[a]), %%rdx\n\t"
        "xorq	%%r15, %%r15\n\t"
        "movq	64(%[t]), %%r12\n\t"
        "movq	72(%[t]), %%r13\n\t"
        "movq	80(%[t]), %%r14\n\t"
        "movq	88(%[t]), %%rax\n\t"
        "movq	96(%[t]), %%r10\n\t"
        "# A[8] * B[0]\n\t"
        "mulx	0(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[8] * B[1]\n\t"
        "mulx	8(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[8] * B[2]\n\t"
        "mulx	16(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[8] * B[3]\n\t"
        "mulx	24(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "movq	%%r12, 64(%[t])\n\t"
        "movq	%%r13, 72(%[t])\n\t"
        "movq	%%r14, 80(%[t])\n\t"
        "movq	%%rax, 88(%[t])\n\t"
        "movq	104(%[t]), %%r11\n\t"
        "movq	112(%[t]), %%r12\n\t"
        "movq	120(%[t]), %%r13\n\t"
        "movq	128(%[t]), %%r14\n\t"
        "# A[8] * B[4]\n\t"
        "mulx	32(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[8] * B[5]\n\t"
        "mulx	40(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[8] * B[6]\n\t"
        "mulx	48(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[8] * B[7]\n\t"
        "mulx	56(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "movq	%%r10, 96(%[t])\n\t"
        "movq	%%r11, 104(%[t])\n\t"
        "movq	%%r12, 112(%[t])\n\t"
        "movq	%%r13, 120(%[t])\n\t"
        "movq	136(%[t]), %%rax\n\t"
        "movq	144(%[t]), %%r10\n\t"
        "movq	152(%[t]), %%r11\n\t"
        "movq	160(%[t]), %%r12\n\t"
        "# A[8] * B[8]\n\t"
        "mulx	64(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[8] * B[9]\n\t"
        "mulx	72(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[8] * B[10]\n\t"
        "mulx	80(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[8] * B[11]\n\t"
        "mulx	88(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "movq	%%r14, 128(%[t])\n\t"
        "movq	%%rax, 136(%[t])\n\t"
        "movq	%%r10, 144(%[t])\n\t"
        "movq	%%r11, 152(%[t])\n\t"
        "movq	168(%[t]), %%r13\n\t"
        "movq	176(%[t]), %%r14\n\t"
        "movq	184(%[t]), %%rax\n\t"
        "# A[8] * B[12]\n\t"
        "mulx	96(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[8] * B[13]\n\t"
        "mulx	104(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[8] * B[14]\n\t"
        "mulx	112(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[8] * B[15]\n\t"
        "mulx	120(%[b]), %%r8, %%r9\n\t"
        "movq	%%r15, %%r10\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "adcxq	%%rcx, %%r10\n\t"
        "movq	%%r15, %%rcx\n\t"
        "adoxq	%%r15, %%rcx\n\t"
        "adcxq	%%r15, %%rcx\n\t"
        "movq	%%r12, 160(%[t])\n\t"
        "movq	%%r13, 168(%[t])\n\t"
        "movq	%%r14, 176(%[t])\n\t"
        "movq	%%rax, 184(%[t])\n\t"
        "movq	%%r10, 192(%[t])\n\t"
        "movq	72(%[a]), %%rdx\n\t"
        "xorq	%%r15, %%r15\n\t"
        "movq	72(%[t]), %%r13\n\t"
        "movq	80(%[t]), %%r14\n\t"
        "movq	88(%[t]), %%rax\n\t"
        "movq	96(%[t]), %%r10\n\t"
        "movq	104(%[t]), %%r11\n\t"
        "# A[9] * B[0]\n\t"
        "mulx	0(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[9] * B[1]\n\t"
        "mulx	8(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[9] * B[2]\n\t"
        "mulx	16(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[9] * B[3]\n\t"
        "mulx	24(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "movq	%%r13, 72(%[t])\n\t"
        "movq	%%r14, 80(%[t])\n\t"
        "movq	%%rax, 88(%[t])\n\t"
        "movq	%%r10, 96(%[t])\n\t"
        "movq	112(%[t]), %%r12\n\t"
        "movq	120(%[t]), %%r13\n\t"
        "movq	128(%[t]), %%r14\n\t"
        "movq	136(%[t]), %%rax\n\t"
        "# A[9] * B[4]\n\t"
        "mulx	32(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[9] * B[5]\n\t"
        "mulx	40(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[9] * B[6]\n\t"
        "mulx	48(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[9] * B[7]\n\t"
        "mulx	56(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "movq	%%r11, 104(%[t])\n\t"
        "movq	%%r12, 112(%[t])\n\t"
        "movq	%%r13, 120(%[t])\n\t"
        "movq	%%r14, 128(%[t])\n\t"
        "movq	144(%[t]), %%r10\n\t"
        "movq	152(%[t]), %%r11\n\t"
        "movq	160(%[t]), %%r12\n\t"
        "movq	168(%[t]), %%r13\n\t"
        "# A[9] * B[8]\n\t"
        "mulx	64(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[9] * B[9]\n\t"
        "mulx	72(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[9] * B[10]\n\t"
        "mulx	80(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[9] * B[11]\n\t"
        "mulx	88(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "movq	%%rax, 136(%[t])\n\t"
        "movq	%%r10, 144(%[t])\n\t"
        "movq	%%r11, 152(%[t])\n\t"
        "movq	%%r12, 160(%[t])\n\t"
        "movq	176(%[t]), %%r14\n\t"
        "movq	184(%[t]), %%rax\n\t"
        "movq	192(%[t]), %%r10\n\t"
        "# A[9] * B[12]\n\t"
        "mulx	96(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[9] * B[13]\n\t"
        "mulx	104(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[9] * B[14]\n\t"
        "mulx	112(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[9] * B[15]\n\t"
        "mulx	120(%[b]), %%r8, %%r9\n\t"
        "movq	%%r15, %%r11\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "adcxq	%%rcx, %%r11\n\t"
        "movq	%%r15, %%rcx\n\t"
        "adoxq	%%r15, %%rcx\n\t"
        "adcxq	%%r15, %%rcx\n\t"
        "movq	%%r13, 168(%[t])\n\t"
        "movq	%%r14, 176(%[t])\n\t"
        "movq	%%rax, 184(%[t])\n\t"
        "movq	%%r10, 192(%[t])\n\t"
        "movq	%%r11, 200(%[t])\n\t"
        "movq	80(%[a]), %%rdx\n\t"
        "xorq	%%r15, %%r15\n\t"
        "movq	80(%[t]), %%r14\n\t"
        "movq	88(%[t]), %%rax\n\t"
        "movq	96(%[t]), %%r10\n\t"
        "movq	104(%[t]), %%r11\n\t"
        "movq	112(%[t]), %%r12\n\t"
        "# A[10] * B[0]\n\t"
        "mulx	0(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[10] * B[1]\n\t"
        "mulx	8(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[10] * B[2]\n\t"
        "mulx	16(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[10] * B[3]\n\t"
        "mulx	24(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "movq	%%r14, 80(%[t])\n\t"
        "movq	%%rax, 88(%[t])\n\t"
        "movq	%%r10, 96(%[t])\n\t"
        "movq	%%r11, 104(%[t])\n\t"
        "movq	120(%[t]), %%r13\n\t"
        "movq	128(%[t]), %%r14\n\t"
        "movq	136(%[t]), %%rax\n\t"
        "movq	144(%[t]), %%r10\n\t"
        "# A[10] * B[4]\n\t"
        "mulx	32(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[10] * B[5]\n\t"
        "mulx	40(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[10] * B[6]\n\t"
        "mulx	48(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[10] * B[7]\n\t"
        "mulx	56(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "movq	%%r12, 112(%[t])\n\t"
        "movq	%%r13, 120(%[t])\n\t"
        "movq	%%r14, 128(%[t])\n\t"
        "movq	%%rax, 136(%[t])\n\t"
        "movq	152(%[t]), %%r11\n\t"
        "movq	160(%[t]), %%r12\n\t"
        "movq	168(%[t]), %%r13\n\t"
        "movq	176(%[t]), %%r14\n\t"
        "# A[10] * B[8]\n\t"
        "mulx	64(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[10] * B[9]\n\t"
        "mulx	72(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[10] * B[10]\n\t"
        "mulx	80(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[10] * B[11]\n\t"
        "mulx	88(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "movq	%%r10, 144(%[t])\n\t"
        "movq	%%r11, 152(%[t])\n\t"
        "movq	%%r12, 160(%[t])\n\t"
        "movq	%%r13, 168(%[t])\n\t"
        "movq	184(%[t]), %%rax\n\t"
        "movq	192(%[t]), %%r10\n\t"
        "movq	200(%[t]), %%r11\n\t"
        "# A[10] * B[12]\n\t"
        "mulx	96(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[10] * B[13]\n\t"
        "mulx	104(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[10] * B[14]\n\t"
        "mulx	112(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[10] * B[15]\n\t"
        "mulx	120(%[b]), %%r8, %%r9\n\t"
        "movq	%%r15, %%r12\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "adcxq	%%rcx, %%r12\n\t"
        "movq	%%r15, %%rcx\n\t"
        "adoxq	%%r15, %%rcx\n\t"
        "adcxq	%%r15, %%rcx\n\t"
        "movq	%%r14, 176(%[t])\n\t"
        "movq	%%rax, 184(%[t])\n\t"
        "movq	%%r10, 192(%[t])\n\t"
        "movq	%%r11, 200(%[t])\n\t"
        "movq	%%r12, 208(%[t])\n\t"
        "movq	88(%[a]), %%rdx\n\t"
        "xorq	%%r15, %%r15\n\t"
        "movq	88(%[t]), %%rax\n\t"
        "movq	96(%[t]), %%r10\n\t"
        "movq	104(%[t]), %%r11\n\t"
        "movq	112(%[t]), %%r12\n\t"
        "movq	120(%[t]), %%r13\n\t"
        "# A[11] * B[0]\n\t"
        "mulx	0(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[11] * B[1]\n\t"
        "mulx	8(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[11] * B[2]\n\t"
        "mulx	16(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[11] * B[3]\n\t"
        "mulx	24(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "movq	%%rax, 88(%[t])\n\t"
        "movq	%%r10, 96(%[t])\n\t"
        "movq	%%r11, 104(%[t])\n\t"
        "movq	%%r12, 112(%[t])\n\t"
        "movq	128(%[t]), %%r14\n\t"
        "movq	136(%[t]), %%rax\n\t"
        "movq	144(%[t]), %%r10\n\t"
        "movq	152(%[t]), %%r11\n\t"
        "# A[11] * B[4]\n\t"
        "mulx	32(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[11] * B[5]\n\t"
        "mulx	40(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[11] * B[6]\n\t"
        "mulx	48(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[11] * B[7]\n\t"
        "mulx	56(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "movq	%%r13, 120(%[t])\n\t"
        "movq	%%r14, 128(%[t])\n\t"
        "movq	%%rax, 136(%[t])\n\t"
        "movq	%%r10, 144(%[t])\n\t"
        "movq	160(%[t]), %%r12\n\t"
        "movq	168(%[t]), %%r13\n\t"
        "movq	176(%[t]), %%r14\n\t"
        "movq	184(%[t]), %%rax\n\t"
        "# A[11] * B[8]\n\t"
        "mulx	64(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[11] * B[9]\n\t"
        "mulx	72(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[11] * B[10]\n\t"
        "mulx	80(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[11] * B[11]\n\t"
        "mulx	88(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "movq	%%r11, 152(%[t])\n\t"
        "movq	%%r12, 160(%[t])\n\t"
        "movq	%%r13, 168(%[t])\n\t"
        "movq	%%r14, 176(%[t])\n\t"
        "movq	192(%[t]), %%r10\n\t"
        "movq	200(%[t]), %%r11\n\t"
        "movq	208(%[t]), %%r12\n\t"
        "# A[11] * B[12]\n\t"
        "mulx	96(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[11] * B[13]\n\t"
        "mulx	104(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[11] * B[14]\n\t"
        "mulx	112(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[11] * B[15]\n\t"
        "mulx	120(%[b]), %%r8, %%r9\n\t"
        "movq	%%r15, %%r13\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "adcxq	%%rcx, %%r13\n\t"
        "movq	%%r15, %%rcx\n\t"
        "adoxq	%%r15, %%rcx\n\t"
        "adcxq	%%r15, %%rcx\n\t"
        "movq	%%rax, 184(%[t])\n\t"
        "movq	%%r10, 192(%[t])\n\t"
        "movq	%%r11, 200(%[t])\n\t"
        "movq	%%r12, 208(%[t])\n\t"
        "movq	%%r13, 216(%[t])\n\t"
        "movq	96(%[a]), %%rdx\n\t"
        "xorq	%%r15, %%r15\n\t"
        "movq	96(%[t]), %%r10\n\t"
        "movq	104(%[t]), %%r11\n\t"
        "movq	112(%[t]), %%r12\n\t"
        "movq	120(%[t]), %%r13\n\t"
        "movq	128(%[t]), %%r14\n\t"
        "# A[12] * B[0]\n\t"
        "mulx	0(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[12] * B[1]\n\t"
        "mulx	8(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[12] * B[2]\n\t"
        "mulx	16(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[12] * B[3]\n\t"
        "mulx	24(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "movq	%%r10, 96(%[t])\n\t"
        "movq	%%r11, 104(%[t])\n\t"
        "movq	%%r12, 112(%[t])\n\t"
        "movq	%%r13, 120(%[t])\n\t"
        "movq	136(%[t]), %%rax\n\t"
        "movq	144(%[t]), %%r10\n\t"
        "movq	152(%[t]), %%r11\n\t"
        "movq	160(%[t]), %%r12\n\t"
        "# A[12] * B[4]\n\t"
        "mulx	32(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[12] * B[5]\n\t"
        "mulx	40(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[12] * B[6]\n\t"
        "mulx	48(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[12] * B[7]\n\t"
        "mulx	56(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "movq	%%r14, 128(%[t])\n\t"
        "movq	%%rax, 136(%[t])\n\t"
        "movq	%%r10, 144(%[t])\n\t"
        "movq	%%r11, 152(%[t])\n\t"
        "movq	168(%[t]), %%r13\n\t"
        "movq	176(%[t]), %%r14\n\t"
        "movq	184(%[t]), %%rax\n\t"
        "movq	192(%[t]), %%r10\n\t"
        "# A[12] * B[8]\n\t"
        "mulx	64(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[12] * B[9]\n\t"
        "mulx	72(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[12] * B[10]\n\t"
        "mulx	80(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[12] * B[11]\n\t"
        "mulx	88(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "movq	%%r12, 160(%[t])\n\t"
        "movq	%%r13, 168(%[t])\n\t"
        "movq	%%r14, 176(%[t])\n\t"
        "movq	%%rax, 184(%[t])\n\t"
        "movq	200(%[t]), %%r11\n\t"
        "movq	208(%[t]), %%r12\n\t"
        "movq	216(%[t]), %%r13\n\t"
        "# A[12] * B[12]\n\t"
        "mulx	96(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[12] * B[13]\n\t"
        "mulx	104(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[12] * B[14]\n\t"
        "mulx	112(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[12] * B[15]\n\t"
        "mulx	120(%[b]), %%r8, %%r9\n\t"
        "movq	%%r15, %%r14\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "adcxq	%%rcx, %%r14\n\t"
        "movq	%%r15, %%rcx\n\t"
        "adoxq	%%r15, %%rcx\n\t"
        "adcxq	%%r15, %%rcx\n\t"
        "movq	%%r10, 192(%[t])\n\t"
        "movq	%%r11, 200(%[t])\n\t"
        "movq	%%r12, 208(%[t])\n\t"
        "movq	%%r13, 216(%[t])\n\t"
        "movq	%%r14, 224(%[t])\n\t"
        "movq	104(%[a]), %%rdx\n\t"
        "xorq	%%r15, %%r15\n\t"
        "movq	104(%[t]), %%r11\n\t"
        "movq	112(%[t]), %%r12\n\t"
        "movq	120(%[t]), %%r13\n\t"
        "movq	128(%[t]), %%r14\n\t"
        "movq	136(%[t]), %%rax\n\t"
        "# A[13] * B[0]\n\t"
        "mulx	0(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[13] * B[1]\n\t"
        "mulx	8(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[13] * B[2]\n\t"
        "mulx	16(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[13] * B[3]\n\t"
        "mulx	24(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "movq	%%r11, 104(%[t])\n\t"
        "movq	%%r12, 112(%[t])\n\t"
        "movq	%%r13, 120(%[t])\n\t"
        "movq	%%r14, 128(%[t])\n\t"
        "movq	144(%[t]), %%r10\n\t"
        "movq	152(%[t]), %%r11\n\t"
        "movq	160(%[t]), %%r12\n\t"
        "movq	168(%[t]), %%r13\n\t"
        "# A[13] * B[4]\n\t"
        "mulx	32(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[13] * B[5]\n\t"
        "mulx	40(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[13] * B[6]\n\t"
        "mulx	48(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[13] * B[7]\n\t"
        "mulx	56(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "movq	%%rax, 136(%[t])\n\t"
        "movq	%%r10, 144(%[t])\n\t"
        "movq	%%r11, 152(%[t])\n\t"
        "movq	%%r12, 160(%[t])\n\t"
        "movq	176(%[t]), %%r14\n\t"
        "movq	184(%[t]), %%rax\n\t"
        "movq	192(%[t]), %%r10\n\t"
        "movq	200(%[t]), %%r11\n\t"
        "# A[13] * B[8]\n\t"
        "mulx	64(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[13] * B[9]\n\t"
        "mulx	72(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[13] * B[10]\n\t"
        "mulx	80(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[13] * B[11]\n\t"
        "mulx	88(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "movq	%%r13, 168(%[t])\n\t"
        "movq	%%r14, 176(%[t])\n\t"
        "movq	%%rax, 184(%[t])\n\t"
        "movq	%%r10, 192(%[t])\n\t"
        "movq	208(%[t]), %%r12\n\t"
        "movq	216(%[t]), %%r13\n\t"
        "movq	224(%[t]), %%r14\n\t"
        "# A[13] * B[12]\n\t"
        "mulx	96(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[13] * B[13]\n\t"
        "mulx	104(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[13] * B[14]\n\t"
        "mulx	112(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[13] * B[15]\n\t"
        "mulx	120(%[b]), %%r8, %%r9\n\t"
        "movq	%%r15, %%rax\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "adcxq	%%rcx, %%rax\n\t"
        "movq	%%r15, %%rcx\n\t"
        "adoxq	%%r15, %%rcx\n\t"
        "adcxq	%%r15, %%rcx\n\t"
        "movq	%%r11, 200(%[t])\n\t"
        "movq	%%r12, 208(%[t])\n\t"
        "movq	%%r13, 216(%[t])\n\t"
        "movq	%%r14, 224(%[t])\n\t"
        "movq	%%rax, 232(%[t])\n\t"
        "movq	112(%[a]), %%rdx\n\t"
        "xorq	%%r15, %%r15\n\t"
        "movq	112(%[t]), %%r12\n\t"
        "movq	120(%[t]), %%r13\n\t"
        "movq	128(%[t]), %%r14\n\t"
        "movq	136(%[t]), %%rax\n\t"
        "movq	144(%[t]), %%r10\n\t"
        "# A[14] * B[0]\n\t"
        "mulx	0(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[14] * B[1]\n\t"
        "mulx	8(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[14] * B[2]\n\t"
        "mulx	16(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[14] * B[3]\n\t"
        "mulx	24(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "movq	%%r12, 112(%[t])\n\t"
        "movq	%%r13, 120(%[t])\n\t"
        "movq	%%r14, 128(%[t])\n\t"
        "movq	%%rax, 136(%[t])\n\t"
        "movq	152(%[t]), %%r11\n\t"
        "movq	160(%[t]), %%r12\n\t"
        "movq	168(%[t]), %%r13\n\t"
        "movq	176(%[t]), %%r14\n\t"
        "# A[14] * B[4]\n\t"
        "mulx	32(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[14] * B[5]\n\t"
        "mulx	40(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[14] * B[6]\n\t"
        "mulx	48(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[14] * B[7]\n\t"
        "mulx	56(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "movq	%%r10, 144(%[t])\n\t"
        "movq	%%r11, 152(%[t])\n\t"
        "movq	%%r12, 160(%[t])\n\t"
        "movq	%%r13, 168(%[t])\n\t"
        "movq	184(%[t]), %%rax\n\t"
        "movq	192(%[t]), %%r10\n\t"
        "movq	200(%[t]), %%r11\n\t"
        "movq	208(%[t]), %%r12\n\t"
        "# A[14] * B[8]\n\t"
        "mulx	64(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[14] * B[9]\n\t"
        "mulx	72(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[14] * B[10]\n\t"
        "mulx	80(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[14] * B[11]\n\t"
        "mulx	88(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "movq	%%r14, 176(%[t])\n\t"
        "movq	%%rax, 184(%[t])\n\t"
        "movq	%%r10, 192(%[t])\n\t"
        "movq	%%r11, 200(%[t])\n\t"
        "movq	216(%[t]), %%r13\n\t"
        "movq	224(%[t]), %%r14\n\t"
        "movq	232(%[t]), %%rax\n\t"
        "# A[14] * B[12]\n\t"
        "mulx	96(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[14] * B[13]\n\t"
        "mulx	104(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[14] * B[14]\n\t"
        "mulx	112(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[14] * B[15]\n\t"
        "mulx	120(%[b]), %%r8, %%r9\n\t"
        "movq	%%r15, %%r10\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "adcxq	%%rcx, %%r10\n\t"
        "movq	%%r15, %%rcx\n\t"
        "adoxq	%%r15, %%rcx\n\t"
        "adcxq	%%r15, %%rcx\n\t"
        "movq	%%r12, 208(%[t])\n\t"
        "movq	%%r13, 216(%[t])\n\t"
        "movq	%%r14, 224(%[t])\n\t"
        "movq	%%rax, 232(%[t])\n\t"
        "movq	%%r10, 240(%[t])\n\t"
        "movq	120(%[a]), %%rdx\n\t"
        "xorq	%%r15, %%r15\n\t"
        "movq	120(%[t]), %%r13\n\t"
        "movq	128(%[t]), %%r14\n\t"
        "movq	136(%[t]), %%rax\n\t"
        "movq	144(%[t]), %%r10\n\t"
        "movq	152(%[t]), %%r11\n\t"
        "# A[15] * B[0]\n\t"
        "mulx	0(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[15] * B[1]\n\t"
        "mulx	8(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[15] * B[2]\n\t"
        "mulx	16(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[15] * B[3]\n\t"
        "mulx	24(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "movq	%%r13, 120(%[t])\n\t"
        "movq	%%r14, 128(%[t])\n\t"
        "movq	%%rax, 136(%[t])\n\t"
        "movq	%%r10, 144(%[t])\n\t"
        "movq	160(%[t]), %%r12\n\t"
        "movq	168(%[t]), %%r13\n\t"
        "movq	176(%[t]), %%r14\n\t"
        "movq	184(%[t]), %%rax\n\t"
        "# A[15] * B[4]\n\t"
        "mulx	32(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[15] * B[5]\n\t"
        "mulx	40(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[15] * B[6]\n\t"
        "mulx	48(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[15] * B[7]\n\t"
        "mulx	56(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "movq	%%r11, 152(%[t])\n\t"
        "movq	%%r12, 160(%[t])\n\t"
        "movq	%%r13, 168(%[t])\n\t"
        "movq	%%r14, 176(%[t])\n\t"
        "movq	192(%[t]), %%r10\n\t"
        "movq	200(%[t]), %%r11\n\t"
        "movq	208(%[t]), %%r12\n\t"
        "movq	216(%[t]), %%r13\n\t"
        "# A[15] * B[8]\n\t"
        "mulx	64(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[15] * B[9]\n\t"
        "mulx	72(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[15] * B[10]\n\t"
        "mulx	80(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[15] * B[11]\n\t"
        "mulx	88(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "movq	%%rax, 184(%[t])\n\t"
        "movq	%%r10, 192(%[t])\n\t"
        "movq	%%r11, 200(%[t])\n\t"
        "movq	%%r12, 208(%[t])\n\t"
        "movq	224(%[t]), %%r14\n\t"
        "movq	232(%[t]), %%rax\n\t"
        "movq	240(%[t]), %%r10\n\t"
        "# A[15] * B[12]\n\t"
        "mulx	96(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[15] * B[13]\n\t"
        "mulx	104(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[15] * B[14]\n\t"
        "mulx	112(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[15] * B[15]\n\t"
        "mulx	120(%[b]), %%r8, %%r9\n\t"
        "movq	%%r15, %%r11\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "adcxq	%%rcx, %%r11\n\t"
        "movq	%%r13, 216(%[t])\n\t"
        "movq	%%r14, 224(%[t])\n\t"
        "movq	%%rax, 232(%[t])\n\t"
        "movq	%%r10, 240(%[t])\n\t"
        "movq	%%r11, 248(%[t])\n\t"
        :
        : [a] "r" (a), [b] "r" (b), [t] "r" (tmp)
        : "memory", "rax", "rdx", "rcx",
          "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
    );

    XMEMCPY(r, tmp, sizeof(tmp));
}
#endif /* HAVE_INTEL_AVX2 */

#ifdef HAVE_INTEL_AVX2
/* Square a and put result in r. (r = a * a)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 */
SP_NOINLINE static void sp_2048_sqr_avx2_16(sp_digit* r, const sp_digit* a)
{
    sp_digit tmp[32];

    __asm__ __volatile__ (
        "xorq	%%r8, %%r8\n\t"
        "# Diagonal 1\n\t"
        "xorq	%%r10, %%r10\n\t"
        "xorq	%%r11, %%r11\n\t"
        "xorq	%%r12, %%r12\n\t"
        "xorq	%%r13, %%r13\n\t"
        "xorq	%%r14, %%r14\n\t"
        "xorq	%%r15, %%r15\n\t"
        "# A[1] x A[0]\n\t"
        "movq	0(%[a]), %%rdx\n\t"
        "mulxq	8(%[a]), %%r10, %%r11\n\t"
        "# A[2] x A[0]\n\t"
        "mulxq	16(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "# A[3] x A[0]\n\t"
        "mulxq	24(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "# A[4] x A[0]\n\t"
        "mulxq	32(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "# A[5] x A[0]\n\t"
        "mulxq	40(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "movq	%%r10, 8(%[tmp])\n\t"
        "movq	%%r11, 16(%[tmp])\n\t"
        "movq	%%r12, 24(%[tmp])\n\t"
        "movq	%%r13, 32(%[tmp])\n\t"
        "movq	%%r14, 40(%[tmp])\n\t"
        "movq	%%r8, %%r10\n\t"
        "movq	%%r8, %%r11\n\t"
        "movq	%%r8, %%r12\n\t"
        "movq	%%r8, %%r13\n\t"
        "movq	%%r8, %%r14\n\t"
        "# A[6] x A[0]\n\t"
        "mulxq	48(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "# A[7] x A[0]\n\t"
        "mulxq	56(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "# A[8] x A[0]\n\t"
        "mulxq	64(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "# A[9] x A[0]\n\t"
        "mulxq	72(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "# A[10] x A[0]\n\t"
        "mulxq	80(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "movq	%%r15, 48(%[tmp])\n\t"
        "movq	%%r10, 56(%[tmp])\n\t"
        "movq	%%r11, 64(%[tmp])\n\t"
        "movq	%%r12, 72(%[tmp])\n\t"
        "movq	%%r13, 80(%[tmp])\n\t"
        "movq	%%r8, %%r15\n\t"
        "movq	%%r8, %%r10\n\t"
        "movq	%%r8, %%r11\n\t"
        "movq	%%r8, %%r12\n\t"
        "movq	%%r8, %%r13\n\t"
        "# A[11] x A[0]\n\t"
        "mulxq	88(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "# A[12] x A[0]\n\t"
        "mulxq	96(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "# A[13] x A[0]\n\t"
        "mulxq	104(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "# A[14] x A[0]\n\t"
        "mulxq	112(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "# A[15] x A[0]\n\t"
        "mulxq	120(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "movq	%%r14, 88(%[tmp])\n\t"
        "movq	%%r15, 96(%[tmp])\n\t"
        "movq	%%r10, 104(%[tmp])\n\t"
        "movq	%%r11, 112(%[tmp])\n\t"
        "movq	%%r12, 120(%[tmp])\n\t"
        "#  Carry\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "movq	%%r8, %%r9\n\t"
        "adcxq	%%r8, %%r9\n\t"
        "adoxq	%%r8, %%r9\n\t"
        "movq	%%r13, 128(%[tmp])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "# Diagonal 2\n\t"
        "movq	24(%[tmp]), %%r13\n\t"
        "movq	32(%[tmp]), %%r14\n\t"
        "movq	40(%[tmp]), %%r15\n\t"
        "movq	48(%[tmp]), %%r10\n\t"
        "movq	56(%[tmp]), %%r11\n\t"
        "movq	64(%[tmp]), %%r12\n\t"
        "# A[2] x A[1]\n\t"
        "movq	8(%[a]), %%rdx\n\t"
        "mulxq	16(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "# A[3] x A[1]\n\t"
        "mulxq	24(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "# A[4] x A[1]\n\t"
        "mulxq	32(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "# A[5] x A[1]\n\t"
        "mulxq	40(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "# A[6] x A[1]\n\t"
        "mulxq	48(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "movq	%%r13, 24(%[tmp])\n\t"
        "movq	%%r14, 32(%[tmp])\n\t"
        "movq	%%r15, 40(%[tmp])\n\t"
        "movq	%%r10, 48(%[tmp])\n\t"
        "movq	%%r11, 56(%[tmp])\n\t"
        "movq	72(%[tmp]), %%r13\n\t"
        "movq	80(%[tmp]), %%r14\n\t"
        "movq	88(%[tmp]), %%r15\n\t"
        "movq	96(%[tmp]), %%r10\n\t"
        "movq	104(%[tmp]), %%r11\n\t"
        "# A[7] x A[1]\n\t"
        "mulxq	56(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "# A[8] x A[1]\n\t"
        "mulxq	64(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "# A[9] x A[1]\n\t"
        "mulxq	72(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "# A[10] x A[1]\n\t"
        "mulxq	80(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "# A[11] x A[1]\n\t"
        "mulxq	88(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "movq	%%r12, 64(%[tmp])\n\t"
        "movq	%%r13, 72(%[tmp])\n\t"
        "movq	%%r14, 80(%[tmp])\n\t"
        "movq	%%r15, 88(%[tmp])\n\t"
        "movq	%%r10, 96(%[tmp])\n\t"
        "movq	112(%[tmp]), %%r12\n\t"
        "movq	120(%[tmp]), %%r13\n\t"
        "movq	128(%[tmp]), %%r14\n\t"
        "movq	%%r8, %%r15\n\t"
        "movq	%%r8, %%r10\n\t"
        "# A[12] x A[1]\n\t"
        "mulxq	96(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "# A[13] x A[1]\n\t"
        "mulxq	104(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "# A[14] x A[1]\n\t"
        "mulxq	112(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "# A[15] x A[1]\n\t"
        "mulxq	120(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "# A[15] x A[2]\n\t"
        "movq	16(%[a]), %%rdx\n\t"
        "mulxq	120(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "movq	%%r11, 104(%[tmp])\n\t"
        "movq	%%r12, 112(%[tmp])\n\t"
        "movq	%%r13, 120(%[tmp])\n\t"
        "movq	%%r14, 128(%[tmp])\n\t"
        "movq	%%r15, 136(%[tmp])\n\t"
        "#  Carry\n\t"
        "adcxq	%%r9, %%r10\n\t"
        "movq	%%r8, %%r9\n\t"
        "adcxq	%%r8, %%r9\n\t"
        "adoxq	%%r8, %%r9\n\t"
        "movq	%%r10, 144(%[tmp])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "# Diagonal 3\n\t"
        "movq	40(%[tmp]), %%r10\n\t"
        "movq	48(%[tmp]), %%r11\n\t"
        "movq	56(%[tmp]), %%r12\n\t"
        "movq	64(%[tmp]), %%r13\n\t"
        "movq	72(%[tmp]), %%r14\n\t"
        "movq	80(%[tmp]), %%r15\n\t"
        "# A[3] x A[2]\n\t"
        "mulxq	24(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "# A[4] x A[2]\n\t"
        "mulxq	32(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "# A[5] x A[2]\n\t"
        "mulxq	40(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "# A[6] x A[2]\n\t"
        "mulxq	48(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "# A[7] x A[2]\n\t"
        "mulxq	56(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "movq	%%r10, 40(%[tmp])\n\t"
        "movq	%%r11, 48(%[tmp])\n\t"
        "movq	%%r12, 56(%[tmp])\n\t"
        "movq	%%r13, 64(%[tmp])\n\t"
        "movq	%%r14, 72(%[tmp])\n\t"
        "movq	88(%[tmp]), %%r10\n\t"
        "movq	96(%[tmp]), %%r11\n\t"
        "movq	104(%[tmp]), %%r12\n\t"
        "movq	112(%[tmp]), %%r13\n\t"
        "movq	120(%[tmp]), %%r14\n\t"
        "# A[8] x A[2]\n\t"
        "mulxq	64(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "# A[9] x A[2]\n\t"
        "mulxq	72(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "# A[10] x A[2]\n\t"
        "mulxq	80(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "# A[11] x A[2]\n\t"
        "mulxq	88(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "# A[12] x A[2]\n\t"
        "mulxq	96(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "movq	%%r15, 80(%[tmp])\n\t"
        "movq	%%r10, 88(%[tmp])\n\t"
        "movq	%%r11, 96(%[tmp])\n\t"
        "movq	%%r12, 104(%[tmp])\n\t"
        "movq	%%r13, 112(%[tmp])\n\t"
        "movq	128(%[tmp]), %%r15\n\t"
        "movq	136(%[tmp]), %%r10\n\t"
        "movq	144(%[tmp]), %%r11\n\t"
        "movq	%%r8, %%r12\n\t"
        "movq	%%r8, %%r13\n\t"
        "# A[13] x A[2]\n\t"
        "mulxq	104(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "# A[14] x A[2]\n\t"
        "mulxq	112(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "# A[14] x A[3]\n\t"
        "movq	112(%[a]), %%rdx\n\t"
        "mulxq	24(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "# A[14] x A[4]\n\t"
        "mulxq	32(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "# A[14] x A[5]\n\t"
        "mulxq	40(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "movq	%%r14, 120(%[tmp])\n\t"
        "movq	%%r15, 128(%[tmp])\n\t"
        "movq	%%r10, 136(%[tmp])\n\t"
        "movq	%%r11, 144(%[tmp])\n\t"
        "movq	%%r12, 152(%[tmp])\n\t"
        "#  Carry\n\t"
        "adcxq	%%r9, %%r13\n\t"
        "movq	%%r8, %%r9\n\t"
        "adcxq	%%r8, %%r9\n\t"
        "adoxq	%%r8, %%r9\n\t"
        "movq	%%r13, 160(%[tmp])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "# Diagonal 4\n\t"
        "movq	56(%[tmp]), %%r13\n\t"
        "movq	64(%[tmp]), %%r14\n\t"
        "movq	72(%[tmp]), %%r15\n\t"
        "movq	80(%[tmp]), %%r10\n\t"
        "movq	88(%[tmp]), %%r11\n\t"
        "movq	96(%[tmp]), %%r12\n\t"
        "# A[4] x A[3]\n\t"
        "movq	24(%[a]), %%rdx\n\t"
        "mulxq	32(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "# A[5] x A[3]\n\t"
        "mulxq	40(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "# A[6] x A[3]\n\t"
        "mulxq	48(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "# A[7] x A[3]\n\t"
        "mulxq	56(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "# A[8] x A[3]\n\t"
        "mulxq	64(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "movq	%%r13, 56(%[tmp])\n\t"
        "movq	%%r14, 64(%[tmp])\n\t"
        "movq	%%r15, 72(%[tmp])\n\t"
        "movq	%%r10, 80(%[tmp])\n\t"
        "movq	%%r11, 88(%[tmp])\n\t"
        "movq	104(%[tmp]), %%r13\n\t"
        "movq	112(%[tmp]), %%r14\n\t"
        "movq	120(%[tmp]), %%r15\n\t"
        "movq	128(%[tmp]), %%r10\n\t"
        "movq	136(%[tmp]), %%r11\n\t"
        "# A[9] x A[3]\n\t"
        "mulxq	72(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "# A[10] x A[3]\n\t"
        "mulxq	80(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "# A[11] x A[3]\n\t"
        "mulxq	88(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "# A[12] x A[3]\n\t"
        "mulxq	96(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "# A[13] x A[3]\n\t"
        "mulxq	104(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "movq	%%r12, 96(%[tmp])\n\t"
        "movq	%%r13, 104(%[tmp])\n\t"
        "movq	%%r14, 112(%[tmp])\n\t"
        "movq	%%r15, 120(%[tmp])\n\t"
        "movq	%%r10, 128(%[tmp])\n\t"
        "movq	144(%[tmp]), %%r12\n\t"
        "movq	152(%[tmp]), %%r13\n\t"
        "movq	160(%[tmp]), %%r14\n\t"
        "movq	%%r8, %%r15\n\t"
        "movq	%%r8, %%r10\n\t"
        "# A[13] x A[4]\n\t"
        "movq	104(%[a]), %%rdx\n\t"
        "mulxq	32(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "# A[13] x A[5]\n\t"
        "mulxq	40(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "# A[13] x A[6]\n\t"
        "mulxq	48(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "# A[13] x A[7]\n\t"
        "mulxq	56(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "# A[13] x A[8]\n\t"
        "mulxq	64(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "movq	%%r11, 136(%[tmp])\n\t"
        "movq	%%r12, 144(%[tmp])\n\t"
        "movq	%%r13, 152(%[tmp])\n\t"
        "movq	%%r14, 160(%[tmp])\n\t"
        "movq	%%r15, 168(%[tmp])\n\t"
        "#  Carry\n\t"
        "adcxq	%%r9, %%r10\n\t"
        "movq	%%r8, %%r9\n\t"
        "adcxq	%%r8, %%r9\n\t"
        "adoxq	%%r8, %%r9\n\t"
        "movq	%%r10, 176(%[tmp])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "# Diagonal 5\n\t"
        "movq	72(%[tmp]), %%r10\n\t"
        "movq	80(%[tmp]), %%r11\n\t"
        "movq	88(%[tmp]), %%r12\n\t"
        "movq	96(%[tmp]), %%r13\n\t"
        "movq	104(%[tmp]), %%r14\n\t"
        "movq	112(%[tmp]), %%r15\n\t"
        "# A[5] x A[4]\n\t"
        "movq	32(%[a]), %%rdx\n\t"
        "mulxq	40(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "# A[6] x A[4]\n\t"
        "mulxq	48(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "# A[7] x A[4]\n\t"
        "mulxq	56(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "# A[8] x A[4]\n\t"
        "mulxq	64(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "# A[9] x A[4]\n\t"
        "mulxq	72(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "movq	%%r10, 72(%[tmp])\n\t"
        "movq	%%r11, 80(%[tmp])\n\t"
        "movq	%%r12, 88(%[tmp])\n\t"
        "movq	%%r13, 96(%[tmp])\n\t"
        "movq	%%r14, 104(%[tmp])\n\t"
        "movq	120(%[tmp]), %%r10\n\t"
        "movq	128(%[tmp]), %%r11\n\t"
        "movq	136(%[tmp]), %%r12\n\t"
        "movq	144(%[tmp]), %%r13\n\t"
        "movq	152(%[tmp]), %%r14\n\t"
        "# A[10] x A[4]\n\t"
        "mulxq	80(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "# A[11] x A[4]\n\t"
        "mulxq	88(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "# A[12] x A[4]\n\t"
        "mulxq	96(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "# A[12] x A[5]\n\t"
        "movq	96(%[a]), %%rdx\n\t"
        "mulxq	40(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "# A[12] x A[6]\n\t"
        "mulxq	48(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "movq	%%r15, 112(%[tmp])\n\t"
        "movq	%%r10, 120(%[tmp])\n\t"
        "movq	%%r11, 128(%[tmp])\n\t"
        "movq	%%r12, 136(%[tmp])\n\t"
        "movq	%%r13, 144(%[tmp])\n\t"
        "movq	160(%[tmp]), %%r15\n\t"
        "movq	168(%[tmp]), %%r10\n\t"
        "movq	176(%[tmp]), %%r11\n\t"
        "movq	%%r8, %%r12\n\t"
        "movq	%%r8, %%r13\n\t"
        "# A[12] x A[7]\n\t"
        "mulxq	56(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "# A[12] x A[8]\n\t"
        "mulxq	64(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "# A[12] x A[9]\n\t"
        "mulxq	72(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "# A[12] x A[10]\n\t"
        "mulxq	80(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "# A[12] x A[11]\n\t"
        "mulxq	88(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "movq	%%r14, 152(%[tmp])\n\t"
        "movq	%%r15, 160(%[tmp])\n\t"
        "movq	%%r10, 168(%[tmp])\n\t"
        "movq	%%r11, 176(%[tmp])\n\t"
        "movq	%%r12, 184(%[tmp])\n\t"
        "#  Carry\n\t"
        "adcxq	%%r9, %%r13\n\t"
        "movq	%%r8, %%r9\n\t"
        "adcxq	%%r8, %%r9\n\t"
        "adoxq	%%r8, %%r9\n\t"
        "movq	%%r13, 192(%[tmp])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "# Diagonal 6\n\t"
        "movq	88(%[tmp]), %%r13\n\t"
        "movq	96(%[tmp]), %%r14\n\t"
        "movq	104(%[tmp]), %%r15\n\t"
        "movq	112(%[tmp]), %%r10\n\t"
        "movq	120(%[tmp]), %%r11\n\t"
        "movq	128(%[tmp]), %%r12\n\t"
        "# A[6] x A[5]\n\t"
        "movq	40(%[a]), %%rdx\n\t"
        "mulxq	48(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "# A[7] x A[5]\n\t"
        "mulxq	56(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "# A[8] x A[5]\n\t"
        "mulxq	64(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "# A[9] x A[5]\n\t"
        "mulxq	72(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "# A[10] x A[5]\n\t"
        "mulxq	80(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "movq	%%r13, 88(%[tmp])\n\t"
        "movq	%%r14, 96(%[tmp])\n\t"
        "movq	%%r15, 104(%[tmp])\n\t"
        "movq	%%r10, 112(%[tmp])\n\t"
        "movq	%%r11, 120(%[tmp])\n\t"
        "movq	136(%[tmp]), %%r13\n\t"
        "movq	144(%[tmp]), %%r14\n\t"
        "movq	152(%[tmp]), %%r15\n\t"
        "movq	160(%[tmp]), %%r10\n\t"
        "movq	168(%[tmp]), %%r11\n\t"
        "# A[11] x A[5]\n\t"
        "mulxq	88(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "# A[11] x A[6]\n\t"
        "movq	88(%[a]), %%rdx\n\t"
        "mulxq	48(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "# A[11] x A[7]\n\t"
        "mulxq	56(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "# A[11] x A[8]\n\t"
        "mulxq	64(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "# A[11] x A[9]\n\t"
        "mulxq	72(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "movq	%%r12, 128(%[tmp])\n\t"
        "movq	%%r13, 136(%[tmp])\n\t"
        "movq	%%r14, 144(%[tmp])\n\t"
        "movq	%%r15, 152(%[tmp])\n\t"
        "movq	%%r10, 160(%[tmp])\n\t"
        "movq	176(%[tmp]), %%r12\n\t"
        "movq	184(%[tmp]), %%r13\n\t"
        "movq	192(%[tmp]), %%r14\n\t"
        "movq	%%r8, %%r15\n\t"
        "movq	%%r8, %%r10\n\t"
        "# A[11] x A[10]\n\t"
        "mulxq	80(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "# A[13] x A[9]\n\t"
        "movq	104(%[a]), %%rdx\n\t"
        "mulxq	72(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "# A[13] x A[10]\n\t"
        "mulxq	80(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "# A[13] x A[11]\n\t"
        "mulxq	88(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "# A[13] x A[12]\n\t"
        "mulxq	96(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "movq	%%r11, 168(%[tmp])\n\t"
        "movq	%%r12, 176(%[tmp])\n\t"
        "movq	%%r13, 184(%[tmp])\n\t"
        "movq	%%r14, 192(%[tmp])\n\t"
        "movq	%%r15, 200(%[tmp])\n\t"
        "#  Carry\n\t"
        "adcxq	%%r9, %%r10\n\t"
        "movq	%%r8, %%r9\n\t"
        "adcxq	%%r8, %%r9\n\t"
        "adoxq	%%r8, %%r9\n\t"
        "movq	%%r10, 208(%[tmp])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "# Diagonal 7\n\t"
        "movq	104(%[tmp]), %%r10\n\t"
        "movq	112(%[tmp]), %%r11\n\t"
        "movq	120(%[tmp]), %%r12\n\t"
        "movq	128(%[tmp]), %%r13\n\t"
        "movq	136(%[tmp]), %%r14\n\t"
        "movq	144(%[tmp]), %%r15\n\t"
        "# A[7] x A[6]\n\t"
        "movq	48(%[a]), %%rdx\n\t"
        "mulxq	56(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "# A[8] x A[6]\n\t"
        "mulxq	64(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "# A[9] x A[6]\n\t"
        "mulxq	72(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "# A[10] x A[6]\n\t"
        "mulxq	80(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "# A[10] x A[7]\n\t"
        "movq	80(%[a]), %%rdx\n\t"
        "mulxq	56(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "movq	%%r10, 104(%[tmp])\n\t"
        "movq	%%r11, 112(%[tmp])\n\t"
        "movq	%%r12, 120(%[tmp])\n\t"
        "movq	%%r13, 128(%[tmp])\n\t"
        "movq	%%r14, 136(%[tmp])\n\t"
        "movq	152(%[tmp]), %%r10\n\t"
        "movq	160(%[tmp]), %%r11\n\t"
        "movq	168(%[tmp]), %%r12\n\t"
        "movq	176(%[tmp]), %%r13\n\t"
        "movq	184(%[tmp]), %%r14\n\t"
        "# A[10] x A[8]\n\t"
        "mulxq	64(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "# A[10] x A[9]\n\t"
        "mulxq	72(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "# A[14] x A[6]\n\t"
        "movq	112(%[a]), %%rdx\n\t"
        "mulxq	48(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "# A[14] x A[7]\n\t"
        "mulxq	56(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "# A[14] x A[8]\n\t"
        "mulxq	64(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "movq	%%r15, 144(%[tmp])\n\t"
        "movq	%%r10, 152(%[tmp])\n\t"
        "movq	%%r11, 160(%[tmp])\n\t"
        "movq	%%r12, 168(%[tmp])\n\t"
        "movq	%%r13, 176(%[tmp])\n\t"
        "movq	192(%[tmp]), %%r15\n\t"
        "movq	200(%[tmp]), %%r10\n\t"
        "movq	208(%[tmp]), %%r11\n\t"
        "movq	%%r8, %%r12\n\t"
        "movq	%%r8, %%r13\n\t"
        "# A[14] x A[9]\n\t"
        "mulxq	72(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "# A[14] x A[10]\n\t"
        "mulxq	80(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "# A[14] x A[11]\n\t"
        "mulxq	88(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "# A[14] x A[12]\n\t"
        "mulxq	96(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "# A[14] x A[13]\n\t"
        "mulxq	104(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "movq	%%r14, 184(%[tmp])\n\t"
        "movq	%%r15, 192(%[tmp])\n\t"
        "movq	%%r10, 200(%[tmp])\n\t"
        "movq	%%r11, 208(%[tmp])\n\t"
        "movq	%%r12, 216(%[tmp])\n\t"
        "#  Carry\n\t"
        "adcxq	%%r9, %%r13\n\t"
        "movq	%%r8, %%r9\n\t"
        "adcxq	%%r8, %%r9\n\t"
        "adoxq	%%r8, %%r9\n\t"
        "movq	%%r13, 224(%[tmp])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "# Diagonal 8\n\t"
        "movq	120(%[tmp]), %%r13\n\t"
        "movq	128(%[tmp]), %%r14\n\t"
        "movq	136(%[tmp]), %%r15\n\t"
        "movq	144(%[tmp]), %%r10\n\t"
        "movq	152(%[tmp]), %%r11\n\t"
        "movq	160(%[tmp]), %%r12\n\t"
        "# A[8] x A[7]\n\t"
        "movq	56(%[a]), %%rdx\n\t"
        "mulxq	64(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "# A[9] x A[7]\n\t"
        "mulxq	72(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "# A[9] x A[8]\n\t"
        "movq	64(%[a]), %%rdx\n\t"
        "mulxq	72(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "# A[15] x A[3]\n\t"
        "movq	120(%[a]), %%rdx\n\t"
        "mulxq	24(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "# A[15] x A[4]\n\t"
        "mulxq	32(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "movq	%%r13, 120(%[tmp])\n\t"
        "movq	%%r14, 128(%[tmp])\n\t"
        "movq	%%r15, 136(%[tmp])\n\t"
        "movq	%%r10, 144(%[tmp])\n\t"
        "movq	%%r11, 152(%[tmp])\n\t"
        "movq	168(%[tmp]), %%r13\n\t"
        "movq	176(%[tmp]), %%r14\n\t"
        "movq	184(%[tmp]), %%r15\n\t"
        "movq	192(%[tmp]), %%r10\n\t"
        "movq	200(%[tmp]), %%r11\n\t"
        "# A[15] x A[5]\n\t"
        "mulxq	40(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "# A[15] x A[6]\n\t"
        "mulxq	48(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "# A[15] x A[7]\n\t"
        "mulxq	56(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "# A[15] x A[8]\n\t"
        "mulxq	64(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "# A[15] x A[9]\n\t"
        "mulxq	72(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "movq	%%r12, 160(%[tmp])\n\t"
        "movq	%%r13, 168(%[tmp])\n\t"
        "movq	%%r14, 176(%[tmp])\n\t"
        "movq	%%r15, 184(%[tmp])\n\t"
        "movq	%%r10, 192(%[tmp])\n\t"
        "movq	208(%[tmp]), %%r12\n\t"
        "movq	216(%[tmp]), %%r13\n\t"
        "movq	224(%[tmp]), %%r14\n\t"
        "movq	%%r8, %%r15\n\t"
        "movq	%%r8, %%r10\n\t"
        "# A[15] x A[10]\n\t"
        "mulxq	80(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "# A[15] x A[11]\n\t"
        "mulxq	88(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "# A[15] x A[12]\n\t"
        "mulxq	96(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "# A[15] x A[13]\n\t"
        "mulxq	104(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "# A[15] x A[14]\n\t"
        "mulxq	112(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "movq	%%r11, 200(%[tmp])\n\t"
        "movq	%%r12, 208(%[tmp])\n\t"
        "movq	%%r13, 216(%[tmp])\n\t"
        "movq	%%r14, 224(%[tmp])\n\t"
        "movq	%%r15, 232(%[tmp])\n\t"
        "#  Carry\n\t"
        "adcxq	%%r9, %%r10\n\t"
        "movq	%%r8, %%r9\n\t"
        "adcxq	%%r8, %%r9\n\t"
        "adoxq	%%r8, %%r9\n\t"
        "movq	%%r10, 240(%[tmp])\n\t"
        "movq	%%r9, 248(%[tmp])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "# Double and Add in A[i] x A[i]\n\t"
        "movq	8(%[tmp]), %%r11\n\t"
        "# A[0] x A[0]\n\t"
        "movq	0(%[a]), %%rdx\n\t"
        "mulxq	%%rdx, %%rax, %%rcx\n\t"
        "movq	%%rax, 0(%[tmp])\n\t"
        "adoxq	%%r11, %%r11\n\t"
        "adcxq	%%rcx, %%r11\n\t"
        "movq	%%r11, 8(%[tmp])\n\t"
        "movq	16(%[tmp]), %%r10\n\t"
        "movq	24(%[tmp]), %%r11\n\t"
        "# A[1] x A[1]\n\t"
        "movq	8(%[a]), %%rdx\n\t"
        "mulxq	%%rdx, %%rax, %%rcx\n\t"
        "adoxq	%%r10, %%r10\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r11, %%r11\n\t"
        "adcxq	%%rcx, %%r11\n\t"
        "movq	%%r10, 16(%[tmp])\n\t"
        "movq	%%r11, 24(%[tmp])\n\t"
        "movq	32(%[tmp]), %%r10\n\t"
        "movq	40(%[tmp]), %%r11\n\t"
        "# A[2] x A[2]\n\t"
        "movq	16(%[a]), %%rdx\n\t"
        "mulxq	%%rdx, %%rax, %%rcx\n\t"
        "adoxq	%%r10, %%r10\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r11, %%r11\n\t"
        "adcxq	%%rcx, %%r11\n\t"
        "movq	%%r10, 32(%[tmp])\n\t"
        "movq	%%r11, 40(%[tmp])\n\t"
        "movq	48(%[tmp]), %%r10\n\t"
        "movq	56(%[tmp]), %%r11\n\t"
        "# A[3] x A[3]\n\t"
        "movq	24(%[a]), %%rdx\n\t"
        "mulxq	%%rdx, %%rax, %%rcx\n\t"
        "adoxq	%%r10, %%r10\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r11, %%r11\n\t"
        "adcxq	%%rcx, %%r11\n\t"
        "movq	%%r10, 48(%[tmp])\n\t"
        "movq	%%r11, 56(%[tmp])\n\t"
        "movq	64(%[tmp]), %%r10\n\t"
        "movq	72(%[tmp]), %%r11\n\t"
        "# A[4] x A[4]\n\t"
        "movq	32(%[a]), %%rdx\n\t"
        "mulxq	%%rdx, %%rax, %%rcx\n\t"
        "adoxq	%%r10, %%r10\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r11, %%r11\n\t"
        "adcxq	%%rcx, %%r11\n\t"
        "movq	%%r10, 64(%[tmp])\n\t"
        "movq	%%r11, 72(%[tmp])\n\t"
        "movq	80(%[tmp]), %%r10\n\t"
        "movq	88(%[tmp]), %%r11\n\t"
        "# A[5] x A[5]\n\t"
        "movq	40(%[a]), %%rdx\n\t"
        "mulxq	%%rdx, %%rax, %%rcx\n\t"
        "adoxq	%%r10, %%r10\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r11, %%r11\n\t"
        "adcxq	%%rcx, %%r11\n\t"
        "movq	%%r10, 80(%[tmp])\n\t"
        "movq	%%r11, 88(%[tmp])\n\t"
        "movq	96(%[tmp]), %%r10\n\t"
        "movq	104(%[tmp]), %%r11\n\t"
        "# A[6] x A[6]\n\t"
        "movq	48(%[a]), %%rdx\n\t"
        "mulxq	%%rdx, %%rax, %%rcx\n\t"
        "adoxq	%%r10, %%r10\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r11, %%r11\n\t"
        "adcxq	%%rcx, %%r11\n\t"
        "movq	%%r10, 96(%[tmp])\n\t"
        "movq	%%r11, 104(%[tmp])\n\t"
        "movq	112(%[tmp]), %%r10\n\t"
        "movq	120(%[tmp]), %%r11\n\t"
        "# A[7] x A[7]\n\t"
        "movq	56(%[a]), %%rdx\n\t"
        "mulxq	%%rdx, %%rax, %%rcx\n\t"
        "adoxq	%%r10, %%r10\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r11, %%r11\n\t"
        "adcxq	%%rcx, %%r11\n\t"
        "movq	%%r10, 112(%[tmp])\n\t"
        "movq	%%r11, 120(%[tmp])\n\t"
        "movq	128(%[tmp]), %%r10\n\t"
        "movq	136(%[tmp]), %%r11\n\t"
        "# A[8] x A[8]\n\t"
        "movq	64(%[a]), %%rdx\n\t"
        "mulxq	%%rdx, %%rax, %%rcx\n\t"
        "adoxq	%%r10, %%r10\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r11, %%r11\n\t"
        "adcxq	%%rcx, %%r11\n\t"
        "movq	%%r10, 128(%[r])\n\t"
        "movq	%%r11, 136(%[r])\n\t"
        "movq	144(%[tmp]), %%r10\n\t"
        "movq	152(%[tmp]), %%r11\n\t"
        "# A[9] x A[9]\n\t"
        "movq	72(%[a]), %%rdx\n\t"
        "mulxq	%%rdx, %%rax, %%rcx\n\t"
        "adoxq	%%r10, %%r10\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r11, %%r11\n\t"
        "adcxq	%%rcx, %%r11\n\t"
        "movq	%%r10, 144(%[r])\n\t"
        "movq	%%r11, 152(%[r])\n\t"
        "movq	160(%[tmp]), %%r10\n\t"
        "movq	168(%[tmp]), %%r11\n\t"
        "# A[10] x A[10]\n\t"
        "movq	80(%[a]), %%rdx\n\t"
        "mulxq	%%rdx, %%rax, %%rcx\n\t"
        "adoxq	%%r10, %%r10\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r11, %%r11\n\t"
        "adcxq	%%rcx, %%r11\n\t"
        "movq	%%r10, 160(%[r])\n\t"
        "movq	%%r11, 168(%[r])\n\t"
        "movq	176(%[tmp]), %%r10\n\t"
        "movq	184(%[tmp]), %%r11\n\t"
        "# A[11] x A[11]\n\t"
        "movq	88(%[a]), %%rdx\n\t"
        "mulxq	%%rdx, %%rax, %%rcx\n\t"
        "adoxq	%%r10, %%r10\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r11, %%r11\n\t"
        "adcxq	%%rcx, %%r11\n\t"
        "movq	%%r10, 176(%[r])\n\t"
        "movq	%%r11, 184(%[r])\n\t"
        "movq	192(%[tmp]), %%r10\n\t"
        "movq	200(%[tmp]), %%r11\n\t"
        "# A[12] x A[12]\n\t"
        "movq	96(%[a]), %%rdx\n\t"
        "mulxq	%%rdx, %%rax, %%rcx\n\t"
        "adoxq	%%r10, %%r10\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r11, %%r11\n\t"
        "adcxq	%%rcx, %%r11\n\t"
        "movq	%%r10, 192(%[r])\n\t"
        "movq	%%r11, 200(%[r])\n\t"
        "movq	208(%[tmp]), %%r10\n\t"
        "movq	216(%[tmp]), %%r11\n\t"
        "# A[13] x A[13]\n\t"
        "movq	104(%[a]), %%rdx\n\t"
        "mulxq	%%rdx, %%rax, %%rcx\n\t"
        "adoxq	%%r10, %%r10\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r11, %%r11\n\t"
        "adcxq	%%rcx, %%r11\n\t"
        "movq	%%r10, 208(%[r])\n\t"
        "movq	%%r11, 216(%[r])\n\t"
        "movq	224(%[tmp]), %%r10\n\t"
        "movq	232(%[tmp]), %%r11\n\t"
        "# A[14] x A[14]\n\t"
        "movq	112(%[a]), %%rdx\n\t"
        "mulxq	%%rdx, %%rax, %%rcx\n\t"
        "adoxq	%%r10, %%r10\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r11, %%r11\n\t"
        "adcxq	%%rcx, %%r11\n\t"
        "movq	%%r10, 224(%[r])\n\t"
        "movq	%%r11, 232(%[r])\n\t"
        "movq	240(%[tmp]), %%r10\n\t"
        "movq	248(%[tmp]), %%r11\n\t"
        "# A[15] x A[15]\n\t"
        "movq	120(%[a]), %%rdx\n\t"
        "mulxq	%%rdx, %%rax, %%rcx\n\t"
        "adoxq	%%r10, %%r10\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r11, %%r11\n\t"
        "adcxq	%%rcx, %%r11\n\t"
        "movq	%%r10, 240(%[r])\n\t"
        "movq	%%r11, 248(%[r])\n\t"
        :
        : [r] "r" (r), [a] "r" (a), [tmp] "r" (tmp)
        : "memory", "rax", "rdx", "rcx", "r8", "r9", "r10", "r11",
          "r12", "r13", "r14", "r15"
    );

    XMEMCPY(r, tmp, sizeof(tmp)/2);
}
#endif /* HAVE_INTEL_AVX2 */

/* Add b to a into r. (r = a + b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
SP_NOINLINE static sp_digit sp_2048_add_16(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    sp_digit c = 0;

    __asm__ __volatile__ (
        "movq	(%[a]), %%rax\n\t"
        "addq	(%[b]), %%rax\n\t"
        "movq	%%rax, (%[r])\n\t"
        "movq	8(%[a]), %%rax\n\t"
        "adcq	8(%[b]), %%rax\n\t"
        "movq	%%rax, 8(%[r])\n\t"
        "movq	16(%[a]), %%rax\n\t"
        "adcq	16(%[b]), %%rax\n\t"
        "movq	%%rax, 16(%[r])\n\t"
        "movq	24(%[a]), %%rax\n\t"
        "adcq	24(%[b]), %%rax\n\t"
        "movq	%%rax, 24(%[r])\n\t"
        "movq	32(%[a]), %%rax\n\t"
        "adcq	32(%[b]), %%rax\n\t"
        "movq	%%rax, 32(%[r])\n\t"
        "movq	40(%[a]), %%rax\n\t"
        "adcq	40(%[b]), %%rax\n\t"
        "movq	%%rax, 40(%[r])\n\t"
        "movq	48(%[a]), %%rax\n\t"
        "adcq	48(%[b]), %%rax\n\t"
        "movq	%%rax, 48(%[r])\n\t"
        "movq	56(%[a]), %%rax\n\t"
        "adcq	56(%[b]), %%rax\n\t"
        "movq	%%rax, 56(%[r])\n\t"
        "movq	64(%[a]), %%rax\n\t"
        "adcq	64(%[b]), %%rax\n\t"
        "movq	%%rax, 64(%[r])\n\t"
        "movq	72(%[a]), %%rax\n\t"
        "adcq	72(%[b]), %%rax\n\t"
        "movq	%%rax, 72(%[r])\n\t"
        "movq	80(%[a]), %%rax\n\t"
        "adcq	80(%[b]), %%rax\n\t"
        "movq	%%rax, 80(%[r])\n\t"
        "movq	88(%[a]), %%rax\n\t"
        "adcq	88(%[b]), %%rax\n\t"
        "movq	%%rax, 88(%[r])\n\t"
        "movq	96(%[a]), %%rax\n\t"
        "adcq	96(%[b]), %%rax\n\t"
        "movq	%%rax, 96(%[r])\n\t"
        "movq	104(%[a]), %%rax\n\t"
        "adcq	104(%[b]), %%rax\n\t"
        "movq	%%rax, 104(%[r])\n\t"
        "movq	112(%[a]), %%rax\n\t"
        "adcq	112(%[b]), %%rax\n\t"
        "movq	%%rax, 112(%[r])\n\t"
        "movq	120(%[a]), %%rax\n\t"
        "adcq	120(%[b]), %%rax\n\t"
        "movq	%%rax, 120(%[r])\n\t"
        "adcq	$0, %[c]\n\t"
        : [c] "+r" (c)
        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
        : "memory", "rax"
    );

    return c;
}

/* Sub b from a into a. (a -= b)
 *
 * a  A single precision integer and result.
 * b  A single precision integer.
 */
SP_NOINLINE static sp_digit sp_2048_sub_in_place_32(sp_digit* a,
    const sp_digit* b)
{
    sp_digit c = 0;

    __asm__ __volatile__ (
        "movq	0(%[a]), %%r8\n\t"
        "movq	8(%[a]), %%r9\n\t"
        "movq	0(%[b]), %%rdx\n\t"
        "movq	8(%[b]), %%rcx\n\t"
        "subq	%%rdx, %%r8\n\t"
        "movq	16(%[b]), %%rdx\n\t"
        "movq	%%r8, 0(%[a])\n\t"
        "movq	16(%[a]), %%r8\n\t"
        "sbbq	%%rcx, %%r9\n\t"
        "movq	24(%[b]), %%rcx\n\t"
        "movq	%%r9, 8(%[a])\n\t"
        "movq	24(%[a]), %%r9\n\t"
        "sbbq	%%rdx, %%r8\n\t"
        "movq	32(%[b]), %%rdx\n\t"
        "movq	%%r8, 16(%[a])\n\t"
        "movq	32(%[a]), %%r8\n\t"
        "sbbq	%%rcx, %%r9\n\t"
        "movq	40(%[b]), %%rcx\n\t"
        "movq	%%r9, 24(%[a])\n\t"
        "movq	40(%[a]), %%r9\n\t"
        "sbbq	%%rdx, %%r8\n\t"
        "movq	48(%[b]), %%rdx\n\t"
        "movq	%%r8, 32(%[a])\n\t"
        "movq	48(%[a]), %%r8\n\t"
        "sbbq	%%rcx, %%r9\n\t"
        "movq	56(%[b]), %%rcx\n\t"
        "movq	%%r9, 40(%[a])\n\t"
        "movq	56(%[a]), %%r9\n\t"
        "sbbq	%%rdx, %%r8\n\t"
        "movq	64(%[b]), %%rdx\n\t"
        "movq	%%r8, 48(%[a])\n\t"
        "movq	64(%[a]), %%r8\n\t"
        "sbbq	%%rcx, %%r9\n\t"
        "movq	72(%[b]), %%rcx\n\t"
        "movq	%%r9, 56(%[a])\n\t"
        "movq	72(%[a]), %%r9\n\t"
        "sbbq	%%rdx, %%r8\n\t"
        "movq	80(%[b]), %%rdx\n\t"
        "movq	%%r8, 64(%[a])\n\t"
        "movq	80(%[a]), %%r8\n\t"
        "sbbq	%%rcx, %%r9\n\t"
        "movq	88(%[b]), %%rcx\n\t"
        "movq	%%r9, 72(%[a])\n\t"
        "movq	88(%[a]), %%r9\n\t"
        "sbbq	%%rdx, %%r8\n\t"
        "movq	96(%[b]), %%rdx\n\t"
        "movq	%%r8, 80(%[a])\n\t"
        "movq	96(%[a]), %%r8\n\t"
        "sbbq	%%rcx, %%r9\n\t"
        "movq	104(%[b]), %%rcx\n\t"
        "movq	%%r9, 88(%[a])\n\t"
        "movq	104(%[a]), %%r9\n\t"
        "sbbq	%%rdx, %%r8\n\t"
        "movq	112(%[b]), %%rdx\n\t"
        "movq	%%r8, 96(%[a])\n\t"
        "movq	112(%[a]), %%r8\n\t"
        "sbbq	%%rcx, %%r9\n\t"
        "movq	120(%[b]), %%rcx\n\t"
        "movq	%%r9, 104(%[a])\n\t"
        "movq	120(%[a]), %%r9\n\t"
        "sbbq	%%rdx, %%r8\n\t"
        "movq	128(%[b]), %%rdx\n\t"
        "movq	%%r8, 112(%[a])\n\t"
        "movq	128(%[a]), %%r8\n\t"
        "sbbq	%%rcx, %%r9\n\t"
        "movq	136(%[b]), %%rcx\n\t"
        "movq	%%r9, 120(%[a])\n\t"
        "movq	136(%[a]), %%r9\n\t"
        "sbbq	%%rdx, %%r8\n\t"
        "movq	144(%[b]), %%rdx\n\t"
        "movq	%%r8, 128(%[a])\n\t"
        "movq	144(%[a]), %%r8\n\t"
        "sbbq	%%rcx, %%r9\n\t"
        "movq	152(%[b]), %%rcx\n\t"
        "movq	%%r9, 136(%[a])\n\t"
        "movq	152(%[a]), %%r9\n\t"
        "sbbq	%%rdx, %%r8\n\t"
        "movq	160(%[b]), %%rdx\n\t"
        "movq	%%r8, 144(%[a])\n\t"
        "movq	160(%[a]), %%r8\n\t"
        "sbbq	%%rcx, %%r9\n\t"
        "movq	168(%[b]), %%rcx\n\t"
        "movq	%%r9, 152(%[a])\n\t"
        "movq	168(%[a]), %%r9\n\t"
        "sbbq	%%rdx, %%r8\n\t"
        "movq	176(%[b]), %%rdx\n\t"
        "movq	%%r8, 160(%[a])\n\t"
        "movq	176(%[a]), %%r8\n\t"
        "sbbq	%%rcx, %%r9\n\t"
        "movq	184(%[b]), %%rcx\n\t"
        "movq	%%r9, 168(%[a])\n\t"
        "movq	184(%[a]), %%r9\n\t"
        "sbbq	%%rdx, %%r8\n\t"
        "movq	192(%[b]), %%rdx\n\t"
        "movq	%%r8, 176(%[a])\n\t"
        "movq	192(%[a]), %%r8\n\t"
        "sbbq	%%rcx, %%r9\n\t"
        "movq	200(%[b]), %%rcx\n\t"
        "movq	%%r9, 184(%[a])\n\t"
        "movq	200(%[a]), %%r9\n\t"
        "sbbq	%%rdx, %%r8\n\t"
        "movq	208(%[b]), %%rdx\n\t"
        "movq	%%r8, 192(%[a])\n\t"
        "movq	208(%[a]), %%r8\n\t"
        "sbbq	%%rcx, %%r9\n\t"
        "movq	216(%[b]), %%rcx\n\t"
        "movq	%%r9, 200(%[a])\n\t"
        "movq	216(%[a]), %%r9\n\t"
        "sbbq	%%rdx, %%r8\n\t"
        "movq	224(%[b]), %%rdx\n\t"
        "movq	%%r8, 208(%[a])\n\t"
        "movq	224(%[a]), %%r8\n\t"
        "sbbq	%%rcx, %%r9\n\t"
        "movq	232(%[b]), %%rcx\n\t"
        "movq	%%r9, 216(%[a])\n\t"
        "movq	232(%[a]), %%r9\n\t"
        "sbbq	%%rdx, %%r8\n\t"
        "movq	240(%[b]), %%rdx\n\t"
        "movq	%%r8, 224(%[a])\n\t"
        "movq	240(%[a]), %%r8\n\t"
        "sbbq	%%rcx, %%r9\n\t"
        "movq	248(%[b]), %%rcx\n\t"
        "movq	%%r9, 232(%[a])\n\t"
        "movq	248(%[a]), %%r9\n\t"
        "sbbq	%%rdx, %%r8\n\t"
        "movq	%%r8, 240(%[a])\n\t"
        "sbbq	%%rcx, %%r9\n\t"
        "movq	%%r9, 248(%[a])\n\t"
        "sbbq	$0, %[c]\n\t"
        : [c] "+r" (c)
        : [a] "r" (a), [b] "r" (b)
        : "memory", "rdx", "rcx", "r8", "r9"
    );

    return c;
}

/* Add b to a into r. (r = a + b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
SP_NOINLINE static sp_digit sp_2048_add_32(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    sp_digit c = 0;

    __asm__ __volatile__ (
        "movq	(%[a]), %%rax\n\t"
        "addq	(%[b]), %%rax\n\t"
        "movq	%%rax, (%[r])\n\t"
        "movq	8(%[a]), %%rax\n\t"
        "adcq	8(%[b]), %%rax\n\t"
        "movq	%%rax, 8(%[r])\n\t"
        "movq	16(%[a]), %%rax\n\t"
        "adcq	16(%[b]), %%rax\n\t"
        "movq	%%rax, 16(%[r])\n\t"
        "movq	24(%[a]), %%rax\n\t"
        "adcq	24(%[b]), %%rax\n\t"
        "movq	%%rax, 24(%[r])\n\t"
        "movq	32(%[a]), %%rax\n\t"
        "adcq	32(%[b]), %%rax\n\t"
        "movq	%%rax, 32(%[r])\n\t"
        "movq	40(%[a]), %%rax\n\t"
        "adcq	40(%[b]), %%rax\n\t"
        "movq	%%rax, 40(%[r])\n\t"
        "movq	48(%[a]), %%rax\n\t"
        "adcq	48(%[b]), %%rax\n\t"
        "movq	%%rax, 48(%[r])\n\t"
        "movq	56(%[a]), %%rax\n\t"
        "adcq	56(%[b]), %%rax\n\t"
        "movq	%%rax, 56(%[r])\n\t"
        "movq	64(%[a]), %%rax\n\t"
        "adcq	64(%[b]), %%rax\n\t"
        "movq	%%rax, 64(%[r])\n\t"
        "movq	72(%[a]), %%rax\n\t"
        "adcq	72(%[b]), %%rax\n\t"
        "movq	%%rax, 72(%[r])\n\t"
        "movq	80(%[a]), %%rax\n\t"
        "adcq	80(%[b]), %%rax\n\t"
        "movq	%%rax, 80(%[r])\n\t"
        "movq	88(%[a]), %%rax\n\t"
        "adcq	88(%[b]), %%rax\n\t"
        "movq	%%rax, 88(%[r])\n\t"
        "movq	96(%[a]), %%rax\n\t"
        "adcq	96(%[b]), %%rax\n\t"
        "movq	%%rax, 96(%[r])\n\t"
        "movq	104(%[a]), %%rax\n\t"
        "adcq	104(%[b]), %%rax\n\t"
        "movq	%%rax, 104(%[r])\n\t"
        "movq	112(%[a]), %%rax\n\t"
        "adcq	112(%[b]), %%rax\n\t"
        "movq	%%rax, 112(%[r])\n\t"
        "movq	120(%[a]), %%rax\n\t"
        "adcq	120(%[b]), %%rax\n\t"
        "movq	%%rax, 120(%[r])\n\t"
        "movq	128(%[a]), %%rax\n\t"
        "adcq	128(%[b]), %%rax\n\t"
        "movq	%%rax, 128(%[r])\n\t"
        "movq	136(%[a]), %%rax\n\t"
        "adcq	136(%[b]), %%rax\n\t"
        "movq	%%rax, 136(%[r])\n\t"
        "movq	144(%[a]), %%rax\n\t"
        "adcq	144(%[b]), %%rax\n\t"
        "movq	%%rax, 144(%[r])\n\t"
        "movq	152(%[a]), %%rax\n\t"
        "adcq	152(%[b]), %%rax\n\t"
        "movq	%%rax, 152(%[r])\n\t"
        "movq	160(%[a]), %%rax\n\t"
        "adcq	160(%[b]), %%rax\n\t"
        "movq	%%rax, 160(%[r])\n\t"
        "movq	168(%[a]), %%rax\n\t"
        "adcq	168(%[b]), %%rax\n\t"
        "movq	%%rax, 168(%[r])\n\t"
        "movq	176(%[a]), %%rax\n\t"
        "adcq	176(%[b]), %%rax\n\t"
        "movq	%%rax, 176(%[r])\n\t"
        "movq	184(%[a]), %%rax\n\t"
        "adcq	184(%[b]), %%rax\n\t"
        "movq	%%rax, 184(%[r])\n\t"
        "movq	192(%[a]), %%rax\n\t"
        "adcq	192(%[b]), %%rax\n\t"
        "movq	%%rax, 192(%[r])\n\t"
        "movq	200(%[a]), %%rax\n\t"
        "adcq	200(%[b]), %%rax\n\t"
        "movq	%%rax, 200(%[r])\n\t"
        "movq	208(%[a]), %%rax\n\t"
        "adcq	208(%[b]), %%rax\n\t"
        "movq	%%rax, 208(%[r])\n\t"
        "movq	216(%[a]), %%rax\n\t"
        "adcq	216(%[b]), %%rax\n\t"
        "movq	%%rax, 216(%[r])\n\t"
        "movq	224(%[a]), %%rax\n\t"
        "adcq	224(%[b]), %%rax\n\t"
        "movq	%%rax, 224(%[r])\n\t"
        "movq	232(%[a]), %%rax\n\t"
        "adcq	232(%[b]), %%rax\n\t"
        "movq	%%rax, 232(%[r])\n\t"
        "movq	240(%[a]), %%rax\n\t"
        "adcq	240(%[b]), %%rax\n\t"
        "movq	%%rax, 240(%[r])\n\t"
        "movq	248(%[a]), %%rax\n\t"
        "adcq	248(%[b]), %%rax\n\t"
        "movq	%%rax, 248(%[r])\n\t"
        "adcq	$0, %[c]\n\t"
        : [c] "+r" (c)
        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
        : "memory", "rax"
    );

    return c;
}

/* AND m into each word of a and store in r.
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * m  Mask to AND against each digit.
 */
static void sp_2048_mask_16(sp_digit* r, sp_digit* a, sp_digit m)
{
#ifdef WOLFSSL_SP_SMALL
    int i;

    for (i=0; i<16; i++)
        r[i] = a[i] & m;
#else
    int i;

    for (i = 0; i < 16; i += 8) {
        r[i+0] = a[i+0] & m;
        r[i+1] = a[i+1] & m;
        r[i+2] = a[i+2] & m;
        r[i+3] = a[i+3] & m;
        r[i+4] = a[i+4] & m;
        r[i+5] = a[i+5] & m;
        r[i+6] = a[i+6] & m;
        r[i+7] = a[i+7] & m;
    }
#endif
}

/* Multiply a and b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
SP_NOINLINE static void sp_2048_mul_32(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    sp_digit* z0 = r;
    sp_digit z1[32];
    sp_digit a1[16];
    sp_digit b1[16];
    sp_digit z2[32];
    sp_digit u, ca, cb;

    ca = sp_2048_add_16(a1, a, &a[16]);
    cb = sp_2048_add_16(b1, b, &b[16]);
    u  = ca & cb;
    sp_2048_mul_16(z1, a1, b1);
    sp_2048_mul_16(z2, &a[16], &b[16]);
    sp_2048_mul_16(z0, a, b);
    sp_2048_mask_16(r + 32, a1, 0 - cb);
    sp_2048_mask_16(b1, b1, 0 - ca);
    u += sp_2048_add_16(r + 32, r + 32, b1);
    u += sp_2048_sub_in_place_32(z1, z2);
    u += sp_2048_sub_in_place_32(z1, z0);
    u += sp_2048_add_32(r + 16, r + 16, z1);
    r[48] = u;
    XMEMSET(r + 48 + 1, 0, sizeof(sp_digit) * (16 - 1));
    sp_2048_add_32(r + 32, r + 32, z2);
}

/* Square a and put result in r. (r = a * a)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 */
SP_NOINLINE static void sp_2048_sqr_32(sp_digit* r, const sp_digit* a)
{
    sp_digit* z0 = r;
    sp_digit z2[32];
    sp_digit z1[32];
    sp_digit a1[16];
    sp_digit u;

    u = sp_2048_add_16(a1, a, &a[16]);
    sp_2048_sqr_16(z1, a1);
    sp_2048_sqr_16(z2, &a[16]);
    sp_2048_sqr_16(z0, a);
    sp_2048_mask_16(r + 32, a1, 0 - u);
    u += sp_2048_add_16(r + 32, r + 32, r + 32);
    u += sp_2048_sub_in_place_32(z1, z2);
    u += sp_2048_sub_in_place_32(z1, z0);
    u += sp_2048_add_32(r + 16, r + 16, z1);
    r[48] = u;
    XMEMSET(r + 48 + 1, 0, sizeof(sp_digit) * (16 - 1));
    sp_2048_add_32(r + 32, r + 32, z2);
}

#ifdef HAVE_INTEL_AVX2
/* Multiply a and b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
SP_NOINLINE static void sp_2048_mul_avx2_32(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    sp_digit* z0 = r;
    sp_digit z1[32];
    sp_digit a1[16];
    sp_digit b1[16];
    sp_digit z2[32];
    sp_digit u, ca, cb;

    ca = sp_2048_add_16(a1, a, &a[16]);
    cb = sp_2048_add_16(b1, b, &b[16]);
    u  = ca & cb;
    sp_2048_mul_avx2_16(z1, a1, b1);
    sp_2048_mul_avx2_16(z2, &a[16], &b[16]);
    sp_2048_mul_avx2_16(z0, a, b);
    sp_2048_mask_16(r + 32, a1, 0 - cb);
    sp_2048_mask_16(b1, b1, 0 - ca);
    u += sp_2048_add_16(r + 32, r + 32, b1);
    u += sp_2048_sub_in_place_32(z1, z2);
    u += sp_2048_sub_in_place_32(z1, z0);
    u += sp_2048_add_32(r + 16, r + 16, z1);
    r[48] = u;
    XMEMSET(r + 48 + 1, 0, sizeof(sp_digit) * (16 - 1));
    sp_2048_add_32(r + 32, r + 32, z2);
}
#endif /* HAVE_INTEL_AVX2 */

#ifdef HAVE_INTEL_AVX2
/* Square a and put result in r. (r = a * a)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 */
SP_NOINLINE static void sp_2048_sqr_avx2_32(sp_digit* r, const sp_digit* a)
{
    sp_digit* z0 = r;
    sp_digit z2[32];
    sp_digit z1[32];
    sp_digit a1[16];
    sp_digit u;

    u = sp_2048_add_16(a1, a, &a[16]);
    sp_2048_sqr_avx2_16(z1, a1);
    sp_2048_sqr_avx2_16(z2, &a[16]);
    sp_2048_sqr_avx2_16(z0, a);
    sp_2048_mask_16(r + 32, a1, 0 - u);
    u += sp_2048_add_16(r + 32, r + 32, r + 32);
    u += sp_2048_sub_in_place_32(z1, z2);
    u += sp_2048_sub_in_place_32(z1, z0);
    u += sp_2048_add_32(r + 16, r + 16, z1);
    r[48] = u;
    XMEMSET(r + 48 + 1, 0, sizeof(sp_digit) * (16 - 1));
    sp_2048_add_32(r + 32, r + 32, z2);
}
#endif /* HAVE_INTEL_AVX2 */

#if !defined(SP_RSA_PRIVATE_EXP_D) && defined(WOLFSSL_HAVE_SP_RSA)
#endif /* !SP_RSA_PRIVATE_EXP_D && WOLFSSL_HAVE_SP_RSA */

/* Caclulate the bottom digit of -1/a mod 2^n.
 *
 * a    A single precision number.
 * rho  Bottom word of inverse.
 */
static void sp_2048_mont_setup(sp_digit* a, sp_digit* rho)
{
    sp_digit x, b;

    b = a[0];
    x = (((b + 2) & 4) << 1) + b; /* here x*a==1 mod 2**4 */
    x *= 2 - b * x;               /* here x*a==1 mod 2**8 */
    x *= 2 - b * x;               /* here x*a==1 mod 2**16 */
    x *= 2 - b * x;               /* here x*a==1 mod 2**32 */
    x *= 2 - b * x;               /* here x*a==1 mod 2**64 */

    /* rho = -1/m mod b */
    *rho = -x;
}

#if !defined(SP_RSA_PRIVATE_EXP_D) && defined(WOLFSSL_HAVE_SP_RSA)
/* Sub b from a into a. (a -= b)
 *
 * a  A single precision integer and result.
 * b  A single precision integer.
 */
SP_NOINLINE static sp_digit sp_2048_sub_in_place_16(sp_digit* a,
    const sp_digit* b)
{
    sp_digit c = 0;

    __asm__ __volatile__ (
        "movq	0(%[a]), %%r8\n\t"
        "movq	8(%[a]), %%r9\n\t"
        "movq	0(%[b]), %%rdx\n\t"
        "movq	8(%[b]), %%rcx\n\t"
        "subq	%%rdx, %%r8\n\t"
        "movq	16(%[b]), %%rdx\n\t"
        "movq	%%r8, 0(%[a])\n\t"
        "movq	16(%[a]), %%r8\n\t"
        "sbbq	%%rcx, %%r9\n\t"
        "movq	24(%[b]), %%rcx\n\t"
        "movq	%%r9, 8(%[a])\n\t"
        "movq	24(%[a]), %%r9\n\t"
        "sbbq	%%rdx, %%r8\n\t"
        "movq	32(%[b]), %%rdx\n\t"
        "movq	%%r8, 16(%[a])\n\t"
        "movq	32(%[a]), %%r8\n\t"
        "sbbq	%%rcx, %%r9\n\t"
        "movq	40(%[b]), %%rcx\n\t"
        "movq	%%r9, 24(%[a])\n\t"
        "movq	40(%[a]), %%r9\n\t"
        "sbbq	%%rdx, %%r8\n\t"
        "movq	48(%[b]), %%rdx\n\t"
        "movq	%%r8, 32(%[a])\n\t"
        "movq	48(%[a]), %%r8\n\t"
        "sbbq	%%rcx, %%r9\n\t"
        "movq	56(%[b]), %%rcx\n\t"
        "movq	%%r9, 40(%[a])\n\t"
        "movq	56(%[a]), %%r9\n\t"
        "sbbq	%%rdx, %%r8\n\t"
        "movq	64(%[b]), %%rdx\n\t"
        "movq	%%r8, 48(%[a])\n\t"
        "movq	64(%[a]), %%r8\n\t"
        "sbbq	%%rcx, %%r9\n\t"
        "movq	72(%[b]), %%rcx\n\t"
        "movq	%%r9, 56(%[a])\n\t"
        "movq	72(%[a]), %%r9\n\t"
        "sbbq	%%rdx, %%r8\n\t"
        "movq	80(%[b]), %%rdx\n\t"
        "movq	%%r8, 64(%[a])\n\t"
        "movq	80(%[a]), %%r8\n\t"
        "sbbq	%%rcx, %%r9\n\t"
        "movq	88(%[b]), %%rcx\n\t"
        "movq	%%r9, 72(%[a])\n\t"
        "movq	88(%[a]), %%r9\n\t"
        "sbbq	%%rdx, %%r8\n\t"
        "movq	96(%[b]), %%rdx\n\t"
        "movq	%%r8, 80(%[a])\n\t"
        "movq	96(%[a]), %%r8\n\t"
        "sbbq	%%rcx, %%r9\n\t"
        "movq	104(%[b]), %%rcx\n\t"
        "movq	%%r9, 88(%[a])\n\t"
        "movq	104(%[a]), %%r9\n\t"
        "sbbq	%%rdx, %%r8\n\t"
        "movq	112(%[b]), %%rdx\n\t"
        "movq	%%r8, 96(%[a])\n\t"
        "movq	112(%[a]), %%r8\n\t"
        "sbbq	%%rcx, %%r9\n\t"
        "movq	120(%[b]), %%rcx\n\t"
        "movq	%%r9, 104(%[a])\n\t"
        "movq	120(%[a]), %%r9\n\t"
        "sbbq	%%rdx, %%r8\n\t"
        "movq	%%r8, 112(%[a])\n\t"
        "sbbq	%%rcx, %%r9\n\t"
        "movq	%%r9, 120(%[a])\n\t"
        "sbbq	$0, %[c]\n\t"
        : [c] "+r" (c)
        : [a] "r" (a), [b] "r" (b)
        : "memory", "rdx", "rcx", "r8", "r9"
    );

    return c;
}

/* r = 2^n mod m where n is the number of bits to reduce by.
 * Given m must be 2048 bits, just need to subtract.
 *
 * r  A single precision number.
 * m  A signle precision number.
 */
static void sp_2048_mont_norm_16(sp_digit* r, sp_digit* m)
{
    XMEMSET(r, 0, sizeof(sp_digit) * 16);

    /* r = 2^n mod m */
    sp_2048_sub_in_place_16(r, m);
}

/* Conditionally subtract b from a using the mask m.
 * m is -1 to subtract and 0 when not copying.
 *
 * r  A single precision number representing condition subtract result.
 * a  A single precision number to subtract from.
 * b  A single precision number to subtract.
 * m  Mask value to apply.
 */
static sp_digit sp_2048_cond_sub_16(sp_digit* r, sp_digit* a, sp_digit* b,
        sp_digit m)
{
    sp_digit t[16];
    sp_digit c = 0;

    __asm__ __volatile__ (
        "movq	0(%[b]), %%rax\n\t"
        "movq	8(%[b]), %%rcx\n\t"
        "andq	%[m], %%rax\n\t"
        "andq	%[m], %%rcx\n\t"
        "movq	%%rax, 0(%[t])\n\t"
        "movq	%%rcx, 8(%[t])\n\t"
        "movq	16(%[b]), %%rax\n\t"
        "movq	24(%[b]), %%rcx\n\t"
        "andq	%[m], %%rax\n\t"
        "andq	%[m], %%rcx\n\t"
        "movq	%%rax, 16(%[t])\n\t"
        "movq	%%rcx, 24(%[t])\n\t"
        "movq	32(%[b]), %%rax\n\t"
        "movq	40(%[b]), %%rcx\n\t"
        "andq	%[m], %%rax\n\t"
        "andq	%[m], %%rcx\n\t"
        "movq	%%rax, 32(%[t])\n\t"
        "movq	%%rcx, 40(%[t])\n\t"
        "movq	48(%[b]), %%rax\n\t"
        "movq	56(%[b]), %%rcx\n\t"
        "andq	%[m], %%rax\n\t"
        "andq	%[m], %%rcx\n\t"
        "movq	%%rax, 48(%[t])\n\t"
        "movq	%%rcx, 56(%[t])\n\t"
        "movq	64(%[b]), %%rax\n\t"
        "movq	72(%[b]), %%rcx\n\t"
        "andq	%[m], %%rax\n\t"
        "andq	%[m], %%rcx\n\t"
        "movq	%%rax, 64(%[t])\n\t"
        "movq	%%rcx, 72(%[t])\n\t"
        "movq	80(%[b]), %%rax\n\t"
        "movq	88(%[b]), %%rcx\n\t"
        "andq	%[m], %%rax\n\t"
        "andq	%[m], %%rcx\n\t"
        "movq	%%rax, 80(%[t])\n\t"
        "movq	%%rcx, 88(%[t])\n\t"
        "movq	96(%[b]), %%rax\n\t"
        "movq	104(%[b]), %%rcx\n\t"
        "andq	%[m], %%rax\n\t"
        "andq	%[m], %%rcx\n\t"
        "movq	%%rax, 96(%[t])\n\t"
        "movq	%%rcx, 104(%[t])\n\t"
        "movq	112(%[b]), %%rax\n\t"
        "movq	120(%[b]), %%rcx\n\t"
        "andq	%[m], %%rax\n\t"
        "andq	%[m], %%rcx\n\t"
        "movq	%%rax, 112(%[t])\n\t"
        "movq	%%rcx, 120(%[t])\n\t"
        "movq	(%[a]), %%rax\n\t"
        "movq	(%[t]), %%rdx\n\t"
        "subq	%%rdx,%%rax\n\t"
        "movq	8(%[a]), %%rcx\n\t"
        "movq	8(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rcx\n\t"
        "movq	%%rax, 0(%[r])\n\t"
        "movq	16(%[a]), %%rax\n\t"
        "movq	16(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rax\n\t"
        "movq	%%rcx, 8(%[r])\n\t"
        "movq	24(%[a]), %%rcx\n\t"
        "movq	24(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rcx\n\t"
        "movq	%%rax, 16(%[r])\n\t"
        "movq	32(%[a]), %%rax\n\t"
        "movq	32(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rax\n\t"
        "movq	%%rcx, 24(%[r])\n\t"
        "movq	40(%[a]), %%rcx\n\t"
        "movq	40(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rcx\n\t"
        "movq	%%rax, 32(%[r])\n\t"
        "movq	48(%[a]), %%rax\n\t"
        "movq	48(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rax\n\t"
        "movq	%%rcx, 40(%[r])\n\t"
        "movq	56(%[a]), %%rcx\n\t"
        "movq	56(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rcx\n\t"
        "movq	%%rax, 48(%[r])\n\t"
        "movq	64(%[a]), %%rax\n\t"
        "movq	64(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rax\n\t"
        "movq	%%rcx, 56(%[r])\n\t"
        "movq	72(%[a]), %%rcx\n\t"
        "movq	72(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rcx\n\t"
        "movq	%%rax, 64(%[r])\n\t"
        "movq	80(%[a]), %%rax\n\t"
        "movq	80(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rax\n\t"
        "movq	%%rcx, 72(%[r])\n\t"
        "movq	88(%[a]), %%rcx\n\t"
        "movq	88(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rcx\n\t"
        "movq	%%rax, 80(%[r])\n\t"
        "movq	96(%[a]), %%rax\n\t"
        "movq	96(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rax\n\t"
        "movq	%%rcx, 88(%[r])\n\t"
        "movq	104(%[a]), %%rcx\n\t"
        "movq	104(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rcx\n\t"
        "movq	%%rax, 96(%[r])\n\t"
        "movq	112(%[a]), %%rax\n\t"
        "movq	112(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rax\n\t"
        "movq	%%rcx, 104(%[r])\n\t"
        "movq	120(%[a]), %%rcx\n\t"
        "movq	120(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rcx\n\t"
        "movq	%%rax, 112(%[r])\n\t"
        "movq	%%rcx, 120(%[r])\n\t"
        "sbbq	$0, %[c]\n\t"
        : [c] "+r" (c)
        : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m), [t] "r" (t)
        : "memory", "rax", "rcx", "rdx"
    );

    return c;
}

/* Reduce the number back to 2048 bits using Montgomery reduction.
 *
 * a   A single precision number to reduce in place.
 * m   The single precision number representing the modulus.
 * mp  The digit representing the negative inverse of m mod 2^n.
 */
SP_NOINLINE static void sp_2048_mont_reduce_16(sp_digit* a, sp_digit* m,
        sp_digit mp)
{
    sp_digit ca = 0;

    __asm__ __volatile__ (
        "# i = 0\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "movq	0(%[a]), %%r12\n\t"
        "movq	8(%[a]), %%r13\n\t"
        "\nL_mont_loop_16:\n\t"
        "# mu = a[i] * mp\n\t"
        "movq	%%r12, %%r10\n\t"
        "imulq	%[mp], %%r10\n\t"
        "# a[i+0] += m[0] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r9, %%r9\n\t"
        "mulq	0(%[m])\n\t"
        "addq	%%rax, %%r12\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "# a[i+1] += m[1] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	8(%[m])\n\t"
        "movq	%%r13, %%r12\n\t"
        "addq	%%rax, %%r12\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "addq	%%r9,  %%r12\n\t"
        "adcq	$0, %%r8\n\t"
        "# a[i+2] += m[2] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r9, %%r9\n\t"
        "mulq	16(%[m])\n\t"
        "movq	16(%[a]), %%r13\n\t"
        "addq	%%rax, %%r13\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "addq	%%r8,  %%r13\n\t"
        "adcq	$0, %%r9\n\t"
        "# a[i+3] += m[3] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	24(%[m])\n\t"
        "movq	24(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "addq	%%r9,  %%r11\n\t"
        "movq	%%r11, 24(%[a])\n\t"
        "adcq	$0, %%r8\n\t"
        "# a[i+4] += m[4] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r9, %%r9\n\t"
        "mulq	32(%[m])\n\t"
        "movq	32(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "addq	%%r8,  %%r11\n\t"
        "movq	%%r11, 32(%[a])\n\t"
        "adcq	$0, %%r9\n\t"
        "# a[i+5] += m[5] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	40(%[m])\n\t"
        "movq	40(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "addq	%%r9,  %%r11\n\t"
        "movq	%%r11, 40(%[a])\n\t"
        "adcq	$0, %%r8\n\t"
        "# a[i+6] += m[6] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r9, %%r9\n\t"
        "mulq	48(%[m])\n\t"
        "movq	48(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "addq	%%r8,  %%r11\n\t"
        "movq	%%r11, 48(%[a])\n\t"
        "adcq	$0, %%r9\n\t"
        "# a[i+7] += m[7] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	56(%[m])\n\t"
        "movq	56(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "addq	%%r9,  %%r11\n\t"
        "movq	%%r11, 56(%[a])\n\t"
        "adcq	$0, %%r8\n\t"
        "# a[i+8] += m[8] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r9, %%r9\n\t"
        "mulq	64(%[m])\n\t"
        "movq	64(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "addq	%%r8,  %%r11\n\t"
        "movq	%%r11, 64(%[a])\n\t"
        "adcq	$0, %%r9\n\t"
        "# a[i+9] += m[9] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	72(%[m])\n\t"
        "movq	72(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "addq	%%r9,  %%r11\n\t"
        "movq	%%r11, 72(%[a])\n\t"
        "adcq	$0, %%r8\n\t"
        "# a[i+10] += m[10] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r9, %%r9\n\t"
        "mulq	80(%[m])\n\t"
        "movq	80(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "addq	%%r8,  %%r11\n\t"
        "movq	%%r11, 80(%[a])\n\t"
        "adcq	$0, %%r9\n\t"
        "# a[i+11] += m[11] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	88(%[m])\n\t"
        "movq	88(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "addq	%%r9,  %%r11\n\t"
        "movq	%%r11, 88(%[a])\n\t"
        "adcq	$0, %%r8\n\t"
        "# a[i+12] += m[12] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r9, %%r9\n\t"
        "mulq	96(%[m])\n\t"
        "movq	96(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "addq	%%r8,  %%r11\n\t"
        "movq	%%r11, 96(%[a])\n\t"
        "adcq	$0, %%r9\n\t"
        "# a[i+13] += m[13] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	104(%[m])\n\t"
        "movq	104(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "addq	%%r9,  %%r11\n\t"
        "movq	%%r11, 104(%[a])\n\t"
        "adcq	$0, %%r8\n\t"
        "# a[i+14] += m[14] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r9, %%r9\n\t"
        "mulq	112(%[m])\n\t"
        "movq	112(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "addq	%%r8,  %%r11\n\t"
        "movq	%%r11, 112(%[a])\n\t"
        "adcq	$0, %%r9\n\t"
        "# a[i+15] += m[15] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "mulq	120(%[m])\n\t"
        "movq	120(%[a]), %%r11\n\t"
        "addq	%%rax, %%r9\n\t"
        "adcq	%[ca], %%rdx\n\t"
        "movq	$0, %[ca]\n\t"
        "adcq	$0, %[ca]\n\t"
        "addq	%%r9, %%r11\n\t"
        "movq	%%r11, 120(%[a])\n\t"
        "adcq	%%rdx, 128(%[a])\n\t"
        "adcq	$0, %[ca]\n\t"
        "# i += 1\n\t"
        "addq	$8, %[a]\n\t"
        "addq	$8, %%rcx\n\t"
        "cmpq	$128, %%rcx\n\t"
        "jl	L_mont_loop_16\n\t"
        "movq	%%r12, 0(%[a])\n\t"
        "movq	%%r13, 8(%[a])\n\t"
        : [ca] "+r" (ca), [a] "+r" (a)
        : [m] "r" (m), [mp] "r" (mp)
        : "memory", "rax", "rdx", "rcx", "r8", "r9", "r10", "r11",
          "r12", "r13"
    );

    sp_2048_cond_sub_16(a - 16, a, m, (sp_digit)0 - ca);
}

/* Multiply two Montogmery form numbers mod the modulus (prime).
 * (r = a * b mod m)
 *
 * r   Result of multiplication.
 * a   First number to multiply in Montogmery form.
 * b   Second number to multiply in Montogmery form.
 * m   Modulus (prime).
 * mp  Montogmery mulitplier.
 */
static void sp_2048_mont_mul_16(sp_digit* r, sp_digit* a, sp_digit* b,
        sp_digit* m, sp_digit mp)
{
    sp_2048_mul_16(r, a, b);
    sp_2048_mont_reduce_16(r, m, mp);
}

/* Square the Montgomery form number. (r = a * a mod m)
 *
 * r   Result of squaring.
 * a   Number to square in Montogmery form.
 * m   Modulus (prime).
 * mp  Montogmery mulitplier.
 */
static void sp_2048_mont_sqr_16(sp_digit* r, sp_digit* a, sp_digit* m,
        sp_digit mp)
{
    sp_2048_sqr_16(r, a);
    sp_2048_mont_reduce_16(r, m, mp);
}

/* Mul a by digit b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision digit.
 */
SP_NOINLINE static void sp_2048_mul_d_16(sp_digit* r, const sp_digit* a,
        const sp_digit b)
{
    __asm__ __volatile__ (
        "# A[0] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	(%[a])\n\t"
        "movq	%%rax, %%rbx\n\t"
        "movq	%%rdx, %%rcx\n\t"
        "movq	%%rbx, 0(%[r])\n\t"
        "# A[1] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "movq	%%rcx, 8(%[r])\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "# A[2] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "movq	%%r8, 16(%[r])\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "# A[3] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "movq	%%rbx, 24(%[r])\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "# A[4] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "movq	%%rcx, 32(%[r])\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "# A[5] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "movq	%%r8, 40(%[r])\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "# A[6] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "movq	%%rbx, 48(%[r])\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "# A[7] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "movq	%%rcx, 56(%[r])\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "# A[8] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "movq	%%r8, 64(%[r])\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "# A[9] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "movq	%%rbx, 72(%[r])\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "# A[10] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "movq	%%rcx, 80(%[r])\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "# A[11] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "movq	%%r8, 88(%[r])\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "# A[12] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "movq	%%rbx, 96(%[r])\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "# A[13] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "movq	%%rcx, 104(%[r])\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "# A[14] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "movq	%%r8, 112(%[r])\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "# A[15] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "movq	%%rbx, 120(%[r])\n\t"
        "movq	%%rcx, 128(%[r])\n\t"
        :
        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
        : "memory", "rax", "rdx", "rbx", "rcx", "r8"
    );
}

#ifdef HAVE_INTEL_AVX2
/* Mul a by digit b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision digit.
 */
SP_NOINLINE static void sp_2048_mul_d_avx2_16(sp_digit* r, const sp_digit* a,
        const sp_digit b)
{
    __asm__ __volatile__ (
        "# A[0] * B\n\t"
        "movq	%[b], %%rdx\n\t"
        "xorq	%%r10, %%r10\n\t"
        "mulxq	(%[a]), %%r8, %%r9\n\t"
        "movq	%%r8, 0(%[r])\n\t"
        "# A[1] * B\n\t"
        "mulxq	8(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r8\n\t"
        "adcxq	%%rax, %%r9\n\t"
        "movq	%%r9, 8(%[r])\n\t"
        "adoxq	%%rcx, %%r8\n\t"
        "# A[2] * B\n\t"
        "mulxq	16(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r9\n\t"
        "adcxq	%%rax, %%r8\n\t"
        "movq	%%r8, 16(%[r])\n\t"
        "adoxq	%%rcx, %%r9\n\t"
        "# A[3] * B\n\t"
        "mulxq	24(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r8\n\t"
        "adcxq	%%rax, %%r9\n\t"
        "movq	%%r9, 24(%[r])\n\t"
        "adoxq	%%rcx, %%r8\n\t"
        "# A[4] * B\n\t"
        "mulxq	32(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r9\n\t"
        "adcxq	%%rax, %%r8\n\t"
        "movq	%%r8, 32(%[r])\n\t"
        "adoxq	%%rcx, %%r9\n\t"
        "# A[5] * B\n\t"
        "mulxq	40(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r8\n\t"
        "adcxq	%%rax, %%r9\n\t"
        "movq	%%r9, 40(%[r])\n\t"
        "adoxq	%%rcx, %%r8\n\t"
        "# A[6] * B\n\t"
        "mulxq	48(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r9\n\t"
        "adcxq	%%rax, %%r8\n\t"
        "movq	%%r8, 48(%[r])\n\t"
        "adoxq	%%rcx, %%r9\n\t"
        "# A[7] * B\n\t"
        "mulxq	56(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r8\n\t"
        "adcxq	%%rax, %%r9\n\t"
        "movq	%%r9, 56(%[r])\n\t"
        "adoxq	%%rcx, %%r8\n\t"
        "# A[8] * B\n\t"
        "mulxq	64(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r9\n\t"
        "adcxq	%%rax, %%r8\n\t"
        "movq	%%r8, 64(%[r])\n\t"
        "adoxq	%%rcx, %%r9\n\t"
        "# A[9] * B\n\t"
        "mulxq	72(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r8\n\t"
        "adcxq	%%rax, %%r9\n\t"
        "movq	%%r9, 72(%[r])\n\t"
        "adoxq	%%rcx, %%r8\n\t"
        "# A[10] * B\n\t"
        "mulxq	80(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r9\n\t"
        "adcxq	%%rax, %%r8\n\t"
        "movq	%%r8, 80(%[r])\n\t"
        "adoxq	%%rcx, %%r9\n\t"
        "# A[11] * B\n\t"
        "mulxq	88(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r8\n\t"
        "adcxq	%%rax, %%r9\n\t"
        "movq	%%r9, 88(%[r])\n\t"
        "adoxq	%%rcx, %%r8\n\t"
        "# A[12] * B\n\t"
        "mulxq	96(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r9\n\t"
        "adcxq	%%rax, %%r8\n\t"
        "movq	%%r8, 96(%[r])\n\t"
        "adoxq	%%rcx, %%r9\n\t"
        "# A[13] * B\n\t"
        "mulxq	104(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r8\n\t"
        "adcxq	%%rax, %%r9\n\t"
        "movq	%%r9, 104(%[r])\n\t"
        "adoxq	%%rcx, %%r8\n\t"
        "# A[14] * B\n\t"
        "mulxq	112(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r9\n\t"
        "adcxq	%%rax, %%r8\n\t"
        "movq	%%r8, 112(%[r])\n\t"
        "adoxq	%%rcx, %%r9\n\t"
        "# A[15] * B\n\t"
        "mulxq	120(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r8\n\t"
        "adcxq	%%rax, %%r9\n\t"
        "adoxq	%%rcx, %%r8\n\t"
        "adcxq	%%r10, %%r8\n\t"
        "movq	%%r9, 120(%[r])\n\t"
        "movq	%%r8, 128(%[r])\n\t"
        :
        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
        : "memory", "rax", "rdx", "rcx", "r8", "r9", "r10"
    );
}
#endif /* HAVE_INTEL_AVX2 */

/* Divide the double width number (d1|d0) by the dividend. (d1|d0 / div)
 *
 * d1   The high order half of the number to divide.
 * d0   The low order half of the number to divide.
 * div  The dividend.
 * returns the result of the division.
 */
static sp_digit div_2048_word_16(sp_digit d1, sp_digit d0, sp_digit div)
{
    sp_digit r;

    __asm__ __volatile__ (
        "movq	%[d0], %%rax\n\t"
        "movq	%[d1], %%rdx\n\t"
        "divq	%[div]\n\t"
        "movq	%%rax, %[r]\n\t"
        : [r] "=r" (r)
        : [d1] "r" (d1), [d0] "r" (d0), [div] "r" (div)
        : "rax", "rdx"
    );

    return r;
}

/* Compare a with b in constant time.
 *
 * a  A single precision integer.
 * b  A single precision integer.
 * return -ve, 0 or +ve if a is less than, equal to or greater than b
 * respectively.
 */
static int64_t sp_2048_cmp_16(sp_digit* a, sp_digit* b)
{
    sp_digit r = -1;
    sp_digit one = 1;

    __asm__ __volatile__ (
        "xorq	%%rcx, %%rcx\n\t"
        "movq	$-1, %%rdx\n\t"
        "movq	120(%[a]), %%rbx\n\t"
        "movq	120(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	112(%[a]), %%rbx\n\t"
        "movq	112(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	104(%[a]), %%rbx\n\t"
        "movq	104(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	96(%[a]), %%rbx\n\t"
        "movq	96(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	88(%[a]), %%rbx\n\t"
        "movq	88(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	80(%[a]), %%rbx\n\t"
        "movq	80(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	72(%[a]), %%rbx\n\t"
        "movq	72(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	64(%[a]), %%rbx\n\t"
        "movq	64(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	56(%[a]), %%rbx\n\t"
        "movq	56(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	48(%[a]), %%rbx\n\t"
        "movq	48(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	40(%[a]), %%rbx\n\t"
        "movq	40(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	32(%[a]), %%rbx\n\t"
        "movq	32(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	24(%[a]), %%rbx\n\t"
        "movq	24(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	16(%[a]), %%rbx\n\t"
        "movq	16(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	8(%[a]), %%rbx\n\t"
        "movq	8(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	0(%[a]), %%rbx\n\t"
        "movq	0(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "xorq	%%rdx, %[r]\n\t"
        : [r] "+r" (r)
        : [a] "r" (a), [b] "r" (b), [one] "r" (one)
        : "rax", "rdx", "rcx", "rbx", "r8"
    );

    return r;
}

/* Divide d in a and put remainder into r (m*d + r = a)
 * m is not calculated as it is not needed at this time.
 *
 * a  Nmber to be divided.
 * d  Number to divide with.
 * m  Multiplier result.
 * r  Remainder from the division.
 * returns MP_OKAY indicating success.
 */
static INLINE int sp_2048_div_16(sp_digit* a, sp_digit* d, sp_digit* m,
        sp_digit* r)
{
    sp_digit t1[32], t2[17];
    sp_digit div, r1;
    int i;
#ifdef HAVE_INTEL_AVX2
    word32 cpuid_flags = cpuid_get_flags();
#endif

    (void)m;

    div = d[15];
    XMEMCPY(t1, a, sizeof(*t1) * 2 * 16);
    for (i=15; i>=0; i--) {
        r1 = div_2048_word_16(t1[16 + i], t1[16 + i - 1], div);

#ifdef HAVE_INTEL_AVX2
        if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags))
            sp_2048_mul_d_avx2_16(t2, d, r1);
        else
#endif
            sp_2048_mul_d_16(t2, d, r1);
        t1[16 + i] += sp_2048_sub_in_place_16(&t1[i], t2);
        t1[16 + i] -= t2[16];
        sp_2048_mask_16(t2, d, t1[16 + i]);
        t1[16 + i] += sp_2048_add_16(&t1[i], &t1[i], t2);
        sp_2048_mask_16(t2, d, t1[16 + i]);
        t1[16 + i] += sp_2048_add_16(&t1[i], &t1[i], t2);
    }

    r1 = sp_2048_cmp_16(t1, d) >= 0;
    sp_2048_cond_sub_16(r, t1, t2, (sp_digit)0 - r1);

    return MP_OKAY;
}

/* Reduce a modulo m into r. (r = a mod m)
 *
 * r  A single precision number that is the reduced result.
 * a  A single precision number that is to be reduced.
 * m  A single precision number that is the modulus to reduce with.
 * returns MP_OKAY indicating success.
 */
static INLINE int sp_2048_mod_16(sp_digit* r, sp_digit* a, sp_digit* m)
{
    return sp_2048_div_16(a, m, NULL, r);
}

/* Modular exponentiate a to the e mod m. (r = a^e mod m)
 *
 * r     A single precision number that is the result of the operation.
 * a     A single precision number being exponentiated.
 * e     A single precision number that is the exponent.
 * bits  The number of bits in the exponent.
 * m     A single precision number that is the modulus.
 * returns 0 on success and MEMORY_E on dynamic memory allocation failure.
 */
static int sp_2048_mod_exp_16(sp_digit* r, sp_digit* a, sp_digit* e,
        int bits, sp_digit* m, int reduceA)
{
#ifndef WOLFSSL_SMALL_STACK
    sp_digit t[32][32];
#else
    sp_digit* t[32];
    sp_digit* td;
#endif
    sp_digit* norm;
    sp_digit mp = 1;
    sp_digit n;
    sp_digit mask;
    int i;
    int c, y;
    int err = MP_OKAY;

#ifdef WOLFSSL_SMALL_STACK
    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 32 * 32, NULL,
                            DYNAMIC_TYPE_TMP_BUFFER);
    if (td == NULL)
        err = MEMORY_E;

    if (err == MP_OKAY) {
        for (i=0; i<32; i++)
            t[i] = td + i * 32;
        norm = t[0];
    }
#else
    norm = t[0];
#endif

    if (err == MP_OKAY) {
        sp_2048_mont_setup(m, &mp);
        sp_2048_mont_norm_16(norm, m);

        XMEMSET(t[1], 0, sizeof(sp_digit) * 16);
        if (reduceA) {
            err = sp_2048_mod_16(t[1] + 16, a, m);
            if (err == MP_OKAY)
                err = sp_2048_mod_16(t[1], t[1], m);
        }
        else {
            XMEMCPY(t[1] + 16, a, sizeof(sp_digit) * 16);
            err = sp_2048_mod_16(t[1], t[1], m);
        }
    }

    if (err == MP_OKAY) {
        sp_2048_mont_sqr_16(t[ 2], t[ 1], m, mp);
        sp_2048_mont_mul_16(t[ 3], t[ 2], t[ 1], m, mp);
        sp_2048_mont_sqr_16(t[ 4], t[ 2], m, mp);
        sp_2048_mont_mul_16(t[ 5], t[ 3], t[ 2], m, mp);
        sp_2048_mont_sqr_16(t[ 6], t[ 3], m, mp);
        sp_2048_mont_mul_16(t[ 7], t[ 4], t[ 3], m, mp);
        sp_2048_mont_sqr_16(t[ 8], t[ 4], m, mp);
        sp_2048_mont_mul_16(t[ 9], t[ 5], t[ 4], m, mp);
        sp_2048_mont_sqr_16(t[10], t[ 5], m, mp);
        sp_2048_mont_mul_16(t[11], t[ 6], t[ 5], m, mp);
        sp_2048_mont_sqr_16(t[12], t[ 6], m, mp);
        sp_2048_mont_mul_16(t[13], t[ 7], t[ 6], m, mp);
        sp_2048_mont_sqr_16(t[14], t[ 7], m, mp);
        sp_2048_mont_mul_16(t[15], t[ 8], t[ 7], m, mp);
        sp_2048_mont_sqr_16(t[16], t[ 8], m, mp);
        sp_2048_mont_mul_16(t[17], t[ 9], t[ 8], m, mp);
        sp_2048_mont_sqr_16(t[18], t[ 9], m, mp);
        sp_2048_mont_mul_16(t[19], t[10], t[ 9], m, mp);
        sp_2048_mont_sqr_16(t[20], t[10], m, mp);
        sp_2048_mont_mul_16(t[21], t[11], t[10], m, mp);
        sp_2048_mont_sqr_16(t[22], t[11], m, mp);
        sp_2048_mont_mul_16(t[23], t[12], t[11], m, mp);
        sp_2048_mont_sqr_16(t[24], t[12], m, mp);
        sp_2048_mont_mul_16(t[25], t[13], t[12], m, mp);
        sp_2048_mont_sqr_16(t[26], t[13], m, mp);
        sp_2048_mont_mul_16(t[27], t[14], t[13], m, mp);
        sp_2048_mont_sqr_16(t[28], t[14], m, mp);
        sp_2048_mont_mul_16(t[29], t[15], t[14], m, mp);
        sp_2048_mont_sqr_16(t[30], t[15], m, mp);
        sp_2048_mont_mul_16(t[31], t[16], t[15], m, mp);

        i = (bits - 1) / 64;
        n = e[i--];
        y = n >> 59;
        n <<= 5;
        c = 59;
        XMEMCPY(r, t[y], sizeof(sp_digit) * 16);
        for (; i>=0 || c>=5; ) {
            if (c == 0) {
                n = e[i--];
                y = n >> 59;
                n <<= 5;
                c = 59;
            }
            else if (c < 5) {
                y = n >> 59;
                n = e[i--];
                c = 5 - c;
                y |= n >> (64 - c);
                n <<= c;
                c = 64 - c;
            }
            else {
                y = (n >> 59) & 0x1f;
                n <<= 5;
                c -= 5;
            }

            sp_2048_mont_sqr_16(r, r, m, mp);
            sp_2048_mont_sqr_16(r, r, m, mp);
            sp_2048_mont_sqr_16(r, r, m, mp);
            sp_2048_mont_sqr_16(r, r, m, mp);
            sp_2048_mont_sqr_16(r, r, m, mp);

            sp_2048_mont_mul_16(r, r, t[y], m, mp);
        }
        y = e[0] & ((1 << c) - 1);
        for (; c > 0; c--)
            sp_2048_mont_sqr_16(r, r, m, mp);
        sp_2048_mont_mul_16(r, r, t[y], m, mp);

        XMEMSET(&r[16], 0, sizeof(sp_digit) * 16);
        sp_2048_mont_reduce_16(r, m, mp);

        mask = 0 - (sp_2048_cmp_16(r, m) >= 0);
        sp_2048_cond_sub_16(r, r, m, mask);
    }

#ifdef WOLFSSL_SMALL_STACK
    if (td != NULL)
        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
}

#ifdef HAVE_INTEL_AVX2
/* Reduce the number back to 2048 bits using Montgomery reduction.
 *
 * a   A single precision number to reduce in place.
 * m   The single precision number representing the modulus.
 * mp  The digit representing the negative inverse of m mod 2^n.
 */
SP_NOINLINE static void sp_2048_mont_reduce_avx2_16(sp_digit* a, sp_digit* m,
        sp_digit mp)
{
    sp_digit ca = 0;

    __asm__ __volatile__ (
        "# i = 0\n\t"
        "movq	0(%[a]), %%r12\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "\nL_mont_loop_avx2_16:\n\t"
        "xorq	%%r9, %%r9\n\t"
        "movq	%%r12, %%r10\n\t"
        "# mu = a[i] * mp\n\t"
        "movq	%%r12, %%rdx\n\t"
        "mulxq	%[mp], %%rdx, %%r8\n\t"
        "# a[i+0] += m[0] * mu\n\t"
        "mulxq	0(%[m]), %%rax, %%r8\n\t"
        "movq	8(%[a]), %%r12\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r8, %%r12\n\t"
        "# a[i+1] += m[1] * mu\n\t"
        "mulxq	8(%[m]), %%rax, %%r8\n\t"
        "movq	16(%[a]), %%r10\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%r8, %%r10\n\t"
        "# a[i+2] += m[2] * mu\n\t"
        "mulxq	16(%[m]), %%rax, %%r8\n\t"
        "movq	24(%[a]), %%r11\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r8, %%r11\n\t"
        "movq	%%r10, 16(%[a])\n\t"
        "# a[i+3] += m[3] * mu\n\t"
        "mulxq	24(%[m]), %%rax, %%r8\n\t"
        "movq	32(%[a]), %%r10\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%r8, %%r10\n\t"
        "movq	%%r11, 24(%[a])\n\t"
        "# a[i+4] += m[4] * mu\n\t"
        "mulxq	32(%[m]), %%rax, %%r8\n\t"
        "movq	40(%[a]), %%r11\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r8, %%r11\n\t"
        "movq	%%r10, 32(%[a])\n\t"
        "# a[i+5] += m[5] * mu\n\t"
        "mulxq	40(%[m]), %%rax, %%r8\n\t"
        "movq	48(%[a]), %%r10\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%r8, %%r10\n\t"
        "movq	%%r11, 40(%[a])\n\t"
        "# a[i+6] += m[6] * mu\n\t"
        "mulxq	48(%[m]), %%rax, %%r8\n\t"
        "movq	56(%[a]), %%r11\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r8, %%r11\n\t"
        "movq	%%r10, 48(%[a])\n\t"
        "# a[i+7] += m[7] * mu\n\t"
        "mulxq	56(%[m]), %%rax, %%r8\n\t"
        "movq	64(%[a]), %%r10\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%r8, %%r10\n\t"
        "movq	%%r11, 56(%[a])\n\t"
        "# a[i+8] += m[8] * mu\n\t"
        "mulxq	64(%[m]), %%rax, %%r8\n\t"
        "movq	72(%[a]), %%r11\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r8, %%r11\n\t"
        "movq	%%r10, 64(%[a])\n\t"
        "# a[i+9] += m[9] * mu\n\t"
        "mulxq	72(%[m]), %%rax, %%r8\n\t"
        "movq	80(%[a]), %%r10\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%r8, %%r10\n\t"
        "movq	%%r11, 72(%[a])\n\t"
        "# a[i+10] += m[10] * mu\n\t"
        "mulxq	80(%[m]), %%rax, %%r8\n\t"
        "movq	88(%[a]), %%r11\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r8, %%r11\n\t"
        "movq	%%r10, 80(%[a])\n\t"
        "# a[i+11] += m[11] * mu\n\t"
        "mulxq	88(%[m]), %%rax, %%r8\n\t"
        "movq	96(%[a]), %%r10\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%r8, %%r10\n\t"
        "movq	%%r11, 88(%[a])\n\t"
        "# a[i+12] += m[12] * mu\n\t"
        "mulxq	96(%[m]), %%rax, %%r8\n\t"
        "movq	104(%[a]), %%r11\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r8, %%r11\n\t"
        "movq	%%r10, 96(%[a])\n\t"
        "# a[i+13] += m[13] * mu\n\t"
        "mulxq	104(%[m]), %%rax, %%r8\n\t"
        "movq	112(%[a]), %%r10\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%r8, %%r10\n\t"
        "movq	%%r11, 104(%[a])\n\t"
        "# a[i+14] += m[14] * mu\n\t"
        "mulxq	112(%[m]), %%rax, %%r8\n\t"
        "movq	120(%[a]), %%r11\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r8, %%r11\n\t"
        "movq	%%r10, 112(%[a])\n\t"
        "# a[i+15] += m[15] * mu\n\t"
        "mulxq	120(%[m]), %%rax, %%r8\n\t"
        "movq	128(%[a]), %%r10\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%r8, %%r10\n\t"
        "movq	%%r11, 120(%[a])\n\t"
        "adcxq	%[ca], %%r10\n\t"
        "movq	%%r9, %[ca]\n\t"
        "adoxq	%%r9, %[ca]\n\t"
        "adcxq	%%r9, %[ca]\n\t"
        "movq	%%r10, 128(%[a])\n\t"
        "# i += 1\n\t"
        "addq	$8, %[a]\n\t"
        "addq	$1, %%rcx\n\t"
        "cmpq	$16, %%rcx\n\t"
        "jl	L_mont_loop_avx2_16\n\t"
        "movq	%%r12, 0(%[a])\n\t"
        : [ca] "+r" (ca), [a] "+r" (a)
        : [m] "r" (m), [mp] "r" (mp)
        : "memory", "rax", "rdx", "rcx", "r8", "r9", "r10", "r11", "r12"
    );

    sp_2048_cond_sub_16(a - 16, a, m, (sp_digit)0 - ca);
}
#endif /* HAVE_INTEL_AVX2 */

#ifdef HAVE_INTEL_AVX2
/* Multiply two Montogmery form numbers mod the modulus (prime).
 * (r = a * b mod m)
 *
 * r   Result of multiplication.
 * a   First number to multiply in Montogmery form.
 * b   Second number to multiply in Montogmery form.
 * m   Modulus (prime).
 * mp  Montogmery mulitplier.
 */
static void sp_2048_mont_mul_avx2_16(sp_digit* r, sp_digit* a, sp_digit* b,
        sp_digit* m, sp_digit mp)
{
    sp_2048_mul_avx2_16(r, a, b);
    sp_2048_mont_reduce_avx2_16(r, m, mp);
}

#endif /* HAVE_INTEL_AVX2 */
#ifdef HAVE_INTEL_AVX2
/* Square the Montgomery form number. (r = a * a mod m)
 *
 * r   Result of squaring.
 * a   Number to square in Montogmery form.
 * m   Modulus (prime).
 * mp  Montogmery mulitplier.
 */
static void sp_2048_mont_sqr_avx2_16(sp_digit* r, sp_digit* a, sp_digit* m,
        sp_digit mp)
{
    sp_2048_sqr_avx2_16(r, a);
    sp_2048_mont_reduce_avx2_16(r, m, mp);
}

#endif /* HAVE_INTEL_AVX2 */
#ifdef HAVE_INTEL_AVX2
/* Modular exponentiate a to the e mod m. (r = a^e mod m)
 *
 * r     A single precision number that is the result of the operation.
 * a     A single precision number being exponentiated.
 * e     A single precision number that is the exponent.
 * bits  The number of bits in the exponent.
 * m     A single precision number that is the modulus.
 * returns 0 on success and MEMORY_E on dynamic memory allocation failure.
 */
static int sp_2048_mod_exp_avx2_16(sp_digit* r, sp_digit* a, sp_digit* e,
        int bits, sp_digit* m, int reduceA)
{
#ifndef WOLFSSL_SMALL_STACK
    sp_digit t[32][32];
#else
    sp_digit* t[32];
    sp_digit* td;
#endif
    sp_digit* norm;
    sp_digit mp = 1;
    sp_digit n;
    sp_digit mask;
    int i;
    int c, y;
    int err = MP_OKAY;

#ifdef WOLFSSL_SMALL_STACK
    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 32 * 32, NULL,
                            DYNAMIC_TYPE_TMP_BUFFER);
    if (td == NULL)
        err = MEMORY_E;

    if (err == MP_OKAY) {
        for (i=0; i<32; i++)
            t[i] = td + i * 32;
        norm = t[0];
    }
#else
    norm = t[0];
#endif

    if (err == MP_OKAY) {
        sp_2048_mont_setup(m, &mp);
        sp_2048_mont_norm_16(norm, m);

        XMEMSET(t[1], 0, sizeof(sp_digit) * 16);
        if (reduceA) {
            err = sp_2048_mod_16(t[1] + 16, a, m);
            if (err == MP_OKAY)
                err = sp_2048_mod_16(t[1], t[1], m);
        }
        else {
            XMEMCPY(t[1] + 16, a, sizeof(sp_digit) * 16);
            err = sp_2048_mod_16(t[1], t[1], m);
        }
    }

    if (err == MP_OKAY) {
        sp_2048_mont_sqr_avx2_16(t[ 2], t[ 1], m, mp);
        sp_2048_mont_mul_avx2_16(t[ 3], t[ 2], t[ 1], m, mp);
        sp_2048_mont_sqr_avx2_16(t[ 4], t[ 2], m, mp);
        sp_2048_mont_mul_avx2_16(t[ 5], t[ 3], t[ 2], m, mp);
        sp_2048_mont_sqr_avx2_16(t[ 6], t[ 3], m, mp);
        sp_2048_mont_mul_avx2_16(t[ 7], t[ 4], t[ 3], m, mp);
        sp_2048_mont_sqr_avx2_16(t[ 8], t[ 4], m, mp);
        sp_2048_mont_mul_avx2_16(t[ 9], t[ 5], t[ 4], m, mp);
        sp_2048_mont_sqr_avx2_16(t[10], t[ 5], m, mp);
        sp_2048_mont_mul_avx2_16(t[11], t[ 6], t[ 5], m, mp);
        sp_2048_mont_sqr_avx2_16(t[12], t[ 6], m, mp);
        sp_2048_mont_mul_avx2_16(t[13], t[ 7], t[ 6], m, mp);
        sp_2048_mont_sqr_avx2_16(t[14], t[ 7], m, mp);
        sp_2048_mont_mul_avx2_16(t[15], t[ 8], t[ 7], m, mp);
        sp_2048_mont_sqr_avx2_16(t[16], t[ 8], m, mp);
        sp_2048_mont_mul_avx2_16(t[17], t[ 9], t[ 8], m, mp);
        sp_2048_mont_sqr_avx2_16(t[18], t[ 9], m, mp);
        sp_2048_mont_mul_avx2_16(t[19], t[10], t[ 9], m, mp);
        sp_2048_mont_sqr_avx2_16(t[20], t[10], m, mp);
        sp_2048_mont_mul_avx2_16(t[21], t[11], t[10], m, mp);
        sp_2048_mont_sqr_avx2_16(t[22], t[11], m, mp);
        sp_2048_mont_mul_avx2_16(t[23], t[12], t[11], m, mp);
        sp_2048_mont_sqr_avx2_16(t[24], t[12], m, mp);
        sp_2048_mont_mul_avx2_16(t[25], t[13], t[12], m, mp);
        sp_2048_mont_sqr_avx2_16(t[26], t[13], m, mp);
        sp_2048_mont_mul_avx2_16(t[27], t[14], t[13], m, mp);
        sp_2048_mont_sqr_avx2_16(t[28], t[14], m, mp);
        sp_2048_mont_mul_avx2_16(t[29], t[15], t[14], m, mp);
        sp_2048_mont_sqr_avx2_16(t[30], t[15], m, mp);
        sp_2048_mont_mul_avx2_16(t[31], t[16], t[15], m, mp);

        i = (bits - 1) / 64;
        n = e[i--];
        y = n >> 59;
        n <<= 5;
        c = 59;
        XMEMCPY(r, t[y], sizeof(sp_digit) * 16);
        for (; i>=0 || c>=5; ) {
            if (c == 0) {
                n = e[i--];
                y = n >> 59;
                n <<= 5;
                c = 59;
            }
            else if (c < 5) {
                y = n >> 59;
                n = e[i--];
                c = 5 - c;
                y |= n >> (64 - c);
                n <<= c;
                c = 64 - c;
            }
            else {
                y = (n >> 59) & 0x1f;
                n <<= 5;
                c -= 5;
            }

            sp_2048_mont_sqr_avx2_16(r, r, m, mp);
            sp_2048_mont_sqr_avx2_16(r, r, m, mp);
            sp_2048_mont_sqr_avx2_16(r, r, m, mp);
            sp_2048_mont_sqr_avx2_16(r, r, m, mp);
            sp_2048_mont_sqr_avx2_16(r, r, m, mp);

            sp_2048_mont_mul_avx2_16(r, r, t[y], m, mp);
        }
        y = e[0] & ((1 << c) - 1);
        for (; c > 0; c--)
            sp_2048_mont_sqr_avx2_16(r, r, m, mp);
        sp_2048_mont_mul_avx2_16(r, r, t[y], m, mp);

        XMEMSET(&r[16], 0, sizeof(sp_digit) * 16);
        sp_2048_mont_reduce_avx2_16(r, m, mp);

        mask = 0 - (sp_2048_cmp_16(r, m) >= 0);
        sp_2048_cond_sub_16(r, r, m, mask);
    }

#ifdef WOLFSSL_SMALL_STACK
    if (td != NULL)
        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
}
#endif /* HAVE_INTEL_AVX2 */

#endif /* !SP_RSA_PRIVATE_EXP_D && WOLFSSL_HAVE_SP_RSA */

/* r = 2^n mod m where n is the number of bits to reduce by.
 * Given m must be 2048 bits, just need to subtract.
 *
 * r  A single precision number.
 * m  A signle precision number.
 */
static void sp_2048_mont_norm_32(sp_digit* r, sp_digit* m)
{
    XMEMSET(r, 0, sizeof(sp_digit) * 32);

    /* r = 2^n mod m */
    sp_2048_sub_in_place_32(r, m);
}

/* Conditionally subtract b from a using the mask m.
 * m is -1 to subtract and 0 when not copying.
 *
 * r  A single precision number representing condition subtract result.
 * a  A single precision number to subtract from.
 * b  A single precision number to subtract.
 * m  Mask value to apply.
 */
static sp_digit sp_2048_cond_sub_32(sp_digit* r, sp_digit* a, sp_digit* b,
        sp_digit m)
{
    sp_digit t[32];
    sp_digit c = 0;

    __asm__ __volatile__ (
        "movq	0(%[b]), %%rax\n\t"
        "movq	8(%[b]), %%rcx\n\t"
        "andq	%[m], %%rax\n\t"
        "andq	%[m], %%rcx\n\t"
        "movq	%%rax, 0(%[t])\n\t"
        "movq	%%rcx, 8(%[t])\n\t"
        "movq	16(%[b]), %%rax\n\t"
        "movq	24(%[b]), %%rcx\n\t"
        "andq	%[m], %%rax\n\t"
        "andq	%[m], %%rcx\n\t"
        "movq	%%rax, 16(%[t])\n\t"
        "movq	%%rcx, 24(%[t])\n\t"
        "movq	32(%[b]), %%rax\n\t"
        "movq	40(%[b]), %%rcx\n\t"
        "andq	%[m], %%rax\n\t"
        "andq	%[m], %%rcx\n\t"
        "movq	%%rax, 32(%[t])\n\t"
        "movq	%%rcx, 40(%[t])\n\t"
        "movq	48(%[b]), %%rax\n\t"
        "movq	56(%[b]), %%rcx\n\t"
        "andq	%[m], %%rax\n\t"
        "andq	%[m], %%rcx\n\t"
        "movq	%%rax, 48(%[t])\n\t"
        "movq	%%rcx, 56(%[t])\n\t"
        "movq	64(%[b]), %%rax\n\t"
        "movq	72(%[b]), %%rcx\n\t"
        "andq	%[m], %%rax\n\t"
        "andq	%[m], %%rcx\n\t"
        "movq	%%rax, 64(%[t])\n\t"
        "movq	%%rcx, 72(%[t])\n\t"
        "movq	80(%[b]), %%rax\n\t"
        "movq	88(%[b]), %%rcx\n\t"
        "andq	%[m], %%rax\n\t"
        "andq	%[m], %%rcx\n\t"
        "movq	%%rax, 80(%[t])\n\t"
        "movq	%%rcx, 88(%[t])\n\t"
        "movq	96(%[b]), %%rax\n\t"
        "movq	104(%[b]), %%rcx\n\t"
        "andq	%[m], %%rax\n\t"
        "andq	%[m], %%rcx\n\t"
        "movq	%%rax, 96(%[t])\n\t"
        "movq	%%rcx, 104(%[t])\n\t"
        "movq	112(%[b]), %%rax\n\t"
        "movq	120(%[b]), %%rcx\n\t"
        "andq	%[m], %%rax\n\t"
        "andq	%[m], %%rcx\n\t"
        "movq	%%rax, 112(%[t])\n\t"
        "movq	%%rcx, 120(%[t])\n\t"
        "movq	128(%[b]), %%rax\n\t"
        "movq	136(%[b]), %%rcx\n\t"
        "andq	%[m], %%rax\n\t"
        "andq	%[m], %%rcx\n\t"
        "movq	%%rax, 128(%[t])\n\t"
        "movq	%%rcx, 136(%[t])\n\t"
        "movq	144(%[b]), %%rax\n\t"
        "movq	152(%[b]), %%rcx\n\t"
        "andq	%[m], %%rax\n\t"
        "andq	%[m], %%rcx\n\t"
        "movq	%%rax, 144(%[t])\n\t"
        "movq	%%rcx, 152(%[t])\n\t"
        "movq	160(%[b]), %%rax\n\t"
        "movq	168(%[b]), %%rcx\n\t"
        "andq	%[m], %%rax\n\t"
        "andq	%[m], %%rcx\n\t"
        "movq	%%rax, 160(%[t])\n\t"
        "movq	%%rcx, 168(%[t])\n\t"
        "movq	176(%[b]), %%rax\n\t"
        "movq	184(%[b]), %%rcx\n\t"
        "andq	%[m], %%rax\n\t"
        "andq	%[m], %%rcx\n\t"
        "movq	%%rax, 176(%[t])\n\t"
        "movq	%%rcx, 184(%[t])\n\t"
        "movq	192(%[b]), %%rax\n\t"
        "movq	200(%[b]), %%rcx\n\t"
        "andq	%[m], %%rax\n\t"
        "andq	%[m], %%rcx\n\t"
        "movq	%%rax, 192(%[t])\n\t"
        "movq	%%rcx, 200(%[t])\n\t"
        "movq	208(%[b]), %%rax\n\t"
        "movq	216(%[b]), %%rcx\n\t"
        "andq	%[m], %%rax\n\t"
        "andq	%[m], %%rcx\n\t"
        "movq	%%rax, 208(%[t])\n\t"
        "movq	%%rcx, 216(%[t])\n\t"
        "movq	224(%[b]), %%rax\n\t"
        "movq	232(%[b]), %%rcx\n\t"
        "andq	%[m], %%rax\n\t"
        "andq	%[m], %%rcx\n\t"
        "movq	%%rax, 224(%[t])\n\t"
        "movq	%%rcx, 232(%[t])\n\t"
        "movq	240(%[b]), %%rax\n\t"
        "movq	248(%[b]), %%rcx\n\t"
        "andq	%[m], %%rax\n\t"
        "andq	%[m], %%rcx\n\t"
        "movq	%%rax, 240(%[t])\n\t"
        "movq	%%rcx, 248(%[t])\n\t"
        "movq	(%[a]), %%rax\n\t"
        "movq	(%[t]), %%rdx\n\t"
        "subq	%%rdx,%%rax\n\t"
        "movq	8(%[a]), %%rcx\n\t"
        "movq	8(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rcx\n\t"
        "movq	%%rax, 0(%[r])\n\t"
        "movq	16(%[a]), %%rax\n\t"
        "movq	16(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rax\n\t"
        "movq	%%rcx, 8(%[r])\n\t"
        "movq	24(%[a]), %%rcx\n\t"
        "movq	24(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rcx\n\t"
        "movq	%%rax, 16(%[r])\n\t"
        "movq	32(%[a]), %%rax\n\t"
        "movq	32(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rax\n\t"
        "movq	%%rcx, 24(%[r])\n\t"
        "movq	40(%[a]), %%rcx\n\t"
        "movq	40(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rcx\n\t"
        "movq	%%rax, 32(%[r])\n\t"
        "movq	48(%[a]), %%rax\n\t"
        "movq	48(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rax\n\t"
        "movq	%%rcx, 40(%[r])\n\t"
        "movq	56(%[a]), %%rcx\n\t"
        "movq	56(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rcx\n\t"
        "movq	%%rax, 48(%[r])\n\t"
        "movq	64(%[a]), %%rax\n\t"
        "movq	64(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rax\n\t"
        "movq	%%rcx, 56(%[r])\n\t"
        "movq	72(%[a]), %%rcx\n\t"
        "movq	72(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rcx\n\t"
        "movq	%%rax, 64(%[r])\n\t"
        "movq	80(%[a]), %%rax\n\t"
        "movq	80(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rax\n\t"
        "movq	%%rcx, 72(%[r])\n\t"
        "movq	88(%[a]), %%rcx\n\t"
        "movq	88(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rcx\n\t"
        "movq	%%rax, 80(%[r])\n\t"
        "movq	96(%[a]), %%rax\n\t"
        "movq	96(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rax\n\t"
        "movq	%%rcx, 88(%[r])\n\t"
        "movq	104(%[a]), %%rcx\n\t"
        "movq	104(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rcx\n\t"
        "movq	%%rax, 96(%[r])\n\t"
        "movq	112(%[a]), %%rax\n\t"
        "movq	112(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rax\n\t"
        "movq	%%rcx, 104(%[r])\n\t"
        "movq	120(%[a]), %%rcx\n\t"
        "movq	120(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rcx\n\t"
        "movq	%%rax, 112(%[r])\n\t"
        "movq	128(%[a]), %%rax\n\t"
        "movq	128(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rax\n\t"
        "movq	%%rcx, 120(%[r])\n\t"
        "movq	136(%[a]), %%rcx\n\t"
        "movq	136(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rcx\n\t"
        "movq	%%rax, 128(%[r])\n\t"
        "movq	144(%[a]), %%rax\n\t"
        "movq	144(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rax\n\t"
        "movq	%%rcx, 136(%[r])\n\t"
        "movq	152(%[a]), %%rcx\n\t"
        "movq	152(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rcx\n\t"
        "movq	%%rax, 144(%[r])\n\t"
        "movq	160(%[a]), %%rax\n\t"
        "movq	160(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rax\n\t"
        "movq	%%rcx, 152(%[r])\n\t"
        "movq	168(%[a]), %%rcx\n\t"
        "movq	168(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rcx\n\t"
        "movq	%%rax, 160(%[r])\n\t"
        "movq	176(%[a]), %%rax\n\t"
        "movq	176(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rax\n\t"
        "movq	%%rcx, 168(%[r])\n\t"
        "movq	184(%[a]), %%rcx\n\t"
        "movq	184(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rcx\n\t"
        "movq	%%rax, 176(%[r])\n\t"
        "movq	192(%[a]), %%rax\n\t"
        "movq	192(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rax\n\t"
        "movq	%%rcx, 184(%[r])\n\t"
        "movq	200(%[a]), %%rcx\n\t"
        "movq	200(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rcx\n\t"
        "movq	%%rax, 192(%[r])\n\t"
        "movq	208(%[a]), %%rax\n\t"
        "movq	208(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rax\n\t"
        "movq	%%rcx, 200(%[r])\n\t"
        "movq	216(%[a]), %%rcx\n\t"
        "movq	216(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rcx\n\t"
        "movq	%%rax, 208(%[r])\n\t"
        "movq	224(%[a]), %%rax\n\t"
        "movq	224(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rax\n\t"
        "movq	%%rcx, 216(%[r])\n\t"
        "movq	232(%[a]), %%rcx\n\t"
        "movq	232(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rcx\n\t"
        "movq	%%rax, 224(%[r])\n\t"
        "movq	240(%[a]), %%rax\n\t"
        "movq	240(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rax\n\t"
        "movq	%%rcx, 232(%[r])\n\t"
        "movq	248(%[a]), %%rcx\n\t"
        "movq	248(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rcx\n\t"
        "movq	%%rax, 240(%[r])\n\t"
        "movq	%%rcx, 248(%[r])\n\t"
        "sbbq	$0, %[c]\n\t"
        : [c] "+r" (c)
        : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m), [t] "r" (t)
        : "memory", "rax", "rcx", "rdx"
    );

    return c;
}

/* Reduce the number back to 2048 bits using Montgomery reduction.
 *
 * a   A single precision number to reduce in place.
 * m   The single precision number representing the modulus.
 * mp  The digit representing the negative inverse of m mod 2^n.
 */
SP_NOINLINE static void sp_2048_mont_reduce_32(sp_digit* a, sp_digit* m,
        sp_digit mp)
{
    sp_digit ca = 0;

    __asm__ __volatile__ (
        "# i = 0\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "movq	0(%[a]), %%r12\n\t"
        "movq	8(%[a]), %%r13\n\t"
        "\nL_mont_loop_32:\n\t"
        "# mu = a[i] * mp\n\t"
        "movq	%%r12, %%r10\n\t"
        "imulq	%[mp], %%r10\n\t"
        "# a[i+0] += m[0] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r9, %%r9\n\t"
        "mulq	0(%[m])\n\t"
        "addq	%%rax, %%r12\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "# a[i+1] += m[1] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	8(%[m])\n\t"
        "movq	%%r13, %%r12\n\t"
        "addq	%%rax, %%r12\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "addq	%%r9,  %%r12\n\t"
        "adcq	$0, %%r8\n\t"
        "# a[i+2] += m[2] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r9, %%r9\n\t"
        "mulq	16(%[m])\n\t"
        "movq	16(%[a]), %%r13\n\t"
        "addq	%%rax, %%r13\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "addq	%%r8,  %%r13\n\t"
        "adcq	$0, %%r9\n\t"
        "# a[i+3] += m[3] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	24(%[m])\n\t"
        "movq	24(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "addq	%%r9,  %%r11\n\t"
        "movq	%%r11, 24(%[a])\n\t"
        "adcq	$0, %%r8\n\t"
        "# a[i+4] += m[4] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r9, %%r9\n\t"
        "mulq	32(%[m])\n\t"
        "movq	32(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "addq	%%r8,  %%r11\n\t"
        "movq	%%r11, 32(%[a])\n\t"
        "adcq	$0, %%r9\n\t"
        "# a[i+5] += m[5] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	40(%[m])\n\t"
        "movq	40(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "addq	%%r9,  %%r11\n\t"
        "movq	%%r11, 40(%[a])\n\t"
        "adcq	$0, %%r8\n\t"
        "# a[i+6] += m[6] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r9, %%r9\n\t"
        "mulq	48(%[m])\n\t"
        "movq	48(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "addq	%%r8,  %%r11\n\t"
        "movq	%%r11, 48(%[a])\n\t"
        "adcq	$0, %%r9\n\t"
        "# a[i+7] += m[7] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	56(%[m])\n\t"
        "movq	56(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "addq	%%r9,  %%r11\n\t"
        "movq	%%r11, 56(%[a])\n\t"
        "adcq	$0, %%r8\n\t"
        "# a[i+8] += m[8] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r9, %%r9\n\t"
        "mulq	64(%[m])\n\t"
        "movq	64(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "addq	%%r8,  %%r11\n\t"
        "movq	%%r11, 64(%[a])\n\t"
        "adcq	$0, %%r9\n\t"
        "# a[i+9] += m[9] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	72(%[m])\n\t"
        "movq	72(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "addq	%%r9,  %%r11\n\t"
        "movq	%%r11, 72(%[a])\n\t"
        "adcq	$0, %%r8\n\t"
        "# a[i+10] += m[10] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r9, %%r9\n\t"
        "mulq	80(%[m])\n\t"
        "movq	80(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "addq	%%r8,  %%r11\n\t"
        "movq	%%r11, 80(%[a])\n\t"
        "adcq	$0, %%r9\n\t"
        "# a[i+11] += m[11] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	88(%[m])\n\t"
        "movq	88(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "addq	%%r9,  %%r11\n\t"
        "movq	%%r11, 88(%[a])\n\t"
        "adcq	$0, %%r8\n\t"
        "# a[i+12] += m[12] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r9, %%r9\n\t"
        "mulq	96(%[m])\n\t"
        "movq	96(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "addq	%%r8,  %%r11\n\t"
        "movq	%%r11, 96(%[a])\n\t"
        "adcq	$0, %%r9\n\t"
        "# a[i+13] += m[13] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	104(%[m])\n\t"
        "movq	104(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "addq	%%r9,  %%r11\n\t"
        "movq	%%r11, 104(%[a])\n\t"
        "adcq	$0, %%r8\n\t"
        "# a[i+14] += m[14] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r9, %%r9\n\t"
        "mulq	112(%[m])\n\t"
        "movq	112(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "addq	%%r8,  %%r11\n\t"
        "movq	%%r11, 112(%[a])\n\t"
        "adcq	$0, %%r9\n\t"
        "# a[i+15] += m[15] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	120(%[m])\n\t"
        "movq	120(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "addq	%%r9,  %%r11\n\t"
        "movq	%%r11, 120(%[a])\n\t"
        "adcq	$0, %%r8\n\t"
        "# a[i+16] += m[16] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r9, %%r9\n\t"
        "mulq	128(%[m])\n\t"
        "movq	128(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "addq	%%r8,  %%r11\n\t"
        "movq	%%r11, 128(%[a])\n\t"
        "adcq	$0, %%r9\n\t"
        "# a[i+17] += m[17] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	136(%[m])\n\t"
        "movq	136(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "addq	%%r9,  %%r11\n\t"
        "movq	%%r11, 136(%[a])\n\t"
        "adcq	$0, %%r8\n\t"
        "# a[i+18] += m[18] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r9, %%r9\n\t"
        "mulq	144(%[m])\n\t"
        "movq	144(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "addq	%%r8,  %%r11\n\t"
        "movq	%%r11, 144(%[a])\n\t"
        "adcq	$0, %%r9\n\t"
        "# a[i+19] += m[19] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	152(%[m])\n\t"
        "movq	152(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "addq	%%r9,  %%r11\n\t"
        "movq	%%r11, 152(%[a])\n\t"
        "adcq	$0, %%r8\n\t"
        "# a[i+20] += m[20] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r9, %%r9\n\t"
        "mulq	160(%[m])\n\t"
        "movq	160(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "addq	%%r8,  %%r11\n\t"
        "movq	%%r11, 160(%[a])\n\t"
        "adcq	$0, %%r9\n\t"
        "# a[i+21] += m[21] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	168(%[m])\n\t"
        "movq	168(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "addq	%%r9,  %%r11\n\t"
        "movq	%%r11, 168(%[a])\n\t"
        "adcq	$0, %%r8\n\t"
        "# a[i+22] += m[22] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r9, %%r9\n\t"
        "mulq	176(%[m])\n\t"
        "movq	176(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "addq	%%r8,  %%r11\n\t"
        "movq	%%r11, 176(%[a])\n\t"
        "adcq	$0, %%r9\n\t"
        "# a[i+23] += m[23] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	184(%[m])\n\t"
        "movq	184(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "addq	%%r9,  %%r11\n\t"
        "movq	%%r11, 184(%[a])\n\t"
        "adcq	$0, %%r8\n\t"
        "# a[i+24] += m[24] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r9, %%r9\n\t"
        "mulq	192(%[m])\n\t"
        "movq	192(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "addq	%%r8,  %%r11\n\t"
        "movq	%%r11, 192(%[a])\n\t"
        "adcq	$0, %%r9\n\t"
        "# a[i+25] += m[25] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	200(%[m])\n\t"
        "movq	200(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "addq	%%r9,  %%r11\n\t"
        "movq	%%r11, 200(%[a])\n\t"
        "adcq	$0, %%r8\n\t"
        "# a[i+26] += m[26] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r9, %%r9\n\t"
        "mulq	208(%[m])\n\t"
        "movq	208(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "addq	%%r8,  %%r11\n\t"
        "movq	%%r11, 208(%[a])\n\t"
        "adcq	$0, %%r9\n\t"
        "# a[i+27] += m[27] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	216(%[m])\n\t"
        "movq	216(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "addq	%%r9,  %%r11\n\t"
        "movq	%%r11, 216(%[a])\n\t"
        "adcq	$0, %%r8\n\t"
        "# a[i+28] += m[28] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r9, %%r9\n\t"
        "mulq	224(%[m])\n\t"
        "movq	224(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "addq	%%r8,  %%r11\n\t"
        "movq	%%r11, 224(%[a])\n\t"
        "adcq	$0, %%r9\n\t"
        "# a[i+29] += m[29] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	232(%[m])\n\t"
        "movq	232(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "addq	%%r9,  %%r11\n\t"
        "movq	%%r11, 232(%[a])\n\t"
        "adcq	$0, %%r8\n\t"
        "# a[i+30] += m[30] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r9, %%r9\n\t"
        "mulq	240(%[m])\n\t"
        "movq	240(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "addq	%%r8,  %%r11\n\t"
        "movq	%%r11, 240(%[a])\n\t"
        "adcq	$0, %%r9\n\t"
        "# a[i+31] += m[31] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "mulq	248(%[m])\n\t"
        "movq	248(%[a]), %%r11\n\t"
        "addq	%%rax, %%r9\n\t"
        "adcq	%[ca], %%rdx\n\t"
        "movq	$0, %[ca]\n\t"
        "adcq	$0, %[ca]\n\t"
        "addq	%%r9, %%r11\n\t"
        "movq	%%r11, 248(%[a])\n\t"
        "adcq	%%rdx, 256(%[a])\n\t"
        "adcq	$0, %[ca]\n\t"
        "# i += 1\n\t"
        "addq	$8, %[a]\n\t"
        "addq	$8, %%rcx\n\t"
        "cmpq	$256, %%rcx\n\t"
        "jl	L_mont_loop_32\n\t"
        "movq	%%r12, 0(%[a])\n\t"
        "movq	%%r13, 8(%[a])\n\t"
        : [ca] "+r" (ca), [a] "+r" (a)
        : [m] "r" (m), [mp] "r" (mp)
        : "memory", "rax", "rdx", "rcx", "r8", "r9", "r10", "r11",
          "r12", "r13"
    );

    sp_2048_cond_sub_32(a - 32, a, m, (sp_digit)0 - ca);
}

/* Multiply two Montogmery form numbers mod the modulus (prime).
 * (r = a * b mod m)
 *
 * r   Result of multiplication.
 * a   First number to multiply in Montogmery form.
 * b   Second number to multiply in Montogmery form.
 * m   Modulus (prime).
 * mp  Montogmery mulitplier.
 */
static void sp_2048_mont_mul_32(sp_digit* r, sp_digit* a, sp_digit* b,
        sp_digit* m, sp_digit mp)
{
    sp_2048_mul_32(r, a, b);
    sp_2048_mont_reduce_32(r, m, mp);
}

/* Square the Montgomery form number. (r = a * a mod m)
 *
 * r   Result of squaring.
 * a   Number to square in Montogmery form.
 * m   Modulus (prime).
 * mp  Montogmery mulitplier.
 */
static void sp_2048_mont_sqr_32(sp_digit* r, sp_digit* a, sp_digit* m,
        sp_digit mp)
{
    sp_2048_sqr_32(r, a);
    sp_2048_mont_reduce_32(r, m, mp);
}

/* Mul a by digit b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision digit.
 */
SP_NOINLINE static void sp_2048_mul_d_32(sp_digit* r, const sp_digit* a,
        const sp_digit b)
{
    __asm__ __volatile__ (
        "# A[0] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	(%[a])\n\t"
        "movq	%%rax, %%rbx\n\t"
        "movq	%%rdx, %%rcx\n\t"
        "movq	%%rbx, 0(%[r])\n\t"
        "# A[1] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "movq	%%rcx, 8(%[r])\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "# A[2] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "movq	%%r8, 16(%[r])\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "# A[3] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "movq	%%rbx, 24(%[r])\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "# A[4] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "movq	%%rcx, 32(%[r])\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "# A[5] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "movq	%%r8, 40(%[r])\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "# A[6] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "movq	%%rbx, 48(%[r])\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "# A[7] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "movq	%%rcx, 56(%[r])\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "# A[8] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "movq	%%r8, 64(%[r])\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "# A[9] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "movq	%%rbx, 72(%[r])\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "# A[10] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "movq	%%rcx, 80(%[r])\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "# A[11] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "movq	%%r8, 88(%[r])\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "# A[12] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "movq	%%rbx, 96(%[r])\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "# A[13] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "movq	%%rcx, 104(%[r])\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "# A[14] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "movq	%%r8, 112(%[r])\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "# A[15] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "movq	%%rbx, 120(%[r])\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "# A[16] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "mulq	128(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "movq	%%rcx, 128(%[r])\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "# A[17] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "mulq	136(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "movq	%%r8, 136(%[r])\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "# A[18] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	144(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "movq	%%rbx, 144(%[r])\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "# A[19] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "mulq	152(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "movq	%%rcx, 152(%[r])\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "# A[20] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "mulq	160(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "movq	%%r8, 160(%[r])\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "# A[21] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	168(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "movq	%%rbx, 168(%[r])\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "# A[22] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "mulq	176(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "movq	%%rcx, 176(%[r])\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "# A[23] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "mulq	184(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "movq	%%r8, 184(%[r])\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "# A[24] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	192(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "movq	%%rbx, 192(%[r])\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "# A[25] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "mulq	200(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "movq	%%rcx, 200(%[r])\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "# A[26] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "mulq	208(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "movq	%%r8, 208(%[r])\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "# A[27] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	216(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "movq	%%rbx, 216(%[r])\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "# A[28] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "mulq	224(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "movq	%%rcx, 224(%[r])\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "# A[29] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "mulq	232(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "movq	%%r8, 232(%[r])\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "# A[30] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	240(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "movq	%%rbx, 240(%[r])\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "# A[31] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "mulq	248(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "movq	%%rcx, 248(%[r])\n\t"
        "movq	%%r8, 256(%[r])\n\t"
        :
        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
        : "memory", "rax", "rdx", "rbx", "rcx", "r8"
    );
}

#ifdef HAVE_INTEL_AVX2
/* Mul a by digit b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision digit.
 */
SP_NOINLINE static void sp_2048_mul_d_avx2_32(sp_digit* r, const sp_digit* a,
        const sp_digit b)
{
    __asm__ __volatile__ (
        "# A[0] * B\n\t"
        "movq	%[b], %%rdx\n\t"
        "xorq	%%r10, %%r10\n\t"
        "mulxq	(%[a]), %%r8, %%r9\n\t"
        "movq	%%r8, 0(%[r])\n\t"
        "# A[1] * B\n\t"
        "mulxq	8(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r8\n\t"
        "adcxq	%%rax, %%r9\n\t"
        "movq	%%r9, 8(%[r])\n\t"
        "adoxq	%%rcx, %%r8\n\t"
        "# A[2] * B\n\t"
        "mulxq	16(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r9\n\t"
        "adcxq	%%rax, %%r8\n\t"
        "movq	%%r8, 16(%[r])\n\t"
        "adoxq	%%rcx, %%r9\n\t"
        "# A[3] * B\n\t"
        "mulxq	24(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r8\n\t"
        "adcxq	%%rax, %%r9\n\t"
        "movq	%%r9, 24(%[r])\n\t"
        "adoxq	%%rcx, %%r8\n\t"
        "# A[4] * B\n\t"
        "mulxq	32(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r9\n\t"
        "adcxq	%%rax, %%r8\n\t"
        "movq	%%r8, 32(%[r])\n\t"
        "adoxq	%%rcx, %%r9\n\t"
        "# A[5] * B\n\t"
        "mulxq	40(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r8\n\t"
        "adcxq	%%rax, %%r9\n\t"
        "movq	%%r9, 40(%[r])\n\t"
        "adoxq	%%rcx, %%r8\n\t"
        "# A[6] * B\n\t"
        "mulxq	48(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r9\n\t"
        "adcxq	%%rax, %%r8\n\t"
        "movq	%%r8, 48(%[r])\n\t"
        "adoxq	%%rcx, %%r9\n\t"
        "# A[7] * B\n\t"
        "mulxq	56(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r8\n\t"
        "adcxq	%%rax, %%r9\n\t"
        "movq	%%r9, 56(%[r])\n\t"
        "adoxq	%%rcx, %%r8\n\t"
        "# A[8] * B\n\t"
        "mulxq	64(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r9\n\t"
        "adcxq	%%rax, %%r8\n\t"
        "movq	%%r8, 64(%[r])\n\t"
        "adoxq	%%rcx, %%r9\n\t"
        "# A[9] * B\n\t"
        "mulxq	72(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r8\n\t"
        "adcxq	%%rax, %%r9\n\t"
        "movq	%%r9, 72(%[r])\n\t"
        "adoxq	%%rcx, %%r8\n\t"
        "# A[10] * B\n\t"
        "mulxq	80(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r9\n\t"
        "adcxq	%%rax, %%r8\n\t"
        "movq	%%r8, 80(%[r])\n\t"
        "adoxq	%%rcx, %%r9\n\t"
        "# A[11] * B\n\t"
        "mulxq	88(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r8\n\t"
        "adcxq	%%rax, %%r9\n\t"
        "movq	%%r9, 88(%[r])\n\t"
        "adoxq	%%rcx, %%r8\n\t"
        "# A[12] * B\n\t"
        "mulxq	96(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r9\n\t"
        "adcxq	%%rax, %%r8\n\t"
        "movq	%%r8, 96(%[r])\n\t"
        "adoxq	%%rcx, %%r9\n\t"
        "# A[13] * B\n\t"
        "mulxq	104(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r8\n\t"
        "adcxq	%%rax, %%r9\n\t"
        "movq	%%r9, 104(%[r])\n\t"
        "adoxq	%%rcx, %%r8\n\t"
        "# A[14] * B\n\t"
        "mulxq	112(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r9\n\t"
        "adcxq	%%rax, %%r8\n\t"
        "movq	%%r8, 112(%[r])\n\t"
        "adoxq	%%rcx, %%r9\n\t"
        "# A[15] * B\n\t"
        "mulxq	120(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r8\n\t"
        "adcxq	%%rax, %%r9\n\t"
        "movq	%%r9, 120(%[r])\n\t"
        "adoxq	%%rcx, %%r8\n\t"
        "# A[16] * B\n\t"
        "mulxq	128(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r9\n\t"
        "adcxq	%%rax, %%r8\n\t"
        "movq	%%r8, 128(%[r])\n\t"
        "adoxq	%%rcx, %%r9\n\t"
        "# A[17] * B\n\t"
        "mulxq	136(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r8\n\t"
        "adcxq	%%rax, %%r9\n\t"
        "movq	%%r9, 136(%[r])\n\t"
        "adoxq	%%rcx, %%r8\n\t"
        "# A[18] * B\n\t"
        "mulxq	144(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r9\n\t"
        "adcxq	%%rax, %%r8\n\t"
        "movq	%%r8, 144(%[r])\n\t"
        "adoxq	%%rcx, %%r9\n\t"
        "# A[19] * B\n\t"
        "mulxq	152(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r8\n\t"
        "adcxq	%%rax, %%r9\n\t"
        "movq	%%r9, 152(%[r])\n\t"
        "adoxq	%%rcx, %%r8\n\t"
        "# A[20] * B\n\t"
        "mulxq	160(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r9\n\t"
        "adcxq	%%rax, %%r8\n\t"
        "movq	%%r8, 160(%[r])\n\t"
        "adoxq	%%rcx, %%r9\n\t"
        "# A[21] * B\n\t"
        "mulxq	168(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r8\n\t"
        "adcxq	%%rax, %%r9\n\t"
        "movq	%%r9, 168(%[r])\n\t"
        "adoxq	%%rcx, %%r8\n\t"
        "# A[22] * B\n\t"
        "mulxq	176(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r9\n\t"
        "adcxq	%%rax, %%r8\n\t"
        "movq	%%r8, 176(%[r])\n\t"
        "adoxq	%%rcx, %%r9\n\t"
        "# A[23] * B\n\t"
        "mulxq	184(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r8\n\t"
        "adcxq	%%rax, %%r9\n\t"
        "movq	%%r9, 184(%[r])\n\t"
        "adoxq	%%rcx, %%r8\n\t"
        "# A[24] * B\n\t"
        "mulxq	192(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r9\n\t"
        "adcxq	%%rax, %%r8\n\t"
        "movq	%%r8, 192(%[r])\n\t"
        "adoxq	%%rcx, %%r9\n\t"
        "# A[25] * B\n\t"
        "mulxq	200(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r8\n\t"
        "adcxq	%%rax, %%r9\n\t"
        "movq	%%r9, 200(%[r])\n\t"
        "adoxq	%%rcx, %%r8\n\t"
        "# A[26] * B\n\t"
        "mulxq	208(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r9\n\t"
        "adcxq	%%rax, %%r8\n\t"
        "movq	%%r8, 208(%[r])\n\t"
        "adoxq	%%rcx, %%r9\n\t"
        "# A[27] * B\n\t"
        "mulxq	216(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r8\n\t"
        "adcxq	%%rax, %%r9\n\t"
        "movq	%%r9, 216(%[r])\n\t"
        "adoxq	%%rcx, %%r8\n\t"
        "# A[28] * B\n\t"
        "mulxq	224(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r9\n\t"
        "adcxq	%%rax, %%r8\n\t"
        "movq	%%r8, 224(%[r])\n\t"
        "adoxq	%%rcx, %%r9\n\t"
        "# A[29] * B\n\t"
        "mulxq	232(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r8\n\t"
        "adcxq	%%rax, %%r9\n\t"
        "movq	%%r9, 232(%[r])\n\t"
        "adoxq	%%rcx, %%r8\n\t"
        "# A[30] * B\n\t"
        "mulxq	240(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r9\n\t"
        "adcxq	%%rax, %%r8\n\t"
        "movq	%%r8, 240(%[r])\n\t"
        "adoxq	%%rcx, %%r9\n\t"
        "# A[31] * B\n\t"
        "mulxq	248(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r8\n\t"
        "adcxq	%%rax, %%r9\n\t"
        "adoxq	%%rcx, %%r8\n\t"
        "adcxq	%%r10, %%r8\n\t"
        "movq	%%r9, 248(%[r])\n\t"
        "movq	%%r8, 256(%[r])\n\t"
        :
        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
        : "memory", "rax", "rdx", "rcx", "r8", "r9", "r10"
    );
}
#endif /* HAVE_INTEL_AVX2 */

/* Divide the double width number (d1|d0) by the dividend. (d1|d0 / div)
 *
 * d1   The high order half of the number to divide.
 * d0   The low order half of the number to divide.
 * div  The dividend.
 * returns the result of the division.
 */
static sp_digit div_2048_word_32(sp_digit d1, sp_digit d0, sp_digit div)
{
    sp_digit r;

    __asm__ __volatile__ (
        "movq	%[d0], %%rax\n\t"
        "movq	%[d1], %%rdx\n\t"
        "divq	%[div]\n\t"
        "movq	%%rax, %[r]\n\t"
        : [r] "=r" (r)
        : [d1] "r" (d1), [d0] "r" (d0), [div] "r" (div)
        : "rax", "rdx"
    );

    return r;
}

/* AND m into each word of a and store in r.
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * m  Mask to AND against each digit.
 */
static void sp_2048_mask_32(sp_digit* r, sp_digit* a, sp_digit m)
{
#ifdef WOLFSSL_SP_SMALL
    int i;

    for (i=0; i<32; i++)
        r[i] = a[i] & m;
#else
    int i;

    for (i = 0; i < 32; i += 8) {
        r[i+0] = a[i+0] & m;
        r[i+1] = a[i+1] & m;
        r[i+2] = a[i+2] & m;
        r[i+3] = a[i+3] & m;
        r[i+4] = a[i+4] & m;
        r[i+5] = a[i+5] & m;
        r[i+6] = a[i+6] & m;
        r[i+7] = a[i+7] & m;
    }
#endif
}

/* Compare a with b in constant time.
 *
 * a  A single precision integer.
 * b  A single precision integer.
 * return -ve, 0 or +ve if a is less than, equal to or greater than b
 * respectively.
 */
static int64_t sp_2048_cmp_32(sp_digit* a, sp_digit* b)
{
    sp_digit r = -1;
    sp_digit one = 1;

    __asm__ __volatile__ (
        "xorq	%%rcx, %%rcx\n\t"
        "movq	$-1, %%rdx\n\t"
        "movq	248(%[a]), %%rbx\n\t"
        "movq	248(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	240(%[a]), %%rbx\n\t"
        "movq	240(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	232(%[a]), %%rbx\n\t"
        "movq	232(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	224(%[a]), %%rbx\n\t"
        "movq	224(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	216(%[a]), %%rbx\n\t"
        "movq	216(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	208(%[a]), %%rbx\n\t"
        "movq	208(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	200(%[a]), %%rbx\n\t"
        "movq	200(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	192(%[a]), %%rbx\n\t"
        "movq	192(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	184(%[a]), %%rbx\n\t"
        "movq	184(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	176(%[a]), %%rbx\n\t"
        "movq	176(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	168(%[a]), %%rbx\n\t"
        "movq	168(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	160(%[a]), %%rbx\n\t"
        "movq	160(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	152(%[a]), %%rbx\n\t"
        "movq	152(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	144(%[a]), %%rbx\n\t"
        "movq	144(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	136(%[a]), %%rbx\n\t"
        "movq	136(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	128(%[a]), %%rbx\n\t"
        "movq	128(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	120(%[a]), %%rbx\n\t"
        "movq	120(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	112(%[a]), %%rbx\n\t"
        "movq	112(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	104(%[a]), %%rbx\n\t"
        "movq	104(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	96(%[a]), %%rbx\n\t"
        "movq	96(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	88(%[a]), %%rbx\n\t"
        "movq	88(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	80(%[a]), %%rbx\n\t"
        "movq	80(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	72(%[a]), %%rbx\n\t"
        "movq	72(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	64(%[a]), %%rbx\n\t"
        "movq	64(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	56(%[a]), %%rbx\n\t"
        "movq	56(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	48(%[a]), %%rbx\n\t"
        "movq	48(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	40(%[a]), %%rbx\n\t"
        "movq	40(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	32(%[a]), %%rbx\n\t"
        "movq	32(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	24(%[a]), %%rbx\n\t"
        "movq	24(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	16(%[a]), %%rbx\n\t"
        "movq	16(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	8(%[a]), %%rbx\n\t"
        "movq	8(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	0(%[a]), %%rbx\n\t"
        "movq	0(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "xorq	%%rdx, %[r]\n\t"
        : [r] "+r" (r)
        : [a] "r" (a), [b] "r" (b), [one] "r" (one)
        : "rax", "rdx", "rcx", "rbx", "r8"
    );

    return r;
}

/* Divide d in a and put remainder into r (m*d + r = a)
 * m is not calculated as it is not needed at this time.
 *
 * a  Nmber to be divided.
 * d  Number to divide with.
 * m  Multiplier result.
 * r  Remainder from the division.
 * returns MP_OKAY indicating success.
 */
static INLINE int sp_2048_div_32(sp_digit* a, sp_digit* d, sp_digit* m,
        sp_digit* r)
{
    sp_digit t1[64], t2[33];
    sp_digit div, r1;
    int i;
#ifdef HAVE_INTEL_AVX2
    word32 cpuid_flags = cpuid_get_flags();
#endif

    (void)m;

    div = d[31];
    XMEMCPY(t1, a, sizeof(*t1) * 2 * 32);
    for (i=31; i>=0; i--) {
        r1 = div_2048_word_32(t1[32 + i], t1[32 + i - 1], div);

#ifdef HAVE_INTEL_AVX2
        if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags))
            sp_2048_mul_d_avx2_32(t2, d, r1);
        else
#endif
            sp_2048_mul_d_32(t2, d, r1);
        t1[32 + i] += sp_2048_sub_in_place_32(&t1[i], t2);
        t1[32 + i] -= t2[32];
        sp_2048_mask_32(t2, d, t1[32 + i]);
        t1[32 + i] += sp_2048_add_32(&t1[i], &t1[i], t2);
        sp_2048_mask_32(t2, d, t1[32 + i]);
        t1[32 + i] += sp_2048_add_32(&t1[i], &t1[i], t2);
    }

    r1 = sp_2048_cmp_32(t1, d) >= 0;
    sp_2048_cond_sub_32(r, t1, t2, (sp_digit)0 - r1);

    return MP_OKAY;
}

/* Reduce a modulo m into r. (r = a mod m)
 *
 * r  A single precision number that is the reduced result.
 * a  A single precision number that is to be reduced.
 * m  A single precision number that is the modulus to reduce with.
 * returns MP_OKAY indicating success.
 */
static INLINE int sp_2048_mod_32(sp_digit* r, sp_digit* a, sp_digit* m)
{
    return sp_2048_div_32(a, m, NULL, r);
}

/* Divide d in a and put remainder into r (m*d + r = a)
 * m is not calculated as it is not needed at this time.
 *
 * a  Nmber to be divided.
 * d  Number to divide with.
 * m  Multiplier result.
 * r  Remainder from the division.
 * returns MP_OKAY indicating success.
 */
static INLINE int sp_2048_div_32_cond(sp_digit* a, sp_digit* d, sp_digit* m,
        sp_digit* r)
{
    sp_digit t1[64], t2[33];
    sp_digit div, r1;
    int i;
#ifdef HAVE_INTEL_AVX2
    word32 cpuid_flags = cpuid_get_flags();
#endif

    (void)m;

    div = d[31];
    XMEMCPY(t1, a, sizeof(*t1) * 2 * 32);
    for (i=31; i>=0; i--) {
        r1 = div_2048_word_32(t1[32 + i], t1[32 + i - 1], div);

#ifdef HAVE_INTEL_AVX2
        if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags))
            sp_2048_mul_d_avx2_32(t2, d, r1);
        else
#endif
            sp_2048_mul_d_32(t2, d, r1);
        t1[32 + i] += sp_2048_sub_in_place_32(&t1[i], t2);
        t1[32 + i] -= t2[32];
        if (t1[32 + i] != 0) {
            t1[32 + i] += sp_2048_add_32(&t1[i], &t1[i], d);
            if (t1[32 + i] != 0)
                t1[32 + i] += sp_2048_add_32(&t1[i], &t1[i], d);
        }
    }

    r1 = sp_2048_cmp_32(t1, d) >= 0;
    sp_2048_cond_sub_32(r, t1, t2, (sp_digit)0 - r1);

    return MP_OKAY;
}

/* Reduce a modulo m into r. (r = a mod m)
 *
 * r  A single precision number that is the reduced result.
 * a  A single precision number that is to be reduced.
 * m  A single precision number that is the modulus to reduce with.
 * returns MP_OKAY indicating success.
 */
static INLINE int sp_2048_mod_32_cond(sp_digit* r, sp_digit* a, sp_digit* m)
{
    return sp_2048_div_32_cond(a, m, NULL, r);
}

#if defined(SP_RSA_PRIVATE_EXP_D) || defined(WOLFSSL_HAVE_SP_DH)
/* Modular exponentiate a to the e mod m. (r = a^e mod m)
 *
 * r     A single precision number that is the result of the operation.
 * a     A single precision number being exponentiated.
 * e     A single precision number that is the exponent.
 * bits  The number of bits in the exponent.
 * m     A single precision number that is the modulus.
 * returns 0 on success and MEMORY_E on dynamic memory allocation failure.
 */
static int sp_2048_mod_exp_32(sp_digit* r, sp_digit* a, sp_digit* e,
        int bits, sp_digit* m, int reduceA)
{
#ifndef WOLFSSL_SMALL_STACK
    sp_digit t[32][64];
#else
    sp_digit* t[32];
    sp_digit* td;
#endif
    sp_digit* norm;
    sp_digit mp = 1;
    sp_digit n;
    sp_digit mask;
    int i;
    int c, y;
    int err = MP_OKAY;

#ifdef WOLFSSL_SMALL_STACK
    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 32 * 64, NULL,
                            DYNAMIC_TYPE_TMP_BUFFER);
    if (td == NULL)
        err = MEMORY_E;

    if (err == MP_OKAY) {
        for (i=0; i<32; i++)
            t[i] = td + i * 64;
        norm = t[0];
    }
#else
    norm = t[0];
#endif

    if (err == MP_OKAY) {
        sp_2048_mont_setup(m, &mp);
        sp_2048_mont_norm_32(norm, m);

        XMEMSET(t[1], 0, sizeof(sp_digit) * 32);
        if (reduceA) {
            err = sp_2048_mod_32(t[1] + 32, a, m);
            if (err == MP_OKAY)
                err = sp_2048_mod_32(t[1], t[1], m);
        }
        else {
            XMEMCPY(t[1] + 32, a, sizeof(sp_digit) * 32);
            err = sp_2048_mod_32(t[1], t[1], m);
        }
    }

    if (err == MP_OKAY) {
        sp_2048_mont_sqr_32(t[ 2], t[ 1], m, mp);
        sp_2048_mont_mul_32(t[ 3], t[ 2], t[ 1], m, mp);
        sp_2048_mont_sqr_32(t[ 4], t[ 2], m, mp);
        sp_2048_mont_mul_32(t[ 5], t[ 3], t[ 2], m, mp);
        sp_2048_mont_sqr_32(t[ 6], t[ 3], m, mp);
        sp_2048_mont_mul_32(t[ 7], t[ 4], t[ 3], m, mp);
        sp_2048_mont_sqr_32(t[ 8], t[ 4], m, mp);
        sp_2048_mont_mul_32(t[ 9], t[ 5], t[ 4], m, mp);
        sp_2048_mont_sqr_32(t[10], t[ 5], m, mp);
        sp_2048_mont_mul_32(t[11], t[ 6], t[ 5], m, mp);
        sp_2048_mont_sqr_32(t[12], t[ 6], m, mp);
        sp_2048_mont_mul_32(t[13], t[ 7], t[ 6], m, mp);
        sp_2048_mont_sqr_32(t[14], t[ 7], m, mp);
        sp_2048_mont_mul_32(t[15], t[ 8], t[ 7], m, mp);
        sp_2048_mont_sqr_32(t[16], t[ 8], m, mp);
        sp_2048_mont_mul_32(t[17], t[ 9], t[ 8], m, mp);
        sp_2048_mont_sqr_32(t[18], t[ 9], m, mp);
        sp_2048_mont_mul_32(t[19], t[10], t[ 9], m, mp);
        sp_2048_mont_sqr_32(t[20], t[10], m, mp);
        sp_2048_mont_mul_32(t[21], t[11], t[10], m, mp);
        sp_2048_mont_sqr_32(t[22], t[11], m, mp);
        sp_2048_mont_mul_32(t[23], t[12], t[11], m, mp);
        sp_2048_mont_sqr_32(t[24], t[12], m, mp);
        sp_2048_mont_mul_32(t[25], t[13], t[12], m, mp);
        sp_2048_mont_sqr_32(t[26], t[13], m, mp);
        sp_2048_mont_mul_32(t[27], t[14], t[13], m, mp);
        sp_2048_mont_sqr_32(t[28], t[14], m, mp);
        sp_2048_mont_mul_32(t[29], t[15], t[14], m, mp);
        sp_2048_mont_sqr_32(t[30], t[15], m, mp);
        sp_2048_mont_mul_32(t[31], t[16], t[15], m, mp);

        i = (bits - 1) / 64;
        n = e[i--];
        y = n >> 59;
        n <<= 5;
        c = 59;
        XMEMCPY(r, t[y], sizeof(sp_digit) * 32);
        for (; i>=0 || c>=5; ) {
            if (c == 0) {
                n = e[i--];
                y = n >> 59;
                n <<= 5;
                c = 59;
            }
            else if (c < 5) {
                y = n >> 59;
                n = e[i--];
                c = 5 - c;
                y |= n >> (64 - c);
                n <<= c;
                c = 64 - c;
            }
            else {
                y = (n >> 59) & 0x1f;
                n <<= 5;
                c -= 5;
            }

            sp_2048_mont_sqr_32(r, r, m, mp);
            sp_2048_mont_sqr_32(r, r, m, mp);
            sp_2048_mont_sqr_32(r, r, m, mp);
            sp_2048_mont_sqr_32(r, r, m, mp);
            sp_2048_mont_sqr_32(r, r, m, mp);

            sp_2048_mont_mul_32(r, r, t[y], m, mp);
        }
        y = e[0] & ((1 << c) - 1);
        for (; c > 0; c--)
            sp_2048_mont_sqr_32(r, r, m, mp);
        sp_2048_mont_mul_32(r, r, t[y], m, mp);

        XMEMSET(&r[32], 0, sizeof(sp_digit) * 32);
        sp_2048_mont_reduce_32(r, m, mp);

        mask = 0 - (sp_2048_cmp_32(r, m) >= 0);
        sp_2048_cond_sub_32(r, r, m, mask);
    }

#ifdef WOLFSSL_SMALL_STACK
    if (td != NULL)
        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
}
#endif /* SP_RSA_PRIVATE_EXP_D || WOLFSSL_HAVE_SP_DH */

#ifdef HAVE_INTEL_AVX2
/* Reduce the number back to 2048 bits using Montgomery reduction.
 *
 * a   A single precision number to reduce in place.
 * m   The single precision number representing the modulus.
 * mp  The digit representing the negative inverse of m mod 2^n.
 */
SP_NOINLINE static void sp_2048_mont_reduce_avx2_32(sp_digit* a, sp_digit* m,
        sp_digit mp)
{
    sp_digit ca = 0;

    __asm__ __volatile__ (
        "# i = 0\n\t"
        "movq	0(%[a]), %%r12\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "\nL_mont_loop_avx2_32:\n\t"
        "xorq	%%r9, %%r9\n\t"
        "movq	%%r12, %%r10\n\t"
        "# mu = a[i] * mp\n\t"
        "movq	%%r12, %%rdx\n\t"
        "mulxq	%[mp], %%rdx, %%r8\n\t"
        "# a[i+0] += m[0] * mu\n\t"
        "mulxq	0(%[m]), %%rax, %%r8\n\t"
        "movq	8(%[a]), %%r12\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r8, %%r12\n\t"
        "# a[i+1] += m[1] * mu\n\t"
        "mulxq	8(%[m]), %%rax, %%r8\n\t"
        "movq	16(%[a]), %%r10\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%r8, %%r10\n\t"
        "# a[i+2] += m[2] * mu\n\t"
        "mulxq	16(%[m]), %%rax, %%r8\n\t"
        "movq	24(%[a]), %%r11\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r8, %%r11\n\t"
        "movq	%%r10, 16(%[a])\n\t"
        "# a[i+3] += m[3] * mu\n\t"
        "mulxq	24(%[m]), %%rax, %%r8\n\t"
        "movq	32(%[a]), %%r10\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%r8, %%r10\n\t"
        "movq	%%r11, 24(%[a])\n\t"
        "# a[i+4] += m[4] * mu\n\t"
        "mulxq	32(%[m]), %%rax, %%r8\n\t"
        "movq	40(%[a]), %%r11\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r8, %%r11\n\t"
        "movq	%%r10, 32(%[a])\n\t"
        "# a[i+5] += m[5] * mu\n\t"
        "mulxq	40(%[m]), %%rax, %%r8\n\t"
        "movq	48(%[a]), %%r10\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%r8, %%r10\n\t"
        "movq	%%r11, 40(%[a])\n\t"
        "# a[i+6] += m[6] * mu\n\t"
        "mulxq	48(%[m]), %%rax, %%r8\n\t"
        "movq	56(%[a]), %%r11\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r8, %%r11\n\t"
        "movq	%%r10, 48(%[a])\n\t"
        "# a[i+7] += m[7] * mu\n\t"
        "mulxq	56(%[m]), %%rax, %%r8\n\t"
        "movq	64(%[a]), %%r10\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%r8, %%r10\n\t"
        "movq	%%r11, 56(%[a])\n\t"
        "# a[i+8] += m[8] * mu\n\t"
        "mulxq	64(%[m]), %%rax, %%r8\n\t"
        "movq	72(%[a]), %%r11\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r8, %%r11\n\t"
        "movq	%%r10, 64(%[a])\n\t"
        "# a[i+9] += m[9] * mu\n\t"
        "mulxq	72(%[m]), %%rax, %%r8\n\t"
        "movq	80(%[a]), %%r10\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%r8, %%r10\n\t"
        "movq	%%r11, 72(%[a])\n\t"
        "# a[i+10] += m[10] * mu\n\t"
        "mulxq	80(%[m]), %%rax, %%r8\n\t"
        "movq	88(%[a]), %%r11\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r8, %%r11\n\t"
        "movq	%%r10, 80(%[a])\n\t"
        "# a[i+11] += m[11] * mu\n\t"
        "mulxq	88(%[m]), %%rax, %%r8\n\t"
        "movq	96(%[a]), %%r10\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%r8, %%r10\n\t"
        "movq	%%r11, 88(%[a])\n\t"
        "# a[i+12] += m[12] * mu\n\t"
        "mulxq	96(%[m]), %%rax, %%r8\n\t"
        "movq	104(%[a]), %%r11\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r8, %%r11\n\t"
        "movq	%%r10, 96(%[a])\n\t"
        "# a[i+13] += m[13] * mu\n\t"
        "mulxq	104(%[m]), %%rax, %%r8\n\t"
        "movq	112(%[a]), %%r10\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%r8, %%r10\n\t"
        "movq	%%r11, 104(%[a])\n\t"
        "# a[i+14] += m[14] * mu\n\t"
        "mulxq	112(%[m]), %%rax, %%r8\n\t"
        "movq	120(%[a]), %%r11\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r8, %%r11\n\t"
        "movq	%%r10, 112(%[a])\n\t"
        "# a[i+15] += m[15] * mu\n\t"
        "mulxq	120(%[m]), %%rax, %%r8\n\t"
        "movq	128(%[a]), %%r10\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%r8, %%r10\n\t"
        "movq	%%r11, 120(%[a])\n\t"
        "# a[i+16] += m[16] * mu\n\t"
        "mulxq	128(%[m]), %%rax, %%r8\n\t"
        "movq	136(%[a]), %%r11\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r8, %%r11\n\t"
        "movq	%%r10, 128(%[a])\n\t"
        "# a[i+17] += m[17] * mu\n\t"
        "mulxq	136(%[m]), %%rax, %%r8\n\t"
        "movq	144(%[a]), %%r10\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%r8, %%r10\n\t"
        "movq	%%r11, 136(%[a])\n\t"
        "# a[i+18] += m[18] * mu\n\t"
        "mulxq	144(%[m]), %%rax, %%r8\n\t"
        "movq	152(%[a]), %%r11\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r8, %%r11\n\t"
        "movq	%%r10, 144(%[a])\n\t"
        "# a[i+19] += m[19] * mu\n\t"
        "mulxq	152(%[m]), %%rax, %%r8\n\t"
        "movq	160(%[a]), %%r10\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%r8, %%r10\n\t"
        "movq	%%r11, 152(%[a])\n\t"
        "# a[i+20] += m[20] * mu\n\t"
        "mulxq	160(%[m]), %%rax, %%r8\n\t"
        "movq	168(%[a]), %%r11\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r8, %%r11\n\t"
        "movq	%%r10, 160(%[a])\n\t"
        "# a[i+21] += m[21] * mu\n\t"
        "mulxq	168(%[m]), %%rax, %%r8\n\t"
        "movq	176(%[a]), %%r10\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%r8, %%r10\n\t"
        "movq	%%r11, 168(%[a])\n\t"
        "# a[i+22] += m[22] * mu\n\t"
        "mulxq	176(%[m]), %%rax, %%r8\n\t"
        "movq	184(%[a]), %%r11\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r8, %%r11\n\t"
        "movq	%%r10, 176(%[a])\n\t"
        "# a[i+23] += m[23] * mu\n\t"
        "mulxq	184(%[m]), %%rax, %%r8\n\t"
        "movq	192(%[a]), %%r10\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%r8, %%r10\n\t"
        "movq	%%r11, 184(%[a])\n\t"
        "# a[i+24] += m[24] * mu\n\t"
        "mulxq	192(%[m]), %%rax, %%r8\n\t"
        "movq	200(%[a]), %%r11\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r8, %%r11\n\t"
        "movq	%%r10, 192(%[a])\n\t"
        "# a[i+25] += m[25] * mu\n\t"
        "mulxq	200(%[m]), %%rax, %%r8\n\t"
        "movq	208(%[a]), %%r10\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%r8, %%r10\n\t"
        "movq	%%r11, 200(%[a])\n\t"
        "# a[i+26] += m[26] * mu\n\t"
        "mulxq	208(%[m]), %%rax, %%r8\n\t"
        "movq	216(%[a]), %%r11\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r8, %%r11\n\t"
        "movq	%%r10, 208(%[a])\n\t"
        "# a[i+27] += m[27] * mu\n\t"
        "mulxq	216(%[m]), %%rax, %%r8\n\t"
        "movq	224(%[a]), %%r10\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%r8, %%r10\n\t"
        "movq	%%r11, 216(%[a])\n\t"
        "# a[i+28] += m[28] * mu\n\t"
        "mulxq	224(%[m]), %%rax, %%r8\n\t"
        "movq	232(%[a]), %%r11\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r8, %%r11\n\t"
        "movq	%%r10, 224(%[a])\n\t"
        "# a[i+29] += m[29] * mu\n\t"
        "mulxq	232(%[m]), %%rax, %%r8\n\t"
        "movq	240(%[a]), %%r10\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%r8, %%r10\n\t"
        "movq	%%r11, 232(%[a])\n\t"
        "# a[i+30] += m[30] * mu\n\t"
        "mulxq	240(%[m]), %%rax, %%r8\n\t"
        "movq	248(%[a]), %%r11\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r8, %%r11\n\t"
        "movq	%%r10, 240(%[a])\n\t"
        "# a[i+31] += m[31] * mu\n\t"
        "mulxq	248(%[m]), %%rax, %%r8\n\t"
        "movq	256(%[a]), %%r10\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%r8, %%r10\n\t"
        "movq	%%r11, 248(%[a])\n\t"
        "adcxq	%[ca], %%r10\n\t"
        "movq	%%r9, %[ca]\n\t"
        "adoxq	%%r9, %[ca]\n\t"
        "adcxq	%%r9, %[ca]\n\t"
        "movq	%%r10, 256(%[a])\n\t"
        "# i += 1\n\t"
        "addq	$8, %[a]\n\t"
        "addq	$1, %%rcx\n\t"
        "cmpq	$32, %%rcx\n\t"
        "jl	L_mont_loop_avx2_32\n\t"
        "movq	%%r12, 0(%[a])\n\t"
        : [ca] "+r" (ca), [a] "+r" (a)
        : [m] "r" (m), [mp] "r" (mp)
        : "memory", "rax", "rdx", "rcx", "r8", "r9", "r10", "r11", "r12"
    );

    sp_2048_cond_sub_32(a - 32, a, m, (sp_digit)0 - ca);
}
#endif /* HAVE_INTEL_AVX2 */

#ifdef HAVE_INTEL_AVX2
/* Multiply two Montogmery form numbers mod the modulus (prime).
 * (r = a * b mod m)
 *
 * r   Result of multiplication.
 * a   First number to multiply in Montogmery form.
 * b   Second number to multiply in Montogmery form.
 * m   Modulus (prime).
 * mp  Montogmery mulitplier.
 */
static void sp_2048_mont_mul_avx2_32(sp_digit* r, sp_digit* a, sp_digit* b,
        sp_digit* m, sp_digit mp)
{
    sp_2048_mul_avx2_32(r, a, b);
    sp_2048_mont_reduce_avx2_32(r, m, mp);
}

#endif /* HAVE_INTEL_AVX2 */
#ifdef HAVE_INTEL_AVX2
/* Square the Montgomery form number. (r = a * a mod m)
 *
 * r   Result of squaring.
 * a   Number to square in Montogmery form.
 * m   Modulus (prime).
 * mp  Montogmery mulitplier.
 */
static void sp_2048_mont_sqr_avx2_32(sp_digit* r, sp_digit* a, sp_digit* m,
        sp_digit mp)
{
    sp_2048_sqr_avx2_32(r, a);
    sp_2048_mont_reduce_avx2_32(r, m, mp);
}

#endif /* HAVE_INTEL_AVX2 */
#if defined(SP_RSA_PRIVATE_EXP_D) || defined(WOLFSSL_HAVE_SP_DH)
#ifdef HAVE_INTEL_AVX2
/* Modular exponentiate a to the e mod m. (r = a^e mod m)
 *
 * r     A single precision number that is the result of the operation.
 * a     A single precision number being exponentiated.
 * e     A single precision number that is the exponent.
 * bits  The number of bits in the exponent.
 * m     A single precision number that is the modulus.
 * returns 0 on success and MEMORY_E on dynamic memory allocation failure.
 */
static int sp_2048_mod_exp_avx2_32(sp_digit* r, sp_digit* a, sp_digit* e,
        int bits, sp_digit* m, int reduceA)
{
#ifndef WOLFSSL_SMALL_STACK
    sp_digit t[32][64];
#else
    sp_digit* t[32];
    sp_digit* td;
#endif
    sp_digit* norm;
    sp_digit mp = 1;
    sp_digit n;
    sp_digit mask;
    int i;
    int c, y;
    int err = MP_OKAY;

#ifdef WOLFSSL_SMALL_STACK
    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 32 * 64, NULL,
                            DYNAMIC_TYPE_TMP_BUFFER);
    if (td == NULL)
        err = MEMORY_E;

    if (err == MP_OKAY) {
        for (i=0; i<32; i++)
            t[i] = td + i * 64;
        norm = t[0];
    }
#else
    norm = t[0];
#endif

    if (err == MP_OKAY) {
        sp_2048_mont_setup(m, &mp);
        sp_2048_mont_norm_32(norm, m);

        XMEMSET(t[1], 0, sizeof(sp_digit) * 32);
        if (reduceA) {
            err = sp_2048_mod_32(t[1] + 32, a, m);
            if (err == MP_OKAY)
                err = sp_2048_mod_32(t[1], t[1], m);
        }
        else {
            XMEMCPY(t[1] + 32, a, sizeof(sp_digit) * 32);
            err = sp_2048_mod_32(t[1], t[1], m);
        }
    }

    if (err == MP_OKAY) {
        sp_2048_mont_sqr_avx2_32(t[ 2], t[ 1], m, mp);
        sp_2048_mont_mul_avx2_32(t[ 3], t[ 2], t[ 1], m, mp);
        sp_2048_mont_sqr_avx2_32(t[ 4], t[ 2], m, mp);
        sp_2048_mont_mul_avx2_32(t[ 5], t[ 3], t[ 2], m, mp);
        sp_2048_mont_sqr_avx2_32(t[ 6], t[ 3], m, mp);
        sp_2048_mont_mul_avx2_32(t[ 7], t[ 4], t[ 3], m, mp);
        sp_2048_mont_sqr_avx2_32(t[ 8], t[ 4], m, mp);
        sp_2048_mont_mul_avx2_32(t[ 9], t[ 5], t[ 4], m, mp);
        sp_2048_mont_sqr_avx2_32(t[10], t[ 5], m, mp);
        sp_2048_mont_mul_avx2_32(t[11], t[ 6], t[ 5], m, mp);
        sp_2048_mont_sqr_avx2_32(t[12], t[ 6], m, mp);
        sp_2048_mont_mul_avx2_32(t[13], t[ 7], t[ 6], m, mp);
        sp_2048_mont_sqr_avx2_32(t[14], t[ 7], m, mp);
        sp_2048_mont_mul_avx2_32(t[15], t[ 8], t[ 7], m, mp);
        sp_2048_mont_sqr_avx2_32(t[16], t[ 8], m, mp);
        sp_2048_mont_mul_avx2_32(t[17], t[ 9], t[ 8], m, mp);
        sp_2048_mont_sqr_avx2_32(t[18], t[ 9], m, mp);
        sp_2048_mont_mul_avx2_32(t[19], t[10], t[ 9], m, mp);
        sp_2048_mont_sqr_avx2_32(t[20], t[10], m, mp);
        sp_2048_mont_mul_avx2_32(t[21], t[11], t[10], m, mp);
        sp_2048_mont_sqr_avx2_32(t[22], t[11], m, mp);
        sp_2048_mont_mul_avx2_32(t[23], t[12], t[11], m, mp);
        sp_2048_mont_sqr_avx2_32(t[24], t[12], m, mp);
        sp_2048_mont_mul_avx2_32(t[25], t[13], t[12], m, mp);
        sp_2048_mont_sqr_avx2_32(t[26], t[13], m, mp);
        sp_2048_mont_mul_avx2_32(t[27], t[14], t[13], m, mp);
        sp_2048_mont_sqr_avx2_32(t[28], t[14], m, mp);
        sp_2048_mont_mul_avx2_32(t[29], t[15], t[14], m, mp);
        sp_2048_mont_sqr_avx2_32(t[30], t[15], m, mp);
        sp_2048_mont_mul_avx2_32(t[31], t[16], t[15], m, mp);

        i = (bits - 1) / 64;
        n = e[i--];
        y = n >> 59;
        n <<= 5;
        c = 59;
        XMEMCPY(r, t[y], sizeof(sp_digit) * 32);
        for (; i>=0 || c>=5; ) {
            if (c == 0) {
                n = e[i--];
                y = n >> 59;
                n <<= 5;
                c = 59;
            }
            else if (c < 5) {
                y = n >> 59;
                n = e[i--];
                c = 5 - c;
                y |= n >> (64 - c);
                n <<= c;
                c = 64 - c;
            }
            else {
                y = (n >> 59) & 0x1f;
                n <<= 5;
                c -= 5;
            }

            sp_2048_mont_sqr_avx2_32(r, r, m, mp);
            sp_2048_mont_sqr_avx2_32(r, r, m, mp);
            sp_2048_mont_sqr_avx2_32(r, r, m, mp);
            sp_2048_mont_sqr_avx2_32(r, r, m, mp);
            sp_2048_mont_sqr_avx2_32(r, r, m, mp);

            sp_2048_mont_mul_avx2_32(r, r, t[y], m, mp);
        }
        y = e[0] & ((1 << c) - 1);
        for (; c > 0; c--)
            sp_2048_mont_sqr_avx2_32(r, r, m, mp);
        sp_2048_mont_mul_avx2_32(r, r, t[y], m, mp);

        XMEMSET(&r[32], 0, sizeof(sp_digit) * 32);
        sp_2048_mont_reduce_avx2_32(r, m, mp);

        mask = 0 - (sp_2048_cmp_32(r, m) >= 0);
        sp_2048_cond_sub_32(r, r, m, mask);
    }

#ifdef WOLFSSL_SMALL_STACK
    if (td != NULL)
        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
}
#endif /* HAVE_INTEL_AVX2 */
#endif /* SP_RSA_PRIVATE_EXP_D || WOLFSSL_HAVE_SP_DH */

#ifdef WOLFSSL_HAVE_SP_RSA
/* RSA public key operation.
 *
 * in      Array of bytes representing the number to exponentiate, base.
 * inLen   Number of bytes in base.
 * em      Public exponent.
 * mm      Modulus.
 * out     Buffer to hold big-endian bytes of exponentiation result.
 *         Must be at least 256 bytes long.
 * outLen  Number of bytes in result.
 * returns 0 on success, MP_TO_E when the outLen is too small, MP_READ_E when
 * an array is too long and MEMORY_E when dynamic memory allocation fails.
 */
int sp_RsaPublic_2048(const byte* in, word32 inLen, mp_int* em, mp_int* mm,
    byte* out, word32* outLen)
{
#if !defined(WOLFSSL_SP_SMALL) && !defined(WOLFSSL_SMALL_STACK)
    sp_digit ad[64], md[32], rd[64];
#else
    sp_digit* d = NULL;
#endif
    sp_digit* a;
    sp_digit *ah;
    sp_digit* m;
    sp_digit* r;
    sp_digit e[1];
    int err = MP_OKAY;
#ifdef HAVE_INTEL_AVX2
    word32 cpuid_flags = cpuid_get_flags();
#endif

    if (*outLen < 256)
        err = MP_TO_E;
    if (err == MP_OKAY && (mp_count_bits(em) > 64 || inLen > 256 ||
                                                     mp_count_bits(mm) != 2048))
        err = MP_READ_E;

#if defined(WOLFSSL_SP_SMALL) || defined(WOLFSSL_SMALL_STACK)
    if (err == MP_OKAY) {
        d = (sp_digit*)XMALLOC(sizeof(sp_digit) * 32 * 5, NULL,
                               DYNAMIC_TYPE_TMP_BUFFER);
        if (d == NULL)
            err = MEMORY_E;
    }

    if (err == MP_OKAY) {
        a = d;
        r = a + 32 * 2;
        m = r + 32 * 2;
        ah = a + 32;
    }
#else
    a = ad;
    m = md;
    r = rd;
    ah = a + 32;
#endif

    if (err == MP_OKAY) {
        sp_2048_from_bin(ah, 32, in, inLen);
#if DIGIT_BIT >= 64
        e[0] = em->dp[0];
#else
        e[0] = em->dp[0];
        if (em->used > 1)
            e[0] |= ((sp_digit)em->dp[1]) << DIGIT_BIT;
#endif
        if (e[0] == 0)
            err = MP_EXPTMOD_E;
    }
    if (err == MP_OKAY) {
        sp_2048_from_mp(m, 32, mm);

        if (e[0] == 0x3) {
#ifdef HAVE_INTEL_AVX2
            if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags)) {
                if (err == MP_OKAY) {
                    sp_2048_sqr_avx2_32(r, ah);
                    err = sp_2048_mod_32_cond(r, r, m);
                }
                if (err == MP_OKAY) {
                    sp_2048_mul_avx2_32(r, ah, r);
                    err = sp_2048_mod_32_cond(r, r, m);
                }
            }
            else
#endif
            {
                if (err == MP_OKAY) {
                    sp_2048_sqr_32(r, ah);
                    err = sp_2048_mod_32_cond(r, r, m);
                }
                if (err == MP_OKAY) {
                    sp_2048_mul_32(r, ah, r);
                    err = sp_2048_mod_32_cond(r, r, m);
                }
            }
        }
        else {
            int i;
            sp_digit mp;

            sp_2048_mont_setup(m, &mp);

            /* Convert to Montgomery form. */
            XMEMSET(a, 0, sizeof(sp_digit) * 32);
            err = sp_2048_mod_32_cond(a, a, m);

            if (err == MP_OKAY) {
                for (i=63; i>=0; i--)
                    if (e[0] >> i)
                        break;

                XMEMCPY(r, a, sizeof(sp_digit) * 32);
#ifdef HAVE_INTEL_AVX2
                if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags)) {
                    for (i--; i>=0; i--) {
                        sp_2048_mont_sqr_avx2_32(r, r, m, mp);
                        if (((e[0] >> i) & 1) == 1)
                            sp_2048_mont_mul_avx2_32(r, r, a, m, mp);
                    }
                    XMEMSET(&r[32], 0, sizeof(sp_digit) * 32);
                    sp_2048_mont_reduce_avx2_32(r, m, mp);
                }
                else
#endif
                {
                    for (i--; i>=0; i--) {
                        sp_2048_mont_sqr_32(r, r, m, mp);
                        if (((e[0] >> i) & 1) == 1)
                            sp_2048_mont_mul_32(r, r, a, m, mp);
                    }
                    XMEMSET(&r[32], 0, sizeof(sp_digit) * 32);
                    sp_2048_mont_reduce_32(r, m, mp);
                }

                for (i = 31; i > 0; i--) {
                    if (r[i] != m[i])
                        break;
                }
                if (r[i] >= m[i])
                    sp_2048_sub_in_place_32(r, m);
            }
        }
    }

    if (err == MP_OKAY) {
        sp_2048_to_bin(r, out);
        *outLen = 256;
    }

#if defined(WOLFSSL_SP_SMALL) || defined(WOLFSSL_SMALL_STACK)
    if (d != NULL)
        XFREE(d, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
}

/* RSA private key operation.
 *
 * in      Array of bytes representing the number to exponentiate, base.
 * inLen   Number of bytes in base.
 * dm      Private exponent.
 * pm      First prime.
 * qm      Second prime.
 * dpm     First prime's CRT exponent.
 * dqm     Second prime's CRT exponent.
 * qim     Inverse of second prime mod p.
 * mm      Modulus.
 * out     Buffer to hold big-endian bytes of exponentiation result.
 *         Must be at least 256 bytes long.
 * outLen  Number of bytes in result.
 * returns 0 on success, MP_TO_E when the outLen is too small, MP_READ_E when
 * an array is too long and MEMORY_E when dynamic memory allocation fails.
 */
int sp_RsaPrivate_2048(const byte* in, word32 inLen, mp_int* dm,
    mp_int* pm, mp_int* qm, mp_int* dpm, mp_int* dqm, mp_int* qim, mp_int* mm,
    byte* out, word32* outLen)
{
#if !defined(WOLFSSL_SP_SMALL) && !defined(WOLFSSL_SMALL_STACK)
    sp_digit ad[32 * 2];
    sp_digit pd[16], qd[16], dpd[16];
    sp_digit tmpad[32], tmpbd[32];
#else
    sp_digit* t = NULL;
#endif
    sp_digit* a;
    sp_digit* p;
    sp_digit* q;
    sp_digit* dp;
    sp_digit* dq;
    sp_digit* qi;
    sp_digit* tmp;
    sp_digit* tmpa;
    sp_digit* tmpb;
    sp_digit* r;
    sp_digit c;
    int err = MP_OKAY;
#ifdef HAVE_INTEL_AVX2
    word32 cpuid_flags = cpuid_get_flags();
#endif

    (void)dm;
    (void)mm;

    if (*outLen < 256)
        err = MP_TO_E;
    if (err == MP_OKAY && (inLen > 256 || mp_count_bits(mm) != 2048))
        err = MP_READ_E;

#if defined(WOLFSSL_SP_SMALL) || defined(WOLFSSL_SMALL_STACK)
    if (err == MP_OKAY) {
        t = (sp_digit*)XMALLOC(sizeof(sp_digit) * 16 * 11, NULL,
                               DYNAMIC_TYPE_TMP_BUFFER);
        if (t == NULL)
            err = MEMORY_E;
    }
    if (err == MP_OKAY) {
        a = t;
        p = a + 32 * 2;
        q = p + 16;
        qi = dq = dp = q + 16;
        tmpa = qi + 16;
        tmpb = tmpa + 32;

        tmp = t;
        r = tmp + 32;
    }
#else
    r = a = ad;
    p = pd;
    q = qd;
    qi = dq = dp = dpd;
    tmpa = tmpad;
    tmpb = tmpbd;
    tmp = a + 32;
#endif

    if (err == MP_OKAY) {
        sp_2048_from_bin(a, 32, in, inLen);
        sp_2048_from_mp(p, 16, pm);
        sp_2048_from_mp(q, 16, qm);
        sp_2048_from_mp(dp, 16, dpm);

#ifdef HAVE_INTEL_AVX2
        if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags))
            err = sp_2048_mod_exp_avx2_16(tmpa, a, dp, 1024, p, 1);
        else
#endif
            err = sp_2048_mod_exp_16(tmpa, a, dp, 1024, p, 1);
    }
    if (err == MP_OKAY) {
        sp_2048_from_mp(dq, 16, dqm);
#ifdef HAVE_INTEL_AVX2
        if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags))
            err = sp_2048_mod_exp_avx2_16(tmpb, a, dq, 1024, q, 1);
       else
#endif
            err = sp_2048_mod_exp_16(tmpb, a, dq, 1024, q, 1);
    }

    if (err == MP_OKAY) {
        c = sp_2048_sub_in_place_16(tmpa, tmpb);
        sp_2048_mask_16(tmp, p, c);
        sp_2048_add_16(tmpa, tmpa, tmp);

        sp_2048_from_mp(qi, 16, qim);
#ifdef HAVE_INTEL_AVX2
        if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags))
            sp_2048_mul_avx2_16(tmpa, tmpa, qi);
        else
#endif
            sp_2048_mul_16(tmpa, tmpa, qi);
        err = sp_2048_mod_16(tmpa, tmpa, p);
    }

    if (err == MP_OKAY) {
#ifdef HAVE_INTEL_AVX2
        if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags))
            sp_2048_mul_avx2_16(tmpa, q, tmpa);
        else
#endif
            sp_2048_mul_16(tmpa, q, tmpa);
        XMEMSET(&tmpb[16], 0, sizeof(sp_digit) * 16);
        sp_2048_add_32(r, tmpb, tmpa);

        sp_2048_to_bin(r, out);
        *outLen = 256;
    }

#if defined(WOLFSSL_SP_SMALL) || defined(WOLFSSL_SMALL_STACK)
    if (t != NULL) {
        XMEMSET(t, 0, sizeof(sp_digit) * 16 * 11);
        XFREE(t, NULL, DYNAMIC_TYPE_TMP_BUFFER);
    }
#else
    XMEMSET(tmpad, 0, sizeof(tmpad));
    XMEMSET(tmpbd, 0, sizeof(tmpbd));
    XMEMSET(pd, 0, sizeof(pd));
    XMEMSET(qd, 0, sizeof(qd));
    XMEMSET(dpd, 0, sizeof(dpd));
#endif

    return err;
}
#endif /* WOLFSSL_HAVE_SP_RSA */
#ifdef WOLFSSL_HAVE_SP_DH
/* Convert an array of sp_digit to an mp_int.
 *
 * a  A single precision integer.
 * r  A multi-precision integer.
 */
static int sp_2048_to_mp(sp_digit* a, mp_int* r)
{
    int err;

    err = mp_grow(r, (2048 + DIGIT_BIT - 1) / DIGIT_BIT);
    if (err == MP_OKAY) {
#if DIGIT_BIT == 64
        XMEMCPY(r->dp, a, sizeof(sp_digit) * 32);
        r->used = 32;
        mp_clamp(r);
#elif DIGIT_BIT < 64
        int i, j = 0, s = 0;

        r->dp[0] = 0;
        for (i = 0; i < 32; i++) {
            r->dp[j] |= a[i] << s;
            r->dp[j] &= (1l << DIGIT_BIT) - 1;
            s = DIGIT_BIT - s;
            r->dp[++j] = a[i] >> s;
            while (s + DIGIT_BIT <= 64) {
                s += DIGIT_BIT;
                r->dp[j] &= (1l << DIGIT_BIT) - 1;
                r->dp[++j] = a[i] >> s;
            }
            s = 64 - s;
        }
        r->used = (2048 + DIGIT_BIT - 1) / DIGIT_BIT;
        mp_clamp(r);
#else
        int i, j = 0, s = 0;

        r->dp[0] = 0;
        for (i = 0; i < 32; i++) {
            r->dp[j] |= ((mp_digit)a[i]) << s;
            if (s + 64 >= DIGIT_BIT) {
    #if DIGIT_BIT < 64
                r->dp[j] &= (1l << DIGIT_BIT) - 1;
    #endif
                s = DIGIT_BIT - s;
                r->dp[++j] = a[i] >> s;
                s = 64 - s;
            }
            else
                s += 64;
        }
        r->used = (2048 + DIGIT_BIT - 1) / DIGIT_BIT;
        mp_clamp(r);
#endif
    }

    return err;
}

/* Perform the modular exponentiation for Diffie-Hellman.
 *
 * base  Base. MP integer.
 * exp   Exponent. MP integer.
 * mod   Modulus. MP integer.
 * res   Result. MP integer.
 * returs 0 on success, MP_READ_E if there are too many bytes in an array
 * and MEMORY_E if memory allocation fails.
 */
int sp_ModExp_2048(mp_int* base, mp_int* exp, mp_int* mod, mp_int* res)
{
    int err = MP_OKAY;
    sp_digit b[64], e[32], m[32];
    sp_digit* r = b;
#ifdef HAVE_INTEL_AVX2
    word32 cpuid_flags = cpuid_get_flags();
#endif
    int expBits = mp_count_bits(exp);

    if (mp_count_bits(base) > 2048 || expBits > 2048 ||
                                                   mp_count_bits(mod) != 2048) {
        err = MP_READ_E;
    }

    if (err == MP_OKAY) {
        sp_2048_from_mp(b, 32, base);
        sp_2048_from_mp(e, 32, exp);
        sp_2048_from_mp(m, 32, mod);

#ifdef HAVE_INTEL_AVX2
        if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags))
            err = sp_2048_mod_exp_avx2_32(r, b, e, expBits, m, 0);
        else
#endif
            err = sp_2048_mod_exp_32(r, b, e, expBits, m, 0);
    }

    if (err == MP_OKAY) {
        err = sp_2048_to_mp(r, res);
    }

    XMEMSET(e, 0, sizeof(e));

    return err;
}

/* Perform the modular exponentiation for Diffie-Hellman.
 *
 * base     Base.
 * exp      Array of bytes that is the exponent.
 * expLen   Length of data, in bytes, in exponent.
 * mod      Modulus.
 * out      Buffer to hold big-endian bytes of exponentiation result.
 *          Must be at least 256 bytes long.
 * outLen   Length, in bytes, of exponentiation result.
 * returs 0 on success, MP_READ_E if there are too many bytes in an array
 * and MEMORY_E if memory allocation fails.
 */
int sp_DhExp_2048(mp_int* base, const byte* exp, word32 expLen,
    mp_int* mod, byte* out, word32* outLen)
{
    int err = MP_OKAY;
    sp_digit b[64], e[32], m[32];
    sp_digit* r = b;
    word32 i;
#ifdef HAVE_INTEL_AVX2
    word32 cpuid_flags = cpuid_get_flags();
#endif

    if (mp_count_bits(base) > 2048 || expLen > 256 ||
                                                   mp_count_bits(mod) != 2048) {
        err = MP_READ_E;
    }

    if (err == MP_OKAY) {
        sp_2048_from_mp(b, 32, base);
        sp_2048_from_bin(e, 32, exp, expLen);
        sp_2048_from_mp(m, 32, mod);

#ifdef HAVE_INTEL_AVX2
        if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags))
            err = sp_2048_mod_exp_avx2_32(r, b, e, expLen * 8, m, 0);
        else
#endif
            err = sp_2048_mod_exp_32(r, b, e, expLen * 8, m, 0);
    }

    if (err == MP_OKAY) {
        sp_2048_to_bin(r, out);
        *outLen = 256;
        for (i=0; i<256 && out[i] == 0; i++) {
        }
        *outLen -= i;
        XMEMMOVE(out, out + i, *outLen);
    }

    XMEMSET(e, 0, sizeof(e));

    return err;
}
#endif /* WOLFSSL_HAVE_SP_DH */

#endif /* WOLFSSL_SP_NO_2048 */

#ifndef WOLFSSL_SP_NO_3072
/* Read big endian unsigned byte aray into r.
 *
 * r  A single precision integer.
 * a  Byte array.
 * n  Number of bytes in array to read.
 */
static void sp_3072_from_bin(sp_digit* r, int max, const byte* a, int n)
{
    int i, j = 0, s = 0;

    r[0] = 0;
    for (i = n-1; i >= 0; i--) {
        r[j] |= ((sp_digit)a[i]) << s;
        if (s >= 56) {
            r[j] &= 0xffffffffffffffffl;
            s = 64 - s;
            if (j + 1 >= max)
                break;
            r[++j] = a[i] >> s;
            s = 8 - s;
        }
        else
            s += 8;
    }

    for (j++; j < max; j++)
        r[j] = 0;
}

/* Convert an mp_int to an array of sp_digit.
 *
 * r  A single precision integer.
 * a  A multi-precision integer.
 */
static void sp_3072_from_mp(sp_digit* r, int max, mp_int* a)
{
#if DIGIT_BIT == 64
    int j;

    XMEMCPY(r, a->dp, sizeof(sp_digit) * a->used);

    for (j = a->used; j < max; j++)
        r[j] = 0;
#elif DIGIT_BIT > 64
    int i, j = 0, s = 0;

    r[0] = 0;
    for (i = 0; i < a->used && j < max; i++) {
        r[j] |= a->dp[i] << s;
        r[j] &= 0xffffffffffffffffl;
        s = 64 - s;
        if (j + 1 >= max)
            break;
        r[++j] = a->dp[i] >> s;
        while (s + 64 <= DIGIT_BIT) {
            s += 64;
            r[j] &= 0xffffffffffffffffl;
            if (j + 1 >= max)
                break;
            if (s < DIGIT_BIT)
                r[++j] = a->dp[i] >> s;
            else
                r[++j] = 0;
        }
        s = DIGIT_BIT - s;
    }

    for (j++; j < max; j++)
        r[j] = 0;
#else
    int i, j = 0, s = 0;

    r[0] = 0;
    for (i = 0; i < a->used && j < max; i++) {
        r[j] |= ((sp_digit)a->dp[i]) << s;
        if (s + DIGIT_BIT >= 64) {
            r[j] &= 0xffffffffffffffffl;
            if (j + 1 >= max)
                break;
            s = 64 - s;
            if (s == DIGIT_BIT) {
                r[++j] = 0;
                s = 0;
            }
            else {
                r[++j] = a->dp[i] >> s;
                s = DIGIT_BIT - s;
            }
        }
        else
            s += DIGIT_BIT;
    }

    for (j++; j < max; j++)
        r[j] = 0;
#endif
}

/* Write r as big endian to byte aray.
 * Fixed length number of bytes written: 384
 *
 * r  A single precision integer.
 * a  Byte array.
 */
static void sp_3072_to_bin(sp_digit* r, byte* a)
{
    int i, j, s = 0, b;

    j = 3072 / 8 - 1;
    a[j] = 0;
    for (i=0; i<48 && j>=0; i++) {
        b = 0;
        a[j--] |= r[i] << s; b += 8 - s;
        if (j < 0)
            break;
        while (b < 64) {
            a[j--] = r[i] >> b; b += 8;
            if (j < 0)
                break;
        }
        s = 8 - (b - 64);
        if (j >= 0)
            a[j] = 0;
        if (s != 0)
            j++;
    }
}

/* Multiply a and b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
SP_NOINLINE static void sp_3072_mul_24(sp_digit* r, const sp_digit* a,
    const sp_digit* b)
{
    sp_digit tmp[24];

    __asm__ __volatile__ (
        "#  A[0] * B[0]\n\t"
        "movq	0(%[b]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "movq	%%rax, (%[tmp])\n\t"
        "movq	%%rdx, %%rcx\n\t"
        "#  A[0] * B[1]\n\t"
        "movq	8(%[b]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[1] * B[0]\n\t"
        "movq	0(%[b]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "movq	%%rcx, 8(%[tmp])\n\t"
        "#  A[0] * B[2]\n\t"
        "movq	16(%[b]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[1] * B[1]\n\t"
        "movq	8(%[b]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[2] * B[0]\n\t"
        "movq	0(%[b]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "movq	%%r8, 16(%[tmp])\n\t"
        "#  A[0] * B[3]\n\t"
        "movq	24(%[b]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[1] * B[2]\n\t"
        "movq	16(%[b]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[2] * B[1]\n\t"
        "movq	8(%[b]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[3] * B[0]\n\t"
        "movq	0(%[b]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "movq	%%rbx, 24(%[tmp])\n\t"
        "#  A[0] * B[4]\n\t"
        "movq	32(%[b]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[1] * B[3]\n\t"
        "movq	24(%[b]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[2] * B[2]\n\t"
        "movq	16(%[b]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[3] * B[1]\n\t"
        "movq	8(%[b]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[4] * B[0]\n\t"
        "movq	0(%[b]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "movq	%%rcx, 32(%[tmp])\n\t"
        "#  A[0] * B[5]\n\t"
        "movq	40(%[b]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[1] * B[4]\n\t"
        "movq	32(%[b]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[2] * B[3]\n\t"
        "movq	24(%[b]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[3] * B[2]\n\t"
        "movq	16(%[b]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[4] * B[1]\n\t"
        "movq	8(%[b]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[5] * B[0]\n\t"
        "movq	0(%[b]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "movq	%%r8, 40(%[tmp])\n\t"
        "#  A[0] * B[6]\n\t"
        "movq	48(%[b]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[1] * B[5]\n\t"
        "movq	40(%[b]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[2] * B[4]\n\t"
        "movq	32(%[b]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[3] * B[3]\n\t"
        "movq	24(%[b]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[4] * B[2]\n\t"
        "movq	16(%[b]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[5] * B[1]\n\t"
        "movq	8(%[b]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[6] * B[0]\n\t"
        "movq	0(%[b]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "movq	%%rbx, 48(%[tmp])\n\t"
        "#  A[0] * B[7]\n\t"
        "movq	56(%[b]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[1] * B[6]\n\t"
        "movq	48(%[b]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[2] * B[5]\n\t"
        "movq	40(%[b]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[3] * B[4]\n\t"
        "movq	32(%[b]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[4] * B[3]\n\t"
        "movq	24(%[b]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[5] * B[2]\n\t"
        "movq	16(%[b]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[6] * B[1]\n\t"
        "movq	8(%[b]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[7] * B[0]\n\t"
        "movq	0(%[b]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "movq	%%rcx, 56(%[tmp])\n\t"
        "#  A[0] * B[8]\n\t"
        "movq	64(%[b]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[1] * B[7]\n\t"
        "movq	56(%[b]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[2] * B[6]\n\t"
        "movq	48(%[b]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[3] * B[5]\n\t"
        "movq	40(%[b]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[4] * B[4]\n\t"
        "movq	32(%[b]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[5] * B[3]\n\t"
        "movq	24(%[b]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[6] * B[2]\n\t"
        "movq	16(%[b]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[7] * B[1]\n\t"
        "movq	8(%[b]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[8] * B[0]\n\t"
        "movq	0(%[b]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "movq	%%r8, 64(%[tmp])\n\t"
        "#  A[0] * B[9]\n\t"
        "movq	72(%[b]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[1] * B[8]\n\t"
        "movq	64(%[b]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[2] * B[7]\n\t"
        "movq	56(%[b]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[3] * B[6]\n\t"
        "movq	48(%[b]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[4] * B[5]\n\t"
        "movq	40(%[b]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[5] * B[4]\n\t"
        "movq	32(%[b]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[6] * B[3]\n\t"
        "movq	24(%[b]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[7] * B[2]\n\t"
        "movq	16(%[b]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[8] * B[1]\n\t"
        "movq	8(%[b]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[9] * B[0]\n\t"
        "movq	0(%[b]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "movq	%%rbx, 72(%[tmp])\n\t"
        "#  A[0] * B[10]\n\t"
        "movq	80(%[b]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[1] * B[9]\n\t"
        "movq	72(%[b]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[2] * B[8]\n\t"
        "movq	64(%[b]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[3] * B[7]\n\t"
        "movq	56(%[b]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[4] * B[6]\n\t"
        "movq	48(%[b]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[5] * B[5]\n\t"
        "movq	40(%[b]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[6] * B[4]\n\t"
        "movq	32(%[b]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[7] * B[3]\n\t"
        "movq	24(%[b]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[8] * B[2]\n\t"
        "movq	16(%[b]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[9] * B[1]\n\t"
        "movq	8(%[b]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[10] * B[0]\n\t"
        "movq	0(%[b]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "movq	%%rcx, 80(%[tmp])\n\t"
        "#  A[0] * B[11]\n\t"
        "movq	88(%[b]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[1] * B[10]\n\t"
        "movq	80(%[b]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[2] * B[9]\n\t"
        "movq	72(%[b]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[3] * B[8]\n\t"
        "movq	64(%[b]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[4] * B[7]\n\t"
        "movq	56(%[b]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[5] * B[6]\n\t"
        "movq	48(%[b]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[6] * B[5]\n\t"
        "movq	40(%[b]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[7] * B[4]\n\t"
        "movq	32(%[b]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[8] * B[3]\n\t"
        "movq	24(%[b]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[9] * B[2]\n\t"
        "movq	16(%[b]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[10] * B[1]\n\t"
        "movq	8(%[b]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[11] * B[0]\n\t"
        "movq	0(%[b]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "movq	%%r8, 88(%[tmp])\n\t"
        "#  A[0] * B[12]\n\t"
        "movq	96(%[b]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[1] * B[11]\n\t"
        "movq	88(%[b]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[2] * B[10]\n\t"
        "movq	80(%[b]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[3] * B[9]\n\t"
        "movq	72(%[b]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[4] * B[8]\n\t"
        "movq	64(%[b]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[5] * B[7]\n\t"
        "movq	56(%[b]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[6] * B[6]\n\t"
        "movq	48(%[b]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[7] * B[5]\n\t"
        "movq	40(%[b]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[8] * B[4]\n\t"
        "movq	32(%[b]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[9] * B[3]\n\t"
        "movq	24(%[b]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[10] * B[2]\n\t"
        "movq	16(%[b]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[11] * B[1]\n\t"
        "movq	8(%[b]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[12] * B[0]\n\t"
        "movq	0(%[b]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "movq	%%rbx, 96(%[tmp])\n\t"
        "#  A[0] * B[13]\n\t"
        "movq	104(%[b]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[1] * B[12]\n\t"
        "movq	96(%[b]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[2] * B[11]\n\t"
        "movq	88(%[b]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[3] * B[10]\n\t"
        "movq	80(%[b]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[4] * B[9]\n\t"
        "movq	72(%[b]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[5] * B[8]\n\t"
        "movq	64(%[b]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[6] * B[7]\n\t"
        "movq	56(%[b]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[7] * B[6]\n\t"
        "movq	48(%[b]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[8] * B[5]\n\t"
        "movq	40(%[b]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[9] * B[4]\n\t"
        "movq	32(%[b]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[10] * B[3]\n\t"
        "movq	24(%[b]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[11] * B[2]\n\t"
        "movq	16(%[b]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[12] * B[1]\n\t"
        "movq	8(%[b]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[13] * B[0]\n\t"
        "movq	0(%[b]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "movq	%%rcx, 104(%[tmp])\n\t"
        "#  A[0] * B[14]\n\t"
        "movq	112(%[b]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[1] * B[13]\n\t"
        "movq	104(%[b]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[2] * B[12]\n\t"
        "movq	96(%[b]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[3] * B[11]\n\t"
        "movq	88(%[b]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[4] * B[10]\n\t"
        "movq	80(%[b]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[5] * B[9]\n\t"
        "movq	72(%[b]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[6] * B[8]\n\t"
        "movq	64(%[b]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[7] * B[7]\n\t"
        "movq	56(%[b]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[8] * B[6]\n\t"
        "movq	48(%[b]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[9] * B[5]\n\t"
        "movq	40(%[b]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[10] * B[4]\n\t"
        "movq	32(%[b]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[11] * B[3]\n\t"
        "movq	24(%[b]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[12] * B[2]\n\t"
        "movq	16(%[b]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[13] * B[1]\n\t"
        "movq	8(%[b]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[14] * B[0]\n\t"
        "movq	0(%[b]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "movq	%%r8, 112(%[tmp])\n\t"
        "#  A[0] * B[15]\n\t"
        "movq	120(%[b]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[1] * B[14]\n\t"
        "movq	112(%[b]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[2] * B[13]\n\t"
        "movq	104(%[b]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[3] * B[12]\n\t"
        "movq	96(%[b]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[4] * B[11]\n\t"
        "movq	88(%[b]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[5] * B[10]\n\t"
        "movq	80(%[b]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[6] * B[9]\n\t"
        "movq	72(%[b]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[7] * B[8]\n\t"
        "movq	64(%[b]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[8] * B[7]\n\t"
        "movq	56(%[b]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[9] * B[6]\n\t"
        "movq	48(%[b]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[10] * B[5]\n\t"
        "movq	40(%[b]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[11] * B[4]\n\t"
        "movq	32(%[b]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[12] * B[3]\n\t"
        "movq	24(%[b]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[13] * B[2]\n\t"
        "movq	16(%[b]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[14] * B[1]\n\t"
        "movq	8(%[b]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[15] * B[0]\n\t"
        "movq	0(%[b]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "movq	%%rbx, 120(%[tmp])\n\t"
        "#  A[0] * B[16]\n\t"
        "movq	128(%[b]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[1] * B[15]\n\t"
        "movq	120(%[b]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[2] * B[14]\n\t"
        "movq	112(%[b]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[3] * B[13]\n\t"
        "movq	104(%[b]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[4] * B[12]\n\t"
        "movq	96(%[b]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[5] * B[11]\n\t"
        "movq	88(%[b]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[6] * B[10]\n\t"
        "movq	80(%[b]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[7] * B[9]\n\t"
        "movq	72(%[b]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[8] * B[8]\n\t"
        "movq	64(%[b]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[9] * B[7]\n\t"
        "movq	56(%[b]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[10] * B[6]\n\t"
        "movq	48(%[b]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[11] * B[5]\n\t"
        "movq	40(%[b]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[12] * B[4]\n\t"
        "movq	32(%[b]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[13] * B[3]\n\t"
        "movq	24(%[b]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[14] * B[2]\n\t"
        "movq	16(%[b]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[15] * B[1]\n\t"
        "movq	8(%[b]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[16] * B[0]\n\t"
        "movq	0(%[b]), %%rax\n\t"
        "mulq	128(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "movq	%%rcx, 128(%[tmp])\n\t"
        "#  A[0] * B[17]\n\t"
        "movq	136(%[b]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[1] * B[16]\n\t"
        "movq	128(%[b]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[2] * B[15]\n\t"
        "movq	120(%[b]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[3] * B[14]\n\t"
        "movq	112(%[b]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[4] * B[13]\n\t"
        "movq	104(%[b]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[5] * B[12]\n\t"
        "movq	96(%[b]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[6] * B[11]\n\t"
        "movq	88(%[b]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[7] * B[10]\n\t"
        "movq	80(%[b]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[8] * B[9]\n\t"
        "movq	72(%[b]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[9] * B[8]\n\t"
        "movq	64(%[b]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[10] * B[7]\n\t"
        "movq	56(%[b]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[11] * B[6]\n\t"
        "movq	48(%[b]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[12] * B[5]\n\t"
        "movq	40(%[b]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[13] * B[4]\n\t"
        "movq	32(%[b]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[14] * B[3]\n\t"
        "movq	24(%[b]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[15] * B[2]\n\t"
        "movq	16(%[b]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[16] * B[1]\n\t"
        "movq	8(%[b]), %%rax\n\t"
        "mulq	128(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[17] * B[0]\n\t"
        "movq	0(%[b]), %%rax\n\t"
        "mulq	136(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "movq	%%r8, 136(%[tmp])\n\t"
        "#  A[0] * B[18]\n\t"
        "movq	144(%[b]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[1] * B[17]\n\t"
        "movq	136(%[b]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[2] * B[16]\n\t"
        "movq	128(%[b]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[3] * B[15]\n\t"
        "movq	120(%[b]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[4] * B[14]\n\t"
        "movq	112(%[b]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[5] * B[13]\n\t"
        "movq	104(%[b]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[6] * B[12]\n\t"
        "movq	96(%[b]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[7] * B[11]\n\t"
        "movq	88(%[b]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[8] * B[10]\n\t"
        "movq	80(%[b]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[9] * B[9]\n\t"
        "movq	72(%[b]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[10] * B[8]\n\t"
        "movq	64(%[b]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[11] * B[7]\n\t"
        "movq	56(%[b]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[12] * B[6]\n\t"
        "movq	48(%[b]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[13] * B[5]\n\t"
        "movq	40(%[b]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[14] * B[4]\n\t"
        "movq	32(%[b]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[15] * B[3]\n\t"
        "movq	24(%[b]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[16] * B[2]\n\t"
        "movq	16(%[b]), %%rax\n\t"
        "mulq	128(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[17] * B[1]\n\t"
        "movq	8(%[b]), %%rax\n\t"
        "mulq	136(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[18] * B[0]\n\t"
        "movq	0(%[b]), %%rax\n\t"
        "mulq	144(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "movq	%%rbx, 144(%[tmp])\n\t"
        "#  A[0] * B[19]\n\t"
        "movq	152(%[b]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[1] * B[18]\n\t"
        "movq	144(%[b]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[2] * B[17]\n\t"
        "movq	136(%[b]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[3] * B[16]\n\t"
        "movq	128(%[b]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[4] * B[15]\n\t"
        "movq	120(%[b]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[5] * B[14]\n\t"
        "movq	112(%[b]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[6] * B[13]\n\t"
        "movq	104(%[b]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[7] * B[12]\n\t"
        "movq	96(%[b]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[8] * B[11]\n\t"
        "movq	88(%[b]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[9] * B[10]\n\t"
        "movq	80(%[b]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[10] * B[9]\n\t"
        "movq	72(%[b]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[11] * B[8]\n\t"
        "movq	64(%[b]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[12] * B[7]\n\t"
        "movq	56(%[b]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[13] * B[6]\n\t"
        "movq	48(%[b]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[14] * B[5]\n\t"
        "movq	40(%[b]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[15] * B[4]\n\t"
        "movq	32(%[b]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[16] * B[3]\n\t"
        "movq	24(%[b]), %%rax\n\t"
        "mulq	128(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[17] * B[2]\n\t"
        "movq	16(%[b]), %%rax\n\t"
        "mulq	136(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[18] * B[1]\n\t"
        "movq	8(%[b]), %%rax\n\t"
        "mulq	144(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[19] * B[0]\n\t"
        "movq	0(%[b]), %%rax\n\t"
        "mulq	152(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "movq	%%rcx, 152(%[tmp])\n\t"
        "#  A[0] * B[20]\n\t"
        "movq	160(%[b]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[1] * B[19]\n\t"
        "movq	152(%[b]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[2] * B[18]\n\t"
        "movq	144(%[b]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[3] * B[17]\n\t"
        "movq	136(%[b]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[4] * B[16]\n\t"
        "movq	128(%[b]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[5] * B[15]\n\t"
        "movq	120(%[b]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[6] * B[14]\n\t"
        "movq	112(%[b]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[7] * B[13]\n\t"
        "movq	104(%[b]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[8] * B[12]\n\t"
        "movq	96(%[b]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[9] * B[11]\n\t"
        "movq	88(%[b]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[10] * B[10]\n\t"
        "movq	80(%[b]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[11] * B[9]\n\t"
        "movq	72(%[b]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[12] * B[8]\n\t"
        "movq	64(%[b]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[13] * B[7]\n\t"
        "movq	56(%[b]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[14] * B[6]\n\t"
        "movq	48(%[b]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[15] * B[5]\n\t"
        "movq	40(%[b]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[16] * B[4]\n\t"
        "movq	32(%[b]), %%rax\n\t"
        "mulq	128(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[17] * B[3]\n\t"
        "movq	24(%[b]), %%rax\n\t"
        "mulq	136(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[18] * B[2]\n\t"
        "movq	16(%[b]), %%rax\n\t"
        "mulq	144(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[19] * B[1]\n\t"
        "movq	8(%[b]), %%rax\n\t"
        "mulq	152(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[20] * B[0]\n\t"
        "movq	0(%[b]), %%rax\n\t"
        "mulq	160(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "movq	%%r8, 160(%[tmp])\n\t"
        "#  A[0] * B[21]\n\t"
        "movq	168(%[b]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[1] * B[20]\n\t"
        "movq	160(%[b]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[2] * B[19]\n\t"
        "movq	152(%[b]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[3] * B[18]\n\t"
        "movq	144(%[b]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[4] * B[17]\n\t"
        "movq	136(%[b]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[5] * B[16]\n\t"
        "movq	128(%[b]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[6] * B[15]\n\t"
        "movq	120(%[b]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[7] * B[14]\n\t"
        "movq	112(%[b]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[8] * B[13]\n\t"
        "movq	104(%[b]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[9] * B[12]\n\t"
        "movq	96(%[b]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[10] * B[11]\n\t"
        "movq	88(%[b]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[11] * B[10]\n\t"
        "movq	80(%[b]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[12] * B[9]\n\t"
        "movq	72(%[b]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[13] * B[8]\n\t"
        "movq	64(%[b]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[14] * B[7]\n\t"
        "movq	56(%[b]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[15] * B[6]\n\t"
        "movq	48(%[b]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[16] * B[5]\n\t"
        "movq	40(%[b]), %%rax\n\t"
        "mulq	128(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[17] * B[4]\n\t"
        "movq	32(%[b]), %%rax\n\t"
        "mulq	136(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[18] * B[3]\n\t"
        "movq	24(%[b]), %%rax\n\t"
        "mulq	144(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[19] * B[2]\n\t"
        "movq	16(%[b]), %%rax\n\t"
        "mulq	152(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[20] * B[1]\n\t"
        "movq	8(%[b]), %%rax\n\t"
        "mulq	160(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[21] * B[0]\n\t"
        "movq	0(%[b]), %%rax\n\t"
        "mulq	168(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "movq	%%rbx, 168(%[tmp])\n\t"
        "#  A[0] * B[22]\n\t"
        "movq	176(%[b]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[1] * B[21]\n\t"
        "movq	168(%[b]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[2] * B[20]\n\t"
        "movq	160(%[b]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[3] * B[19]\n\t"
        "movq	152(%[b]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[4] * B[18]\n\t"
        "movq	144(%[b]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[5] * B[17]\n\t"
        "movq	136(%[b]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[6] * B[16]\n\t"
        "movq	128(%[b]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[7] * B[15]\n\t"
        "movq	120(%[b]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[8] * B[14]\n\t"
        "movq	112(%[b]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[9] * B[13]\n\t"
        "movq	104(%[b]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[10] * B[12]\n\t"
        "movq	96(%[b]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[11] * B[11]\n\t"
        "movq	88(%[b]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[12] * B[10]\n\t"
        "movq	80(%[b]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[13] * B[9]\n\t"
        "movq	72(%[b]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[14] * B[8]\n\t"
        "movq	64(%[b]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[15] * B[7]\n\t"
        "movq	56(%[b]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[16] * B[6]\n\t"
        "movq	48(%[b]), %%rax\n\t"
        "mulq	128(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[17] * B[5]\n\t"
        "movq	40(%[b]), %%rax\n\t"
        "mulq	136(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[18] * B[4]\n\t"
        "movq	32(%[b]), %%rax\n\t"
        "mulq	144(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[19] * B[3]\n\t"
        "movq	24(%[b]), %%rax\n\t"
        "mulq	152(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[20] * B[2]\n\t"
        "movq	16(%[b]), %%rax\n\t"
        "mulq	160(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[21] * B[1]\n\t"
        "movq	8(%[b]), %%rax\n\t"
        "mulq	168(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[22] * B[0]\n\t"
        "movq	0(%[b]), %%rax\n\t"
        "mulq	176(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "movq	%%rcx, 176(%[tmp])\n\t"
        "#  A[0] * B[23]\n\t"
        "movq	184(%[b]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[1] * B[22]\n\t"
        "movq	176(%[b]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[2] * B[21]\n\t"
        "movq	168(%[b]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[3] * B[20]\n\t"
        "movq	160(%[b]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[4] * B[19]\n\t"
        "movq	152(%[b]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[5] * B[18]\n\t"
        "movq	144(%[b]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[6] * B[17]\n\t"
        "movq	136(%[b]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[7] * B[16]\n\t"
        "movq	128(%[b]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[8] * B[15]\n\t"
        "movq	120(%[b]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[9] * B[14]\n\t"
        "movq	112(%[b]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[10] * B[13]\n\t"
        "movq	104(%[b]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[11] * B[12]\n\t"
        "movq	96(%[b]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[12] * B[11]\n\t"
        "movq	88(%[b]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[13] * B[10]\n\t"
        "movq	80(%[b]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[14] * B[9]\n\t"
        "movq	72(%[b]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[15] * B[8]\n\t"
        "movq	64(%[b]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[16] * B[7]\n\t"
        "movq	56(%[b]), %%rax\n\t"
        "mulq	128(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[17] * B[6]\n\t"
        "movq	48(%[b]), %%rax\n\t"
        "mulq	136(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[18] * B[5]\n\t"
        "movq	40(%[b]), %%rax\n\t"
        "mulq	144(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[19] * B[4]\n\t"
        "movq	32(%[b]), %%rax\n\t"
        "mulq	152(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[20] * B[3]\n\t"
        "movq	24(%[b]), %%rax\n\t"
        "mulq	160(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[21] * B[2]\n\t"
        "movq	16(%[b]), %%rax\n\t"
        "mulq	168(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[22] * B[1]\n\t"
        "movq	8(%[b]), %%rax\n\t"
        "mulq	176(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[23] * B[0]\n\t"
        "movq	0(%[b]), %%rax\n\t"
        "mulq	184(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "movq	%%r8, 184(%[tmp])\n\t"
        "#  A[1] * B[23]\n\t"
        "movq	184(%[b]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[2] * B[22]\n\t"
        "movq	176(%[b]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[3] * B[21]\n\t"
        "movq	168(%[b]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[4] * B[20]\n\t"
        "movq	160(%[b]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[5] * B[19]\n\t"
        "movq	152(%[b]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[6] * B[18]\n\t"
        "movq	144(%[b]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[7] * B[17]\n\t"
        "movq	136(%[b]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[8] * B[16]\n\t"
        "movq	128(%[b]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[9] * B[15]\n\t"
        "movq	120(%[b]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[10] * B[14]\n\t"
        "movq	112(%[b]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[11] * B[13]\n\t"
        "movq	104(%[b]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[12] * B[12]\n\t"
        "movq	96(%[b]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[13] * B[11]\n\t"
        "movq	88(%[b]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[14] * B[10]\n\t"
        "movq	80(%[b]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[15] * B[9]\n\t"
        "movq	72(%[b]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[16] * B[8]\n\t"
        "movq	64(%[b]), %%rax\n\t"
        "mulq	128(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[17] * B[7]\n\t"
        "movq	56(%[b]), %%rax\n\t"
        "mulq	136(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[18] * B[6]\n\t"
        "movq	48(%[b]), %%rax\n\t"
        "mulq	144(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[19] * B[5]\n\t"
        "movq	40(%[b]), %%rax\n\t"
        "mulq	152(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[20] * B[4]\n\t"
        "movq	32(%[b]), %%rax\n\t"
        "mulq	160(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[21] * B[3]\n\t"
        "movq	24(%[b]), %%rax\n\t"
        "mulq	168(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[22] * B[2]\n\t"
        "movq	16(%[b]), %%rax\n\t"
        "mulq	176(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[23] * B[1]\n\t"
        "movq	8(%[b]), %%rax\n\t"
        "mulq	184(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "movq	%%rbx, 192(%[r])\n\t"
        "#  A[2] * B[23]\n\t"
        "movq	184(%[b]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[3] * B[22]\n\t"
        "movq	176(%[b]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[4] * B[21]\n\t"
        "movq	168(%[b]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[5] * B[20]\n\t"
        "movq	160(%[b]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[6] * B[19]\n\t"
        "movq	152(%[b]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[7] * B[18]\n\t"
        "movq	144(%[b]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[8] * B[17]\n\t"
        "movq	136(%[b]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[9] * B[16]\n\t"
        "movq	128(%[b]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[10] * B[15]\n\t"
        "movq	120(%[b]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[11] * B[14]\n\t"
        "movq	112(%[b]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[12] * B[13]\n\t"
        "movq	104(%[b]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[13] * B[12]\n\t"
        "movq	96(%[b]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[14] * B[11]\n\t"
        "movq	88(%[b]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[15] * B[10]\n\t"
        "movq	80(%[b]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[16] * B[9]\n\t"
        "movq	72(%[b]), %%rax\n\t"
        "mulq	128(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[17] * B[8]\n\t"
        "movq	64(%[b]), %%rax\n\t"
        "mulq	136(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[18] * B[7]\n\t"
        "movq	56(%[b]), %%rax\n\t"
        "mulq	144(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[19] * B[6]\n\t"
        "movq	48(%[b]), %%rax\n\t"
        "mulq	152(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[20] * B[5]\n\t"
        "movq	40(%[b]), %%rax\n\t"
        "mulq	160(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[21] * B[4]\n\t"
        "movq	32(%[b]), %%rax\n\t"
        "mulq	168(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[22] * B[3]\n\t"
        "movq	24(%[b]), %%rax\n\t"
        "mulq	176(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[23] * B[2]\n\t"
        "movq	16(%[b]), %%rax\n\t"
        "mulq	184(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "movq	%%rcx, 200(%[r])\n\t"
        "#  A[3] * B[23]\n\t"
        "movq	184(%[b]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[4] * B[22]\n\t"
        "movq	176(%[b]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[5] * B[21]\n\t"
        "movq	168(%[b]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[6] * B[20]\n\t"
        "movq	160(%[b]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[7] * B[19]\n\t"
        "movq	152(%[b]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[8] * B[18]\n\t"
        "movq	144(%[b]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[9] * B[17]\n\t"
        "movq	136(%[b]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[10] * B[16]\n\t"
        "movq	128(%[b]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[11] * B[15]\n\t"
        "movq	120(%[b]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[12] * B[14]\n\t"
        "movq	112(%[b]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[13] * B[13]\n\t"
        "movq	104(%[b]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[14] * B[12]\n\t"
        "movq	96(%[b]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[15] * B[11]\n\t"
        "movq	88(%[b]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[16] * B[10]\n\t"
        "movq	80(%[b]), %%rax\n\t"
        "mulq	128(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[17] * B[9]\n\t"
        "movq	72(%[b]), %%rax\n\t"
        "mulq	136(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[18] * B[8]\n\t"
        "movq	64(%[b]), %%rax\n\t"
        "mulq	144(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[19] * B[7]\n\t"
        "movq	56(%[b]), %%rax\n\t"
        "mulq	152(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[20] * B[6]\n\t"
        "movq	48(%[b]), %%rax\n\t"
        "mulq	160(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[21] * B[5]\n\t"
        "movq	40(%[b]), %%rax\n\t"
        "mulq	168(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[22] * B[4]\n\t"
        "movq	32(%[b]), %%rax\n\t"
        "mulq	176(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[23] * B[3]\n\t"
        "movq	24(%[b]), %%rax\n\t"
        "mulq	184(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "movq	%%r8, 208(%[r])\n\t"
        "#  A[4] * B[23]\n\t"
        "movq	184(%[b]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[5] * B[22]\n\t"
        "movq	176(%[b]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[6] * B[21]\n\t"
        "movq	168(%[b]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[7] * B[20]\n\t"
        "movq	160(%[b]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[8] * B[19]\n\t"
        "movq	152(%[b]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[9] * B[18]\n\t"
        "movq	144(%[b]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[10] * B[17]\n\t"
        "movq	136(%[b]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[11] * B[16]\n\t"
        "movq	128(%[b]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[12] * B[15]\n\t"
        "movq	120(%[b]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[13] * B[14]\n\t"
        "movq	112(%[b]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[14] * B[13]\n\t"
        "movq	104(%[b]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[15] * B[12]\n\t"
        "movq	96(%[b]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[16] * B[11]\n\t"
        "movq	88(%[b]), %%rax\n\t"
        "mulq	128(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[17] * B[10]\n\t"
        "movq	80(%[b]), %%rax\n\t"
        "mulq	136(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[18] * B[9]\n\t"
        "movq	72(%[b]), %%rax\n\t"
        "mulq	144(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[19] * B[8]\n\t"
        "movq	64(%[b]), %%rax\n\t"
        "mulq	152(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[20] * B[7]\n\t"
        "movq	56(%[b]), %%rax\n\t"
        "mulq	160(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[21] * B[6]\n\t"
        "movq	48(%[b]), %%rax\n\t"
        "mulq	168(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[22] * B[5]\n\t"
        "movq	40(%[b]), %%rax\n\t"
        "mulq	176(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[23] * B[4]\n\t"
        "movq	32(%[b]), %%rax\n\t"
        "mulq	184(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "movq	%%rbx, 216(%[r])\n\t"
        "#  A[5] * B[23]\n\t"
        "movq	184(%[b]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[6] * B[22]\n\t"
        "movq	176(%[b]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[7] * B[21]\n\t"
        "movq	168(%[b]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[8] * B[20]\n\t"
        "movq	160(%[b]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[9] * B[19]\n\t"
        "movq	152(%[b]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[10] * B[18]\n\t"
        "movq	144(%[b]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[11] * B[17]\n\t"
        "movq	136(%[b]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[12] * B[16]\n\t"
        "movq	128(%[b]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[13] * B[15]\n\t"
        "movq	120(%[b]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[14] * B[14]\n\t"
        "movq	112(%[b]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[15] * B[13]\n\t"
        "movq	104(%[b]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[16] * B[12]\n\t"
        "movq	96(%[b]), %%rax\n\t"
        "mulq	128(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[17] * B[11]\n\t"
        "movq	88(%[b]), %%rax\n\t"
        "mulq	136(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[18] * B[10]\n\t"
        "movq	80(%[b]), %%rax\n\t"
        "mulq	144(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[19] * B[9]\n\t"
        "movq	72(%[b]), %%rax\n\t"
        "mulq	152(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[20] * B[8]\n\t"
        "movq	64(%[b]), %%rax\n\t"
        "mulq	160(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[21] * B[7]\n\t"
        "movq	56(%[b]), %%rax\n\t"
        "mulq	168(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[22] * B[6]\n\t"
        "movq	48(%[b]), %%rax\n\t"
        "mulq	176(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[23] * B[5]\n\t"
        "movq	40(%[b]), %%rax\n\t"
        "mulq	184(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "movq	%%rcx, 224(%[r])\n\t"
        "#  A[6] * B[23]\n\t"
        "movq	184(%[b]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[7] * B[22]\n\t"
        "movq	176(%[b]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[8] * B[21]\n\t"
        "movq	168(%[b]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[9] * B[20]\n\t"
        "movq	160(%[b]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[10] * B[19]\n\t"
        "movq	152(%[b]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[11] * B[18]\n\t"
        "movq	144(%[b]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[12] * B[17]\n\t"
        "movq	136(%[b]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[13] * B[16]\n\t"
        "movq	128(%[b]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[14] * B[15]\n\t"
        "movq	120(%[b]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[15] * B[14]\n\t"
        "movq	112(%[b]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[16] * B[13]\n\t"
        "movq	104(%[b]), %%rax\n\t"
        "mulq	128(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[17] * B[12]\n\t"
        "movq	96(%[b]), %%rax\n\t"
        "mulq	136(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[18] * B[11]\n\t"
        "movq	88(%[b]), %%rax\n\t"
        "mulq	144(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[19] * B[10]\n\t"
        "movq	80(%[b]), %%rax\n\t"
        "mulq	152(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[20] * B[9]\n\t"
        "movq	72(%[b]), %%rax\n\t"
        "mulq	160(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[21] * B[8]\n\t"
        "movq	64(%[b]), %%rax\n\t"
        "mulq	168(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[22] * B[7]\n\t"
        "movq	56(%[b]), %%rax\n\t"
        "mulq	176(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[23] * B[6]\n\t"
        "movq	48(%[b]), %%rax\n\t"
        "mulq	184(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "movq	%%r8, 232(%[r])\n\t"
        "#  A[7] * B[23]\n\t"
        "movq	184(%[b]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[8] * B[22]\n\t"
        "movq	176(%[b]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[9] * B[21]\n\t"
        "movq	168(%[b]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[10] * B[20]\n\t"
        "movq	160(%[b]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[11] * B[19]\n\t"
        "movq	152(%[b]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[12] * B[18]\n\t"
        "movq	144(%[b]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[13] * B[17]\n\t"
        "movq	136(%[b]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[14] * B[16]\n\t"
        "movq	128(%[b]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[15] * B[15]\n\t"
        "movq	120(%[b]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[16] * B[14]\n\t"
        "movq	112(%[b]), %%rax\n\t"
        "mulq	128(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[17] * B[13]\n\t"
        "movq	104(%[b]), %%rax\n\t"
        "mulq	136(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[18] * B[12]\n\t"
        "movq	96(%[b]), %%rax\n\t"
        "mulq	144(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[19] * B[11]\n\t"
        "movq	88(%[b]), %%rax\n\t"
        "mulq	152(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[20] * B[10]\n\t"
        "movq	80(%[b]), %%rax\n\t"
        "mulq	160(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[21] * B[9]\n\t"
        "movq	72(%[b]), %%rax\n\t"
        "mulq	168(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[22] * B[8]\n\t"
        "movq	64(%[b]), %%rax\n\t"
        "mulq	176(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[23] * B[7]\n\t"
        "movq	56(%[b]), %%rax\n\t"
        "mulq	184(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "movq	%%rbx, 240(%[r])\n\t"
        "#  A[8] * B[23]\n\t"
        "movq	184(%[b]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[9] * B[22]\n\t"
        "movq	176(%[b]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[10] * B[21]\n\t"
        "movq	168(%[b]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[11] * B[20]\n\t"
        "movq	160(%[b]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[12] * B[19]\n\t"
        "movq	152(%[b]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[13] * B[18]\n\t"
        "movq	144(%[b]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[14] * B[17]\n\t"
        "movq	136(%[b]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[15] * B[16]\n\t"
        "movq	128(%[b]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[16] * B[15]\n\t"
        "movq	120(%[b]), %%rax\n\t"
        "mulq	128(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[17] * B[14]\n\t"
        "movq	112(%[b]), %%rax\n\t"
        "mulq	136(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[18] * B[13]\n\t"
        "movq	104(%[b]), %%rax\n\t"
        "mulq	144(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[19] * B[12]\n\t"
        "movq	96(%[b]), %%rax\n\t"
        "mulq	152(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[20] * B[11]\n\t"
        "movq	88(%[b]), %%rax\n\t"
        "mulq	160(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[21] * B[10]\n\t"
        "movq	80(%[b]), %%rax\n\t"
        "mulq	168(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[22] * B[9]\n\t"
        "movq	72(%[b]), %%rax\n\t"
        "mulq	176(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[23] * B[8]\n\t"
        "movq	64(%[b]), %%rax\n\t"
        "mulq	184(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "movq	%%rcx, 248(%[r])\n\t"
        "#  A[9] * B[23]\n\t"
        "movq	184(%[b]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[10] * B[22]\n\t"
        "movq	176(%[b]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[11] * B[21]\n\t"
        "movq	168(%[b]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[12] * B[20]\n\t"
        "movq	160(%[b]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[13] * B[19]\n\t"
        "movq	152(%[b]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[14] * B[18]\n\t"
        "movq	144(%[b]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[15] * B[17]\n\t"
        "movq	136(%[b]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[16] * B[16]\n\t"
        "movq	128(%[b]), %%rax\n\t"
        "mulq	128(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[17] * B[15]\n\t"
        "movq	120(%[b]), %%rax\n\t"
        "mulq	136(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[18] * B[14]\n\t"
        "movq	112(%[b]), %%rax\n\t"
        "mulq	144(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[19] * B[13]\n\t"
        "movq	104(%[b]), %%rax\n\t"
        "mulq	152(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[20] * B[12]\n\t"
        "movq	96(%[b]), %%rax\n\t"
        "mulq	160(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[21] * B[11]\n\t"
        "movq	88(%[b]), %%rax\n\t"
        "mulq	168(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[22] * B[10]\n\t"
        "movq	80(%[b]), %%rax\n\t"
        "mulq	176(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[23] * B[9]\n\t"
        "movq	72(%[b]), %%rax\n\t"
        "mulq	184(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "movq	%%r8, 256(%[r])\n\t"
        "#  A[10] * B[23]\n\t"
        "movq	184(%[b]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[11] * B[22]\n\t"
        "movq	176(%[b]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[12] * B[21]\n\t"
        "movq	168(%[b]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[13] * B[20]\n\t"
        "movq	160(%[b]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[14] * B[19]\n\t"
        "movq	152(%[b]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[15] * B[18]\n\t"
        "movq	144(%[b]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[16] * B[17]\n\t"
        "movq	136(%[b]), %%rax\n\t"
        "mulq	128(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[17] * B[16]\n\t"
        "movq	128(%[b]), %%rax\n\t"
        "mulq	136(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[18] * B[15]\n\t"
        "movq	120(%[b]), %%rax\n\t"
        "mulq	144(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[19] * B[14]\n\t"
        "movq	112(%[b]), %%rax\n\t"
        "mulq	152(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[20] * B[13]\n\t"
        "movq	104(%[b]), %%rax\n\t"
        "mulq	160(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[21] * B[12]\n\t"
        "movq	96(%[b]), %%rax\n\t"
        "mulq	168(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[22] * B[11]\n\t"
        "movq	88(%[b]), %%rax\n\t"
        "mulq	176(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[23] * B[10]\n\t"
        "movq	80(%[b]), %%rax\n\t"
        "mulq	184(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "movq	%%rbx, 264(%[r])\n\t"
        "#  A[11] * B[23]\n\t"
        "movq	184(%[b]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[12] * B[22]\n\t"
        "movq	176(%[b]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[13] * B[21]\n\t"
        "movq	168(%[b]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[14] * B[20]\n\t"
        "movq	160(%[b]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[15] * B[19]\n\t"
        "movq	152(%[b]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[16] * B[18]\n\t"
        "movq	144(%[b]), %%rax\n\t"
        "mulq	128(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[17] * B[17]\n\t"
        "movq	136(%[b]), %%rax\n\t"
        "mulq	136(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[18] * B[16]\n\t"
        "movq	128(%[b]), %%rax\n\t"
        "mulq	144(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[19] * B[15]\n\t"
        "movq	120(%[b]), %%rax\n\t"
        "mulq	152(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[20] * B[14]\n\t"
        "movq	112(%[b]), %%rax\n\t"
        "mulq	160(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[21] * B[13]\n\t"
        "movq	104(%[b]), %%rax\n\t"
        "mulq	168(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[22] * B[12]\n\t"
        "movq	96(%[b]), %%rax\n\t"
        "mulq	176(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[23] * B[11]\n\t"
        "movq	88(%[b]), %%rax\n\t"
        "mulq	184(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "movq	%%rcx, 272(%[r])\n\t"
        "#  A[12] * B[23]\n\t"
        "movq	184(%[b]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[13] * B[22]\n\t"
        "movq	176(%[b]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[14] * B[21]\n\t"
        "movq	168(%[b]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[15] * B[20]\n\t"
        "movq	160(%[b]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[16] * B[19]\n\t"
        "movq	152(%[b]), %%rax\n\t"
        "mulq	128(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[17] * B[18]\n\t"
        "movq	144(%[b]), %%rax\n\t"
        "mulq	136(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[18] * B[17]\n\t"
        "movq	136(%[b]), %%rax\n\t"
        "mulq	144(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[19] * B[16]\n\t"
        "movq	128(%[b]), %%rax\n\t"
        "mulq	152(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[20] * B[15]\n\t"
        "movq	120(%[b]), %%rax\n\t"
        "mulq	160(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[21] * B[14]\n\t"
        "movq	112(%[b]), %%rax\n\t"
        "mulq	168(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[22] * B[13]\n\t"
        "movq	104(%[b]), %%rax\n\t"
        "mulq	176(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[23] * B[12]\n\t"
        "movq	96(%[b]), %%rax\n\t"
        "mulq	184(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "movq	%%r8, 280(%[r])\n\t"
        "#  A[13] * B[23]\n\t"
        "movq	184(%[b]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[14] * B[22]\n\t"
        "movq	176(%[b]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[15] * B[21]\n\t"
        "movq	168(%[b]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[16] * B[20]\n\t"
        "movq	160(%[b]), %%rax\n\t"
        "mulq	128(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[17] * B[19]\n\t"
        "movq	152(%[b]), %%rax\n\t"
        "mulq	136(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[18] * B[18]\n\t"
        "movq	144(%[b]), %%rax\n\t"
        "mulq	144(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[19] * B[17]\n\t"
        "movq	136(%[b]), %%rax\n\t"
        "mulq	152(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[20] * B[16]\n\t"
        "movq	128(%[b]), %%rax\n\t"
        "mulq	160(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[21] * B[15]\n\t"
        "movq	120(%[b]), %%rax\n\t"
        "mulq	168(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[22] * B[14]\n\t"
        "movq	112(%[b]), %%rax\n\t"
        "mulq	176(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[23] * B[13]\n\t"
        "movq	104(%[b]), %%rax\n\t"
        "mulq	184(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "movq	%%rbx, 288(%[r])\n\t"
        "#  A[14] * B[23]\n\t"
        "movq	184(%[b]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[15] * B[22]\n\t"
        "movq	176(%[b]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[16] * B[21]\n\t"
        "movq	168(%[b]), %%rax\n\t"
        "mulq	128(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[17] * B[20]\n\t"
        "movq	160(%[b]), %%rax\n\t"
        "mulq	136(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[18] * B[19]\n\t"
        "movq	152(%[b]), %%rax\n\t"
        "mulq	144(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[19] * B[18]\n\t"
        "movq	144(%[b]), %%rax\n\t"
        "mulq	152(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[20] * B[17]\n\t"
        "movq	136(%[b]), %%rax\n\t"
        "mulq	160(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[21] * B[16]\n\t"
        "movq	128(%[b]), %%rax\n\t"
        "mulq	168(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[22] * B[15]\n\t"
        "movq	120(%[b]), %%rax\n\t"
        "mulq	176(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[23] * B[14]\n\t"
        "movq	112(%[b]), %%rax\n\t"
        "mulq	184(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "movq	%%rcx, 296(%[r])\n\t"
        "#  A[15] * B[23]\n\t"
        "movq	184(%[b]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[16] * B[22]\n\t"
        "movq	176(%[b]), %%rax\n\t"
        "mulq	128(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[17] * B[21]\n\t"
        "movq	168(%[b]), %%rax\n\t"
        "mulq	136(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[18] * B[20]\n\t"
        "movq	160(%[b]), %%rax\n\t"
        "mulq	144(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[19] * B[19]\n\t"
        "movq	152(%[b]), %%rax\n\t"
        "mulq	152(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[20] * B[18]\n\t"
        "movq	144(%[b]), %%rax\n\t"
        "mulq	160(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[21] * B[17]\n\t"
        "movq	136(%[b]), %%rax\n\t"
        "mulq	168(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[22] * B[16]\n\t"
        "movq	128(%[b]), %%rax\n\t"
        "mulq	176(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[23] * B[15]\n\t"
        "movq	120(%[b]), %%rax\n\t"
        "mulq	184(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "movq	%%r8, 304(%[r])\n\t"
        "#  A[16] * B[23]\n\t"
        "movq	184(%[b]), %%rax\n\t"
        "mulq	128(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[17] * B[22]\n\t"
        "movq	176(%[b]), %%rax\n\t"
        "mulq	136(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[18] * B[21]\n\t"
        "movq	168(%[b]), %%rax\n\t"
        "mulq	144(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[19] * B[20]\n\t"
        "movq	160(%[b]), %%rax\n\t"
        "mulq	152(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[20] * B[19]\n\t"
        "movq	152(%[b]), %%rax\n\t"
        "mulq	160(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[21] * B[18]\n\t"
        "movq	144(%[b]), %%rax\n\t"
        "mulq	168(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[22] * B[17]\n\t"
        "movq	136(%[b]), %%rax\n\t"
        "mulq	176(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[23] * B[16]\n\t"
        "movq	128(%[b]), %%rax\n\t"
        "mulq	184(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "movq	%%rbx, 312(%[r])\n\t"
        "#  A[17] * B[23]\n\t"
        "movq	184(%[b]), %%rax\n\t"
        "mulq	136(%[a])\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[18] * B[22]\n\t"
        "movq	176(%[b]), %%rax\n\t"
        "mulq	144(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[19] * B[21]\n\t"
        "movq	168(%[b]), %%rax\n\t"
        "mulq	152(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[20] * B[20]\n\t"
        "movq	160(%[b]), %%rax\n\t"
        "mulq	160(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[21] * B[19]\n\t"
        "movq	152(%[b]), %%rax\n\t"
        "mulq	168(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[22] * B[18]\n\t"
        "movq	144(%[b]), %%rax\n\t"
        "mulq	176(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[23] * B[17]\n\t"
        "movq	136(%[b]), %%rax\n\t"
        "mulq	184(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "movq	%%rcx, 320(%[r])\n\t"
        "#  A[18] * B[23]\n\t"
        "movq	184(%[b]), %%rax\n\t"
        "mulq	144(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[19] * B[22]\n\t"
        "movq	176(%[b]), %%rax\n\t"
        "mulq	152(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[20] * B[21]\n\t"
        "movq	168(%[b]), %%rax\n\t"
        "mulq	160(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[21] * B[20]\n\t"
        "movq	160(%[b]), %%rax\n\t"
        "mulq	168(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[22] * B[19]\n\t"
        "movq	152(%[b]), %%rax\n\t"
        "mulq	176(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[23] * B[18]\n\t"
        "movq	144(%[b]), %%rax\n\t"
        "mulq	184(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "movq	%%r8, 328(%[r])\n\t"
        "#  A[19] * B[23]\n\t"
        "movq	184(%[b]), %%rax\n\t"
        "mulq	152(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[20] * B[22]\n\t"
        "movq	176(%[b]), %%rax\n\t"
        "mulq	160(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[21] * B[21]\n\t"
        "movq	168(%[b]), %%rax\n\t"
        "mulq	168(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[22] * B[20]\n\t"
        "movq	160(%[b]), %%rax\n\t"
        "mulq	176(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[23] * B[19]\n\t"
        "movq	152(%[b]), %%rax\n\t"
        "mulq	184(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "movq	%%rbx, 336(%[r])\n\t"
        "#  A[20] * B[23]\n\t"
        "movq	184(%[b]), %%rax\n\t"
        "mulq	160(%[a])\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[21] * B[22]\n\t"
        "movq	176(%[b]), %%rax\n\t"
        "mulq	168(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[22] * B[21]\n\t"
        "movq	168(%[b]), %%rax\n\t"
        "mulq	176(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "#  A[23] * B[20]\n\t"
        "movq	160(%[b]), %%rax\n\t"
        "mulq	184(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "movq	%%rcx, 344(%[r])\n\t"
        "#  A[21] * B[23]\n\t"
        "movq	184(%[b]), %%rax\n\t"
        "mulq	168(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[22] * B[22]\n\t"
        "movq	176(%[b]), %%rax\n\t"
        "mulq	176(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[23] * B[21]\n\t"
        "movq	168(%[b]), %%rax\n\t"
        "mulq	184(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "movq	%%r8, 352(%[r])\n\t"
        "#  A[22] * B[23]\n\t"
        "movq	184(%[b]), %%rax\n\t"
        "mulq	176(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[23] * B[22]\n\t"
        "movq	176(%[b]), %%rax\n\t"
        "mulq	184(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "movq	%%rbx, 360(%[r])\n\t"
        "#  A[23] * B[23]\n\t"
        "movq	184(%[b]), %%rax\n\t"
        "mulq	184(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "movq	%%rcx, 368(%[r])\n\t"
        "movq	%%r8, 376(%[r])\n\t"
        :
        : [r] "r" (r), [a] "r" (a), [b] "r" (b), [tmp] "r" (tmp)
        : "memory", "rax", "rdx", "rbx", "rcx", "r8"
    );

    XMEMCPY(r, tmp, sizeof(tmp));
}

/* Square a and put result in r. (r = a * a)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 */
SP_NOINLINE static void sp_3072_sqr_24(sp_digit* r, const sp_digit* a)
{
    sp_digit tmp[24];

    __asm__ __volatile__ (
        "#  A[0] * A[0]\n\t"
        "movq	0(%[a]), %%rax\n\t"
        "mulq	%%rax\n\t"
        "xorq	%%r9, %%r9\n\t"
        "movq	%%rax, (%[tmp])\n\t"
        "movq	%%rdx, %%r8\n\t"
        "#  A[0] * A[1]\n\t"
        "movq	8(%[a]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "adcq	$0, %%rcx\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "adcq	$0, %%rcx\n\t"
        "movq	%%r8, 8(%[tmp])\n\t"
        "#  A[0] * A[2]\n\t"
        "movq	16(%[a]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "addq	%%rax, %%r9\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "addq	%%rax, %%r9\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[1] * A[1]\n\t"
        "movq	8(%[a]), %%rax\n\t"
        "mulq	%%rax\n\t"
        "addq	%%rax, %%r9\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "movq	%%r9, 16(%[tmp])\n\t"
        "#  A[0] * A[3]\n\t"
        "movq	24(%[a]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%r9, %%r9\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%r9\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%r9\n\t"
        "#  A[1] * A[2]\n\t"
        "movq	16(%[a]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%r9\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%r9\n\t"
        "movq	%%rcx, 24(%[tmp])\n\t"
        "#  A[0] * A[4]\n\t"
        "movq	32(%[a]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "adcq	$0, %%rcx\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[1] * A[3]\n\t"
        "movq	24(%[a]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "adcq	$0, %%rcx\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[2] * A[2]\n\t"
        "movq	16(%[a]), %%rax\n\t"
        "mulq	%%rax\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "adcq	$0, %%rcx\n\t"
        "movq	%%r8, 32(%[tmp])\n\t"
        "#  A[0] * A[5]\n\t"
        "movq	40(%[a]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[1] * A[4]\n\t"
        "movq	32(%[a]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[2] * A[3]\n\t"
        "movq	24(%[a]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%r10, %%r9\n\t"
        "adcq	%%r11, %%rcx\n\t"
        "adcq	%%r12, %%r8\n\t"
        "movq	%%r9, 40(%[tmp])\n\t"
        "#  A[0] * A[6]\n\t"
        "movq	48(%[a]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%r9, %%r9\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[1] * A[5]\n\t"
        "movq	40(%[a]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[2] * A[4]\n\t"
        "movq	32(%[a]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[3] * A[3]\n\t"
        "movq	24(%[a]), %%rax\n\t"
        "mulq	%%rax\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%rcx\n\t"
        "adcq	%%r11, %%r8\n\t"
        "adcq	%%r12, %%r9\n\t"
        "movq	%%rcx, 48(%[tmp])\n\t"
        "#  A[0] * A[7]\n\t"
        "movq	56(%[a]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[1] * A[6]\n\t"
        "movq	48(%[a]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[2] * A[5]\n\t"
        "movq	40(%[a]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[3] * A[4]\n\t"
        "movq	32(%[a]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%r10, %%r8\n\t"
        "adcq	%%r11, %%r9\n\t"
        "adcq	%%r12, %%rcx\n\t"
        "movq	%%r8, 56(%[tmp])\n\t"
        "#  A[0] * A[8]\n\t"
        "movq	64(%[a]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[1] * A[7]\n\t"
        "movq	56(%[a]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[2] * A[6]\n\t"
        "movq	48(%[a]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[3] * A[5]\n\t"
        "movq	40(%[a]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[4] * A[4]\n\t"
        "movq	32(%[a]), %%rax\n\t"
        "mulq	%%rax\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r9\n\t"
        "adcq	%%r11, %%rcx\n\t"
        "adcq	%%r12, %%r8\n\t"
        "movq	%%r9, 64(%[tmp])\n\t"
        "#  A[0] * A[9]\n\t"
        "movq	72(%[a]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%r9, %%r9\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[1] * A[8]\n\t"
        "movq	64(%[a]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[2] * A[7]\n\t"
        "movq	56(%[a]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[3] * A[6]\n\t"
        "movq	48(%[a]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[4] * A[5]\n\t"
        "movq	40(%[a]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%r10, %%rcx\n\t"
        "adcq	%%r11, %%r8\n\t"
        "adcq	%%r12, %%r9\n\t"
        "movq	%%rcx, 72(%[tmp])\n\t"
        "#  A[0] * A[10]\n\t"
        "movq	80(%[a]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[1] * A[9]\n\t"
        "movq	72(%[a]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[2] * A[8]\n\t"
        "movq	64(%[a]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[3] * A[7]\n\t"
        "movq	56(%[a]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[4] * A[6]\n\t"
        "movq	48(%[a]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[5] * A[5]\n\t"
        "movq	40(%[a]), %%rax\n\t"
        "mulq	%%rax\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r8\n\t"
        "adcq	%%r11, %%r9\n\t"
        "adcq	%%r12, %%rcx\n\t"
        "movq	%%r8, 80(%[tmp])\n\t"
        "#  A[0] * A[11]\n\t"
        "movq	88(%[a]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[1] * A[10]\n\t"
        "movq	80(%[a]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[2] * A[9]\n\t"
        "movq	72(%[a]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[3] * A[8]\n\t"
        "movq	64(%[a]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[4] * A[7]\n\t"
        "movq	56(%[a]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[5] * A[6]\n\t"
        "movq	48(%[a]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%r10, %%r9\n\t"
        "adcq	%%r11, %%rcx\n\t"
        "adcq	%%r12, %%r8\n\t"
        "movq	%%r9, 88(%[tmp])\n\t"
        "#  A[0] * A[12]\n\t"
        "movq	96(%[a]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%r9, %%r9\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[1] * A[11]\n\t"
        "movq	88(%[a]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[2] * A[10]\n\t"
        "movq	80(%[a]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[3] * A[9]\n\t"
        "movq	72(%[a]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[4] * A[8]\n\t"
        "movq	64(%[a]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[5] * A[7]\n\t"
        "movq	56(%[a]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[6] * A[6]\n\t"
        "movq	48(%[a]), %%rax\n\t"
        "mulq	%%rax\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%rcx\n\t"
        "adcq	%%r11, %%r8\n\t"
        "adcq	%%r12, %%r9\n\t"
        "movq	%%rcx, 96(%[tmp])\n\t"
        "#  A[0] * A[13]\n\t"
        "movq	104(%[a]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[1] * A[12]\n\t"
        "movq	96(%[a]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[2] * A[11]\n\t"
        "movq	88(%[a]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[3] * A[10]\n\t"
        "movq	80(%[a]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[4] * A[9]\n\t"
        "movq	72(%[a]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[5] * A[8]\n\t"
        "movq	64(%[a]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[6] * A[7]\n\t"
        "movq	56(%[a]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%r10, %%r8\n\t"
        "adcq	%%r11, %%r9\n\t"
        "adcq	%%r12, %%rcx\n\t"
        "movq	%%r8, 104(%[tmp])\n\t"
        "#  A[0] * A[14]\n\t"
        "movq	112(%[a]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[1] * A[13]\n\t"
        "movq	104(%[a]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[2] * A[12]\n\t"
        "movq	96(%[a]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[3] * A[11]\n\t"
        "movq	88(%[a]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[4] * A[10]\n\t"
        "movq	80(%[a]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[5] * A[9]\n\t"
        "movq	72(%[a]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[6] * A[8]\n\t"
        "movq	64(%[a]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[7] * A[7]\n\t"
        "movq	56(%[a]), %%rax\n\t"
        "mulq	%%rax\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r9\n\t"
        "adcq	%%r11, %%rcx\n\t"
        "adcq	%%r12, %%r8\n\t"
        "movq	%%r9, 112(%[tmp])\n\t"
        "#  A[0] * A[15]\n\t"
        "movq	120(%[a]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%r9, %%r9\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[1] * A[14]\n\t"
        "movq	112(%[a]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[2] * A[13]\n\t"
        "movq	104(%[a]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[3] * A[12]\n\t"
        "movq	96(%[a]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[4] * A[11]\n\t"
        "movq	88(%[a]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[5] * A[10]\n\t"
        "movq	80(%[a]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[6] * A[9]\n\t"
        "movq	72(%[a]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[7] * A[8]\n\t"
        "movq	64(%[a]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%r10, %%rcx\n\t"
        "adcq	%%r11, %%r8\n\t"
        "adcq	%%r12, %%r9\n\t"
        "movq	%%rcx, 120(%[tmp])\n\t"
        "#  A[0] * A[16]\n\t"
        "movq	128(%[a]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[1] * A[15]\n\t"
        "movq	120(%[a]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[2] * A[14]\n\t"
        "movq	112(%[a]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[3] * A[13]\n\t"
        "movq	104(%[a]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[4] * A[12]\n\t"
        "movq	96(%[a]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[5] * A[11]\n\t"
        "movq	88(%[a]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[6] * A[10]\n\t"
        "movq	80(%[a]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[7] * A[9]\n\t"
        "movq	72(%[a]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[8] * A[8]\n\t"
        "movq	64(%[a]), %%rax\n\t"
        "mulq	%%rax\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r8\n\t"
        "adcq	%%r11, %%r9\n\t"
        "adcq	%%r12, %%rcx\n\t"
        "movq	%%r8, 128(%[tmp])\n\t"
        "#  A[0] * A[17]\n\t"
        "movq	136(%[a]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[1] * A[16]\n\t"
        "movq	128(%[a]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[2] * A[15]\n\t"
        "movq	120(%[a]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[3] * A[14]\n\t"
        "movq	112(%[a]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[4] * A[13]\n\t"
        "movq	104(%[a]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[5] * A[12]\n\t"
        "movq	96(%[a]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[6] * A[11]\n\t"
        "movq	88(%[a]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[7] * A[10]\n\t"
        "movq	80(%[a]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[8] * A[9]\n\t"
        "movq	72(%[a]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%r10, %%r9\n\t"
        "adcq	%%r11, %%rcx\n\t"
        "adcq	%%r12, %%r8\n\t"
        "movq	%%r9, 136(%[tmp])\n\t"
        "#  A[0] * A[18]\n\t"
        "movq	144(%[a]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%r9, %%r9\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[1] * A[17]\n\t"
        "movq	136(%[a]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[2] * A[16]\n\t"
        "movq	128(%[a]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[3] * A[15]\n\t"
        "movq	120(%[a]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[4] * A[14]\n\t"
        "movq	112(%[a]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[5] * A[13]\n\t"
        "movq	104(%[a]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[6] * A[12]\n\t"
        "movq	96(%[a]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[7] * A[11]\n\t"
        "movq	88(%[a]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[8] * A[10]\n\t"
        "movq	80(%[a]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[9] * A[9]\n\t"
        "movq	72(%[a]), %%rax\n\t"
        "mulq	%%rax\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%rcx\n\t"
        "adcq	%%r11, %%r8\n\t"
        "adcq	%%r12, %%r9\n\t"
        "movq	%%rcx, 144(%[tmp])\n\t"
        "#  A[0] * A[19]\n\t"
        "movq	152(%[a]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[1] * A[18]\n\t"
        "movq	144(%[a]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[2] * A[17]\n\t"
        "movq	136(%[a]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[3] * A[16]\n\t"
        "movq	128(%[a]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[4] * A[15]\n\t"
        "movq	120(%[a]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[5] * A[14]\n\t"
        "movq	112(%[a]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[6] * A[13]\n\t"
        "movq	104(%[a]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[7] * A[12]\n\t"
        "movq	96(%[a]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[8] * A[11]\n\t"
        "movq	88(%[a]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[9] * A[10]\n\t"
        "movq	80(%[a]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%r10, %%r8\n\t"
        "adcq	%%r11, %%r9\n\t"
        "adcq	%%r12, %%rcx\n\t"
        "movq	%%r8, 152(%[tmp])\n\t"
        "#  A[0] * A[20]\n\t"
        "movq	160(%[a]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[1] * A[19]\n\t"
        "movq	152(%[a]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[2] * A[18]\n\t"
        "movq	144(%[a]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[3] * A[17]\n\t"
        "movq	136(%[a]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[4] * A[16]\n\t"
        "movq	128(%[a]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[5] * A[15]\n\t"
        "movq	120(%[a]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[6] * A[14]\n\t"
        "movq	112(%[a]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[7] * A[13]\n\t"
        "movq	104(%[a]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[8] * A[12]\n\t"
        "movq	96(%[a]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[9] * A[11]\n\t"
        "movq	88(%[a]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[10] * A[10]\n\t"
        "movq	80(%[a]), %%rax\n\t"
        "mulq	%%rax\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r9\n\t"
        "adcq	%%r11, %%rcx\n\t"
        "adcq	%%r12, %%r8\n\t"
        "movq	%%r9, 160(%[tmp])\n\t"
        "#  A[0] * A[21]\n\t"
        "movq	168(%[a]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%r9, %%r9\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[1] * A[20]\n\t"
        "movq	160(%[a]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[2] * A[19]\n\t"
        "movq	152(%[a]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[3] * A[18]\n\t"
        "movq	144(%[a]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[4] * A[17]\n\t"
        "movq	136(%[a]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[5] * A[16]\n\t"
        "movq	128(%[a]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[6] * A[15]\n\t"
        "movq	120(%[a]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[7] * A[14]\n\t"
        "movq	112(%[a]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[8] * A[13]\n\t"
        "movq	104(%[a]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[9] * A[12]\n\t"
        "movq	96(%[a]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[10] * A[11]\n\t"
        "movq	88(%[a]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%r10, %%rcx\n\t"
        "adcq	%%r11, %%r8\n\t"
        "adcq	%%r12, %%r9\n\t"
        "movq	%%rcx, 168(%[tmp])\n\t"
        "#  A[0] * A[22]\n\t"
        "movq	176(%[a]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[1] * A[21]\n\t"
        "movq	168(%[a]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[2] * A[20]\n\t"
        "movq	160(%[a]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[3] * A[19]\n\t"
        "movq	152(%[a]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[4] * A[18]\n\t"
        "movq	144(%[a]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[5] * A[17]\n\t"
        "movq	136(%[a]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[6] * A[16]\n\t"
        "movq	128(%[a]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[7] * A[15]\n\t"
        "movq	120(%[a]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[8] * A[14]\n\t"
        "movq	112(%[a]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[9] * A[13]\n\t"
        "movq	104(%[a]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[10] * A[12]\n\t"
        "movq	96(%[a]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[11] * A[11]\n\t"
        "movq	88(%[a]), %%rax\n\t"
        "mulq	%%rax\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r8\n\t"
        "adcq	%%r11, %%r9\n\t"
        "adcq	%%r12, %%rcx\n\t"
        "movq	%%r8, 176(%[tmp])\n\t"
        "#  A[0] * A[23]\n\t"
        "movq	184(%[a]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[1] * A[22]\n\t"
        "movq	176(%[a]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[2] * A[21]\n\t"
        "movq	168(%[a]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[3] * A[20]\n\t"
        "movq	160(%[a]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[4] * A[19]\n\t"
        "movq	152(%[a]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[5] * A[18]\n\t"
        "movq	144(%[a]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[6] * A[17]\n\t"
        "movq	136(%[a]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[7] * A[16]\n\t"
        "movq	128(%[a]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[8] * A[15]\n\t"
        "movq	120(%[a]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[9] * A[14]\n\t"
        "movq	112(%[a]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[10] * A[13]\n\t"
        "movq	104(%[a]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[11] * A[12]\n\t"
        "movq	96(%[a]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%r10, %%r9\n\t"
        "adcq	%%r11, %%rcx\n\t"
        "adcq	%%r12, %%r8\n\t"
        "movq	%%r9, 184(%[tmp])\n\t"
        "#  A[1] * A[23]\n\t"
        "movq	184(%[a]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "xorq	%%r9, %%r9\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[2] * A[22]\n\t"
        "movq	176(%[a]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[3] * A[21]\n\t"
        "movq	168(%[a]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[4] * A[20]\n\t"
        "movq	160(%[a]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[5] * A[19]\n\t"
        "movq	152(%[a]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[6] * A[18]\n\t"
        "movq	144(%[a]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[7] * A[17]\n\t"
        "movq	136(%[a]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[8] * A[16]\n\t"
        "movq	128(%[a]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[9] * A[15]\n\t"
        "movq	120(%[a]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[10] * A[14]\n\t"
        "movq	112(%[a]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[11] * A[13]\n\t"
        "movq	104(%[a]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[12] * A[12]\n\t"
        "movq	96(%[a]), %%rax\n\t"
        "mulq	%%rax\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%rcx\n\t"
        "adcq	%%r11, %%r8\n\t"
        "adcq	%%r12, %%r9\n\t"
        "movq	%%rcx, 192(%[r])\n\t"
        "#  A[2] * A[23]\n\t"
        "movq	184(%[a]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[3] * A[22]\n\t"
        "movq	176(%[a]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[4] * A[21]\n\t"
        "movq	168(%[a]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[5] * A[20]\n\t"
        "movq	160(%[a]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[6] * A[19]\n\t"
        "movq	152(%[a]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[7] * A[18]\n\t"
        "movq	144(%[a]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[8] * A[17]\n\t"
        "movq	136(%[a]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[9] * A[16]\n\t"
        "movq	128(%[a]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[10] * A[15]\n\t"
        "movq	120(%[a]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[11] * A[14]\n\t"
        "movq	112(%[a]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[12] * A[13]\n\t"
        "movq	104(%[a]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%r10, %%r8\n\t"
        "adcq	%%r11, %%r9\n\t"
        "adcq	%%r12, %%rcx\n\t"
        "movq	%%r8, 200(%[r])\n\t"
        "#  A[3] * A[23]\n\t"
        "movq	184(%[a]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[4] * A[22]\n\t"
        "movq	176(%[a]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[5] * A[21]\n\t"
        "movq	168(%[a]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[6] * A[20]\n\t"
        "movq	160(%[a]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[7] * A[19]\n\t"
        "movq	152(%[a]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[8] * A[18]\n\t"
        "movq	144(%[a]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[9] * A[17]\n\t"
        "movq	136(%[a]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[10] * A[16]\n\t"
        "movq	128(%[a]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[11] * A[15]\n\t"
        "movq	120(%[a]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[12] * A[14]\n\t"
        "movq	112(%[a]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[13] * A[13]\n\t"
        "movq	104(%[a]), %%rax\n\t"
        "mulq	%%rax\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r9\n\t"
        "adcq	%%r11, %%rcx\n\t"
        "adcq	%%r12, %%r8\n\t"
        "movq	%%r9, 208(%[r])\n\t"
        "#  A[4] * A[23]\n\t"
        "movq	184(%[a]), %%rax\n\t"
        "mulq	32(%[a])\n\t"
        "xorq	%%r9, %%r9\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[5] * A[22]\n\t"
        "movq	176(%[a]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[6] * A[21]\n\t"
        "movq	168(%[a]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[7] * A[20]\n\t"
        "movq	160(%[a]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[8] * A[19]\n\t"
        "movq	152(%[a]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[9] * A[18]\n\t"
        "movq	144(%[a]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[10] * A[17]\n\t"
        "movq	136(%[a]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[11] * A[16]\n\t"
        "movq	128(%[a]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[12] * A[15]\n\t"
        "movq	120(%[a]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[13] * A[14]\n\t"
        "movq	112(%[a]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%r10, %%rcx\n\t"
        "adcq	%%r11, %%r8\n\t"
        "adcq	%%r12, %%r9\n\t"
        "movq	%%rcx, 216(%[r])\n\t"
        "#  A[5] * A[23]\n\t"
        "movq	184(%[a]), %%rax\n\t"
        "mulq	40(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[6] * A[22]\n\t"
        "movq	176(%[a]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[7] * A[21]\n\t"
        "movq	168(%[a]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[8] * A[20]\n\t"
        "movq	160(%[a]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[9] * A[19]\n\t"
        "movq	152(%[a]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[10] * A[18]\n\t"
        "movq	144(%[a]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[11] * A[17]\n\t"
        "movq	136(%[a]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[12] * A[16]\n\t"
        "movq	128(%[a]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[13] * A[15]\n\t"
        "movq	120(%[a]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[14] * A[14]\n\t"
        "movq	112(%[a]), %%rax\n\t"
        "mulq	%%rax\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r8\n\t"
        "adcq	%%r11, %%r9\n\t"
        "adcq	%%r12, %%rcx\n\t"
        "movq	%%r8, 224(%[r])\n\t"
        "#  A[6] * A[23]\n\t"
        "movq	184(%[a]), %%rax\n\t"
        "mulq	48(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[7] * A[22]\n\t"
        "movq	176(%[a]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[8] * A[21]\n\t"
        "movq	168(%[a]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[9] * A[20]\n\t"
        "movq	160(%[a]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[10] * A[19]\n\t"
        "movq	152(%[a]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[11] * A[18]\n\t"
        "movq	144(%[a]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[12] * A[17]\n\t"
        "movq	136(%[a]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[13] * A[16]\n\t"
        "movq	128(%[a]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[14] * A[15]\n\t"
        "movq	120(%[a]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%r10, %%r9\n\t"
        "adcq	%%r11, %%rcx\n\t"
        "adcq	%%r12, %%r8\n\t"
        "movq	%%r9, 232(%[r])\n\t"
        "#  A[7] * A[23]\n\t"
        "movq	184(%[a]), %%rax\n\t"
        "mulq	56(%[a])\n\t"
        "xorq	%%r9, %%r9\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[8] * A[22]\n\t"
        "movq	176(%[a]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[9] * A[21]\n\t"
        "movq	168(%[a]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[10] * A[20]\n\t"
        "movq	160(%[a]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[11] * A[19]\n\t"
        "movq	152(%[a]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[12] * A[18]\n\t"
        "movq	144(%[a]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[13] * A[17]\n\t"
        "movq	136(%[a]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[14] * A[16]\n\t"
        "movq	128(%[a]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[15] * A[15]\n\t"
        "movq	120(%[a]), %%rax\n\t"
        "mulq	%%rax\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%rcx\n\t"
        "adcq	%%r11, %%r8\n\t"
        "adcq	%%r12, %%r9\n\t"
        "movq	%%rcx, 240(%[r])\n\t"
        "#  A[8] * A[23]\n\t"
        "movq	184(%[a]), %%rax\n\t"
        "mulq	64(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[9] * A[22]\n\t"
        "movq	176(%[a]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[10] * A[21]\n\t"
        "movq	168(%[a]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[11] * A[20]\n\t"
        "movq	160(%[a]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[12] * A[19]\n\t"
        "movq	152(%[a]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[13] * A[18]\n\t"
        "movq	144(%[a]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[14] * A[17]\n\t"
        "movq	136(%[a]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[15] * A[16]\n\t"
        "movq	128(%[a]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%r10, %%r8\n\t"
        "adcq	%%r11, %%r9\n\t"
        "adcq	%%r12, %%rcx\n\t"
        "movq	%%r8, 248(%[r])\n\t"
        "#  A[9] * A[23]\n\t"
        "movq	184(%[a]), %%rax\n\t"
        "mulq	72(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[10] * A[22]\n\t"
        "movq	176(%[a]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[11] * A[21]\n\t"
        "movq	168(%[a]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[12] * A[20]\n\t"
        "movq	160(%[a]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[13] * A[19]\n\t"
        "movq	152(%[a]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[14] * A[18]\n\t"
        "movq	144(%[a]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[15] * A[17]\n\t"
        "movq	136(%[a]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[16] * A[16]\n\t"
        "movq	128(%[a]), %%rax\n\t"
        "mulq	%%rax\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r9\n\t"
        "adcq	%%r11, %%rcx\n\t"
        "adcq	%%r12, %%r8\n\t"
        "movq	%%r9, 256(%[r])\n\t"
        "#  A[10] * A[23]\n\t"
        "movq	184(%[a]), %%rax\n\t"
        "mulq	80(%[a])\n\t"
        "xorq	%%r9, %%r9\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[11] * A[22]\n\t"
        "movq	176(%[a]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[12] * A[21]\n\t"
        "movq	168(%[a]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[13] * A[20]\n\t"
        "movq	160(%[a]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[14] * A[19]\n\t"
        "movq	152(%[a]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[15] * A[18]\n\t"
        "movq	144(%[a]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[16] * A[17]\n\t"
        "movq	136(%[a]), %%rax\n\t"
        "mulq	128(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%r10, %%rcx\n\t"
        "adcq	%%r11, %%r8\n\t"
        "adcq	%%r12, %%r9\n\t"
        "movq	%%rcx, 264(%[r])\n\t"
        "#  A[11] * A[23]\n\t"
        "movq	184(%[a]), %%rax\n\t"
        "mulq	88(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[12] * A[22]\n\t"
        "movq	176(%[a]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[13] * A[21]\n\t"
        "movq	168(%[a]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[14] * A[20]\n\t"
        "movq	160(%[a]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[15] * A[19]\n\t"
        "movq	152(%[a]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[16] * A[18]\n\t"
        "movq	144(%[a]), %%rax\n\t"
        "mulq	128(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[17] * A[17]\n\t"
        "movq	136(%[a]), %%rax\n\t"
        "mulq	%%rax\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r8\n\t"
        "adcq	%%r11, %%r9\n\t"
        "adcq	%%r12, %%rcx\n\t"
        "movq	%%r8, 272(%[r])\n\t"
        "#  A[12] * A[23]\n\t"
        "movq	184(%[a]), %%rax\n\t"
        "mulq	96(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[13] * A[22]\n\t"
        "movq	176(%[a]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[14] * A[21]\n\t"
        "movq	168(%[a]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[15] * A[20]\n\t"
        "movq	160(%[a]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[16] * A[19]\n\t"
        "movq	152(%[a]), %%rax\n\t"
        "mulq	128(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[17] * A[18]\n\t"
        "movq	144(%[a]), %%rax\n\t"
        "mulq	136(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%r10, %%r9\n\t"
        "adcq	%%r11, %%rcx\n\t"
        "adcq	%%r12, %%r8\n\t"
        "movq	%%r9, 280(%[r])\n\t"
        "#  A[13] * A[23]\n\t"
        "movq	184(%[a]), %%rax\n\t"
        "mulq	104(%[a])\n\t"
        "xorq	%%r9, %%r9\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[14] * A[22]\n\t"
        "movq	176(%[a]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[15] * A[21]\n\t"
        "movq	168(%[a]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[16] * A[20]\n\t"
        "movq	160(%[a]), %%rax\n\t"
        "mulq	128(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[17] * A[19]\n\t"
        "movq	152(%[a]), %%rax\n\t"
        "mulq	136(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[18] * A[18]\n\t"
        "movq	144(%[a]), %%rax\n\t"
        "mulq	%%rax\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%rcx\n\t"
        "adcq	%%r11, %%r8\n\t"
        "adcq	%%r12, %%r9\n\t"
        "movq	%%rcx, 288(%[r])\n\t"
        "#  A[14] * A[23]\n\t"
        "movq	184(%[a]), %%rax\n\t"
        "mulq	112(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[15] * A[22]\n\t"
        "movq	176(%[a]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[16] * A[21]\n\t"
        "movq	168(%[a]), %%rax\n\t"
        "mulq	128(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[17] * A[20]\n\t"
        "movq	160(%[a]), %%rax\n\t"
        "mulq	136(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[18] * A[19]\n\t"
        "movq	152(%[a]), %%rax\n\t"
        "mulq	144(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%r10, %%r8\n\t"
        "adcq	%%r11, %%r9\n\t"
        "adcq	%%r12, %%rcx\n\t"
        "movq	%%r8, 296(%[r])\n\t"
        "#  A[15] * A[23]\n\t"
        "movq	184(%[a]), %%rax\n\t"
        "mulq	120(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[16] * A[22]\n\t"
        "movq	176(%[a]), %%rax\n\t"
        "mulq	128(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[17] * A[21]\n\t"
        "movq	168(%[a]), %%rax\n\t"
        "mulq	136(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[18] * A[20]\n\t"
        "movq	160(%[a]), %%rax\n\t"
        "mulq	144(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[19] * A[19]\n\t"
        "movq	152(%[a]), %%rax\n\t"
        "mulq	%%rax\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r9\n\t"
        "adcq	%%r11, %%rcx\n\t"
        "adcq	%%r12, %%r8\n\t"
        "movq	%%r9, 304(%[r])\n\t"
        "#  A[16] * A[23]\n\t"
        "movq	184(%[a]), %%rax\n\t"
        "mulq	128(%[a])\n\t"
        "xorq	%%r9, %%r9\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[17] * A[22]\n\t"
        "movq	176(%[a]), %%rax\n\t"
        "mulq	136(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[18] * A[21]\n\t"
        "movq	168(%[a]), %%rax\n\t"
        "mulq	144(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[19] * A[20]\n\t"
        "movq	160(%[a]), %%rax\n\t"
        "mulq	152(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%r10, %%rcx\n\t"
        "adcq	%%r11, %%r8\n\t"
        "adcq	%%r12, %%r9\n\t"
        "movq	%%rcx, 312(%[r])\n\t"
        "#  A[17] * A[23]\n\t"
        "movq	184(%[a]), %%rax\n\t"
        "mulq	136(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[18] * A[22]\n\t"
        "movq	176(%[a]), %%rax\n\t"
        "mulq	144(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[19] * A[21]\n\t"
        "movq	168(%[a]), %%rax\n\t"
        "mulq	152(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[20] * A[20]\n\t"
        "movq	160(%[a]), %%rax\n\t"
        "mulq	%%rax\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r8\n\t"
        "adcq	%%r11, %%r9\n\t"
        "adcq	%%r12, %%rcx\n\t"
        "movq	%%r8, 320(%[r])\n\t"
        "#  A[18] * A[23]\n\t"
        "movq	184(%[a]), %%rax\n\t"
        "mulq	144(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r10\n\t"
        "movq	%%rdx, %%r11\n\t"
        "#  A[19] * A[22]\n\t"
        "movq	176(%[a]), %%rax\n\t"
        "mulq	152(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[20] * A[21]\n\t"
        "movq	168(%[a]), %%rax\n\t"
        "mulq	160(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "addq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "addq	%%r10, %%r9\n\t"
        "adcq	%%r11, %%rcx\n\t"
        "adcq	%%r12, %%r8\n\t"
        "movq	%%r9, 328(%[r])\n\t"
        "#  A[19] * A[23]\n\t"
        "movq	184(%[a]), %%rax\n\t"
        "mulq	152(%[a])\n\t"
        "xorq	%%r9, %%r9\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%r9\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%r9\n\t"
        "#  A[20] * A[22]\n\t"
        "movq	176(%[a]), %%rax\n\t"
        "mulq	160(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%r9\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%r9\n\t"
        "#  A[21] * A[21]\n\t"
        "movq	168(%[a]), %%rax\n\t"
        "mulq	%%rax\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%r9\n\t"
        "movq	%%rcx, 336(%[r])\n\t"
        "#  A[20] * A[23]\n\t"
        "movq	184(%[a]), %%rax\n\t"
        "mulq	160(%[a])\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "adcq	$0, %%rcx\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "adcq	$0, %%rcx\n\t"
        "#  A[21] * A[22]\n\t"
        "movq	176(%[a]), %%rax\n\t"
        "mulq	168(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "adcq	$0, %%rcx\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "adcq	$0, %%rcx\n\t"
        "movq	%%r8, 344(%[r])\n\t"
        "#  A[21] * A[23]\n\t"
        "movq	184(%[a]), %%rax\n\t"
        "mulq	168(%[a])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "addq	%%rax, %%r9\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "addq	%%rax, %%r9\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "#  A[22] * A[22]\n\t"
        "movq	176(%[a]), %%rax\n\t"
        "mulq	%%rax\n\t"
        "addq	%%rax, %%r9\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "movq	%%r9, 352(%[r])\n\t"
        "#  A[22] * A[23]\n\t"
        "movq	184(%[a]), %%rax\n\t"
        "mulq	176(%[a])\n\t"
        "xorq	%%r9, %%r9\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%r9\n\t"
        "addq	%%rax, %%rcx\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%r9\n\t"
        "movq	%%rcx, 360(%[r])\n\t"
        "#  A[23] * A[23]\n\t"
        "movq	184(%[a]), %%rax\n\t"
        "mulq	%%rax\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "movq	%%r8, 368(%[r])\n\t"
        "movq	%%r9, 376(%[r])\n\t"
        :
        : [r] "r" (r), [a] "r" (a), [tmp] "r" (tmp)
        : "memory", "rax", "rdx", "rcx", "r8", "r9", "r10", "r11", "r12"
    );

    XMEMCPY(r, tmp, sizeof(tmp));
}

#ifdef HAVE_INTEL_AVX2
/* Multiply a and b into r. (r = a * b)
 *
 * r   Result of multiplication.
 * a   First number to multiply.
 * b   Second number to multiply.
 */
SP_NOINLINE static void sp_3072_mul_avx2_24(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    sp_digit tmp[2*24];

    __asm__ __volatile__ (
        "movq	0(%[a]), %%rdx\n\t"
        "xorq	%%r15, %%r15\n\t"
        "# A[0] * B[0]\n\t"
        "mulx	0(%[b]), %%r10, %%r11\n\t"
        "# A[0] * B[1]\n\t"
        "mulx	8(%[b]), %%r8, %%r12\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "# A[0] * B[2]\n\t"
        "mulx	16(%[b]), %%r8, %%r13\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "# A[0] * B[3]\n\t"
        "mulx	24(%[b]), %%r8, %%r14\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "movq	%%r10, 0(%[t])\n\t"
        "movq	%%r11, 8(%[t])\n\t"
        "movq	%%r12, 16(%[t])\n\t"
        "movq	%%r13, 24(%[t])\n\t"
        "# A[0] * B[4]\n\t"
        "mulx	32(%[b]), %%r8, %%rax\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "# A[0] * B[5]\n\t"
        "mulx	40(%[b]), %%r8, %%r10\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "# A[0] * B[6]\n\t"
        "mulx	48(%[b]), %%r8, %%r11\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "# A[0] * B[7]\n\t"
        "mulx	56(%[b]), %%r8, %%r12\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "movq	%%r14, 32(%[t])\n\t"
        "movq	%%rax, 40(%[t])\n\t"
        "movq	%%r10, 48(%[t])\n\t"
        "movq	%%r11, 56(%[t])\n\t"
        "# A[0] * B[8]\n\t"
        "mulx	64(%[b]), %%r8, %%r13\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "# A[0] * B[9]\n\t"
        "mulx	72(%[b]), %%r8, %%r14\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "# A[0] * B[10]\n\t"
        "mulx	80(%[b]), %%r8, %%rax\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "# A[0] * B[11]\n\t"
        "mulx	88(%[b]), %%r8, %%r10\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "movq	%%r12, 64(%[t])\n\t"
        "movq	%%r13, 72(%[t])\n\t"
        "movq	%%r14, 80(%[t])\n\t"
        "movq	%%rax, 88(%[t])\n\t"
        "# A[0] * B[12]\n\t"
        "mulx	96(%[b]), %%r8, %%r11\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "# A[0] * B[13]\n\t"
        "mulx	104(%[b]), %%r8, %%r12\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "# A[0] * B[14]\n\t"
        "mulx	112(%[b]), %%r8, %%r13\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "# A[0] * B[15]\n\t"
        "mulx	120(%[b]), %%r8, %%r14\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "movq	%%r10, 96(%[t])\n\t"
        "movq	%%r11, 104(%[t])\n\t"
        "movq	%%r12, 112(%[t])\n\t"
        "movq	%%r13, 120(%[t])\n\t"
        "# A[0] * B[16]\n\t"
        "mulx	128(%[b]), %%r8, %%rax\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "# A[0] * B[17]\n\t"
        "mulx	136(%[b]), %%r8, %%r10\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "# A[0] * B[18]\n\t"
        "mulx	144(%[b]), %%r8, %%r11\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "# A[0] * B[19]\n\t"
        "mulx	152(%[b]), %%r8, %%r12\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "movq	%%r14, 128(%[t])\n\t"
        "movq	%%rax, 136(%[t])\n\t"
        "movq	%%r10, 144(%[t])\n\t"
        "movq	%%r11, 152(%[t])\n\t"
        "# A[0] * B[20]\n\t"
        "mulx	160(%[b]), %%r8, %%r13\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "# A[0] * B[21]\n\t"
        "mulx	168(%[b]), %%r8, %%r14\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "# A[0] * B[22]\n\t"
        "mulx	176(%[b]), %%r8, %%rax\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "# A[0] * B[23]\n\t"
        "mulx	184(%[b]), %%r8, %%r10\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adcxq	%%r15, %%r10\n\t"
        "movq	%%r15, %%rcx\n\t"
        "adcxq	%%r15, %%rcx\n\t"
        "movq	%%r12, 160(%[t])\n\t"
        "movq	%%r13, 168(%[t])\n\t"
        "movq	%%r14, 176(%[t])\n\t"
        "movq	%%rax, 184(%[t])\n\t"
        "movq	%%r10, 192(%[t])\n\t"
        "movq	8(%[a]), %%rdx\n\t"
        "xorq	%%r15, %%r15\n\t"
        "movq	8(%[t]), %%r11\n\t"
        "movq	16(%[t]), %%r12\n\t"
        "movq	24(%[t]), %%r13\n\t"
        "movq	32(%[t]), %%r14\n\t"
        "movq	40(%[t]), %%rax\n\t"
        "# A[1] * B[0]\n\t"
        "mulx	0(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[1] * B[1]\n\t"
        "mulx	8(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[1] * B[2]\n\t"
        "mulx	16(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[1] * B[3]\n\t"
        "mulx	24(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "movq	%%r11, 8(%[t])\n\t"
        "movq	%%r12, 16(%[t])\n\t"
        "movq	%%r13, 24(%[t])\n\t"
        "movq	%%r14, 32(%[t])\n\t"
        "movq	48(%[t]), %%r10\n\t"
        "movq	56(%[t]), %%r11\n\t"
        "movq	64(%[t]), %%r12\n\t"
        "movq	72(%[t]), %%r13\n\t"
        "# A[1] * B[4]\n\t"
        "mulx	32(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[1] * B[5]\n\t"
        "mulx	40(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[1] * B[6]\n\t"
        "mulx	48(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[1] * B[7]\n\t"
        "mulx	56(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "movq	%%rax, 40(%[t])\n\t"
        "movq	%%r10, 48(%[t])\n\t"
        "movq	%%r11, 56(%[t])\n\t"
        "movq	%%r12, 64(%[t])\n\t"
        "movq	80(%[t]), %%r14\n\t"
        "movq	88(%[t]), %%rax\n\t"
        "movq	96(%[t]), %%r10\n\t"
        "movq	104(%[t]), %%r11\n\t"
        "# A[1] * B[8]\n\t"
        "mulx	64(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[1] * B[9]\n\t"
        "mulx	72(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[1] * B[10]\n\t"
        "mulx	80(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[1] * B[11]\n\t"
        "mulx	88(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "movq	%%r13, 72(%[t])\n\t"
        "movq	%%r14, 80(%[t])\n\t"
        "movq	%%rax, 88(%[t])\n\t"
        "movq	%%r10, 96(%[t])\n\t"
        "movq	112(%[t]), %%r12\n\t"
        "movq	120(%[t]), %%r13\n\t"
        "movq	128(%[t]), %%r14\n\t"
        "movq	136(%[t]), %%rax\n\t"
        "# A[1] * B[12]\n\t"
        "mulx	96(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[1] * B[13]\n\t"
        "mulx	104(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[1] * B[14]\n\t"
        "mulx	112(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[1] * B[15]\n\t"
        "mulx	120(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "movq	%%r11, 104(%[t])\n\t"
        "movq	%%r12, 112(%[t])\n\t"
        "movq	%%r13, 120(%[t])\n\t"
        "movq	%%r14, 128(%[t])\n\t"
        "movq	144(%[t]), %%r10\n\t"
        "movq	152(%[t]), %%r11\n\t"
        "movq	160(%[t]), %%r12\n\t"
        "movq	168(%[t]), %%r13\n\t"
        "# A[1] * B[16]\n\t"
        "mulx	128(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[1] * B[17]\n\t"
        "mulx	136(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[1] * B[18]\n\t"
        "mulx	144(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[1] * B[19]\n\t"
        "mulx	152(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "movq	%%rax, 136(%[t])\n\t"
        "movq	%%r10, 144(%[t])\n\t"
        "movq	%%r11, 152(%[t])\n\t"
        "movq	%%r12, 160(%[t])\n\t"
        "movq	176(%[t]), %%r14\n\t"
        "movq	184(%[t]), %%rax\n\t"
        "movq	192(%[t]), %%r10\n\t"
        "# A[1] * B[20]\n\t"
        "mulx	160(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[1] * B[21]\n\t"
        "mulx	168(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[1] * B[22]\n\t"
        "mulx	176(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[1] * B[23]\n\t"
        "mulx	184(%[b]), %%r8, %%r9\n\t"
        "movq	%%r15, %%r11\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "adcxq	%%rcx, %%r11\n\t"
        "movq	%%r15, %%rcx\n\t"
        "adoxq	%%r15, %%rcx\n\t"
        "adcxq	%%r15, %%rcx\n\t"
        "movq	%%r13, 168(%[t])\n\t"
        "movq	%%r14, 176(%[t])\n\t"
        "movq	%%rax, 184(%[t])\n\t"
        "movq	%%r10, 192(%[t])\n\t"
        "movq	%%r11, 200(%[t])\n\t"
        "movq	16(%[a]), %%rdx\n\t"
        "xorq	%%r15, %%r15\n\t"
        "movq	16(%[t]), %%r12\n\t"
        "movq	24(%[t]), %%r13\n\t"
        "movq	32(%[t]), %%r14\n\t"
        "movq	40(%[t]), %%rax\n\t"
        "movq	48(%[t]), %%r10\n\t"
        "# A[2] * B[0]\n\t"
        "mulx	0(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[2] * B[1]\n\t"
        "mulx	8(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[2] * B[2]\n\t"
        "mulx	16(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[2] * B[3]\n\t"
        "mulx	24(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "movq	%%r12, 16(%[t])\n\t"
        "movq	%%r13, 24(%[t])\n\t"
        "movq	%%r14, 32(%[t])\n\t"
        "movq	%%rax, 40(%[t])\n\t"
        "movq	56(%[t]), %%r11\n\t"
        "movq	64(%[t]), %%r12\n\t"
        "movq	72(%[t]), %%r13\n\t"
        "movq	80(%[t]), %%r14\n\t"
        "# A[2] * B[4]\n\t"
        "mulx	32(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[2] * B[5]\n\t"
        "mulx	40(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[2] * B[6]\n\t"
        "mulx	48(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[2] * B[7]\n\t"
        "mulx	56(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "movq	%%r10, 48(%[t])\n\t"
        "movq	%%r11, 56(%[t])\n\t"
        "movq	%%r12, 64(%[t])\n\t"
        "movq	%%r13, 72(%[t])\n\t"
        "movq	88(%[t]), %%rax\n\t"
        "movq	96(%[t]), %%r10\n\t"
        "movq	104(%[t]), %%r11\n\t"
        "movq	112(%[t]), %%r12\n\t"
        "# A[2] * B[8]\n\t"
        "mulx	64(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[2] * B[9]\n\t"
        "mulx	72(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[2] * B[10]\n\t"
        "mulx	80(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[2] * B[11]\n\t"
        "mulx	88(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "movq	%%r14, 80(%[t])\n\t"
        "movq	%%rax, 88(%[t])\n\t"
        "movq	%%r10, 96(%[t])\n\t"
        "movq	%%r11, 104(%[t])\n\t"
        "movq	120(%[t]), %%r13\n\t"
        "movq	128(%[t]), %%r14\n\t"
        "movq	136(%[t]), %%rax\n\t"
        "movq	144(%[t]), %%r10\n\t"
        "# A[2] * B[12]\n\t"
        "mulx	96(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[2] * B[13]\n\t"
        "mulx	104(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[2] * B[14]\n\t"
        "mulx	112(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[2] * B[15]\n\t"
        "mulx	120(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "movq	%%r12, 112(%[t])\n\t"
        "movq	%%r13, 120(%[t])\n\t"
        "movq	%%r14, 128(%[t])\n\t"
        "movq	%%rax, 136(%[t])\n\t"
        "movq	152(%[t]), %%r11\n\t"
        "movq	160(%[t]), %%r12\n\t"
        "movq	168(%[t]), %%r13\n\t"
        "movq	176(%[t]), %%r14\n\t"
        "# A[2] * B[16]\n\t"
        "mulx	128(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[2] * B[17]\n\t"
        "mulx	136(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[2] * B[18]\n\t"
        "mulx	144(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[2] * B[19]\n\t"
        "mulx	152(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "movq	%%r10, 144(%[t])\n\t"
        "movq	%%r11, 152(%[t])\n\t"
        "movq	%%r12, 160(%[t])\n\t"
        "movq	%%r13, 168(%[t])\n\t"
        "movq	184(%[t]), %%rax\n\t"
        "movq	192(%[t]), %%r10\n\t"
        "movq	200(%[t]), %%r11\n\t"
        "# A[2] * B[20]\n\t"
        "mulx	160(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[2] * B[21]\n\t"
        "mulx	168(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[2] * B[22]\n\t"
        "mulx	176(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[2] * B[23]\n\t"
        "mulx	184(%[b]), %%r8, %%r9\n\t"
        "movq	%%r15, %%r12\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "adcxq	%%rcx, %%r12\n\t"
        "movq	%%r15, %%rcx\n\t"
        "adoxq	%%r15, %%rcx\n\t"
        "adcxq	%%r15, %%rcx\n\t"
        "movq	%%r14, 176(%[t])\n\t"
        "movq	%%rax, 184(%[t])\n\t"
        "movq	%%r10, 192(%[t])\n\t"
        "movq	%%r11, 200(%[t])\n\t"
        "movq	%%r12, 208(%[t])\n\t"
        "movq	24(%[a]), %%rdx\n\t"
        "xorq	%%r15, %%r15\n\t"
        "movq	24(%[t]), %%r13\n\t"
        "movq	32(%[t]), %%r14\n\t"
        "movq	40(%[t]), %%rax\n\t"
        "movq	48(%[t]), %%r10\n\t"
        "movq	56(%[t]), %%r11\n\t"
        "# A[3] * B[0]\n\t"
        "mulx	0(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[3] * B[1]\n\t"
        "mulx	8(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[3] * B[2]\n\t"
        "mulx	16(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[3] * B[3]\n\t"
        "mulx	24(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "movq	%%r13, 24(%[t])\n\t"
        "movq	%%r14, 32(%[t])\n\t"
        "movq	%%rax, 40(%[t])\n\t"
        "movq	%%r10, 48(%[t])\n\t"
        "movq	64(%[t]), %%r12\n\t"
        "movq	72(%[t]), %%r13\n\t"
        "movq	80(%[t]), %%r14\n\t"
        "movq	88(%[t]), %%rax\n\t"
        "# A[3] * B[4]\n\t"
        "mulx	32(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[3] * B[5]\n\t"
        "mulx	40(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[3] * B[6]\n\t"
        "mulx	48(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[3] * B[7]\n\t"
        "mulx	56(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "movq	%%r11, 56(%[t])\n\t"
        "movq	%%r12, 64(%[t])\n\t"
        "movq	%%r13, 72(%[t])\n\t"
        "movq	%%r14, 80(%[t])\n\t"
        "movq	96(%[t]), %%r10\n\t"
        "movq	104(%[t]), %%r11\n\t"
        "movq	112(%[t]), %%r12\n\t"
        "movq	120(%[t]), %%r13\n\t"
        "# A[3] * B[8]\n\t"
        "mulx	64(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[3] * B[9]\n\t"
        "mulx	72(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[3] * B[10]\n\t"
        "mulx	80(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[3] * B[11]\n\t"
        "mulx	88(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "movq	%%rax, 88(%[t])\n\t"
        "movq	%%r10, 96(%[t])\n\t"
        "movq	%%r11, 104(%[t])\n\t"
        "movq	%%r12, 112(%[t])\n\t"
        "movq	128(%[t]), %%r14\n\t"
        "movq	136(%[t]), %%rax\n\t"
        "movq	144(%[t]), %%r10\n\t"
        "movq	152(%[t]), %%r11\n\t"
        "# A[3] * B[12]\n\t"
        "mulx	96(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[3] * B[13]\n\t"
        "mulx	104(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[3] * B[14]\n\t"
        "mulx	112(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[3] * B[15]\n\t"
        "mulx	120(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "movq	%%r13, 120(%[t])\n\t"
        "movq	%%r14, 128(%[t])\n\t"
        "movq	%%rax, 136(%[t])\n\t"
        "movq	%%r10, 144(%[t])\n\t"
        "movq	160(%[t]), %%r12\n\t"
        "movq	168(%[t]), %%r13\n\t"
        "movq	176(%[t]), %%r14\n\t"
        "movq	184(%[t]), %%rax\n\t"
        "# A[3] * B[16]\n\t"
        "mulx	128(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[3] * B[17]\n\t"
        "mulx	136(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[3] * B[18]\n\t"
        "mulx	144(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[3] * B[19]\n\t"
        "mulx	152(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "movq	%%r11, 152(%[t])\n\t"
        "movq	%%r12, 160(%[t])\n\t"
        "movq	%%r13, 168(%[t])\n\t"
        "movq	%%r14, 176(%[t])\n\t"
        "movq	192(%[t]), %%r10\n\t"
        "movq	200(%[t]), %%r11\n\t"
        "movq	208(%[t]), %%r12\n\t"
        "# A[3] * B[20]\n\t"
        "mulx	160(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[3] * B[21]\n\t"
        "mulx	168(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[3] * B[22]\n\t"
        "mulx	176(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[3] * B[23]\n\t"
        "mulx	184(%[b]), %%r8, %%r9\n\t"
        "movq	%%r15, %%r13\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "adcxq	%%rcx, %%r13\n\t"
        "movq	%%r15, %%rcx\n\t"
        "adoxq	%%r15, %%rcx\n\t"
        "adcxq	%%r15, %%rcx\n\t"
        "movq	%%rax, 184(%[t])\n\t"
        "movq	%%r10, 192(%[t])\n\t"
        "movq	%%r11, 200(%[t])\n\t"
        "movq	%%r12, 208(%[t])\n\t"
        "movq	%%r13, 216(%[t])\n\t"
        "movq	32(%[a]), %%rdx\n\t"
        "xorq	%%r15, %%r15\n\t"
        "movq	32(%[t]), %%r14\n\t"
        "movq	40(%[t]), %%rax\n\t"
        "movq	48(%[t]), %%r10\n\t"
        "movq	56(%[t]), %%r11\n\t"
        "movq	64(%[t]), %%r12\n\t"
        "# A[4] * B[0]\n\t"
        "mulx	0(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[4] * B[1]\n\t"
        "mulx	8(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[4] * B[2]\n\t"
        "mulx	16(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[4] * B[3]\n\t"
        "mulx	24(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "movq	%%r14, 32(%[t])\n\t"
        "movq	%%rax, 40(%[t])\n\t"
        "movq	%%r10, 48(%[t])\n\t"
        "movq	%%r11, 56(%[t])\n\t"
        "movq	72(%[t]), %%r13\n\t"
        "movq	80(%[t]), %%r14\n\t"
        "movq	88(%[t]), %%rax\n\t"
        "movq	96(%[t]), %%r10\n\t"
        "# A[4] * B[4]\n\t"
        "mulx	32(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[4] * B[5]\n\t"
        "mulx	40(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[4] * B[6]\n\t"
        "mulx	48(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[4] * B[7]\n\t"
        "mulx	56(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "movq	%%r12, 64(%[t])\n\t"
        "movq	%%r13, 72(%[t])\n\t"
        "movq	%%r14, 80(%[t])\n\t"
        "movq	%%rax, 88(%[t])\n\t"
        "movq	104(%[t]), %%r11\n\t"
        "movq	112(%[t]), %%r12\n\t"
        "movq	120(%[t]), %%r13\n\t"
        "movq	128(%[t]), %%r14\n\t"
        "# A[4] * B[8]\n\t"
        "mulx	64(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[4] * B[9]\n\t"
        "mulx	72(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[4] * B[10]\n\t"
        "mulx	80(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[4] * B[11]\n\t"
        "mulx	88(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "movq	%%r10, 96(%[t])\n\t"
        "movq	%%r11, 104(%[t])\n\t"
        "movq	%%r12, 112(%[t])\n\t"
        "movq	%%r13, 120(%[t])\n\t"
        "movq	136(%[t]), %%rax\n\t"
        "movq	144(%[t]), %%r10\n\t"
        "movq	152(%[t]), %%r11\n\t"
        "movq	160(%[t]), %%r12\n\t"
        "# A[4] * B[12]\n\t"
        "mulx	96(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[4] * B[13]\n\t"
        "mulx	104(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[4] * B[14]\n\t"
        "mulx	112(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[4] * B[15]\n\t"
        "mulx	120(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "movq	%%r14, 128(%[t])\n\t"
        "movq	%%rax, 136(%[t])\n\t"
        "movq	%%r10, 144(%[t])\n\t"
        "movq	%%r11, 152(%[t])\n\t"
        "movq	168(%[t]), %%r13\n\t"
        "movq	176(%[t]), %%r14\n\t"
        "movq	184(%[t]), %%rax\n\t"
        "movq	192(%[t]), %%r10\n\t"
        "# A[4] * B[16]\n\t"
        "mulx	128(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[4] * B[17]\n\t"
        "mulx	136(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[4] * B[18]\n\t"
        "mulx	144(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[4] * B[19]\n\t"
        "mulx	152(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "movq	%%r12, 160(%[t])\n\t"
        "movq	%%r13, 168(%[t])\n\t"
        "movq	%%r14, 176(%[t])\n\t"
        "movq	%%rax, 184(%[t])\n\t"
        "movq	200(%[t]), %%r11\n\t"
        "movq	208(%[t]), %%r12\n\t"
        "movq	216(%[t]), %%r13\n\t"
        "# A[4] * B[20]\n\t"
        "mulx	160(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[4] * B[21]\n\t"
        "mulx	168(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[4] * B[22]\n\t"
        "mulx	176(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[4] * B[23]\n\t"
        "mulx	184(%[b]), %%r8, %%r9\n\t"
        "movq	%%r15, %%r14\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "adcxq	%%rcx, %%r14\n\t"
        "movq	%%r15, %%rcx\n\t"
        "adoxq	%%r15, %%rcx\n\t"
        "adcxq	%%r15, %%rcx\n\t"
        "movq	%%r10, 192(%[t])\n\t"
        "movq	%%r11, 200(%[t])\n\t"
        "movq	%%r12, 208(%[t])\n\t"
        "movq	%%r13, 216(%[t])\n\t"
        "movq	%%r14, 224(%[t])\n\t"
        "movq	40(%[a]), %%rdx\n\t"
        "xorq	%%r15, %%r15\n\t"
        "movq	40(%[t]), %%rax\n\t"
        "movq	48(%[t]), %%r10\n\t"
        "movq	56(%[t]), %%r11\n\t"
        "movq	64(%[t]), %%r12\n\t"
        "movq	72(%[t]), %%r13\n\t"
        "# A[5] * B[0]\n\t"
        "mulx	0(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[5] * B[1]\n\t"
        "mulx	8(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[5] * B[2]\n\t"
        "mulx	16(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[5] * B[3]\n\t"
        "mulx	24(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "movq	%%rax, 40(%[t])\n\t"
        "movq	%%r10, 48(%[t])\n\t"
        "movq	%%r11, 56(%[t])\n\t"
        "movq	%%r12, 64(%[t])\n\t"
        "movq	80(%[t]), %%r14\n\t"
        "movq	88(%[t]), %%rax\n\t"
        "movq	96(%[t]), %%r10\n\t"
        "movq	104(%[t]), %%r11\n\t"
        "# A[5] * B[4]\n\t"
        "mulx	32(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[5] * B[5]\n\t"
        "mulx	40(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[5] * B[6]\n\t"
        "mulx	48(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[5] * B[7]\n\t"
        "mulx	56(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "movq	%%r13, 72(%[t])\n\t"
        "movq	%%r14, 80(%[t])\n\t"
        "movq	%%rax, 88(%[t])\n\t"
        "movq	%%r10, 96(%[t])\n\t"
        "movq	112(%[t]), %%r12\n\t"
        "movq	120(%[t]), %%r13\n\t"
        "movq	128(%[t]), %%r14\n\t"
        "movq	136(%[t]), %%rax\n\t"
        "# A[5] * B[8]\n\t"
        "mulx	64(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[5] * B[9]\n\t"
        "mulx	72(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[5] * B[10]\n\t"
        "mulx	80(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[5] * B[11]\n\t"
        "mulx	88(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "movq	%%r11, 104(%[t])\n\t"
        "movq	%%r12, 112(%[t])\n\t"
        "movq	%%r13, 120(%[t])\n\t"
        "movq	%%r14, 128(%[t])\n\t"
        "movq	144(%[t]), %%r10\n\t"
        "movq	152(%[t]), %%r11\n\t"
        "movq	160(%[t]), %%r12\n\t"
        "movq	168(%[t]), %%r13\n\t"
        "# A[5] * B[12]\n\t"
        "mulx	96(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[5] * B[13]\n\t"
        "mulx	104(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[5] * B[14]\n\t"
        "mulx	112(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[5] * B[15]\n\t"
        "mulx	120(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "movq	%%rax, 136(%[t])\n\t"
        "movq	%%r10, 144(%[t])\n\t"
        "movq	%%r11, 152(%[t])\n\t"
        "movq	%%r12, 160(%[t])\n\t"
        "movq	176(%[t]), %%r14\n\t"
        "movq	184(%[t]), %%rax\n\t"
        "movq	192(%[t]), %%r10\n\t"
        "movq	200(%[t]), %%r11\n\t"
        "# A[5] * B[16]\n\t"
        "mulx	128(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[5] * B[17]\n\t"
        "mulx	136(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[5] * B[18]\n\t"
        "mulx	144(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[5] * B[19]\n\t"
        "mulx	152(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "movq	%%r13, 168(%[t])\n\t"
        "movq	%%r14, 176(%[t])\n\t"
        "movq	%%rax, 184(%[t])\n\t"
        "movq	%%r10, 192(%[t])\n\t"
        "movq	208(%[t]), %%r12\n\t"
        "movq	216(%[t]), %%r13\n\t"
        "movq	224(%[t]), %%r14\n\t"
        "# A[5] * B[20]\n\t"
        "mulx	160(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[5] * B[21]\n\t"
        "mulx	168(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[5] * B[22]\n\t"
        "mulx	176(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[5] * B[23]\n\t"
        "mulx	184(%[b]), %%r8, %%r9\n\t"
        "movq	%%r15, %%rax\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "adcxq	%%rcx, %%rax\n\t"
        "movq	%%r15, %%rcx\n\t"
        "adoxq	%%r15, %%rcx\n\t"
        "adcxq	%%r15, %%rcx\n\t"
        "movq	%%r11, 200(%[t])\n\t"
        "movq	%%r12, 208(%[t])\n\t"
        "movq	%%r13, 216(%[t])\n\t"
        "movq	%%r14, 224(%[t])\n\t"
        "movq	%%rax, 232(%[t])\n\t"
        "movq	48(%[a]), %%rdx\n\t"
        "xorq	%%r15, %%r15\n\t"
        "movq	48(%[t]), %%r10\n\t"
        "movq	56(%[t]), %%r11\n\t"
        "movq	64(%[t]), %%r12\n\t"
        "movq	72(%[t]), %%r13\n\t"
        "movq	80(%[t]), %%r14\n\t"
        "# A[6] * B[0]\n\t"
        "mulx	0(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[6] * B[1]\n\t"
        "mulx	8(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[6] * B[2]\n\t"
        "mulx	16(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[6] * B[3]\n\t"
        "mulx	24(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "movq	%%r10, 48(%[t])\n\t"
        "movq	%%r11, 56(%[t])\n\t"
        "movq	%%r12, 64(%[t])\n\t"
        "movq	%%r13, 72(%[t])\n\t"
        "movq	88(%[t]), %%rax\n\t"
        "movq	96(%[t]), %%r10\n\t"
        "movq	104(%[t]), %%r11\n\t"
        "movq	112(%[t]), %%r12\n\t"
        "# A[6] * B[4]\n\t"
        "mulx	32(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[6] * B[5]\n\t"
        "mulx	40(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[6] * B[6]\n\t"
        "mulx	48(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[6] * B[7]\n\t"
        "mulx	56(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "movq	%%r14, 80(%[t])\n\t"
        "movq	%%rax, 88(%[t])\n\t"
        "movq	%%r10, 96(%[t])\n\t"
        "movq	%%r11, 104(%[t])\n\t"
        "movq	120(%[t]), %%r13\n\t"
        "movq	128(%[t]), %%r14\n\t"
        "movq	136(%[t]), %%rax\n\t"
        "movq	144(%[t]), %%r10\n\t"
        "# A[6] * B[8]\n\t"
        "mulx	64(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[6] * B[9]\n\t"
        "mulx	72(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[6] * B[10]\n\t"
        "mulx	80(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[6] * B[11]\n\t"
        "mulx	88(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "movq	%%r12, 112(%[t])\n\t"
        "movq	%%r13, 120(%[t])\n\t"
        "movq	%%r14, 128(%[t])\n\t"
        "movq	%%rax, 136(%[t])\n\t"
        "movq	152(%[t]), %%r11\n\t"
        "movq	160(%[t]), %%r12\n\t"
        "movq	168(%[t]), %%r13\n\t"
        "movq	176(%[t]), %%r14\n\t"
        "# A[6] * B[12]\n\t"
        "mulx	96(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[6] * B[13]\n\t"
        "mulx	104(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[6] * B[14]\n\t"
        "mulx	112(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[6] * B[15]\n\t"
        "mulx	120(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "movq	%%r10, 144(%[t])\n\t"
        "movq	%%r11, 152(%[t])\n\t"
        "movq	%%r12, 160(%[t])\n\t"
        "movq	%%r13, 168(%[t])\n\t"
        "movq	184(%[t]), %%rax\n\t"
        "movq	192(%[t]), %%r10\n\t"
        "movq	200(%[t]), %%r11\n\t"
        "movq	208(%[t]), %%r12\n\t"
        "# A[6] * B[16]\n\t"
        "mulx	128(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[6] * B[17]\n\t"
        "mulx	136(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[6] * B[18]\n\t"
        "mulx	144(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[6] * B[19]\n\t"
        "mulx	152(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "movq	%%r14, 176(%[t])\n\t"
        "movq	%%rax, 184(%[t])\n\t"
        "movq	%%r10, 192(%[t])\n\t"
        "movq	%%r11, 200(%[t])\n\t"
        "movq	216(%[t]), %%r13\n\t"
        "movq	224(%[t]), %%r14\n\t"
        "movq	232(%[t]), %%rax\n\t"
        "# A[6] * B[20]\n\t"
        "mulx	160(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[6] * B[21]\n\t"
        "mulx	168(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[6] * B[22]\n\t"
        "mulx	176(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[6] * B[23]\n\t"
        "mulx	184(%[b]), %%r8, %%r9\n\t"
        "movq	%%r15, %%r10\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "adcxq	%%rcx, %%r10\n\t"
        "movq	%%r15, %%rcx\n\t"
        "adoxq	%%r15, %%rcx\n\t"
        "adcxq	%%r15, %%rcx\n\t"
        "movq	%%r12, 208(%[t])\n\t"
        "movq	%%r13, 216(%[t])\n\t"
        "movq	%%r14, 224(%[t])\n\t"
        "movq	%%rax, 232(%[t])\n\t"
        "movq	%%r10, 240(%[t])\n\t"
        "movq	56(%[a]), %%rdx\n\t"
        "xorq	%%r15, %%r15\n\t"
        "movq	56(%[t]), %%r11\n\t"
        "movq	64(%[t]), %%r12\n\t"
        "movq	72(%[t]), %%r13\n\t"
        "movq	80(%[t]), %%r14\n\t"
        "movq	88(%[t]), %%rax\n\t"
        "# A[7] * B[0]\n\t"
        "mulx	0(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[7] * B[1]\n\t"
        "mulx	8(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[7] * B[2]\n\t"
        "mulx	16(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[7] * B[3]\n\t"
        "mulx	24(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "movq	%%r11, 56(%[t])\n\t"
        "movq	%%r12, 64(%[t])\n\t"
        "movq	%%r13, 72(%[t])\n\t"
        "movq	%%r14, 80(%[t])\n\t"
        "movq	96(%[t]), %%r10\n\t"
        "movq	104(%[t]), %%r11\n\t"
        "movq	112(%[t]), %%r12\n\t"
        "movq	120(%[t]), %%r13\n\t"
        "# A[7] * B[4]\n\t"
        "mulx	32(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[7] * B[5]\n\t"
        "mulx	40(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[7] * B[6]\n\t"
        "mulx	48(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[7] * B[7]\n\t"
        "mulx	56(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "movq	%%rax, 88(%[t])\n\t"
        "movq	%%r10, 96(%[t])\n\t"
        "movq	%%r11, 104(%[t])\n\t"
        "movq	%%r12, 112(%[t])\n\t"
        "movq	128(%[t]), %%r14\n\t"
        "movq	136(%[t]), %%rax\n\t"
        "movq	144(%[t]), %%r10\n\t"
        "movq	152(%[t]), %%r11\n\t"
        "# A[7] * B[8]\n\t"
        "mulx	64(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[7] * B[9]\n\t"
        "mulx	72(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[7] * B[10]\n\t"
        "mulx	80(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[7] * B[11]\n\t"
        "mulx	88(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "movq	%%r13, 120(%[t])\n\t"
        "movq	%%r14, 128(%[t])\n\t"
        "movq	%%rax, 136(%[t])\n\t"
        "movq	%%r10, 144(%[t])\n\t"
        "movq	160(%[t]), %%r12\n\t"
        "movq	168(%[t]), %%r13\n\t"
        "movq	176(%[t]), %%r14\n\t"
        "movq	184(%[t]), %%rax\n\t"
        "# A[7] * B[12]\n\t"
        "mulx	96(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[7] * B[13]\n\t"
        "mulx	104(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[7] * B[14]\n\t"
        "mulx	112(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[7] * B[15]\n\t"
        "mulx	120(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "movq	%%r11, 152(%[t])\n\t"
        "movq	%%r12, 160(%[t])\n\t"
        "movq	%%r13, 168(%[t])\n\t"
        "movq	%%r14, 176(%[t])\n\t"
        "movq	192(%[t]), %%r10\n\t"
        "movq	200(%[t]), %%r11\n\t"
        "movq	208(%[t]), %%r12\n\t"
        "movq	216(%[t]), %%r13\n\t"
        "# A[7] * B[16]\n\t"
        "mulx	128(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[7] * B[17]\n\t"
        "mulx	136(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[7] * B[18]\n\t"
        "mulx	144(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[7] * B[19]\n\t"
        "mulx	152(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "movq	%%rax, 184(%[t])\n\t"
        "movq	%%r10, 192(%[t])\n\t"
        "movq	%%r11, 200(%[t])\n\t"
        "movq	%%r12, 208(%[t])\n\t"
        "movq	224(%[t]), %%r14\n\t"
        "movq	232(%[t]), %%rax\n\t"
        "movq	240(%[t]), %%r10\n\t"
        "# A[7] * B[20]\n\t"
        "mulx	160(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[7] * B[21]\n\t"
        "mulx	168(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[7] * B[22]\n\t"
        "mulx	176(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[7] * B[23]\n\t"
        "mulx	184(%[b]), %%r8, %%r9\n\t"
        "movq	%%r15, %%r11\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "adcxq	%%rcx, %%r11\n\t"
        "movq	%%r15, %%rcx\n\t"
        "adoxq	%%r15, %%rcx\n\t"
        "adcxq	%%r15, %%rcx\n\t"
        "movq	%%r13, 216(%[t])\n\t"
        "movq	%%r14, 224(%[t])\n\t"
        "movq	%%rax, 232(%[t])\n\t"
        "movq	%%r10, 240(%[t])\n\t"
        "movq	%%r11, 248(%[t])\n\t"
        "movq	64(%[a]), %%rdx\n\t"
        "xorq	%%r15, %%r15\n\t"
        "movq	64(%[t]), %%r12\n\t"
        "movq	72(%[t]), %%r13\n\t"
        "movq	80(%[t]), %%r14\n\t"
        "movq	88(%[t]), %%rax\n\t"
        "movq	96(%[t]), %%r10\n\t"
        "# A[8] * B[0]\n\t"
        "mulx	0(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[8] * B[1]\n\t"
        "mulx	8(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[8] * B[2]\n\t"
        "mulx	16(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[8] * B[3]\n\t"
        "mulx	24(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "movq	%%r12, 64(%[t])\n\t"
        "movq	%%r13, 72(%[t])\n\t"
        "movq	%%r14, 80(%[t])\n\t"
        "movq	%%rax, 88(%[t])\n\t"
        "movq	104(%[t]), %%r11\n\t"
        "movq	112(%[t]), %%r12\n\t"
        "movq	120(%[t]), %%r13\n\t"
        "movq	128(%[t]), %%r14\n\t"
        "# A[8] * B[4]\n\t"
        "mulx	32(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[8] * B[5]\n\t"
        "mulx	40(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[8] * B[6]\n\t"
        "mulx	48(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[8] * B[7]\n\t"
        "mulx	56(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "movq	%%r10, 96(%[t])\n\t"
        "movq	%%r11, 104(%[t])\n\t"
        "movq	%%r12, 112(%[t])\n\t"
        "movq	%%r13, 120(%[t])\n\t"
        "movq	136(%[t]), %%rax\n\t"
        "movq	144(%[t]), %%r10\n\t"
        "movq	152(%[t]), %%r11\n\t"
        "movq	160(%[t]), %%r12\n\t"
        "# A[8] * B[8]\n\t"
        "mulx	64(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[8] * B[9]\n\t"
        "mulx	72(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[8] * B[10]\n\t"
        "mulx	80(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[8] * B[11]\n\t"
        "mulx	88(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "movq	%%r14, 128(%[t])\n\t"
        "movq	%%rax, 136(%[t])\n\t"
        "movq	%%r10, 144(%[t])\n\t"
        "movq	%%r11, 152(%[t])\n\t"
        "movq	168(%[t]), %%r13\n\t"
        "movq	176(%[t]), %%r14\n\t"
        "movq	184(%[t]), %%rax\n\t"
        "movq	192(%[t]), %%r10\n\t"
        "# A[8] * B[12]\n\t"
        "mulx	96(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[8] * B[13]\n\t"
        "mulx	104(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[8] * B[14]\n\t"
        "mulx	112(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[8] * B[15]\n\t"
        "mulx	120(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "movq	%%r12, 160(%[t])\n\t"
        "movq	%%r13, 168(%[t])\n\t"
        "movq	%%r14, 176(%[t])\n\t"
        "movq	%%rax, 184(%[t])\n\t"
        "movq	200(%[t]), %%r11\n\t"
        "movq	208(%[t]), %%r12\n\t"
        "movq	216(%[t]), %%r13\n\t"
        "movq	224(%[t]), %%r14\n\t"
        "# A[8] * B[16]\n\t"
        "mulx	128(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[8] * B[17]\n\t"
        "mulx	136(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[8] * B[18]\n\t"
        "mulx	144(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[8] * B[19]\n\t"
        "mulx	152(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "movq	%%r10, 192(%[t])\n\t"
        "movq	%%r11, 200(%[t])\n\t"
        "movq	%%r12, 208(%[t])\n\t"
        "movq	%%r13, 216(%[t])\n\t"
        "movq	232(%[t]), %%rax\n\t"
        "movq	240(%[t]), %%r10\n\t"
        "movq	248(%[t]), %%r11\n\t"
        "# A[8] * B[20]\n\t"
        "mulx	160(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[8] * B[21]\n\t"
        "mulx	168(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[8] * B[22]\n\t"
        "mulx	176(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[8] * B[23]\n\t"
        "mulx	184(%[b]), %%r8, %%r9\n\t"
        "movq	%%r15, %%r12\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "adcxq	%%rcx, %%r12\n\t"
        "movq	%%r15, %%rcx\n\t"
        "adoxq	%%r15, %%rcx\n\t"
        "adcxq	%%r15, %%rcx\n\t"
        "movq	%%r14, 224(%[t])\n\t"
        "movq	%%rax, 232(%[t])\n\t"
        "movq	%%r10, 240(%[t])\n\t"
        "movq	%%r11, 248(%[t])\n\t"
        "movq	%%r12, 256(%[t])\n\t"
        "movq	72(%[a]), %%rdx\n\t"
        "xorq	%%r15, %%r15\n\t"
        "movq	72(%[t]), %%r13\n\t"
        "movq	80(%[t]), %%r14\n\t"
        "movq	88(%[t]), %%rax\n\t"
        "movq	96(%[t]), %%r10\n\t"
        "movq	104(%[t]), %%r11\n\t"
        "# A[9] * B[0]\n\t"
        "mulx	0(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[9] * B[1]\n\t"
        "mulx	8(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[9] * B[2]\n\t"
        "mulx	16(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[9] * B[3]\n\t"
        "mulx	24(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "movq	%%r13, 72(%[t])\n\t"
        "movq	%%r14, 80(%[t])\n\t"
        "movq	%%rax, 88(%[t])\n\t"
        "movq	%%r10, 96(%[t])\n\t"
        "movq	112(%[t]), %%r12\n\t"
        "movq	120(%[t]), %%r13\n\t"
        "movq	128(%[t]), %%r14\n\t"
        "movq	136(%[t]), %%rax\n\t"
        "# A[9] * B[4]\n\t"
        "mulx	32(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[9] * B[5]\n\t"
        "mulx	40(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[9] * B[6]\n\t"
        "mulx	48(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[9] * B[7]\n\t"
        "mulx	56(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "movq	%%r11, 104(%[t])\n\t"
        "movq	%%r12, 112(%[t])\n\t"
        "movq	%%r13, 120(%[t])\n\t"
        "movq	%%r14, 128(%[t])\n\t"
        "movq	144(%[t]), %%r10\n\t"
        "movq	152(%[t]), %%r11\n\t"
        "movq	160(%[t]), %%r12\n\t"
        "movq	168(%[t]), %%r13\n\t"
        "# A[9] * B[8]\n\t"
        "mulx	64(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[9] * B[9]\n\t"
        "mulx	72(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[9] * B[10]\n\t"
        "mulx	80(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[9] * B[11]\n\t"
        "mulx	88(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "movq	%%rax, 136(%[t])\n\t"
        "movq	%%r10, 144(%[t])\n\t"
        "movq	%%r11, 152(%[t])\n\t"
        "movq	%%r12, 160(%[t])\n\t"
        "movq	176(%[t]), %%r14\n\t"
        "movq	184(%[t]), %%rax\n\t"
        "movq	192(%[t]), %%r10\n\t"
        "movq	200(%[t]), %%r11\n\t"
        "# A[9] * B[12]\n\t"
        "mulx	96(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[9] * B[13]\n\t"
        "mulx	104(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[9] * B[14]\n\t"
        "mulx	112(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[9] * B[15]\n\t"
        "mulx	120(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "movq	%%r13, 168(%[t])\n\t"
        "movq	%%r14, 176(%[t])\n\t"
        "movq	%%rax, 184(%[t])\n\t"
        "movq	%%r10, 192(%[t])\n\t"
        "movq	208(%[t]), %%r12\n\t"
        "movq	216(%[t]), %%r13\n\t"
        "movq	224(%[t]), %%r14\n\t"
        "movq	232(%[t]), %%rax\n\t"
        "# A[9] * B[16]\n\t"
        "mulx	128(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[9] * B[17]\n\t"
        "mulx	136(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[9] * B[18]\n\t"
        "mulx	144(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[9] * B[19]\n\t"
        "mulx	152(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "movq	%%r11, 200(%[t])\n\t"
        "movq	%%r12, 208(%[t])\n\t"
        "movq	%%r13, 216(%[t])\n\t"
        "movq	%%r14, 224(%[t])\n\t"
        "movq	240(%[t]), %%r10\n\t"
        "movq	248(%[t]), %%r11\n\t"
        "movq	256(%[t]), %%r12\n\t"
        "# A[9] * B[20]\n\t"
        "mulx	160(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[9] * B[21]\n\t"
        "mulx	168(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[9] * B[22]\n\t"
        "mulx	176(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[9] * B[23]\n\t"
        "mulx	184(%[b]), %%r8, %%r9\n\t"
        "movq	%%r15, %%r13\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "adcxq	%%rcx, %%r13\n\t"
        "movq	%%r15, %%rcx\n\t"
        "adoxq	%%r15, %%rcx\n\t"
        "adcxq	%%r15, %%rcx\n\t"
        "movq	%%rax, 232(%[t])\n\t"
        "movq	%%r10, 240(%[t])\n\t"
        "movq	%%r11, 248(%[t])\n\t"
        "movq	%%r12, 256(%[t])\n\t"
        "movq	%%r13, 264(%[t])\n\t"
        "movq	80(%[a]), %%rdx\n\t"
        "xorq	%%r15, %%r15\n\t"
        "movq	80(%[t]), %%r14\n\t"
        "movq	88(%[t]), %%rax\n\t"
        "movq	96(%[t]), %%r10\n\t"
        "movq	104(%[t]), %%r11\n\t"
        "movq	112(%[t]), %%r12\n\t"
        "# A[10] * B[0]\n\t"
        "mulx	0(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[10] * B[1]\n\t"
        "mulx	8(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[10] * B[2]\n\t"
        "mulx	16(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[10] * B[3]\n\t"
        "mulx	24(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "movq	%%r14, 80(%[t])\n\t"
        "movq	%%rax, 88(%[t])\n\t"
        "movq	%%r10, 96(%[t])\n\t"
        "movq	%%r11, 104(%[t])\n\t"
        "movq	120(%[t]), %%r13\n\t"
        "movq	128(%[t]), %%r14\n\t"
        "movq	136(%[t]), %%rax\n\t"
        "movq	144(%[t]), %%r10\n\t"
        "# A[10] * B[4]\n\t"
        "mulx	32(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[10] * B[5]\n\t"
        "mulx	40(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[10] * B[6]\n\t"
        "mulx	48(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[10] * B[7]\n\t"
        "mulx	56(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "movq	%%r12, 112(%[t])\n\t"
        "movq	%%r13, 120(%[t])\n\t"
        "movq	%%r14, 128(%[t])\n\t"
        "movq	%%rax, 136(%[t])\n\t"
        "movq	152(%[t]), %%r11\n\t"
        "movq	160(%[t]), %%r12\n\t"
        "movq	168(%[t]), %%r13\n\t"
        "movq	176(%[t]), %%r14\n\t"
        "# A[10] * B[8]\n\t"
        "mulx	64(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[10] * B[9]\n\t"
        "mulx	72(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[10] * B[10]\n\t"
        "mulx	80(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[10] * B[11]\n\t"
        "mulx	88(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "movq	%%r10, 144(%[t])\n\t"
        "movq	%%r11, 152(%[t])\n\t"
        "movq	%%r12, 160(%[t])\n\t"
        "movq	%%r13, 168(%[t])\n\t"
        "movq	184(%[t]), %%rax\n\t"
        "movq	192(%[t]), %%r10\n\t"
        "movq	200(%[t]), %%r11\n\t"
        "movq	208(%[t]), %%r12\n\t"
        "# A[10] * B[12]\n\t"
        "mulx	96(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[10] * B[13]\n\t"
        "mulx	104(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[10] * B[14]\n\t"
        "mulx	112(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[10] * B[15]\n\t"
        "mulx	120(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "movq	%%r14, 176(%[t])\n\t"
        "movq	%%rax, 184(%[t])\n\t"
        "movq	%%r10, 192(%[t])\n\t"
        "movq	%%r11, 200(%[t])\n\t"
        "movq	216(%[t]), %%r13\n\t"
        "movq	224(%[t]), %%r14\n\t"
        "movq	232(%[t]), %%rax\n\t"
        "movq	240(%[t]), %%r10\n\t"
        "# A[10] * B[16]\n\t"
        "mulx	128(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[10] * B[17]\n\t"
        "mulx	136(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[10] * B[18]\n\t"
        "mulx	144(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[10] * B[19]\n\t"
        "mulx	152(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "movq	%%r12, 208(%[t])\n\t"
        "movq	%%r13, 216(%[t])\n\t"
        "movq	%%r14, 224(%[t])\n\t"
        "movq	%%rax, 232(%[t])\n\t"
        "movq	248(%[t]), %%r11\n\t"
        "movq	256(%[t]), %%r12\n\t"
        "movq	264(%[t]), %%r13\n\t"
        "# A[10] * B[20]\n\t"
        "mulx	160(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[10] * B[21]\n\t"
        "mulx	168(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[10] * B[22]\n\t"
        "mulx	176(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[10] * B[23]\n\t"
        "mulx	184(%[b]), %%r8, %%r9\n\t"
        "movq	%%r15, %%r14\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "adcxq	%%rcx, %%r14\n\t"
        "movq	%%r15, %%rcx\n\t"
        "adoxq	%%r15, %%rcx\n\t"
        "adcxq	%%r15, %%rcx\n\t"
        "movq	%%r10, 240(%[t])\n\t"
        "movq	%%r11, 248(%[t])\n\t"
        "movq	%%r12, 256(%[t])\n\t"
        "movq	%%r13, 264(%[t])\n\t"
        "movq	%%r14, 272(%[t])\n\t"
        "movq	88(%[a]), %%rdx\n\t"
        "xorq	%%r15, %%r15\n\t"
        "movq	88(%[t]), %%rax\n\t"
        "movq	96(%[t]), %%r10\n\t"
        "movq	104(%[t]), %%r11\n\t"
        "movq	112(%[t]), %%r12\n\t"
        "movq	120(%[t]), %%r13\n\t"
        "# A[11] * B[0]\n\t"
        "mulx	0(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[11] * B[1]\n\t"
        "mulx	8(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[11] * B[2]\n\t"
        "mulx	16(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[11] * B[3]\n\t"
        "mulx	24(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "movq	%%rax, 88(%[t])\n\t"
        "movq	%%r10, 96(%[t])\n\t"
        "movq	%%r11, 104(%[t])\n\t"
        "movq	%%r12, 112(%[t])\n\t"
        "movq	128(%[t]), %%r14\n\t"
        "movq	136(%[t]), %%rax\n\t"
        "movq	144(%[t]), %%r10\n\t"
        "movq	152(%[t]), %%r11\n\t"
        "# A[11] * B[4]\n\t"
        "mulx	32(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[11] * B[5]\n\t"
        "mulx	40(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[11] * B[6]\n\t"
        "mulx	48(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[11] * B[7]\n\t"
        "mulx	56(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "movq	%%r13, 120(%[t])\n\t"
        "movq	%%r14, 128(%[t])\n\t"
        "movq	%%rax, 136(%[t])\n\t"
        "movq	%%r10, 144(%[t])\n\t"
        "movq	160(%[t]), %%r12\n\t"
        "movq	168(%[t]), %%r13\n\t"
        "movq	176(%[t]), %%r14\n\t"
        "movq	184(%[t]), %%rax\n\t"
        "# A[11] * B[8]\n\t"
        "mulx	64(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[11] * B[9]\n\t"
        "mulx	72(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[11] * B[10]\n\t"
        "mulx	80(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[11] * B[11]\n\t"
        "mulx	88(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "movq	%%r11, 152(%[t])\n\t"
        "movq	%%r12, 160(%[t])\n\t"
        "movq	%%r13, 168(%[t])\n\t"
        "movq	%%r14, 176(%[t])\n\t"
        "movq	192(%[t]), %%r10\n\t"
        "movq	200(%[t]), %%r11\n\t"
        "movq	208(%[t]), %%r12\n\t"
        "movq	216(%[t]), %%r13\n\t"
        "# A[11] * B[12]\n\t"
        "mulx	96(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[11] * B[13]\n\t"
        "mulx	104(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[11] * B[14]\n\t"
        "mulx	112(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[11] * B[15]\n\t"
        "mulx	120(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "movq	%%rax, 184(%[t])\n\t"
        "movq	%%r10, 192(%[t])\n\t"
        "movq	%%r11, 200(%[t])\n\t"
        "movq	%%r12, 208(%[t])\n\t"
        "movq	224(%[t]), %%r14\n\t"
        "movq	232(%[t]), %%rax\n\t"
        "movq	240(%[t]), %%r10\n\t"
        "movq	248(%[t]), %%r11\n\t"
        "# A[11] * B[16]\n\t"
        "mulx	128(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[11] * B[17]\n\t"
        "mulx	136(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[11] * B[18]\n\t"
        "mulx	144(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[11] * B[19]\n\t"
        "mulx	152(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "movq	%%r13, 216(%[t])\n\t"
        "movq	%%r14, 224(%[t])\n\t"
        "movq	%%rax, 232(%[t])\n\t"
        "movq	%%r10, 240(%[t])\n\t"
        "movq	256(%[t]), %%r12\n\t"
        "movq	264(%[t]), %%r13\n\t"
        "movq	272(%[t]), %%r14\n\t"
        "# A[11] * B[20]\n\t"
        "mulx	160(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[11] * B[21]\n\t"
        "mulx	168(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[11] * B[22]\n\t"
        "mulx	176(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[11] * B[23]\n\t"
        "mulx	184(%[b]), %%r8, %%r9\n\t"
        "movq	%%r15, %%rax\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "adcxq	%%rcx, %%rax\n\t"
        "movq	%%r15, %%rcx\n\t"
        "adoxq	%%r15, %%rcx\n\t"
        "adcxq	%%r15, %%rcx\n\t"
        "movq	%%r11, 248(%[t])\n\t"
        "movq	%%r12, 256(%[t])\n\t"
        "movq	%%r13, 264(%[t])\n\t"
        "movq	%%r14, 272(%[t])\n\t"
        "movq	%%rax, 280(%[t])\n\t"
        "movq	96(%[a]), %%rdx\n\t"
        "xorq	%%r15, %%r15\n\t"
        "movq	96(%[t]), %%r10\n\t"
        "movq	104(%[t]), %%r11\n\t"
        "movq	112(%[t]), %%r12\n\t"
        "movq	120(%[t]), %%r13\n\t"
        "movq	128(%[t]), %%r14\n\t"
        "# A[12] * B[0]\n\t"
        "mulx	0(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[12] * B[1]\n\t"
        "mulx	8(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[12] * B[2]\n\t"
        "mulx	16(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[12] * B[3]\n\t"
        "mulx	24(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "movq	%%r10, 96(%[t])\n\t"
        "movq	%%r11, 104(%[t])\n\t"
        "movq	%%r12, 112(%[t])\n\t"
        "movq	%%r13, 120(%[t])\n\t"
        "movq	136(%[t]), %%rax\n\t"
        "movq	144(%[t]), %%r10\n\t"
        "movq	152(%[t]), %%r11\n\t"
        "movq	160(%[t]), %%r12\n\t"
        "# A[12] * B[4]\n\t"
        "mulx	32(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[12] * B[5]\n\t"
        "mulx	40(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[12] * B[6]\n\t"
        "mulx	48(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[12] * B[7]\n\t"
        "mulx	56(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "movq	%%r14, 128(%[t])\n\t"
        "movq	%%rax, 136(%[t])\n\t"
        "movq	%%r10, 144(%[t])\n\t"
        "movq	%%r11, 152(%[t])\n\t"
        "movq	168(%[t]), %%r13\n\t"
        "movq	176(%[t]), %%r14\n\t"
        "movq	184(%[t]), %%rax\n\t"
        "movq	192(%[t]), %%r10\n\t"
        "# A[12] * B[8]\n\t"
        "mulx	64(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[12] * B[9]\n\t"
        "mulx	72(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[12] * B[10]\n\t"
        "mulx	80(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[12] * B[11]\n\t"
        "mulx	88(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "movq	%%r12, 160(%[t])\n\t"
        "movq	%%r13, 168(%[t])\n\t"
        "movq	%%r14, 176(%[t])\n\t"
        "movq	%%rax, 184(%[t])\n\t"
        "movq	200(%[t]), %%r11\n\t"
        "movq	208(%[t]), %%r12\n\t"
        "movq	216(%[t]), %%r13\n\t"
        "movq	224(%[t]), %%r14\n\t"
        "# A[12] * B[12]\n\t"
        "mulx	96(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[12] * B[13]\n\t"
        "mulx	104(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[12] * B[14]\n\t"
        "mulx	112(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[12] * B[15]\n\t"
        "mulx	120(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "movq	%%r10, 192(%[t])\n\t"
        "movq	%%r11, 200(%[t])\n\t"
        "movq	%%r12, 208(%[t])\n\t"
        "movq	%%r13, 216(%[t])\n\t"
        "movq	232(%[t]), %%rax\n\t"
        "movq	240(%[t]), %%r10\n\t"
        "movq	248(%[t]), %%r11\n\t"
        "movq	256(%[t]), %%r12\n\t"
        "# A[12] * B[16]\n\t"
        "mulx	128(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[12] * B[17]\n\t"
        "mulx	136(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[12] * B[18]\n\t"
        "mulx	144(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[12] * B[19]\n\t"
        "mulx	152(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "movq	%%r14, 224(%[t])\n\t"
        "movq	%%rax, 232(%[t])\n\t"
        "movq	%%r10, 240(%[t])\n\t"
        "movq	%%r11, 248(%[t])\n\t"
        "movq	264(%[t]), %%r13\n\t"
        "movq	272(%[t]), %%r14\n\t"
        "movq	280(%[t]), %%rax\n\t"
        "# A[12] * B[20]\n\t"
        "mulx	160(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[12] * B[21]\n\t"
        "mulx	168(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[12] * B[22]\n\t"
        "mulx	176(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[12] * B[23]\n\t"
        "mulx	184(%[b]), %%r8, %%r9\n\t"
        "movq	%%r15, %%r10\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "adcxq	%%rcx, %%r10\n\t"
        "movq	%%r15, %%rcx\n\t"
        "adoxq	%%r15, %%rcx\n\t"
        "adcxq	%%r15, %%rcx\n\t"
        "movq	%%r12, 256(%[t])\n\t"
        "movq	%%r13, 264(%[t])\n\t"
        "movq	%%r14, 272(%[t])\n\t"
        "movq	%%rax, 280(%[t])\n\t"
        "movq	%%r10, 288(%[t])\n\t"
        "movq	104(%[a]), %%rdx\n\t"
        "xorq	%%r15, %%r15\n\t"
        "movq	104(%[t]), %%r11\n\t"
        "movq	112(%[t]), %%r12\n\t"
        "movq	120(%[t]), %%r13\n\t"
        "movq	128(%[t]), %%r14\n\t"
        "movq	136(%[t]), %%rax\n\t"
        "# A[13] * B[0]\n\t"
        "mulx	0(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[13] * B[1]\n\t"
        "mulx	8(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[13] * B[2]\n\t"
        "mulx	16(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[13] * B[3]\n\t"
        "mulx	24(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "movq	%%r11, 104(%[t])\n\t"
        "movq	%%r12, 112(%[t])\n\t"
        "movq	%%r13, 120(%[t])\n\t"
        "movq	%%r14, 128(%[t])\n\t"
        "movq	144(%[t]), %%r10\n\t"
        "movq	152(%[t]), %%r11\n\t"
        "movq	160(%[t]), %%r12\n\t"
        "movq	168(%[t]), %%r13\n\t"
        "# A[13] * B[4]\n\t"
        "mulx	32(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[13] * B[5]\n\t"
        "mulx	40(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[13] * B[6]\n\t"
        "mulx	48(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[13] * B[7]\n\t"
        "mulx	56(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "movq	%%rax, 136(%[t])\n\t"
        "movq	%%r10, 144(%[t])\n\t"
        "movq	%%r11, 152(%[t])\n\t"
        "movq	%%r12, 160(%[t])\n\t"
        "movq	176(%[t]), %%r14\n\t"
        "movq	184(%[t]), %%rax\n\t"
        "movq	192(%[t]), %%r10\n\t"
        "movq	200(%[t]), %%r11\n\t"
        "# A[13] * B[8]\n\t"
        "mulx	64(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[13] * B[9]\n\t"
        "mulx	72(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[13] * B[10]\n\t"
        "mulx	80(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[13] * B[11]\n\t"
        "mulx	88(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "movq	%%r13, 168(%[t])\n\t"
        "movq	%%r14, 176(%[t])\n\t"
        "movq	%%rax, 184(%[t])\n\t"
        "movq	%%r10, 192(%[t])\n\t"
        "movq	208(%[t]), %%r12\n\t"
        "movq	216(%[t]), %%r13\n\t"
        "movq	224(%[t]), %%r14\n\t"
        "movq	232(%[t]), %%rax\n\t"
        "# A[13] * B[12]\n\t"
        "mulx	96(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[13] * B[13]\n\t"
        "mulx	104(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[13] * B[14]\n\t"
        "mulx	112(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[13] * B[15]\n\t"
        "mulx	120(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "movq	%%r11, 200(%[t])\n\t"
        "movq	%%r12, 208(%[t])\n\t"
        "movq	%%r13, 216(%[t])\n\t"
        "movq	%%r14, 224(%[t])\n\t"
        "movq	240(%[t]), %%r10\n\t"
        "movq	248(%[t]), %%r11\n\t"
        "movq	256(%[t]), %%r12\n\t"
        "movq	264(%[t]), %%r13\n\t"
        "# A[13] * B[16]\n\t"
        "mulx	128(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[13] * B[17]\n\t"
        "mulx	136(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[13] * B[18]\n\t"
        "mulx	144(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[13] * B[19]\n\t"
        "mulx	152(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "movq	%%rax, 232(%[t])\n\t"
        "movq	%%r10, 240(%[t])\n\t"
        "movq	%%r11, 248(%[t])\n\t"
        "movq	%%r12, 256(%[t])\n\t"
        "movq	272(%[t]), %%r14\n\t"
        "movq	280(%[t]), %%rax\n\t"
        "movq	288(%[t]), %%r10\n\t"
        "# A[13] * B[20]\n\t"
        "mulx	160(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[13] * B[21]\n\t"
        "mulx	168(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[13] * B[22]\n\t"
        "mulx	176(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[13] * B[23]\n\t"
        "mulx	184(%[b]), %%r8, %%r9\n\t"
        "movq	%%r15, %%r11\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "adcxq	%%rcx, %%r11\n\t"
        "movq	%%r15, %%rcx\n\t"
        "adoxq	%%r15, %%rcx\n\t"
        "adcxq	%%r15, %%rcx\n\t"
        "movq	%%r13, 264(%[t])\n\t"
        "movq	%%r14, 272(%[t])\n\t"
        "movq	%%rax, 280(%[t])\n\t"
        "movq	%%r10, 288(%[t])\n\t"
        "movq	%%r11, 296(%[t])\n\t"
        "movq	112(%[a]), %%rdx\n\t"
        "xorq	%%r15, %%r15\n\t"
        "movq	112(%[t]), %%r12\n\t"
        "movq	120(%[t]), %%r13\n\t"
        "movq	128(%[t]), %%r14\n\t"
        "movq	136(%[t]), %%rax\n\t"
        "movq	144(%[t]), %%r10\n\t"
        "# A[14] * B[0]\n\t"
        "mulx	0(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[14] * B[1]\n\t"
        "mulx	8(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[14] * B[2]\n\t"
        "mulx	16(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[14] * B[3]\n\t"
        "mulx	24(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "movq	%%r12, 112(%[t])\n\t"
        "movq	%%r13, 120(%[t])\n\t"
        "movq	%%r14, 128(%[t])\n\t"
        "movq	%%rax, 136(%[t])\n\t"
        "movq	152(%[t]), %%r11\n\t"
        "movq	160(%[t]), %%r12\n\t"
        "movq	168(%[t]), %%r13\n\t"
        "movq	176(%[t]), %%r14\n\t"
        "# A[14] * B[4]\n\t"
        "mulx	32(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[14] * B[5]\n\t"
        "mulx	40(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[14] * B[6]\n\t"
        "mulx	48(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[14] * B[7]\n\t"
        "mulx	56(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "movq	%%r10, 144(%[t])\n\t"
        "movq	%%r11, 152(%[t])\n\t"
        "movq	%%r12, 160(%[t])\n\t"
        "movq	%%r13, 168(%[t])\n\t"
        "movq	184(%[t]), %%rax\n\t"
        "movq	192(%[t]), %%r10\n\t"
        "movq	200(%[t]), %%r11\n\t"
        "movq	208(%[t]), %%r12\n\t"
        "# A[14] * B[8]\n\t"
        "mulx	64(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[14] * B[9]\n\t"
        "mulx	72(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[14] * B[10]\n\t"
        "mulx	80(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[14] * B[11]\n\t"
        "mulx	88(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "movq	%%r14, 176(%[t])\n\t"
        "movq	%%rax, 184(%[t])\n\t"
        "movq	%%r10, 192(%[t])\n\t"
        "movq	%%r11, 200(%[t])\n\t"
        "movq	216(%[t]), %%r13\n\t"
        "movq	224(%[t]), %%r14\n\t"
        "movq	232(%[t]), %%rax\n\t"
        "movq	240(%[t]), %%r10\n\t"
        "# A[14] * B[12]\n\t"
        "mulx	96(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[14] * B[13]\n\t"
        "mulx	104(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[14] * B[14]\n\t"
        "mulx	112(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[14] * B[15]\n\t"
        "mulx	120(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "movq	%%r12, 208(%[t])\n\t"
        "movq	%%r13, 216(%[t])\n\t"
        "movq	%%r14, 224(%[t])\n\t"
        "movq	%%rax, 232(%[t])\n\t"
        "movq	248(%[t]), %%r11\n\t"
        "movq	256(%[t]), %%r12\n\t"
        "movq	264(%[t]), %%r13\n\t"
        "movq	272(%[t]), %%r14\n\t"
        "# A[14] * B[16]\n\t"
        "mulx	128(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[14] * B[17]\n\t"
        "mulx	136(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[14] * B[18]\n\t"
        "mulx	144(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[14] * B[19]\n\t"
        "mulx	152(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "movq	%%r10, 240(%[t])\n\t"
        "movq	%%r11, 248(%[t])\n\t"
        "movq	%%r12, 256(%[t])\n\t"
        "movq	%%r13, 264(%[t])\n\t"
        "movq	280(%[t]), %%rax\n\t"
        "movq	288(%[t]), %%r10\n\t"
        "movq	296(%[t]), %%r11\n\t"
        "# A[14] * B[20]\n\t"
        "mulx	160(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[14] * B[21]\n\t"
        "mulx	168(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[14] * B[22]\n\t"
        "mulx	176(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[14] * B[23]\n\t"
        "mulx	184(%[b]), %%r8, %%r9\n\t"
        "movq	%%r15, %%r12\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "adcxq	%%rcx, %%r12\n\t"
        "movq	%%r15, %%rcx\n\t"
        "adoxq	%%r15, %%rcx\n\t"
        "adcxq	%%r15, %%rcx\n\t"
        "movq	%%r14, 272(%[t])\n\t"
        "movq	%%rax, 280(%[t])\n\t"
        "movq	%%r10, 288(%[t])\n\t"
        "movq	%%r11, 296(%[t])\n\t"
        "movq	%%r12, 304(%[t])\n\t"
        "movq	120(%[a]), %%rdx\n\t"
        "xorq	%%r15, %%r15\n\t"
        "movq	120(%[t]), %%r13\n\t"
        "movq	128(%[t]), %%r14\n\t"
        "movq	136(%[t]), %%rax\n\t"
        "movq	144(%[t]), %%r10\n\t"
        "movq	152(%[t]), %%r11\n\t"
        "# A[15] * B[0]\n\t"
        "mulx	0(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[15] * B[1]\n\t"
        "mulx	8(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[15] * B[2]\n\t"
        "mulx	16(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[15] * B[3]\n\t"
        "mulx	24(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "movq	%%r13, 120(%[t])\n\t"
        "movq	%%r14, 128(%[t])\n\t"
        "movq	%%rax, 136(%[t])\n\t"
        "movq	%%r10, 144(%[t])\n\t"
        "movq	160(%[t]), %%r12\n\t"
        "movq	168(%[t]), %%r13\n\t"
        "movq	176(%[t]), %%r14\n\t"
        "movq	184(%[t]), %%rax\n\t"
        "# A[15] * B[4]\n\t"
        "mulx	32(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[15] * B[5]\n\t"
        "mulx	40(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[15] * B[6]\n\t"
        "mulx	48(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[15] * B[7]\n\t"
        "mulx	56(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "movq	%%r11, 152(%[t])\n\t"
        "movq	%%r12, 160(%[t])\n\t"
        "movq	%%r13, 168(%[t])\n\t"
        "movq	%%r14, 176(%[t])\n\t"
        "movq	192(%[t]), %%r10\n\t"
        "movq	200(%[t]), %%r11\n\t"
        "movq	208(%[t]), %%r12\n\t"
        "movq	216(%[t]), %%r13\n\t"
        "# A[15] * B[8]\n\t"
        "mulx	64(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[15] * B[9]\n\t"
        "mulx	72(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[15] * B[10]\n\t"
        "mulx	80(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[15] * B[11]\n\t"
        "mulx	88(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "movq	%%rax, 184(%[t])\n\t"
        "movq	%%r10, 192(%[t])\n\t"
        "movq	%%r11, 200(%[t])\n\t"
        "movq	%%r12, 208(%[t])\n\t"
        "movq	224(%[t]), %%r14\n\t"
        "movq	232(%[t]), %%rax\n\t"
        "movq	240(%[t]), %%r10\n\t"
        "movq	248(%[t]), %%r11\n\t"
        "# A[15] * B[12]\n\t"
        "mulx	96(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[15] * B[13]\n\t"
        "mulx	104(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[15] * B[14]\n\t"
        "mulx	112(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[15] * B[15]\n\t"
        "mulx	120(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "movq	%%r13, 216(%[t])\n\t"
        "movq	%%r14, 224(%[t])\n\t"
        "movq	%%rax, 232(%[t])\n\t"
        "movq	%%r10, 240(%[t])\n\t"
        "movq	256(%[t]), %%r12\n\t"
        "movq	264(%[t]), %%r13\n\t"
        "movq	272(%[t]), %%r14\n\t"
        "movq	280(%[t]), %%rax\n\t"
        "# A[15] * B[16]\n\t"
        "mulx	128(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[15] * B[17]\n\t"
        "mulx	136(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[15] * B[18]\n\t"
        "mulx	144(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[15] * B[19]\n\t"
        "mulx	152(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "movq	%%r11, 248(%[t])\n\t"
        "movq	%%r12, 256(%[t])\n\t"
        "movq	%%r13, 264(%[t])\n\t"
        "movq	%%r14, 272(%[t])\n\t"
        "movq	288(%[t]), %%r10\n\t"
        "movq	296(%[t]), %%r11\n\t"
        "movq	304(%[t]), %%r12\n\t"
        "# A[15] * B[20]\n\t"
        "mulx	160(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[15] * B[21]\n\t"
        "mulx	168(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[15] * B[22]\n\t"
        "mulx	176(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[15] * B[23]\n\t"
        "mulx	184(%[b]), %%r8, %%r9\n\t"
        "movq	%%r15, %%r13\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "adcxq	%%rcx, %%r13\n\t"
        "movq	%%r15, %%rcx\n\t"
        "adoxq	%%r15, %%rcx\n\t"
        "adcxq	%%r15, %%rcx\n\t"
        "movq	%%rax, 280(%[t])\n\t"
        "movq	%%r10, 288(%[t])\n\t"
        "movq	%%r11, 296(%[t])\n\t"
        "movq	%%r12, 304(%[t])\n\t"
        "movq	%%r13, 312(%[t])\n\t"
        "movq	128(%[a]), %%rdx\n\t"
        "xorq	%%r15, %%r15\n\t"
        "movq	128(%[t]), %%r14\n\t"
        "movq	136(%[t]), %%rax\n\t"
        "movq	144(%[t]), %%r10\n\t"
        "movq	152(%[t]), %%r11\n\t"
        "movq	160(%[t]), %%r12\n\t"
        "# A[16] * B[0]\n\t"
        "mulx	0(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[16] * B[1]\n\t"
        "mulx	8(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[16] * B[2]\n\t"
        "mulx	16(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[16] * B[3]\n\t"
        "mulx	24(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "movq	%%r14, 128(%[t])\n\t"
        "movq	%%rax, 136(%[t])\n\t"
        "movq	%%r10, 144(%[t])\n\t"
        "movq	%%r11, 152(%[t])\n\t"
        "movq	168(%[t]), %%r13\n\t"
        "movq	176(%[t]), %%r14\n\t"
        "movq	184(%[t]), %%rax\n\t"
        "movq	192(%[t]), %%r10\n\t"
        "# A[16] * B[4]\n\t"
        "mulx	32(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[16] * B[5]\n\t"
        "mulx	40(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[16] * B[6]\n\t"
        "mulx	48(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[16] * B[7]\n\t"
        "mulx	56(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "movq	%%r12, 160(%[t])\n\t"
        "movq	%%r13, 168(%[t])\n\t"
        "movq	%%r14, 176(%[t])\n\t"
        "movq	%%rax, 184(%[t])\n\t"
        "movq	200(%[t]), %%r11\n\t"
        "movq	208(%[t]), %%r12\n\t"
        "movq	216(%[t]), %%r13\n\t"
        "movq	224(%[t]), %%r14\n\t"
        "# A[16] * B[8]\n\t"
        "mulx	64(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[16] * B[9]\n\t"
        "mulx	72(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[16] * B[10]\n\t"
        "mulx	80(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[16] * B[11]\n\t"
        "mulx	88(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "movq	%%r10, 192(%[t])\n\t"
        "movq	%%r11, 200(%[t])\n\t"
        "movq	%%r12, 208(%[t])\n\t"
        "movq	%%r13, 216(%[t])\n\t"
        "movq	232(%[t]), %%rax\n\t"
        "movq	240(%[t]), %%r10\n\t"
        "movq	248(%[t]), %%r11\n\t"
        "movq	256(%[t]), %%r12\n\t"
        "# A[16] * B[12]\n\t"
        "mulx	96(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[16] * B[13]\n\t"
        "mulx	104(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[16] * B[14]\n\t"
        "mulx	112(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[16] * B[15]\n\t"
        "mulx	120(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "movq	%%r14, 224(%[t])\n\t"
        "movq	%%rax, 232(%[t])\n\t"
        "movq	%%r10, 240(%[t])\n\t"
        "movq	%%r11, 248(%[t])\n\t"
        "movq	264(%[t]), %%r13\n\t"
        "movq	272(%[t]), %%r14\n\t"
        "movq	280(%[t]), %%rax\n\t"
        "movq	288(%[t]), %%r10\n\t"
        "# A[16] * B[16]\n\t"
        "mulx	128(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[16] * B[17]\n\t"
        "mulx	136(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[16] * B[18]\n\t"
        "mulx	144(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[16] * B[19]\n\t"
        "mulx	152(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "movq	%%r12, 256(%[t])\n\t"
        "movq	%%r13, 264(%[t])\n\t"
        "movq	%%r14, 272(%[t])\n\t"
        "movq	%%rax, 280(%[t])\n\t"
        "movq	296(%[t]), %%r11\n\t"
        "movq	304(%[t]), %%r12\n\t"
        "movq	312(%[t]), %%r13\n\t"
        "# A[16] * B[20]\n\t"
        "mulx	160(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[16] * B[21]\n\t"
        "mulx	168(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[16] * B[22]\n\t"
        "mulx	176(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[16] * B[23]\n\t"
        "mulx	184(%[b]), %%r8, %%r9\n\t"
        "movq	%%r15, %%r14\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "adcxq	%%rcx, %%r14\n\t"
        "movq	%%r15, %%rcx\n\t"
        "adoxq	%%r15, %%rcx\n\t"
        "adcxq	%%r15, %%rcx\n\t"
        "movq	%%r10, 288(%[t])\n\t"
        "movq	%%r11, 296(%[t])\n\t"
        "movq	%%r12, 304(%[t])\n\t"
        "movq	%%r13, 312(%[t])\n\t"
        "movq	%%r14, 320(%[t])\n\t"
        "movq	136(%[a]), %%rdx\n\t"
        "xorq	%%r15, %%r15\n\t"
        "movq	136(%[t]), %%rax\n\t"
        "movq	144(%[t]), %%r10\n\t"
        "movq	152(%[t]), %%r11\n\t"
        "movq	160(%[t]), %%r12\n\t"
        "movq	168(%[t]), %%r13\n\t"
        "# A[17] * B[0]\n\t"
        "mulx	0(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[17] * B[1]\n\t"
        "mulx	8(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[17] * B[2]\n\t"
        "mulx	16(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[17] * B[3]\n\t"
        "mulx	24(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "movq	%%rax, 136(%[t])\n\t"
        "movq	%%r10, 144(%[t])\n\t"
        "movq	%%r11, 152(%[t])\n\t"
        "movq	%%r12, 160(%[t])\n\t"
        "movq	176(%[t]), %%r14\n\t"
        "movq	184(%[t]), %%rax\n\t"
        "movq	192(%[t]), %%r10\n\t"
        "movq	200(%[t]), %%r11\n\t"
        "# A[17] * B[4]\n\t"
        "mulx	32(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[17] * B[5]\n\t"
        "mulx	40(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[17] * B[6]\n\t"
        "mulx	48(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[17] * B[7]\n\t"
        "mulx	56(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "movq	%%r13, 168(%[t])\n\t"
        "movq	%%r14, 176(%[t])\n\t"
        "movq	%%rax, 184(%[t])\n\t"
        "movq	%%r10, 192(%[t])\n\t"
        "movq	208(%[t]), %%r12\n\t"
        "movq	216(%[t]), %%r13\n\t"
        "movq	224(%[t]), %%r14\n\t"
        "movq	232(%[t]), %%rax\n\t"
        "# A[17] * B[8]\n\t"
        "mulx	64(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[17] * B[9]\n\t"
        "mulx	72(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[17] * B[10]\n\t"
        "mulx	80(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[17] * B[11]\n\t"
        "mulx	88(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "movq	%%r11, 200(%[t])\n\t"
        "movq	%%r12, 208(%[t])\n\t"
        "movq	%%r13, 216(%[t])\n\t"
        "movq	%%r14, 224(%[t])\n\t"
        "movq	240(%[t]), %%r10\n\t"
        "movq	248(%[t]), %%r11\n\t"
        "movq	256(%[t]), %%r12\n\t"
        "movq	264(%[t]), %%r13\n\t"
        "# A[17] * B[12]\n\t"
        "mulx	96(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[17] * B[13]\n\t"
        "mulx	104(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[17] * B[14]\n\t"
        "mulx	112(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[17] * B[15]\n\t"
        "mulx	120(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "movq	%%rax, 232(%[t])\n\t"
        "movq	%%r10, 240(%[t])\n\t"
        "movq	%%r11, 248(%[t])\n\t"
        "movq	%%r12, 256(%[t])\n\t"
        "movq	272(%[t]), %%r14\n\t"
        "movq	280(%[t]), %%rax\n\t"
        "movq	288(%[t]), %%r10\n\t"
        "movq	296(%[t]), %%r11\n\t"
        "# A[17] * B[16]\n\t"
        "mulx	128(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[17] * B[17]\n\t"
        "mulx	136(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[17] * B[18]\n\t"
        "mulx	144(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[17] * B[19]\n\t"
        "mulx	152(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "movq	%%r13, 264(%[t])\n\t"
        "movq	%%r14, 272(%[t])\n\t"
        "movq	%%rax, 280(%[t])\n\t"
        "movq	%%r10, 288(%[t])\n\t"
        "movq	304(%[t]), %%r12\n\t"
        "movq	312(%[t]), %%r13\n\t"
        "movq	320(%[t]), %%r14\n\t"
        "# A[17] * B[20]\n\t"
        "mulx	160(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[17] * B[21]\n\t"
        "mulx	168(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[17] * B[22]\n\t"
        "mulx	176(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[17] * B[23]\n\t"
        "mulx	184(%[b]), %%r8, %%r9\n\t"
        "movq	%%r15, %%rax\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "adcxq	%%rcx, %%rax\n\t"
        "movq	%%r15, %%rcx\n\t"
        "adoxq	%%r15, %%rcx\n\t"
        "adcxq	%%r15, %%rcx\n\t"
        "movq	%%r11, 296(%[t])\n\t"
        "movq	%%r12, 304(%[t])\n\t"
        "movq	%%r13, 312(%[t])\n\t"
        "movq	%%r14, 320(%[t])\n\t"
        "movq	%%rax, 328(%[t])\n\t"
        "movq	144(%[a]), %%rdx\n\t"
        "xorq	%%r15, %%r15\n\t"
        "movq	144(%[t]), %%r10\n\t"
        "movq	152(%[t]), %%r11\n\t"
        "movq	160(%[t]), %%r12\n\t"
        "movq	168(%[t]), %%r13\n\t"
        "movq	176(%[t]), %%r14\n\t"
        "# A[18] * B[0]\n\t"
        "mulx	0(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[18] * B[1]\n\t"
        "mulx	8(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[18] * B[2]\n\t"
        "mulx	16(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[18] * B[3]\n\t"
        "mulx	24(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "movq	%%r10, 144(%[t])\n\t"
        "movq	%%r11, 152(%[t])\n\t"
        "movq	%%r12, 160(%[t])\n\t"
        "movq	%%r13, 168(%[t])\n\t"
        "movq	184(%[t]), %%rax\n\t"
        "movq	192(%[t]), %%r10\n\t"
        "movq	200(%[t]), %%r11\n\t"
        "movq	208(%[t]), %%r12\n\t"
        "# A[18] * B[4]\n\t"
        "mulx	32(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[18] * B[5]\n\t"
        "mulx	40(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[18] * B[6]\n\t"
        "mulx	48(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[18] * B[7]\n\t"
        "mulx	56(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "movq	%%r14, 176(%[t])\n\t"
        "movq	%%rax, 184(%[t])\n\t"
        "movq	%%r10, 192(%[t])\n\t"
        "movq	%%r11, 200(%[t])\n\t"
        "movq	216(%[t]), %%r13\n\t"
        "movq	224(%[t]), %%r14\n\t"
        "movq	232(%[t]), %%rax\n\t"
        "movq	240(%[t]), %%r10\n\t"
        "# A[18] * B[8]\n\t"
        "mulx	64(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[18] * B[9]\n\t"
        "mulx	72(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[18] * B[10]\n\t"
        "mulx	80(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[18] * B[11]\n\t"
        "mulx	88(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "movq	%%r12, 208(%[t])\n\t"
        "movq	%%r13, 216(%[t])\n\t"
        "movq	%%r14, 224(%[t])\n\t"
        "movq	%%rax, 232(%[t])\n\t"
        "movq	248(%[t]), %%r11\n\t"
        "movq	256(%[t]), %%r12\n\t"
        "movq	264(%[t]), %%r13\n\t"
        "movq	272(%[t]), %%r14\n\t"
        "# A[18] * B[12]\n\t"
        "mulx	96(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[18] * B[13]\n\t"
        "mulx	104(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[18] * B[14]\n\t"
        "mulx	112(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[18] * B[15]\n\t"
        "mulx	120(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "movq	%%r10, 240(%[t])\n\t"
        "movq	%%r11, 248(%[t])\n\t"
        "movq	%%r12, 256(%[t])\n\t"
        "movq	%%r13, 264(%[t])\n\t"
        "movq	280(%[t]), %%rax\n\t"
        "movq	288(%[t]), %%r10\n\t"
        "movq	296(%[t]), %%r11\n\t"
        "movq	304(%[t]), %%r12\n\t"
        "# A[18] * B[16]\n\t"
        "mulx	128(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[18] * B[17]\n\t"
        "mulx	136(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[18] * B[18]\n\t"
        "mulx	144(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[18] * B[19]\n\t"
        "mulx	152(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "movq	%%r14, 272(%[t])\n\t"
        "movq	%%rax, 280(%[t])\n\t"
        "movq	%%r10, 288(%[t])\n\t"
        "movq	%%r11, 296(%[t])\n\t"
        "movq	312(%[t]), %%r13\n\t"
        "movq	320(%[t]), %%r14\n\t"
        "movq	328(%[t]), %%rax\n\t"
        "# A[18] * B[20]\n\t"
        "mulx	160(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[18] * B[21]\n\t"
        "mulx	168(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[18] * B[22]\n\t"
        "mulx	176(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[18] * B[23]\n\t"
        "mulx	184(%[b]), %%r8, %%r9\n\t"
        "movq	%%r15, %%r10\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "adcxq	%%rcx, %%r10\n\t"
        "movq	%%r15, %%rcx\n\t"
        "adoxq	%%r15, %%rcx\n\t"
        "adcxq	%%r15, %%rcx\n\t"
        "movq	%%r12, 304(%[t])\n\t"
        "movq	%%r13, 312(%[t])\n\t"
        "movq	%%r14, 320(%[t])\n\t"
        "movq	%%rax, 328(%[t])\n\t"
        "movq	%%r10, 336(%[t])\n\t"
        "movq	152(%[a]), %%rdx\n\t"
        "xorq	%%r15, %%r15\n\t"
        "movq	152(%[t]), %%r11\n\t"
        "movq	160(%[t]), %%r12\n\t"
        "movq	168(%[t]), %%r13\n\t"
        "movq	176(%[t]), %%r14\n\t"
        "movq	184(%[t]), %%rax\n\t"
        "# A[19] * B[0]\n\t"
        "mulx	0(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[19] * B[1]\n\t"
        "mulx	8(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[19] * B[2]\n\t"
        "mulx	16(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[19] * B[3]\n\t"
        "mulx	24(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "movq	%%r11, 152(%[t])\n\t"
        "movq	%%r12, 160(%[t])\n\t"
        "movq	%%r13, 168(%[t])\n\t"
        "movq	%%r14, 176(%[t])\n\t"
        "movq	192(%[t]), %%r10\n\t"
        "movq	200(%[t]), %%r11\n\t"
        "movq	208(%[t]), %%r12\n\t"
        "movq	216(%[t]), %%r13\n\t"
        "# A[19] * B[4]\n\t"
        "mulx	32(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[19] * B[5]\n\t"
        "mulx	40(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[19] * B[6]\n\t"
        "mulx	48(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[19] * B[7]\n\t"
        "mulx	56(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "movq	%%rax, 184(%[t])\n\t"
        "movq	%%r10, 192(%[t])\n\t"
        "movq	%%r11, 200(%[t])\n\t"
        "movq	%%r12, 208(%[t])\n\t"
        "movq	224(%[t]), %%r14\n\t"
        "movq	232(%[t]), %%rax\n\t"
        "movq	240(%[t]), %%r10\n\t"
        "movq	248(%[t]), %%r11\n\t"
        "# A[19] * B[8]\n\t"
        "mulx	64(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[19] * B[9]\n\t"
        "mulx	72(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[19] * B[10]\n\t"
        "mulx	80(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[19] * B[11]\n\t"
        "mulx	88(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "movq	%%r13, 216(%[t])\n\t"
        "movq	%%r14, 224(%[t])\n\t"
        "movq	%%rax, 232(%[t])\n\t"
        "movq	%%r10, 240(%[t])\n\t"
        "movq	256(%[t]), %%r12\n\t"
        "movq	264(%[t]), %%r13\n\t"
        "movq	272(%[t]), %%r14\n\t"
        "movq	280(%[t]), %%rax\n\t"
        "# A[19] * B[12]\n\t"
        "mulx	96(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[19] * B[13]\n\t"
        "mulx	104(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[19] * B[14]\n\t"
        "mulx	112(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[19] * B[15]\n\t"
        "mulx	120(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "movq	%%r11, 248(%[t])\n\t"
        "movq	%%r12, 256(%[t])\n\t"
        "movq	%%r13, 264(%[t])\n\t"
        "movq	%%r14, 272(%[t])\n\t"
        "movq	288(%[t]), %%r10\n\t"
        "movq	296(%[t]), %%r11\n\t"
        "movq	304(%[t]), %%r12\n\t"
        "movq	312(%[t]), %%r13\n\t"
        "# A[19] * B[16]\n\t"
        "mulx	128(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[19] * B[17]\n\t"
        "mulx	136(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[19] * B[18]\n\t"
        "mulx	144(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[19] * B[19]\n\t"
        "mulx	152(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "movq	%%rax, 280(%[t])\n\t"
        "movq	%%r10, 288(%[t])\n\t"
        "movq	%%r11, 296(%[t])\n\t"
        "movq	%%r12, 304(%[t])\n\t"
        "movq	320(%[t]), %%r14\n\t"
        "movq	328(%[t]), %%rax\n\t"
        "movq	336(%[t]), %%r10\n\t"
        "# A[19] * B[20]\n\t"
        "mulx	160(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[19] * B[21]\n\t"
        "mulx	168(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[19] * B[22]\n\t"
        "mulx	176(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[19] * B[23]\n\t"
        "mulx	184(%[b]), %%r8, %%r9\n\t"
        "movq	%%r15, %%r11\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "adcxq	%%rcx, %%r11\n\t"
        "movq	%%r15, %%rcx\n\t"
        "adoxq	%%r15, %%rcx\n\t"
        "adcxq	%%r15, %%rcx\n\t"
        "movq	%%r13, 312(%[t])\n\t"
        "movq	%%r14, 320(%[t])\n\t"
        "movq	%%rax, 328(%[t])\n\t"
        "movq	%%r10, 336(%[t])\n\t"
        "movq	%%r11, 344(%[t])\n\t"
        "movq	160(%[a]), %%rdx\n\t"
        "xorq	%%r15, %%r15\n\t"
        "movq	160(%[t]), %%r12\n\t"
        "movq	168(%[t]), %%r13\n\t"
        "movq	176(%[t]), %%r14\n\t"
        "movq	184(%[t]), %%rax\n\t"
        "movq	192(%[t]), %%r10\n\t"
        "# A[20] * B[0]\n\t"
        "mulx	0(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[20] * B[1]\n\t"
        "mulx	8(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[20] * B[2]\n\t"
        "mulx	16(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[20] * B[3]\n\t"
        "mulx	24(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "movq	%%r12, 160(%[t])\n\t"
        "movq	%%r13, 168(%[t])\n\t"
        "movq	%%r14, 176(%[t])\n\t"
        "movq	%%rax, 184(%[t])\n\t"
        "movq	200(%[t]), %%r11\n\t"
        "movq	208(%[t]), %%r12\n\t"
        "movq	216(%[t]), %%r13\n\t"
        "movq	224(%[t]), %%r14\n\t"
        "# A[20] * B[4]\n\t"
        "mulx	32(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[20] * B[5]\n\t"
        "mulx	40(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[20] * B[6]\n\t"
        "mulx	48(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[20] * B[7]\n\t"
        "mulx	56(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "movq	%%r10, 192(%[t])\n\t"
        "movq	%%r11, 200(%[t])\n\t"
        "movq	%%r12, 208(%[t])\n\t"
        "movq	%%r13, 216(%[t])\n\t"
        "movq	232(%[t]), %%rax\n\t"
        "movq	240(%[t]), %%r10\n\t"
        "movq	248(%[t]), %%r11\n\t"
        "movq	256(%[t]), %%r12\n\t"
        "# A[20] * B[8]\n\t"
        "mulx	64(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[20] * B[9]\n\t"
        "mulx	72(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[20] * B[10]\n\t"
        "mulx	80(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[20] * B[11]\n\t"
        "mulx	88(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "movq	%%r14, 224(%[t])\n\t"
        "movq	%%rax, 232(%[t])\n\t"
        "movq	%%r10, 240(%[t])\n\t"
        "movq	%%r11, 248(%[t])\n\t"
        "movq	264(%[t]), %%r13\n\t"
        "movq	272(%[t]), %%r14\n\t"
        "movq	280(%[t]), %%rax\n\t"
        "movq	288(%[t]), %%r10\n\t"
        "# A[20] * B[12]\n\t"
        "mulx	96(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[20] * B[13]\n\t"
        "mulx	104(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[20] * B[14]\n\t"
        "mulx	112(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[20] * B[15]\n\t"
        "mulx	120(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "movq	%%r12, 256(%[t])\n\t"
        "movq	%%r13, 264(%[t])\n\t"
        "movq	%%r14, 272(%[t])\n\t"
        "movq	%%rax, 280(%[t])\n\t"
        "movq	296(%[t]), %%r11\n\t"
        "movq	304(%[t]), %%r12\n\t"
        "movq	312(%[t]), %%r13\n\t"
        "movq	320(%[t]), %%r14\n\t"
        "# A[20] * B[16]\n\t"
        "mulx	128(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[20] * B[17]\n\t"
        "mulx	136(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[20] * B[18]\n\t"
        "mulx	144(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[20] * B[19]\n\t"
        "mulx	152(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "movq	%%r10, 288(%[t])\n\t"
        "movq	%%r11, 296(%[t])\n\t"
        "movq	%%r12, 304(%[t])\n\t"
        "movq	%%r13, 312(%[t])\n\t"
        "movq	328(%[t]), %%rax\n\t"
        "movq	336(%[t]), %%r10\n\t"
        "movq	344(%[t]), %%r11\n\t"
        "# A[20] * B[20]\n\t"
        "mulx	160(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[20] * B[21]\n\t"
        "mulx	168(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[20] * B[22]\n\t"
        "mulx	176(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[20] * B[23]\n\t"
        "mulx	184(%[b]), %%r8, %%r9\n\t"
        "movq	%%r15, %%r12\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "adcxq	%%rcx, %%r12\n\t"
        "movq	%%r15, %%rcx\n\t"
        "adoxq	%%r15, %%rcx\n\t"
        "adcxq	%%r15, %%rcx\n\t"
        "movq	%%r14, 320(%[t])\n\t"
        "movq	%%rax, 328(%[t])\n\t"
        "movq	%%r10, 336(%[t])\n\t"
        "movq	%%r11, 344(%[t])\n\t"
        "movq	%%r12, 352(%[t])\n\t"
        "movq	168(%[a]), %%rdx\n\t"
        "xorq	%%r15, %%r15\n\t"
        "movq	168(%[t]), %%r13\n\t"
        "movq	176(%[t]), %%r14\n\t"
        "movq	184(%[t]), %%rax\n\t"
        "movq	192(%[t]), %%r10\n\t"
        "movq	200(%[t]), %%r11\n\t"
        "# A[21] * B[0]\n\t"
        "mulx	0(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[21] * B[1]\n\t"
        "mulx	8(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[21] * B[2]\n\t"
        "mulx	16(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[21] * B[3]\n\t"
        "mulx	24(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "movq	%%r13, 168(%[t])\n\t"
        "movq	%%r14, 176(%[t])\n\t"
        "movq	%%rax, 184(%[t])\n\t"
        "movq	%%r10, 192(%[t])\n\t"
        "movq	208(%[t]), %%r12\n\t"
        "movq	216(%[t]), %%r13\n\t"
        "movq	224(%[t]), %%r14\n\t"
        "movq	232(%[t]), %%rax\n\t"
        "# A[21] * B[4]\n\t"
        "mulx	32(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[21] * B[5]\n\t"
        "mulx	40(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[21] * B[6]\n\t"
        "mulx	48(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[21] * B[7]\n\t"
        "mulx	56(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "movq	%%r11, 200(%[t])\n\t"
        "movq	%%r12, 208(%[t])\n\t"
        "movq	%%r13, 216(%[t])\n\t"
        "movq	%%r14, 224(%[t])\n\t"
        "movq	240(%[t]), %%r10\n\t"
        "movq	248(%[t]), %%r11\n\t"
        "movq	256(%[t]), %%r12\n\t"
        "movq	264(%[t]), %%r13\n\t"
        "# A[21] * B[8]\n\t"
        "mulx	64(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[21] * B[9]\n\t"
        "mulx	72(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[21] * B[10]\n\t"
        "mulx	80(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[21] * B[11]\n\t"
        "mulx	88(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "movq	%%rax, 232(%[t])\n\t"
        "movq	%%r10, 240(%[t])\n\t"
        "movq	%%r11, 248(%[t])\n\t"
        "movq	%%r12, 256(%[t])\n\t"
        "movq	272(%[t]), %%r14\n\t"
        "movq	280(%[t]), %%rax\n\t"
        "movq	288(%[t]), %%r10\n\t"
        "movq	296(%[t]), %%r11\n\t"
        "# A[21] * B[12]\n\t"
        "mulx	96(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[21] * B[13]\n\t"
        "mulx	104(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[21] * B[14]\n\t"
        "mulx	112(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[21] * B[15]\n\t"
        "mulx	120(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "movq	%%r13, 264(%[t])\n\t"
        "movq	%%r14, 272(%[t])\n\t"
        "movq	%%rax, 280(%[t])\n\t"
        "movq	%%r10, 288(%[t])\n\t"
        "movq	304(%[t]), %%r12\n\t"
        "movq	312(%[t]), %%r13\n\t"
        "movq	320(%[t]), %%r14\n\t"
        "movq	328(%[t]), %%rax\n\t"
        "# A[21] * B[16]\n\t"
        "mulx	128(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[21] * B[17]\n\t"
        "mulx	136(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[21] * B[18]\n\t"
        "mulx	144(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[21] * B[19]\n\t"
        "mulx	152(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "movq	%%r11, 296(%[t])\n\t"
        "movq	%%r12, 304(%[t])\n\t"
        "movq	%%r13, 312(%[t])\n\t"
        "movq	%%r14, 320(%[t])\n\t"
        "movq	336(%[t]), %%r10\n\t"
        "movq	344(%[t]), %%r11\n\t"
        "movq	352(%[t]), %%r12\n\t"
        "# A[21] * B[20]\n\t"
        "mulx	160(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[21] * B[21]\n\t"
        "mulx	168(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[21] * B[22]\n\t"
        "mulx	176(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[21] * B[23]\n\t"
        "mulx	184(%[b]), %%r8, %%r9\n\t"
        "movq	%%r15, %%r13\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "adcxq	%%rcx, %%r13\n\t"
        "movq	%%r15, %%rcx\n\t"
        "adoxq	%%r15, %%rcx\n\t"
        "adcxq	%%r15, %%rcx\n\t"
        "movq	%%rax, 328(%[t])\n\t"
        "movq	%%r10, 336(%[t])\n\t"
        "movq	%%r11, 344(%[t])\n\t"
        "movq	%%r12, 352(%[t])\n\t"
        "movq	%%r13, 360(%[t])\n\t"
        "movq	176(%[a]), %%rdx\n\t"
        "xorq	%%r15, %%r15\n\t"
        "movq	176(%[t]), %%r14\n\t"
        "movq	184(%[t]), %%rax\n\t"
        "movq	192(%[t]), %%r10\n\t"
        "movq	200(%[t]), %%r11\n\t"
        "movq	208(%[t]), %%r12\n\t"
        "# A[22] * B[0]\n\t"
        "mulx	0(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[22] * B[1]\n\t"
        "mulx	8(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[22] * B[2]\n\t"
        "mulx	16(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[22] * B[3]\n\t"
        "mulx	24(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "movq	%%r14, 176(%[t])\n\t"
        "movq	%%rax, 184(%[t])\n\t"
        "movq	%%r10, 192(%[t])\n\t"
        "movq	%%r11, 200(%[t])\n\t"
        "movq	216(%[t]), %%r13\n\t"
        "movq	224(%[t]), %%r14\n\t"
        "movq	232(%[t]), %%rax\n\t"
        "movq	240(%[t]), %%r10\n\t"
        "# A[22] * B[4]\n\t"
        "mulx	32(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[22] * B[5]\n\t"
        "mulx	40(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[22] * B[6]\n\t"
        "mulx	48(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[22] * B[7]\n\t"
        "mulx	56(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "movq	%%r12, 208(%[t])\n\t"
        "movq	%%r13, 216(%[t])\n\t"
        "movq	%%r14, 224(%[t])\n\t"
        "movq	%%rax, 232(%[t])\n\t"
        "movq	248(%[t]), %%r11\n\t"
        "movq	256(%[t]), %%r12\n\t"
        "movq	264(%[t]), %%r13\n\t"
        "movq	272(%[t]), %%r14\n\t"
        "# A[22] * B[8]\n\t"
        "mulx	64(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[22] * B[9]\n\t"
        "mulx	72(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[22] * B[10]\n\t"
        "mulx	80(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[22] * B[11]\n\t"
        "mulx	88(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "movq	%%r10, 240(%[t])\n\t"
        "movq	%%r11, 248(%[t])\n\t"
        "movq	%%r12, 256(%[t])\n\t"
        "movq	%%r13, 264(%[t])\n\t"
        "movq	280(%[t]), %%rax\n\t"
        "movq	288(%[t]), %%r10\n\t"
        "movq	296(%[t]), %%r11\n\t"
        "movq	304(%[t]), %%r12\n\t"
        "# A[22] * B[12]\n\t"
        "mulx	96(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[22] * B[13]\n\t"
        "mulx	104(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[22] * B[14]\n\t"
        "mulx	112(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[22] * B[15]\n\t"
        "mulx	120(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "movq	%%r14, 272(%[t])\n\t"
        "movq	%%rax, 280(%[t])\n\t"
        "movq	%%r10, 288(%[t])\n\t"
        "movq	%%r11, 296(%[t])\n\t"
        "movq	312(%[t]), %%r13\n\t"
        "movq	320(%[t]), %%r14\n\t"
        "movq	328(%[t]), %%rax\n\t"
        "movq	336(%[t]), %%r10\n\t"
        "# A[22] * B[16]\n\t"
        "mulx	128(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[22] * B[17]\n\t"
        "mulx	136(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[22] * B[18]\n\t"
        "mulx	144(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[22] * B[19]\n\t"
        "mulx	152(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "movq	%%r12, 304(%[t])\n\t"
        "movq	%%r13, 312(%[t])\n\t"
        "movq	%%r14, 320(%[t])\n\t"
        "movq	%%rax, 328(%[t])\n\t"
        "movq	344(%[t]), %%r11\n\t"
        "movq	352(%[t]), %%r12\n\t"
        "movq	360(%[t]), %%r13\n\t"
        "# A[22] * B[20]\n\t"
        "mulx	160(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[22] * B[21]\n\t"
        "mulx	168(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[22] * B[22]\n\t"
        "mulx	176(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[22] * B[23]\n\t"
        "mulx	184(%[b]), %%r8, %%r9\n\t"
        "movq	%%r15, %%r14\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "adcxq	%%rcx, %%r14\n\t"
        "movq	%%r15, %%rcx\n\t"
        "adoxq	%%r15, %%rcx\n\t"
        "adcxq	%%r15, %%rcx\n\t"
        "movq	%%r10, 336(%[t])\n\t"
        "movq	%%r11, 344(%[t])\n\t"
        "movq	%%r12, 352(%[t])\n\t"
        "movq	%%r13, 360(%[t])\n\t"
        "movq	%%r14, 368(%[t])\n\t"
        "movq	184(%[a]), %%rdx\n\t"
        "xorq	%%r15, %%r15\n\t"
        "movq	184(%[t]), %%rax\n\t"
        "movq	192(%[t]), %%r10\n\t"
        "movq	200(%[t]), %%r11\n\t"
        "movq	208(%[t]), %%r12\n\t"
        "movq	216(%[t]), %%r13\n\t"
        "# A[23] * B[0]\n\t"
        "mulx	0(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[23] * B[1]\n\t"
        "mulx	8(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[23] * B[2]\n\t"
        "mulx	16(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[23] * B[3]\n\t"
        "mulx	24(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "movq	%%rax, 184(%[t])\n\t"
        "movq	%%r10, 192(%[t])\n\t"
        "movq	%%r11, 200(%[t])\n\t"
        "movq	%%r12, 208(%[t])\n\t"
        "movq	224(%[t]), %%r14\n\t"
        "movq	232(%[t]), %%rax\n\t"
        "movq	240(%[t]), %%r10\n\t"
        "movq	248(%[t]), %%r11\n\t"
        "# A[23] * B[4]\n\t"
        "mulx	32(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[23] * B[5]\n\t"
        "mulx	40(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[23] * B[6]\n\t"
        "mulx	48(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[23] * B[7]\n\t"
        "mulx	56(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "movq	%%r13, 216(%[t])\n\t"
        "movq	%%r14, 224(%[t])\n\t"
        "movq	%%rax, 232(%[t])\n\t"
        "movq	%%r10, 240(%[t])\n\t"
        "movq	256(%[t]), %%r12\n\t"
        "movq	264(%[t]), %%r13\n\t"
        "movq	272(%[t]), %%r14\n\t"
        "movq	280(%[t]), %%rax\n\t"
        "# A[23] * B[8]\n\t"
        "mulx	64(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[23] * B[9]\n\t"
        "mulx	72(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[23] * B[10]\n\t"
        "mulx	80(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[23] * B[11]\n\t"
        "mulx	88(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "movq	%%r11, 248(%[t])\n\t"
        "movq	%%r12, 256(%[t])\n\t"
        "movq	%%r13, 264(%[t])\n\t"
        "movq	%%r14, 272(%[t])\n\t"
        "movq	288(%[t]), %%r10\n\t"
        "movq	296(%[t]), %%r11\n\t"
        "movq	304(%[t]), %%r12\n\t"
        "movq	312(%[t]), %%r13\n\t"
        "# A[23] * B[12]\n\t"
        "mulx	96(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[23] * B[13]\n\t"
        "mulx	104(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "# A[23] * B[14]\n\t"
        "mulx	112(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[23] * B[15]\n\t"
        "mulx	120(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "movq	%%rax, 280(%[t])\n\t"
        "movq	%%r10, 288(%[t])\n\t"
        "movq	%%r11, 296(%[t])\n\t"
        "movq	%%r12, 304(%[t])\n\t"
        "movq	320(%[t]), %%r14\n\t"
        "movq	328(%[t]), %%rax\n\t"
        "movq	336(%[t]), %%r10\n\t"
        "movq	344(%[t]), %%r11\n\t"
        "# A[23] * B[16]\n\t"
        "mulx	128(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[23] * B[17]\n\t"
        "mulx	136(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "# A[23] * B[18]\n\t"
        "mulx	144(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%rax\n\t"
        "adoxq	%%r9, %%r10\n\t"
        "# A[23] * B[19]\n\t"
        "mulx	152(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r10\n\t"
        "adoxq	%%r9, %%r11\n\t"
        "movq	%%r13, 312(%[t])\n\t"
        "movq	%%r14, 320(%[t])\n\t"
        "movq	%%rax, 328(%[t])\n\t"
        "movq	%%r10, 336(%[t])\n\t"
        "movq	352(%[t]), %%r12\n\t"
        "movq	360(%[t]), %%r13\n\t"
        "movq	368(%[t]), %%r14\n\t"
        "# A[23] * B[20]\n\t"
        "mulx	160(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r11\n\t"
        "adoxq	%%r9, %%r12\n\t"
        "# A[23] * B[21]\n\t"
        "mulx	168(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r12\n\t"
        "adoxq	%%r9, %%r13\n\t"
        "# A[23] * B[22]\n\t"
        "mulx	176(%[b]), %%r8, %%r9\n\t"
        "adcxq	%%r8, %%r13\n\t"
        "adoxq	%%r9, %%r14\n\t"
        "# A[23] * B[23]\n\t"
        "mulx	184(%[b]), %%r8, %%r9\n\t"
        "movq	%%r15, %%rax\n\t"
        "adcxq	%%r8, %%r14\n\t"
        "adoxq	%%r9, %%rax\n\t"
        "adcxq	%%rcx, %%rax\n\t"
        "movq	%%r11, 344(%[t])\n\t"
        "movq	%%r12, 352(%[t])\n\t"
        "movq	%%r13, 360(%[t])\n\t"
        "movq	%%r14, 368(%[t])\n\t"
        "movq	%%rax, 376(%[t])\n\t"
        :
        : [a] "r" (a), [b] "r" (b), [t] "r" (tmp)
        : "memory", "rax", "rdx", "rcx",
          "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
    );

    XMEMCPY(r, tmp, sizeof(tmp));
}
#endif /* HAVE_INTEL_AVX2 */

#ifdef HAVE_INTEL_AVX2
/* Square a and put result in r. (r = a * a)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 */
SP_NOINLINE static void sp_3072_sqr_avx2_24(sp_digit* r, const sp_digit* a)
{
    sp_digit tmp[48];

    __asm__ __volatile__ (
        "xorq	%%r8, %%r8\n\t"
        "# Diagonal 1\n\t"
        "xorq	%%r10, %%r10\n\t"
        "xorq	%%r11, %%r11\n\t"
        "xorq	%%r12, %%r12\n\t"
        "xorq	%%r13, %%r13\n\t"
        "xorq	%%r14, %%r14\n\t"
        "xorq	%%r15, %%r15\n\t"
        "# A[1] x A[0]\n\t"
        "movq	0(%[a]), %%rdx\n\t"
        "mulxq	8(%[a]), %%r10, %%r11\n\t"
        "# A[2] x A[0]\n\t"
        "mulxq	16(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "# A[3] x A[0]\n\t"
        "mulxq	24(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "# A[4] x A[0]\n\t"
        "mulxq	32(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "# A[5] x A[0]\n\t"
        "mulxq	40(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "movq	%%r10, 8(%[tmp])\n\t"
        "movq	%%r11, 16(%[tmp])\n\t"
        "movq	%%r12, 24(%[tmp])\n\t"
        "movq	%%r13, 32(%[tmp])\n\t"
        "movq	%%r14, 40(%[tmp])\n\t"
        "movq	%%r8, %%r10\n\t"
        "movq	%%r8, %%r11\n\t"
        "movq	%%r8, %%r12\n\t"
        "movq	%%r8, %%r13\n\t"
        "movq	%%r8, %%r14\n\t"
        "# A[6] x A[0]\n\t"
        "mulxq	48(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "# A[7] x A[0]\n\t"
        "mulxq	56(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "# A[8] x A[0]\n\t"
        "mulxq	64(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "# A[9] x A[0]\n\t"
        "mulxq	72(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "# A[10] x A[0]\n\t"
        "mulxq	80(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "movq	%%r15, 48(%[tmp])\n\t"
        "movq	%%r10, 56(%[tmp])\n\t"
        "movq	%%r11, 64(%[tmp])\n\t"
        "movq	%%r12, 72(%[tmp])\n\t"
        "movq	%%r13, 80(%[tmp])\n\t"
        "movq	%%r8, %%r15\n\t"
        "movq	%%r8, %%r10\n\t"
        "movq	%%r8, %%r11\n\t"
        "movq	%%r8, %%r12\n\t"
        "movq	%%r8, %%r13\n\t"
        "# A[11] x A[0]\n\t"
        "mulxq	88(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "# A[12] x A[0]\n\t"
        "mulxq	96(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "# A[13] x A[0]\n\t"
        "mulxq	104(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "# A[14] x A[0]\n\t"
        "mulxq	112(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "# A[15] x A[0]\n\t"
        "mulxq	120(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "movq	%%r14, 88(%[tmp])\n\t"
        "movq	%%r15, 96(%[tmp])\n\t"
        "movq	%%r10, 104(%[tmp])\n\t"
        "movq	%%r11, 112(%[tmp])\n\t"
        "movq	%%r12, 120(%[tmp])\n\t"
        "movq	%%r8, %%r14\n\t"
        "movq	%%r8, %%r15\n\t"
        "movq	%%r8, %%r10\n\t"
        "movq	%%r8, %%r11\n\t"
        "movq	%%r8, %%r12\n\t"
        "# A[16] x A[0]\n\t"
        "mulxq	128(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "# A[17] x A[0]\n\t"
        "mulxq	136(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "# A[18] x A[0]\n\t"
        "mulxq	144(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "# A[19] x A[0]\n\t"
        "mulxq	152(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "# A[20] x A[0]\n\t"
        "mulxq	160(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "movq	%%r13, 128(%[tmp])\n\t"
        "movq	%%r14, 136(%[tmp])\n\t"
        "movq	%%r15, 144(%[tmp])\n\t"
        "movq	%%r10, 152(%[tmp])\n\t"
        "movq	%%r11, 160(%[tmp])\n\t"
        "movq	%%r8, %%r13\n\t"
        "movq	%%r8, %%r14\n\t"
        "movq	%%r8, %%r15\n\t"
        "# A[21] x A[0]\n\t"
        "mulxq	168(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "# A[22] x A[0]\n\t"
        "mulxq	176(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "# A[23] x A[0]\n\t"
        "mulxq	184(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "movq	%%r12, 168(%[tmp])\n\t"
        "movq	%%r13, 176(%[tmp])\n\t"
        "movq	%%r14, 184(%[tmp])\n\t"
        "#  Carry\n\t"
        "adcxq	%%r8, %%r15\n\t"
        "movq	%%r8, %%r9\n\t"
        "adcxq	%%r8, %%r9\n\t"
        "adoxq	%%r8, %%r9\n\t"
        "movq	%%r15, 192(%[tmp])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "# Diagonal 2\n\t"
        "movq	24(%[tmp]), %%r15\n\t"
        "movq	32(%[tmp]), %%r10\n\t"
        "movq	40(%[tmp]), %%r11\n\t"
        "movq	48(%[tmp]), %%r12\n\t"
        "movq	56(%[tmp]), %%r13\n\t"
        "movq	64(%[tmp]), %%r14\n\t"
        "# A[2] x A[1]\n\t"
        "movq	8(%[a]), %%rdx\n\t"
        "mulxq	16(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "# A[3] x A[1]\n\t"
        "mulxq	24(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "# A[4] x A[1]\n\t"
        "mulxq	32(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "# A[5] x A[1]\n\t"
        "mulxq	40(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "# A[6] x A[1]\n\t"
        "mulxq	48(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "movq	%%r15, 24(%[tmp])\n\t"
        "movq	%%r10, 32(%[tmp])\n\t"
        "movq	%%r11, 40(%[tmp])\n\t"
        "movq	%%r12, 48(%[tmp])\n\t"
        "movq	%%r13, 56(%[tmp])\n\t"
        "movq	72(%[tmp]), %%r15\n\t"
        "movq	80(%[tmp]), %%r10\n\t"
        "movq	88(%[tmp]), %%r11\n\t"
        "movq	96(%[tmp]), %%r12\n\t"
        "movq	104(%[tmp]), %%r13\n\t"
        "# A[7] x A[1]\n\t"
        "mulxq	56(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "# A[8] x A[1]\n\t"
        "mulxq	64(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "# A[9] x A[1]\n\t"
        "mulxq	72(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "# A[10] x A[1]\n\t"
        "mulxq	80(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "# A[11] x A[1]\n\t"
        "mulxq	88(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "movq	%%r14, 64(%[tmp])\n\t"
        "movq	%%r15, 72(%[tmp])\n\t"
        "movq	%%r10, 80(%[tmp])\n\t"
        "movq	%%r11, 88(%[tmp])\n\t"
        "movq	%%r12, 96(%[tmp])\n\t"
        "movq	112(%[tmp]), %%r14\n\t"
        "movq	120(%[tmp]), %%r15\n\t"
        "movq	128(%[tmp]), %%r10\n\t"
        "movq	136(%[tmp]), %%r11\n\t"
        "movq	144(%[tmp]), %%r12\n\t"
        "# A[12] x A[1]\n\t"
        "mulxq	96(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "# A[13] x A[1]\n\t"
        "mulxq	104(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "# A[14] x A[1]\n\t"
        "mulxq	112(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "# A[15] x A[1]\n\t"
        "mulxq	120(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "# A[16] x A[1]\n\t"
        "mulxq	128(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "movq	%%r13, 104(%[tmp])\n\t"
        "movq	%%r14, 112(%[tmp])\n\t"
        "movq	%%r15, 120(%[tmp])\n\t"
        "movq	%%r10, 128(%[tmp])\n\t"
        "movq	%%r11, 136(%[tmp])\n\t"
        "movq	152(%[tmp]), %%r13\n\t"
        "movq	160(%[tmp]), %%r14\n\t"
        "movq	168(%[tmp]), %%r15\n\t"
        "movq	176(%[tmp]), %%r10\n\t"
        "movq	184(%[tmp]), %%r11\n\t"
        "# A[17] x A[1]\n\t"
        "mulxq	136(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "# A[18] x A[1]\n\t"
        "mulxq	144(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "# A[19] x A[1]\n\t"
        "mulxq	152(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "# A[20] x A[1]\n\t"
        "mulxq	160(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "# A[21] x A[1]\n\t"
        "mulxq	168(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "movq	%%r12, 144(%[tmp])\n\t"
        "movq	%%r13, 152(%[tmp])\n\t"
        "movq	%%r14, 160(%[tmp])\n\t"
        "movq	%%r15, 168(%[tmp])\n\t"
        "movq	%%r10, 176(%[tmp])\n\t"
        "movq	192(%[tmp]), %%r12\n\t"
        "movq	%%r8, %%r13\n\t"
        "movq	%%r8, %%r14\n\t"
        "# A[22] x A[1]\n\t"
        "mulxq	176(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "# A[23] x A[1]\n\t"
        "mulxq	184(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "# A[23] x A[2]\n\t"
        "movq	16(%[a]), %%rdx\n\t"
        "mulxq	184(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "movq	%%r11, 184(%[tmp])\n\t"
        "movq	%%r12, 192(%[tmp])\n\t"
        "movq	%%r13, 200(%[tmp])\n\t"
        "#  Carry\n\t"
        "adcxq	%%r9, %%r14\n\t"
        "movq	%%r8, %%r9\n\t"
        "adcxq	%%r8, %%r9\n\t"
        "adoxq	%%r8, %%r9\n\t"
        "movq	%%r14, 208(%[tmp])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "# Diagonal 3\n\t"
        "movq	40(%[tmp]), %%r14\n\t"
        "movq	48(%[tmp]), %%r15\n\t"
        "movq	56(%[tmp]), %%r10\n\t"
        "movq	64(%[tmp]), %%r11\n\t"
        "movq	72(%[tmp]), %%r12\n\t"
        "movq	80(%[tmp]), %%r13\n\t"
        "# A[3] x A[2]\n\t"
        "mulxq	24(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "# A[4] x A[2]\n\t"
        "mulxq	32(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "# A[5] x A[2]\n\t"
        "mulxq	40(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "# A[6] x A[2]\n\t"
        "mulxq	48(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "# A[7] x A[2]\n\t"
        "mulxq	56(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "movq	%%r14, 40(%[tmp])\n\t"
        "movq	%%r15, 48(%[tmp])\n\t"
        "movq	%%r10, 56(%[tmp])\n\t"
        "movq	%%r11, 64(%[tmp])\n\t"
        "movq	%%r12, 72(%[tmp])\n\t"
        "movq	88(%[tmp]), %%r14\n\t"
        "movq	96(%[tmp]), %%r15\n\t"
        "movq	104(%[tmp]), %%r10\n\t"
        "movq	112(%[tmp]), %%r11\n\t"
        "movq	120(%[tmp]), %%r12\n\t"
        "# A[8] x A[2]\n\t"
        "mulxq	64(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "# A[9] x A[2]\n\t"
        "mulxq	72(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "# A[10] x A[2]\n\t"
        "mulxq	80(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "# A[11] x A[2]\n\t"
        "mulxq	88(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "# A[12] x A[2]\n\t"
        "mulxq	96(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "movq	%%r13, 80(%[tmp])\n\t"
        "movq	%%r14, 88(%[tmp])\n\t"
        "movq	%%r15, 96(%[tmp])\n\t"
        "movq	%%r10, 104(%[tmp])\n\t"
        "movq	%%r11, 112(%[tmp])\n\t"
        "movq	128(%[tmp]), %%r13\n\t"
        "movq	136(%[tmp]), %%r14\n\t"
        "movq	144(%[tmp]), %%r15\n\t"
        "movq	152(%[tmp]), %%r10\n\t"
        "movq	160(%[tmp]), %%r11\n\t"
        "# A[13] x A[2]\n\t"
        "mulxq	104(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "# A[14] x A[2]\n\t"
        "mulxq	112(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "# A[15] x A[2]\n\t"
        "mulxq	120(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "# A[16] x A[2]\n\t"
        "mulxq	128(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "# A[17] x A[2]\n\t"
        "mulxq	136(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "movq	%%r12, 120(%[tmp])\n\t"
        "movq	%%r13, 128(%[tmp])\n\t"
        "movq	%%r14, 136(%[tmp])\n\t"
        "movq	%%r15, 144(%[tmp])\n\t"
        "movq	%%r10, 152(%[tmp])\n\t"
        "movq	168(%[tmp]), %%r12\n\t"
        "movq	176(%[tmp]), %%r13\n\t"
        "movq	184(%[tmp]), %%r14\n\t"
        "movq	192(%[tmp]), %%r15\n\t"
        "movq	200(%[tmp]), %%r10\n\t"
        "# A[18] x A[2]\n\t"
        "mulxq	144(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "# A[19] x A[2]\n\t"
        "mulxq	152(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "# A[20] x A[2]\n\t"
        "mulxq	160(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "# A[21] x A[2]\n\t"
        "mulxq	168(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "# A[22] x A[2]\n\t"
        "mulxq	176(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "movq	%%r11, 160(%[tmp])\n\t"
        "movq	%%r12, 168(%[tmp])\n\t"
        "movq	%%r13, 176(%[tmp])\n\t"
        "movq	%%r14, 184(%[tmp])\n\t"
        "movq	%%r15, 192(%[tmp])\n\t"
        "movq	208(%[tmp]), %%r11\n\t"
        "movq	%%r8, %%r12\n\t"
        "movq	%%r8, %%r13\n\t"
        "# A[22] x A[3]\n\t"
        "movq	176(%[a]), %%rdx\n\t"
        "mulxq	24(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "# A[22] x A[4]\n\t"
        "mulxq	32(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "# A[22] x A[5]\n\t"
        "mulxq	40(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "movq	%%r10, 200(%[tmp])\n\t"
        "movq	%%r11, 208(%[tmp])\n\t"
        "movq	%%r12, 216(%[tmp])\n\t"
        "#  Carry\n\t"
        "adcxq	%%r9, %%r13\n\t"
        "movq	%%r8, %%r9\n\t"
        "adcxq	%%r8, %%r9\n\t"
        "adoxq	%%r8, %%r9\n\t"
        "movq	%%r13, 224(%[tmp])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "# Diagonal 4\n\t"
        "movq	56(%[tmp]), %%r13\n\t"
        "movq	64(%[tmp]), %%r14\n\t"
        "movq	72(%[tmp]), %%r15\n\t"
        "movq	80(%[tmp]), %%r10\n\t"
        "movq	88(%[tmp]), %%r11\n\t"
        "movq	96(%[tmp]), %%r12\n\t"
        "# A[4] x A[3]\n\t"
        "movq	24(%[a]), %%rdx\n\t"
        "mulxq	32(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "# A[5] x A[3]\n\t"
        "mulxq	40(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "# A[6] x A[3]\n\t"
        "mulxq	48(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "# A[7] x A[3]\n\t"
        "mulxq	56(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "# A[8] x A[3]\n\t"
        "mulxq	64(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "movq	%%r13, 56(%[tmp])\n\t"
        "movq	%%r14, 64(%[tmp])\n\t"
        "movq	%%r15, 72(%[tmp])\n\t"
        "movq	%%r10, 80(%[tmp])\n\t"
        "movq	%%r11, 88(%[tmp])\n\t"
        "movq	104(%[tmp]), %%r13\n\t"
        "movq	112(%[tmp]), %%r14\n\t"
        "movq	120(%[tmp]), %%r15\n\t"
        "movq	128(%[tmp]), %%r10\n\t"
        "movq	136(%[tmp]), %%r11\n\t"
        "# A[9] x A[3]\n\t"
        "mulxq	72(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "# A[10] x A[3]\n\t"
        "mulxq	80(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "# A[11] x A[3]\n\t"
        "mulxq	88(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "# A[12] x A[3]\n\t"
        "mulxq	96(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "# A[13] x A[3]\n\t"
        "mulxq	104(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "movq	%%r12, 96(%[tmp])\n\t"
        "movq	%%r13, 104(%[tmp])\n\t"
        "movq	%%r14, 112(%[tmp])\n\t"
        "movq	%%r15, 120(%[tmp])\n\t"
        "movq	%%r10, 128(%[tmp])\n\t"
        "movq	144(%[tmp]), %%r12\n\t"
        "movq	152(%[tmp]), %%r13\n\t"
        "movq	160(%[tmp]), %%r14\n\t"
        "movq	168(%[tmp]), %%r15\n\t"
        "movq	176(%[tmp]), %%r10\n\t"
        "# A[14] x A[3]\n\t"
        "mulxq	112(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "# A[15] x A[3]\n\t"
        "mulxq	120(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "# A[16] x A[3]\n\t"
        "mulxq	128(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "# A[17] x A[3]\n\t"
        "mulxq	136(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "# A[18] x A[3]\n\t"
        "mulxq	144(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "movq	%%r11, 136(%[tmp])\n\t"
        "movq	%%r12, 144(%[tmp])\n\t"
        "movq	%%r13, 152(%[tmp])\n\t"
        "movq	%%r14, 160(%[tmp])\n\t"
        "movq	%%r15, 168(%[tmp])\n\t"
        "movq	184(%[tmp]), %%r11\n\t"
        "movq	192(%[tmp]), %%r12\n\t"
        "movq	200(%[tmp]), %%r13\n\t"
        "movq	208(%[tmp]), %%r14\n\t"
        "movq	216(%[tmp]), %%r15\n\t"
        "# A[19] x A[3]\n\t"
        "mulxq	152(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "# A[20] x A[3]\n\t"
        "mulxq	160(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "# A[21] x A[3]\n\t"
        "mulxq	168(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "# A[21] x A[4]\n\t"
        "movq	168(%[a]), %%rdx\n\t"
        "mulxq	32(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "# A[21] x A[5]\n\t"
        "mulxq	40(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "movq	%%r10, 176(%[tmp])\n\t"
        "movq	%%r11, 184(%[tmp])\n\t"
        "movq	%%r12, 192(%[tmp])\n\t"
        "movq	%%r13, 200(%[tmp])\n\t"
        "movq	%%r14, 208(%[tmp])\n\t"
        "movq	224(%[tmp]), %%r10\n\t"
        "movq	%%r8, %%r11\n\t"
        "movq	%%r8, %%r12\n\t"
        "# A[21] x A[6]\n\t"
        "mulxq	48(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "# A[21] x A[7]\n\t"
        "mulxq	56(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "# A[21] x A[8]\n\t"
        "mulxq	64(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "movq	%%r15, 216(%[tmp])\n\t"
        "movq	%%r10, 224(%[tmp])\n\t"
        "movq	%%r11, 232(%[tmp])\n\t"
        "#  Carry\n\t"
        "adcxq	%%r9, %%r12\n\t"
        "movq	%%r8, %%r9\n\t"
        "adcxq	%%r8, %%r9\n\t"
        "adoxq	%%r8, %%r9\n\t"
        "movq	%%r12, 240(%[tmp])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "# Diagonal 5\n\t"
        "movq	72(%[tmp]), %%r12\n\t"
        "movq	80(%[tmp]), %%r13\n\t"
        "movq	88(%[tmp]), %%r14\n\t"
        "movq	96(%[tmp]), %%r15\n\t"
        "movq	104(%[tmp]), %%r10\n\t"
        "movq	112(%[tmp]), %%r11\n\t"
        "# A[5] x A[4]\n\t"
        "movq	32(%[a]), %%rdx\n\t"
        "mulxq	40(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "# A[6] x A[4]\n\t"
        "mulxq	48(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "# A[7] x A[4]\n\t"
        "mulxq	56(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "# A[8] x A[4]\n\t"
        "mulxq	64(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "# A[9] x A[4]\n\t"
        "mulxq	72(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "movq	%%r12, 72(%[tmp])\n\t"
        "movq	%%r13, 80(%[tmp])\n\t"
        "movq	%%r14, 88(%[tmp])\n\t"
        "movq	%%r15, 96(%[tmp])\n\t"
        "movq	%%r10, 104(%[tmp])\n\t"
        "movq	120(%[tmp]), %%r12\n\t"
        "movq	128(%[tmp]), %%r13\n\t"
        "movq	136(%[tmp]), %%r14\n\t"
        "movq	144(%[tmp]), %%r15\n\t"
        "movq	152(%[tmp]), %%r10\n\t"
        "# A[10] x A[4]\n\t"
        "mulxq	80(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "# A[11] x A[4]\n\t"
        "mulxq	88(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "# A[12] x A[4]\n\t"
        "mulxq	96(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "# A[13] x A[4]\n\t"
        "mulxq	104(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "# A[14] x A[4]\n\t"
        "mulxq	112(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "movq	%%r11, 112(%[tmp])\n\t"
        "movq	%%r12, 120(%[tmp])\n\t"
        "movq	%%r13, 128(%[tmp])\n\t"
        "movq	%%r14, 136(%[tmp])\n\t"
        "movq	%%r15, 144(%[tmp])\n\t"
        "movq	160(%[tmp]), %%r11\n\t"
        "movq	168(%[tmp]), %%r12\n\t"
        "movq	176(%[tmp]), %%r13\n\t"
        "movq	184(%[tmp]), %%r14\n\t"
        "movq	192(%[tmp]), %%r15\n\t"
        "# A[15] x A[4]\n\t"
        "mulxq	120(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "# A[16] x A[4]\n\t"
        "mulxq	128(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "# A[17] x A[4]\n\t"
        "mulxq	136(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "# A[18] x A[4]\n\t"
        "mulxq	144(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "# A[19] x A[4]\n\t"
        "mulxq	152(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "movq	%%r10, 152(%[tmp])\n\t"
        "movq	%%r11, 160(%[tmp])\n\t"
        "movq	%%r12, 168(%[tmp])\n\t"
        "movq	%%r13, 176(%[tmp])\n\t"
        "movq	%%r14, 184(%[tmp])\n\t"
        "movq	200(%[tmp]), %%r10\n\t"
        "movq	208(%[tmp]), %%r11\n\t"
        "movq	216(%[tmp]), %%r12\n\t"
        "movq	224(%[tmp]), %%r13\n\t"
        "movq	232(%[tmp]), %%r14\n\t"
        "# A[20] x A[4]\n\t"
        "mulxq	160(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "# A[20] x A[5]\n\t"
        "movq	160(%[a]), %%rdx\n\t"
        "mulxq	40(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "# A[20] x A[6]\n\t"
        "mulxq	48(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "# A[20] x A[7]\n\t"
        "mulxq	56(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "# A[20] x A[8]\n\t"
        "mulxq	64(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "movq	%%r15, 192(%[tmp])\n\t"
        "movq	%%r10, 200(%[tmp])\n\t"
        "movq	%%r11, 208(%[tmp])\n\t"
        "movq	%%r12, 216(%[tmp])\n\t"
        "movq	%%r13, 224(%[tmp])\n\t"
        "movq	240(%[tmp]), %%r15\n\t"
        "movq	%%r8, %%r10\n\t"
        "movq	%%r8, %%r11\n\t"
        "# A[20] x A[9]\n\t"
        "mulxq	72(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "# A[20] x A[10]\n\t"
        "mulxq	80(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "# A[20] x A[11]\n\t"
        "mulxq	88(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "movq	%%r14, 232(%[tmp])\n\t"
        "movq	%%r15, 240(%[tmp])\n\t"
        "movq	%%r10, 248(%[tmp])\n\t"
        "#  Carry\n\t"
        "adcxq	%%r9, %%r11\n\t"
        "movq	%%r8, %%r9\n\t"
        "adcxq	%%r8, %%r9\n\t"
        "adoxq	%%r8, %%r9\n\t"
        "movq	%%r11, 256(%[tmp])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "# Diagonal 6\n\t"
        "movq	88(%[tmp]), %%r11\n\t"
        "movq	96(%[tmp]), %%r12\n\t"
        "movq	104(%[tmp]), %%r13\n\t"
        "movq	112(%[tmp]), %%r14\n\t"
        "movq	120(%[tmp]), %%r15\n\t"
        "movq	128(%[tmp]), %%r10\n\t"
        "# A[6] x A[5]\n\t"
        "movq	40(%[a]), %%rdx\n\t"
        "mulxq	48(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "# A[7] x A[5]\n\t"
        "mulxq	56(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "# A[8] x A[5]\n\t"
        "mulxq	64(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "# A[9] x A[5]\n\t"
        "mulxq	72(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "# A[10] x A[5]\n\t"
        "mulxq	80(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "movq	%%r11, 88(%[tmp])\n\t"
        "movq	%%r12, 96(%[tmp])\n\t"
        "movq	%%r13, 104(%[tmp])\n\t"
        "movq	%%r14, 112(%[tmp])\n\t"
        "movq	%%r15, 120(%[tmp])\n\t"
        "movq	136(%[tmp]), %%r11\n\t"
        "movq	144(%[tmp]), %%r12\n\t"
        "movq	152(%[tmp]), %%r13\n\t"
        "movq	160(%[tmp]), %%r14\n\t"
        "movq	168(%[tmp]), %%r15\n\t"
        "# A[11] x A[5]\n\t"
        "mulxq	88(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "# A[12] x A[5]\n\t"
        "mulxq	96(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "# A[13] x A[5]\n\t"
        "mulxq	104(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "# A[14] x A[5]\n\t"
        "mulxq	112(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "# A[15] x A[5]\n\t"
        "mulxq	120(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "movq	%%r10, 128(%[tmp])\n\t"
        "movq	%%r11, 136(%[tmp])\n\t"
        "movq	%%r12, 144(%[tmp])\n\t"
        "movq	%%r13, 152(%[tmp])\n\t"
        "movq	%%r14, 160(%[tmp])\n\t"
        "movq	176(%[tmp]), %%r10\n\t"
        "movq	184(%[tmp]), %%r11\n\t"
        "movq	192(%[tmp]), %%r12\n\t"
        "movq	200(%[tmp]), %%r13\n\t"
        "movq	208(%[tmp]), %%r14\n\t"
        "# A[16] x A[5]\n\t"
        "mulxq	128(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "# A[17] x A[5]\n\t"
        "mulxq	136(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "# A[18] x A[5]\n\t"
        "mulxq	144(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "# A[19] x A[5]\n\t"
        "mulxq	152(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "# A[19] x A[6]\n\t"
        "movq	152(%[a]), %%rdx\n\t"
        "mulxq	48(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "movq	%%r15, 168(%[tmp])\n\t"
        "movq	%%r10, 176(%[tmp])\n\t"
        "movq	%%r11, 184(%[tmp])\n\t"
        "movq	%%r12, 192(%[tmp])\n\t"
        "movq	%%r13, 200(%[tmp])\n\t"
        "movq	216(%[tmp]), %%r15\n\t"
        "movq	224(%[tmp]), %%r10\n\t"
        "movq	232(%[tmp]), %%r11\n\t"
        "movq	240(%[tmp]), %%r12\n\t"
        "movq	248(%[tmp]), %%r13\n\t"
        "# A[19] x A[7]\n\t"
        "mulxq	56(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "# A[19] x A[8]\n\t"
        "mulxq	64(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "# A[19] x A[9]\n\t"
        "mulxq	72(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "# A[19] x A[10]\n\t"
        "mulxq	80(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "# A[19] x A[11]\n\t"
        "mulxq	88(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "movq	%%r14, 208(%[tmp])\n\t"
        "movq	%%r15, 216(%[tmp])\n\t"
        "movq	%%r10, 224(%[tmp])\n\t"
        "movq	%%r11, 232(%[tmp])\n\t"
        "movq	%%r12, 240(%[tmp])\n\t"
        "movq	256(%[tmp]), %%r14\n\t"
        "movq	%%r8, %%r15\n\t"
        "movq	%%r8, %%r10\n\t"
        "# A[19] x A[12]\n\t"
        "mulxq	96(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "# A[19] x A[13]\n\t"
        "mulxq	104(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "# A[19] x A[14]\n\t"
        "mulxq	112(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "movq	%%r13, 248(%[tmp])\n\t"
        "movq	%%r14, 256(%[tmp])\n\t"
        "movq	%%r15, 264(%[tmp])\n\t"
        "#  Carry\n\t"
        "adcxq	%%r9, %%r10\n\t"
        "movq	%%r8, %%r9\n\t"
        "adcxq	%%r8, %%r9\n\t"
        "adoxq	%%r8, %%r9\n\t"
        "movq	%%r10, 272(%[tmp])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "# Diagonal 7\n\t"
        "movq	104(%[tmp]), %%r10\n\t"
        "movq	112(%[tmp]), %%r11\n\t"
        "movq	120(%[tmp]), %%r12\n\t"
        "movq	128(%[tmp]), %%r13\n\t"
        "movq	136(%[tmp]), %%r14\n\t"
        "movq	144(%[tmp]), %%r15\n\t"
        "# A[7] x A[6]\n\t"
        "movq	48(%[a]), %%rdx\n\t"
        "mulxq	56(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "# A[8] x A[6]\n\t"
        "mulxq	64(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "# A[9] x A[6]\n\t"
        "mulxq	72(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "# A[10] x A[6]\n\t"
        "mulxq	80(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "# A[11] x A[6]\n\t"
        "mulxq	88(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "movq	%%r10, 104(%[tmp])\n\t"
        "movq	%%r11, 112(%[tmp])\n\t"
        "movq	%%r12, 120(%[tmp])\n\t"
        "movq	%%r13, 128(%[tmp])\n\t"
        "movq	%%r14, 136(%[tmp])\n\t"
        "movq	152(%[tmp]), %%r10\n\t"
        "movq	160(%[tmp]), %%r11\n\t"
        "movq	168(%[tmp]), %%r12\n\t"
        "movq	176(%[tmp]), %%r13\n\t"
        "movq	184(%[tmp]), %%r14\n\t"
        "# A[12] x A[6]\n\t"
        "mulxq	96(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "# A[13] x A[6]\n\t"
        "mulxq	104(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "# A[14] x A[6]\n\t"
        "mulxq	112(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "# A[15] x A[6]\n\t"
        "mulxq	120(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "# A[16] x A[6]\n\t"
        "mulxq	128(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "movq	%%r15, 144(%[tmp])\n\t"
        "movq	%%r10, 152(%[tmp])\n\t"
        "movq	%%r11, 160(%[tmp])\n\t"
        "movq	%%r12, 168(%[tmp])\n\t"
        "movq	%%r13, 176(%[tmp])\n\t"
        "movq	192(%[tmp]), %%r15\n\t"
        "movq	200(%[tmp]), %%r10\n\t"
        "movq	208(%[tmp]), %%r11\n\t"
        "movq	216(%[tmp]), %%r12\n\t"
        "movq	224(%[tmp]), %%r13\n\t"
        "# A[17] x A[6]\n\t"
        "mulxq	136(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "# A[18] x A[6]\n\t"
        "mulxq	144(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "# A[18] x A[7]\n\t"
        "movq	144(%[a]), %%rdx\n\t"
        "mulxq	56(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "# A[18] x A[8]\n\t"
        "mulxq	64(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "# A[18] x A[9]\n\t"
        "mulxq	72(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "movq	%%r14, 184(%[tmp])\n\t"
        "movq	%%r15, 192(%[tmp])\n\t"
        "movq	%%r10, 200(%[tmp])\n\t"
        "movq	%%r11, 208(%[tmp])\n\t"
        "movq	%%r12, 216(%[tmp])\n\t"
        "movq	232(%[tmp]), %%r14\n\t"
        "movq	240(%[tmp]), %%r15\n\t"
        "movq	248(%[tmp]), %%r10\n\t"
        "movq	256(%[tmp]), %%r11\n\t"
        "movq	264(%[tmp]), %%r12\n\t"
        "# A[18] x A[10]\n\t"
        "mulxq	80(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "# A[18] x A[11]\n\t"
        "mulxq	88(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "# A[18] x A[12]\n\t"
        "mulxq	96(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "# A[18] x A[13]\n\t"
        "mulxq	104(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "# A[18] x A[14]\n\t"
        "mulxq	112(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "movq	%%r13, 224(%[tmp])\n\t"
        "movq	%%r14, 232(%[tmp])\n\t"
        "movq	%%r15, 240(%[tmp])\n\t"
        "movq	%%r10, 248(%[tmp])\n\t"
        "movq	%%r11, 256(%[tmp])\n\t"
        "movq	272(%[tmp]), %%r13\n\t"
        "movq	%%r8, %%r14\n\t"
        "movq	%%r8, %%r15\n\t"
        "# A[18] x A[15]\n\t"
        "mulxq	120(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "# A[18] x A[16]\n\t"
        "mulxq	128(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "# A[18] x A[17]\n\t"
        "mulxq	136(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "movq	%%r12, 264(%[tmp])\n\t"
        "movq	%%r13, 272(%[tmp])\n\t"
        "movq	%%r14, 280(%[tmp])\n\t"
        "#  Carry\n\t"
        "adcxq	%%r9, %%r15\n\t"
        "movq	%%r8, %%r9\n\t"
        "adcxq	%%r8, %%r9\n\t"
        "adoxq	%%r8, %%r9\n\t"
        "movq	%%r15, 288(%[tmp])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "# Diagonal 8\n\t"
        "movq	120(%[tmp]), %%r15\n\t"
        "movq	128(%[tmp]), %%r10\n\t"
        "movq	136(%[tmp]), %%r11\n\t"
        "movq	144(%[tmp]), %%r12\n\t"
        "movq	152(%[tmp]), %%r13\n\t"
        "movq	160(%[tmp]), %%r14\n\t"
        "# A[8] x A[7]\n\t"
        "movq	56(%[a]), %%rdx\n\t"
        "mulxq	64(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "# A[9] x A[7]\n\t"
        "mulxq	72(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "# A[10] x A[7]\n\t"
        "mulxq	80(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "# A[11] x A[7]\n\t"
        "mulxq	88(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "# A[12] x A[7]\n\t"
        "mulxq	96(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "movq	%%r15, 120(%[tmp])\n\t"
        "movq	%%r10, 128(%[tmp])\n\t"
        "movq	%%r11, 136(%[tmp])\n\t"
        "movq	%%r12, 144(%[tmp])\n\t"
        "movq	%%r13, 152(%[tmp])\n\t"
        "movq	168(%[tmp]), %%r15\n\t"
        "movq	176(%[tmp]), %%r10\n\t"
        "movq	184(%[tmp]), %%r11\n\t"
        "movq	192(%[tmp]), %%r12\n\t"
        "movq	200(%[tmp]), %%r13\n\t"
        "# A[13] x A[7]\n\t"
        "mulxq	104(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "# A[14] x A[7]\n\t"
        "mulxq	112(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "# A[15] x A[7]\n\t"
        "mulxq	120(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "# A[16] x A[7]\n\t"
        "mulxq	128(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "# A[17] x A[7]\n\t"
        "mulxq	136(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "movq	%%r14, 160(%[tmp])\n\t"
        "movq	%%r15, 168(%[tmp])\n\t"
        "movq	%%r10, 176(%[tmp])\n\t"
        "movq	%%r11, 184(%[tmp])\n\t"
        "movq	%%r12, 192(%[tmp])\n\t"
        "movq	208(%[tmp]), %%r14\n\t"
        "movq	216(%[tmp]), %%r15\n\t"
        "movq	224(%[tmp]), %%r10\n\t"
        "movq	232(%[tmp]), %%r11\n\t"
        "movq	240(%[tmp]), %%r12\n\t"
        "# A[17] x A[8]\n\t"
        "movq	136(%[a]), %%rdx\n\t"
        "mulxq	64(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "# A[17] x A[9]\n\t"
        "mulxq	72(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "# A[17] x A[10]\n\t"
        "mulxq	80(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "# A[17] x A[11]\n\t"
        "mulxq	88(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "# A[17] x A[12]\n\t"
        "mulxq	96(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "movq	%%r13, 200(%[tmp])\n\t"
        "movq	%%r14, 208(%[tmp])\n\t"
        "movq	%%r15, 216(%[tmp])\n\t"
        "movq	%%r10, 224(%[tmp])\n\t"
        "movq	%%r11, 232(%[tmp])\n\t"
        "movq	248(%[tmp]), %%r13\n\t"
        "movq	256(%[tmp]), %%r14\n\t"
        "movq	264(%[tmp]), %%r15\n\t"
        "movq	272(%[tmp]), %%r10\n\t"
        "movq	280(%[tmp]), %%r11\n\t"
        "# A[17] x A[13]\n\t"
        "mulxq	104(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "# A[17] x A[14]\n\t"
        "mulxq	112(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "# A[17] x A[15]\n\t"
        "mulxq	120(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "# A[17] x A[16]\n\t"
        "mulxq	128(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "# A[19] x A[15]\n\t"
        "movq	152(%[a]), %%rdx\n\t"
        "mulxq	120(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "movq	%%r12, 240(%[tmp])\n\t"
        "movq	%%r13, 248(%[tmp])\n\t"
        "movq	%%r14, 256(%[tmp])\n\t"
        "movq	%%r15, 264(%[tmp])\n\t"
        "movq	%%r10, 272(%[tmp])\n\t"
        "movq	288(%[tmp]), %%r12\n\t"
        "movq	%%r8, %%r13\n\t"
        "movq	%%r8, %%r14\n\t"
        "# A[19] x A[16]\n\t"
        "mulxq	128(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "# A[19] x A[17]\n\t"
        "mulxq	136(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "# A[19] x A[18]\n\t"
        "mulxq	144(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "movq	%%r11, 280(%[tmp])\n\t"
        "movq	%%r12, 288(%[tmp])\n\t"
        "movq	%%r13, 296(%[tmp])\n\t"
        "#  Carry\n\t"
        "adcxq	%%r9, %%r14\n\t"
        "movq	%%r8, %%r9\n\t"
        "adcxq	%%r8, %%r9\n\t"
        "adoxq	%%r8, %%r9\n\t"
        "movq	%%r14, 304(%[tmp])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "# Diagonal 9\n\t"
        "movq	136(%[tmp]), %%r14\n\t"
        "movq	144(%[tmp]), %%r15\n\t"
        "movq	152(%[tmp]), %%r10\n\t"
        "movq	160(%[tmp]), %%r11\n\t"
        "movq	168(%[tmp]), %%r12\n\t"
        "movq	176(%[tmp]), %%r13\n\t"
        "# A[9] x A[8]\n\t"
        "movq	64(%[a]), %%rdx\n\t"
        "mulxq	72(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "# A[10] x A[8]\n\t"
        "mulxq	80(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "# A[11] x A[8]\n\t"
        "mulxq	88(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "# A[12] x A[8]\n\t"
        "mulxq	96(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "# A[13] x A[8]\n\t"
        "mulxq	104(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "movq	%%r14, 136(%[tmp])\n\t"
        "movq	%%r15, 144(%[tmp])\n\t"
        "movq	%%r10, 152(%[tmp])\n\t"
        "movq	%%r11, 160(%[tmp])\n\t"
        "movq	%%r12, 168(%[tmp])\n\t"
        "movq	184(%[tmp]), %%r14\n\t"
        "movq	192(%[tmp]), %%r15\n\t"
        "movq	200(%[tmp]), %%r10\n\t"
        "movq	208(%[tmp]), %%r11\n\t"
        "movq	216(%[tmp]), %%r12\n\t"
        "# A[14] x A[8]\n\t"
        "mulxq	112(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "# A[15] x A[8]\n\t"
        "mulxq	120(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "# A[16] x A[8]\n\t"
        "mulxq	128(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "# A[16] x A[9]\n\t"
        "movq	128(%[a]), %%rdx\n\t"
        "mulxq	72(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "# A[16] x A[10]\n\t"
        "mulxq	80(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "movq	%%r13, 176(%[tmp])\n\t"
        "movq	%%r14, 184(%[tmp])\n\t"
        "movq	%%r15, 192(%[tmp])\n\t"
        "movq	%%r10, 200(%[tmp])\n\t"
        "movq	%%r11, 208(%[tmp])\n\t"
        "movq	224(%[tmp]), %%r13\n\t"
        "movq	232(%[tmp]), %%r14\n\t"
        "movq	240(%[tmp]), %%r15\n\t"
        "movq	248(%[tmp]), %%r10\n\t"
        "movq	256(%[tmp]), %%r11\n\t"
        "# A[16] x A[11]\n\t"
        "mulxq	88(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "# A[16] x A[12]\n\t"
        "mulxq	96(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "# A[16] x A[13]\n\t"
        "mulxq	104(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "# A[16] x A[14]\n\t"
        "mulxq	112(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "# A[16] x A[15]\n\t"
        "mulxq	120(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "movq	%%r12, 216(%[tmp])\n\t"
        "movq	%%r13, 224(%[tmp])\n\t"
        "movq	%%r14, 232(%[tmp])\n\t"
        "movq	%%r15, 240(%[tmp])\n\t"
        "movq	%%r10, 248(%[tmp])\n\t"
        "movq	264(%[tmp]), %%r12\n\t"
        "movq	272(%[tmp]), %%r13\n\t"
        "movq	280(%[tmp]), %%r14\n\t"
        "movq	288(%[tmp]), %%r15\n\t"
        "movq	296(%[tmp]), %%r10\n\t"
        "# A[20] x A[12]\n\t"
        "movq	160(%[a]), %%rdx\n\t"
        "mulxq	96(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "# A[20] x A[13]\n\t"
        "mulxq	104(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "# A[20] x A[14]\n\t"
        "mulxq	112(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "# A[20] x A[15]\n\t"
        "mulxq	120(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "# A[20] x A[16]\n\t"
        "mulxq	128(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "movq	%%r11, 256(%[tmp])\n\t"
        "movq	%%r12, 264(%[tmp])\n\t"
        "movq	%%r13, 272(%[tmp])\n\t"
        "movq	%%r14, 280(%[tmp])\n\t"
        "movq	%%r15, 288(%[tmp])\n\t"
        "movq	304(%[tmp]), %%r11\n\t"
        "movq	%%r8, %%r12\n\t"
        "movq	%%r8, %%r13\n\t"
        "# A[20] x A[17]\n\t"
        "mulxq	136(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "# A[20] x A[18]\n\t"
        "mulxq	144(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "# A[20] x A[19]\n\t"
        "mulxq	152(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "movq	%%r10, 296(%[tmp])\n\t"
        "movq	%%r11, 304(%[tmp])\n\t"
        "movq	%%r12, 312(%[tmp])\n\t"
        "#  Carry\n\t"
        "adcxq	%%r9, %%r13\n\t"
        "movq	%%r8, %%r9\n\t"
        "adcxq	%%r8, %%r9\n\t"
        "adoxq	%%r8, %%r9\n\t"
        "movq	%%r13, 320(%[tmp])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "# Diagonal 10\n\t"
        "movq	152(%[tmp]), %%r13\n\t"
        "movq	160(%[tmp]), %%r14\n\t"
        "movq	168(%[tmp]), %%r15\n\t"
        "movq	176(%[tmp]), %%r10\n\t"
        "movq	184(%[tmp]), %%r11\n\t"
        "movq	192(%[tmp]), %%r12\n\t"
        "# A[10] x A[9]\n\t"
        "movq	72(%[a]), %%rdx\n\t"
        "mulxq	80(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "# A[11] x A[9]\n\t"
        "mulxq	88(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "# A[12] x A[9]\n\t"
        "mulxq	96(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "# A[13] x A[9]\n\t"
        "mulxq	104(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "# A[14] x A[9]\n\t"
        "mulxq	112(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "movq	%%r13, 152(%[tmp])\n\t"
        "movq	%%r14, 160(%[tmp])\n\t"
        "movq	%%r15, 168(%[tmp])\n\t"
        "movq	%%r10, 176(%[tmp])\n\t"
        "movq	%%r11, 184(%[tmp])\n\t"
        "movq	200(%[tmp]), %%r13\n\t"
        "movq	208(%[tmp]), %%r14\n\t"
        "movq	216(%[tmp]), %%r15\n\t"
        "movq	224(%[tmp]), %%r10\n\t"
        "movq	232(%[tmp]), %%r11\n\t"
        "# A[15] x A[9]\n\t"
        "mulxq	120(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "# A[15] x A[10]\n\t"
        "movq	120(%[a]), %%rdx\n\t"
        "mulxq	80(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "# A[15] x A[11]\n\t"
        "mulxq	88(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "# A[15] x A[12]\n\t"
        "mulxq	96(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "# A[15] x A[13]\n\t"
        "mulxq	104(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "movq	%%r12, 192(%[tmp])\n\t"
        "movq	%%r13, 200(%[tmp])\n\t"
        "movq	%%r14, 208(%[tmp])\n\t"
        "movq	%%r15, 216(%[tmp])\n\t"
        "movq	%%r10, 224(%[tmp])\n\t"
        "movq	240(%[tmp]), %%r12\n\t"
        "movq	248(%[tmp]), %%r13\n\t"
        "movq	256(%[tmp]), %%r14\n\t"
        "movq	264(%[tmp]), %%r15\n\t"
        "movq	272(%[tmp]), %%r10\n\t"
        "# A[15] x A[14]\n\t"
        "mulxq	112(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "# A[21] x A[9]\n\t"
        "movq	168(%[a]), %%rdx\n\t"
        "mulxq	72(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "# A[21] x A[10]\n\t"
        "mulxq	80(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "# A[21] x A[11]\n\t"
        "mulxq	88(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "# A[21] x A[12]\n\t"
        "mulxq	96(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "movq	%%r11, 232(%[tmp])\n\t"
        "movq	%%r12, 240(%[tmp])\n\t"
        "movq	%%r13, 248(%[tmp])\n\t"
        "movq	%%r14, 256(%[tmp])\n\t"
        "movq	%%r15, 264(%[tmp])\n\t"
        "movq	280(%[tmp]), %%r11\n\t"
        "movq	288(%[tmp]), %%r12\n\t"
        "movq	296(%[tmp]), %%r13\n\t"
        "movq	304(%[tmp]), %%r14\n\t"
        "movq	312(%[tmp]), %%r15\n\t"
        "# A[21] x A[13]\n\t"
        "mulxq	104(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "# A[21] x A[14]\n\t"
        "mulxq	112(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "# A[21] x A[15]\n\t"
        "mulxq	120(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "# A[21] x A[16]\n\t"
        "mulxq	128(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "# A[21] x A[17]\n\t"
        "mulxq	136(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "movq	%%r10, 272(%[tmp])\n\t"
        "movq	%%r11, 280(%[tmp])\n\t"
        "movq	%%r12, 288(%[tmp])\n\t"
        "movq	%%r13, 296(%[tmp])\n\t"
        "movq	%%r14, 304(%[tmp])\n\t"
        "movq	320(%[tmp]), %%r10\n\t"
        "movq	%%r8, %%r11\n\t"
        "movq	%%r8, %%r12\n\t"
        "# A[21] x A[18]\n\t"
        "mulxq	144(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "# A[21] x A[19]\n\t"
        "mulxq	152(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "# A[21] x A[20]\n\t"
        "mulxq	160(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "movq	%%r15, 312(%[tmp])\n\t"
        "movq	%%r10, 320(%[tmp])\n\t"
        "movq	%%r11, 328(%[tmp])\n\t"
        "#  Carry\n\t"
        "adcxq	%%r9, %%r12\n\t"
        "movq	%%r8, %%r9\n\t"
        "adcxq	%%r8, %%r9\n\t"
        "adoxq	%%r8, %%r9\n\t"
        "movq	%%r12, 336(%[tmp])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "# Diagonal 11\n\t"
        "movq	168(%[tmp]), %%r12\n\t"
        "movq	176(%[tmp]), %%r13\n\t"
        "movq	184(%[tmp]), %%r14\n\t"
        "movq	192(%[tmp]), %%r15\n\t"
        "movq	200(%[tmp]), %%r10\n\t"
        "movq	208(%[tmp]), %%r11\n\t"
        "# A[11] x A[10]\n\t"
        "movq	80(%[a]), %%rdx\n\t"
        "mulxq	88(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "# A[12] x A[10]\n\t"
        "mulxq	96(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "# A[13] x A[10]\n\t"
        "mulxq	104(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "# A[14] x A[10]\n\t"
        "mulxq	112(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "# A[14] x A[11]\n\t"
        "movq	112(%[a]), %%rdx\n\t"
        "mulxq	88(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "movq	%%r12, 168(%[tmp])\n\t"
        "movq	%%r13, 176(%[tmp])\n\t"
        "movq	%%r14, 184(%[tmp])\n\t"
        "movq	%%r15, 192(%[tmp])\n\t"
        "movq	%%r10, 200(%[tmp])\n\t"
        "movq	216(%[tmp]), %%r12\n\t"
        "movq	224(%[tmp]), %%r13\n\t"
        "movq	232(%[tmp]), %%r14\n\t"
        "movq	240(%[tmp]), %%r15\n\t"
        "movq	248(%[tmp]), %%r10\n\t"
        "# A[14] x A[12]\n\t"
        "mulxq	96(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "# A[14] x A[13]\n\t"
        "mulxq	104(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "# A[22] x A[6]\n\t"
        "movq	176(%[a]), %%rdx\n\t"
        "mulxq	48(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "# A[22] x A[7]\n\t"
        "mulxq	56(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "# A[22] x A[8]\n\t"
        "mulxq	64(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "movq	%%r11, 208(%[tmp])\n\t"
        "movq	%%r12, 216(%[tmp])\n\t"
        "movq	%%r13, 224(%[tmp])\n\t"
        "movq	%%r14, 232(%[tmp])\n\t"
        "movq	%%r15, 240(%[tmp])\n\t"
        "movq	256(%[tmp]), %%r11\n\t"
        "movq	264(%[tmp]), %%r12\n\t"
        "movq	272(%[tmp]), %%r13\n\t"
        "movq	280(%[tmp]), %%r14\n\t"
        "movq	288(%[tmp]), %%r15\n\t"
        "# A[22] x A[9]\n\t"
        "mulxq	72(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "# A[22] x A[10]\n\t"
        "mulxq	80(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "# A[22] x A[11]\n\t"
        "mulxq	88(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "# A[22] x A[12]\n\t"
        "mulxq	96(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "# A[22] x A[13]\n\t"
        "mulxq	104(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "movq	%%r10, 248(%[tmp])\n\t"
        "movq	%%r11, 256(%[tmp])\n\t"
        "movq	%%r12, 264(%[tmp])\n\t"
        "movq	%%r13, 272(%[tmp])\n\t"
        "movq	%%r14, 280(%[tmp])\n\t"
        "movq	296(%[tmp]), %%r10\n\t"
        "movq	304(%[tmp]), %%r11\n\t"
        "movq	312(%[tmp]), %%r12\n\t"
        "movq	320(%[tmp]), %%r13\n\t"
        "movq	328(%[tmp]), %%r14\n\t"
        "# A[22] x A[14]\n\t"
        "mulxq	112(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "# A[22] x A[15]\n\t"
        "mulxq	120(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "# A[22] x A[16]\n\t"
        "mulxq	128(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "# A[22] x A[17]\n\t"
        "mulxq	136(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "# A[22] x A[18]\n\t"
        "mulxq	144(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "movq	%%r15, 288(%[tmp])\n\t"
        "movq	%%r10, 296(%[tmp])\n\t"
        "movq	%%r11, 304(%[tmp])\n\t"
        "movq	%%r12, 312(%[tmp])\n\t"
        "movq	%%r13, 320(%[tmp])\n\t"
        "movq	336(%[tmp]), %%r15\n\t"
        "movq	%%r8, %%r10\n\t"
        "movq	%%r8, %%r11\n\t"
        "# A[22] x A[19]\n\t"
        "mulxq	152(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "# A[22] x A[20]\n\t"
        "mulxq	160(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "# A[22] x A[21]\n\t"
        "mulxq	168(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "movq	%%r14, 328(%[tmp])\n\t"
        "movq	%%r15, 336(%[tmp])\n\t"
        "movq	%%r10, 344(%[tmp])\n\t"
        "#  Carry\n\t"
        "adcxq	%%r9, %%r11\n\t"
        "movq	%%r8, %%r9\n\t"
        "adcxq	%%r8, %%r9\n\t"
        "adoxq	%%r8, %%r9\n\t"
        "movq	%%r11, 352(%[tmp])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "# Diagonal 12\n\t"
        "movq	184(%[tmp]), %%r11\n\t"
        "movq	192(%[tmp]), %%r12\n\t"
        "movq	200(%[tmp]), %%r13\n\t"
        "movq	208(%[tmp]), %%r14\n\t"
        "movq	216(%[tmp]), %%r15\n\t"
        "movq	224(%[tmp]), %%r10\n\t"
        "# A[12] x A[11]\n\t"
        "movq	88(%[a]), %%rdx\n\t"
        "mulxq	96(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "# A[13] x A[11]\n\t"
        "mulxq	104(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "# A[13] x A[12]\n\t"
        "movq	96(%[a]), %%rdx\n\t"
        "mulxq	104(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "# A[23] x A[3]\n\t"
        "movq	184(%[a]), %%rdx\n\t"
        "mulxq	24(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "# A[23] x A[4]\n\t"
        "mulxq	32(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "movq	%%r11, 184(%[tmp])\n\t"
        "movq	%%r12, 192(%[tmp])\n\t"
        "movq	%%r13, 200(%[tmp])\n\t"
        "movq	%%r14, 208(%[tmp])\n\t"
        "movq	%%r15, 216(%[tmp])\n\t"
        "movq	232(%[tmp]), %%r11\n\t"
        "movq	240(%[tmp]), %%r12\n\t"
        "movq	248(%[tmp]), %%r13\n\t"
        "movq	256(%[tmp]), %%r14\n\t"
        "movq	264(%[tmp]), %%r15\n\t"
        "# A[23] x A[5]\n\t"
        "mulxq	40(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "# A[23] x A[6]\n\t"
        "mulxq	48(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "# A[23] x A[7]\n\t"
        "mulxq	56(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "# A[23] x A[8]\n\t"
        "mulxq	64(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "# A[23] x A[9]\n\t"
        "mulxq	72(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "movq	%%r10, 224(%[tmp])\n\t"
        "movq	%%r11, 232(%[tmp])\n\t"
        "movq	%%r12, 240(%[tmp])\n\t"
        "movq	%%r13, 248(%[tmp])\n\t"
        "movq	%%r14, 256(%[tmp])\n\t"
        "movq	272(%[tmp]), %%r10\n\t"
        "movq	280(%[tmp]), %%r11\n\t"
        "movq	288(%[tmp]), %%r12\n\t"
        "movq	296(%[tmp]), %%r13\n\t"
        "movq	304(%[tmp]), %%r14\n\t"
        "# A[23] x A[10]\n\t"
        "mulxq	80(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "# A[23] x A[11]\n\t"
        "mulxq	88(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "# A[23] x A[12]\n\t"
        "mulxq	96(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "# A[23] x A[13]\n\t"
        "mulxq	104(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "# A[23] x A[14]\n\t"
        "mulxq	112(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "movq	%%r15, 264(%[tmp])\n\t"
        "movq	%%r10, 272(%[tmp])\n\t"
        "movq	%%r11, 280(%[tmp])\n\t"
        "movq	%%r12, 288(%[tmp])\n\t"
        "movq	%%r13, 296(%[tmp])\n\t"
        "movq	312(%[tmp]), %%r15\n\t"
        "movq	320(%[tmp]), %%r10\n\t"
        "movq	328(%[tmp]), %%r11\n\t"
        "movq	336(%[tmp]), %%r12\n\t"
        "movq	344(%[tmp]), %%r13\n\t"
        "# A[23] x A[15]\n\t"
        "mulxq	120(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "# A[23] x A[16]\n\t"
        "mulxq	128(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "# A[23] x A[17]\n\t"
        "mulxq	136(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%rcx, %%r11\n\t"
        "# A[23] x A[18]\n\t"
        "mulxq	144(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%rcx, %%r12\n\t"
        "# A[23] x A[19]\n\t"
        "mulxq	152(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%rcx, %%r13\n\t"
        "movq	%%r14, 304(%[tmp])\n\t"
        "movq	%%r15, 312(%[tmp])\n\t"
        "movq	%%r10, 320(%[tmp])\n\t"
        "movq	%%r11, 328(%[tmp])\n\t"
        "movq	%%r12, 336(%[tmp])\n\t"
        "movq	352(%[tmp]), %%r14\n\t"
        "movq	%%r8, %%r15\n\t"
        "movq	%%r8, %%r10\n\t"
        "# A[23] x A[20]\n\t"
        "mulxq	160(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r13\n\t"
        "adoxq	%%rcx, %%r14\n\t"
        "# A[23] x A[21]\n\t"
        "mulxq	168(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r14\n\t"
        "adoxq	%%rcx, %%r15\n\t"
        "# A[23] x A[22]\n\t"
        "mulxq	176(%[a]), %%rax, %%rcx\n\t"
        "adcxq	%%rax, %%r15\n\t"
        "adoxq	%%rcx, %%r10\n\t"
        "movq	%%r13, 344(%[tmp])\n\t"
        "movq	%%r14, 352(%[tmp])\n\t"
        "movq	%%r15, 360(%[tmp])\n\t"
        "#  Carry\n\t"
        "adcxq	%%r9, %%r10\n\t"
        "movq	%%r8, %%r9\n\t"
        "adcxq	%%r8, %%r9\n\t"
        "adoxq	%%r8, %%r9\n\t"
        "movq	%%r10, 368(%[tmp])\n\t"
        "movq	%%r9, 376(%[tmp])\n\t"
        "xorq	%%r8, %%r8\n\t"
        "# Double and Add in A[i] x A[i]\n\t"
        "movq	8(%[tmp]), %%r11\n\t"
        "# A[0] x A[0]\n\t"
        "movq	0(%[a]), %%rdx\n\t"
        "mulxq	%%rdx, %%rax, %%rcx\n\t"
        "movq	%%rax, 0(%[tmp])\n\t"
        "adoxq	%%r11, %%r11\n\t"
        "adcxq	%%rcx, %%r11\n\t"
        "movq	%%r11, 8(%[tmp])\n\t"
        "movq	16(%[tmp]), %%r10\n\t"
        "movq	24(%[tmp]), %%r11\n\t"
        "# A[1] x A[1]\n\t"
        "movq	8(%[a]), %%rdx\n\t"
        "mulxq	%%rdx, %%rax, %%rcx\n\t"
        "adoxq	%%r10, %%r10\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r11, %%r11\n\t"
        "adcxq	%%rcx, %%r11\n\t"
        "movq	%%r10, 16(%[tmp])\n\t"
        "movq	%%r11, 24(%[tmp])\n\t"
        "movq	32(%[tmp]), %%r10\n\t"
        "movq	40(%[tmp]), %%r11\n\t"
        "# A[2] x A[2]\n\t"
        "movq	16(%[a]), %%rdx\n\t"
        "mulxq	%%rdx, %%rax, %%rcx\n\t"
        "adoxq	%%r10, %%r10\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r11, %%r11\n\t"
        "adcxq	%%rcx, %%r11\n\t"
        "movq	%%r10, 32(%[tmp])\n\t"
        "movq	%%r11, 40(%[tmp])\n\t"
        "movq	48(%[tmp]), %%r10\n\t"
        "movq	56(%[tmp]), %%r11\n\t"
        "# A[3] x A[3]\n\t"
        "movq	24(%[a]), %%rdx\n\t"
        "mulxq	%%rdx, %%rax, %%rcx\n\t"
        "adoxq	%%r10, %%r10\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r11, %%r11\n\t"
        "adcxq	%%rcx, %%r11\n\t"
        "movq	%%r10, 48(%[tmp])\n\t"
        "movq	%%r11, 56(%[tmp])\n\t"
        "movq	64(%[tmp]), %%r10\n\t"
        "movq	72(%[tmp]), %%r11\n\t"
        "# A[4] x A[4]\n\t"
        "movq	32(%[a]), %%rdx\n\t"
        "mulxq	%%rdx, %%rax, %%rcx\n\t"
        "adoxq	%%r10, %%r10\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r11, %%r11\n\t"
        "adcxq	%%rcx, %%r11\n\t"
        "movq	%%r10, 64(%[tmp])\n\t"
        "movq	%%r11, 72(%[tmp])\n\t"
        "movq	80(%[tmp]), %%r10\n\t"
        "movq	88(%[tmp]), %%r11\n\t"
        "# A[5] x A[5]\n\t"
        "movq	40(%[a]), %%rdx\n\t"
        "mulxq	%%rdx, %%rax, %%rcx\n\t"
        "adoxq	%%r10, %%r10\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r11, %%r11\n\t"
        "adcxq	%%rcx, %%r11\n\t"
        "movq	%%r10, 80(%[tmp])\n\t"
        "movq	%%r11, 88(%[tmp])\n\t"
        "movq	96(%[tmp]), %%r10\n\t"
        "movq	104(%[tmp]), %%r11\n\t"
        "# A[6] x A[6]\n\t"
        "movq	48(%[a]), %%rdx\n\t"
        "mulxq	%%rdx, %%rax, %%rcx\n\t"
        "adoxq	%%r10, %%r10\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r11, %%r11\n\t"
        "adcxq	%%rcx, %%r11\n\t"
        "movq	%%r10, 96(%[tmp])\n\t"
        "movq	%%r11, 104(%[tmp])\n\t"
        "movq	112(%[tmp]), %%r10\n\t"
        "movq	120(%[tmp]), %%r11\n\t"
        "# A[7] x A[7]\n\t"
        "movq	56(%[a]), %%rdx\n\t"
        "mulxq	%%rdx, %%rax, %%rcx\n\t"
        "adoxq	%%r10, %%r10\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r11, %%r11\n\t"
        "adcxq	%%rcx, %%r11\n\t"
        "movq	%%r10, 112(%[tmp])\n\t"
        "movq	%%r11, 120(%[tmp])\n\t"
        "movq	128(%[tmp]), %%r10\n\t"
        "movq	136(%[tmp]), %%r11\n\t"
        "# A[8] x A[8]\n\t"
        "movq	64(%[a]), %%rdx\n\t"
        "mulxq	%%rdx, %%rax, %%rcx\n\t"
        "adoxq	%%r10, %%r10\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r11, %%r11\n\t"
        "adcxq	%%rcx, %%r11\n\t"
        "movq	%%r10, 128(%[tmp])\n\t"
        "movq	%%r11, 136(%[tmp])\n\t"
        "movq	144(%[tmp]), %%r10\n\t"
        "movq	152(%[tmp]), %%r11\n\t"
        "# A[9] x A[9]\n\t"
        "movq	72(%[a]), %%rdx\n\t"
        "mulxq	%%rdx, %%rax, %%rcx\n\t"
        "adoxq	%%r10, %%r10\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r11, %%r11\n\t"
        "adcxq	%%rcx, %%r11\n\t"
        "movq	%%r10, 144(%[tmp])\n\t"
        "movq	%%r11, 152(%[tmp])\n\t"
        "movq	160(%[tmp]), %%r10\n\t"
        "movq	168(%[tmp]), %%r11\n\t"
        "# A[10] x A[10]\n\t"
        "movq	80(%[a]), %%rdx\n\t"
        "mulxq	%%rdx, %%rax, %%rcx\n\t"
        "adoxq	%%r10, %%r10\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r11, %%r11\n\t"
        "adcxq	%%rcx, %%r11\n\t"
        "movq	%%r10, 160(%[tmp])\n\t"
        "movq	%%r11, 168(%[tmp])\n\t"
        "movq	176(%[tmp]), %%r10\n\t"
        "movq	184(%[tmp]), %%r11\n\t"
        "# A[11] x A[11]\n\t"
        "movq	88(%[a]), %%rdx\n\t"
        "mulxq	%%rdx, %%rax, %%rcx\n\t"
        "adoxq	%%r10, %%r10\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r11, %%r11\n\t"
        "adcxq	%%rcx, %%r11\n\t"
        "movq	%%r10, 176(%[tmp])\n\t"
        "movq	%%r11, 184(%[tmp])\n\t"
        "movq	192(%[tmp]), %%r10\n\t"
        "movq	200(%[tmp]), %%r11\n\t"
        "# A[12] x A[12]\n\t"
        "movq	96(%[a]), %%rdx\n\t"
        "mulxq	%%rdx, %%rax, %%rcx\n\t"
        "adoxq	%%r10, %%r10\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r11, %%r11\n\t"
        "adcxq	%%rcx, %%r11\n\t"
        "movq	%%r10, 192(%[r])\n\t"
        "movq	%%r11, 200(%[r])\n\t"
        "movq	208(%[tmp]), %%r10\n\t"
        "movq	216(%[tmp]), %%r11\n\t"
        "# A[13] x A[13]\n\t"
        "movq	104(%[a]), %%rdx\n\t"
        "mulxq	%%rdx, %%rax, %%rcx\n\t"
        "adoxq	%%r10, %%r10\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r11, %%r11\n\t"
        "adcxq	%%rcx, %%r11\n\t"
        "movq	%%r10, 208(%[r])\n\t"
        "movq	%%r11, 216(%[r])\n\t"
        "movq	224(%[tmp]), %%r10\n\t"
        "movq	232(%[tmp]), %%r11\n\t"
        "# A[14] x A[14]\n\t"
        "movq	112(%[a]), %%rdx\n\t"
        "mulxq	%%rdx, %%rax, %%rcx\n\t"
        "adoxq	%%r10, %%r10\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r11, %%r11\n\t"
        "adcxq	%%rcx, %%r11\n\t"
        "movq	%%r10, 224(%[r])\n\t"
        "movq	%%r11, 232(%[r])\n\t"
        "movq	240(%[tmp]), %%r10\n\t"
        "movq	248(%[tmp]), %%r11\n\t"
        "# A[15] x A[15]\n\t"
        "movq	120(%[a]), %%rdx\n\t"
        "mulxq	%%rdx, %%rax, %%rcx\n\t"
        "adoxq	%%r10, %%r10\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r11, %%r11\n\t"
        "adcxq	%%rcx, %%r11\n\t"
        "movq	%%r10, 240(%[r])\n\t"
        "movq	%%r11, 248(%[r])\n\t"
        "movq	256(%[tmp]), %%r10\n\t"
        "movq	264(%[tmp]), %%r11\n\t"
        "# A[16] x A[16]\n\t"
        "movq	128(%[a]), %%rdx\n\t"
        "mulxq	%%rdx, %%rax, %%rcx\n\t"
        "adoxq	%%r10, %%r10\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r11, %%r11\n\t"
        "adcxq	%%rcx, %%r11\n\t"
        "movq	%%r10, 256(%[r])\n\t"
        "movq	%%r11, 264(%[r])\n\t"
        "movq	272(%[tmp]), %%r10\n\t"
        "movq	280(%[tmp]), %%r11\n\t"
        "# A[17] x A[17]\n\t"
        "movq	136(%[a]), %%rdx\n\t"
        "mulxq	%%rdx, %%rax, %%rcx\n\t"
        "adoxq	%%r10, %%r10\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r11, %%r11\n\t"
        "adcxq	%%rcx, %%r11\n\t"
        "movq	%%r10, 272(%[r])\n\t"
        "movq	%%r11, 280(%[r])\n\t"
        "movq	288(%[tmp]), %%r10\n\t"
        "movq	296(%[tmp]), %%r11\n\t"
        "# A[18] x A[18]\n\t"
        "movq	144(%[a]), %%rdx\n\t"
        "mulxq	%%rdx, %%rax, %%rcx\n\t"
        "adoxq	%%r10, %%r10\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r11, %%r11\n\t"
        "adcxq	%%rcx, %%r11\n\t"
        "movq	%%r10, 288(%[r])\n\t"
        "movq	%%r11, 296(%[r])\n\t"
        "movq	304(%[tmp]), %%r10\n\t"
        "movq	312(%[tmp]), %%r11\n\t"
        "# A[19] x A[19]\n\t"
        "movq	152(%[a]), %%rdx\n\t"
        "mulxq	%%rdx, %%rax, %%rcx\n\t"
        "adoxq	%%r10, %%r10\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r11, %%r11\n\t"
        "adcxq	%%rcx, %%r11\n\t"
        "movq	%%r10, 304(%[r])\n\t"
        "movq	%%r11, 312(%[r])\n\t"
        "movq	320(%[tmp]), %%r10\n\t"
        "movq	328(%[tmp]), %%r11\n\t"
        "# A[20] x A[20]\n\t"
        "movq	160(%[a]), %%rdx\n\t"
        "mulxq	%%rdx, %%rax, %%rcx\n\t"
        "adoxq	%%r10, %%r10\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r11, %%r11\n\t"
        "adcxq	%%rcx, %%r11\n\t"
        "movq	%%r10, 320(%[r])\n\t"
        "movq	%%r11, 328(%[r])\n\t"
        "movq	336(%[tmp]), %%r10\n\t"
        "movq	344(%[tmp]), %%r11\n\t"
        "# A[21] x A[21]\n\t"
        "movq	168(%[a]), %%rdx\n\t"
        "mulxq	%%rdx, %%rax, %%rcx\n\t"
        "adoxq	%%r10, %%r10\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r11, %%r11\n\t"
        "adcxq	%%rcx, %%r11\n\t"
        "movq	%%r10, 336(%[r])\n\t"
        "movq	%%r11, 344(%[r])\n\t"
        "movq	352(%[tmp]), %%r10\n\t"
        "movq	360(%[tmp]), %%r11\n\t"
        "# A[22] x A[22]\n\t"
        "movq	176(%[a]), %%rdx\n\t"
        "mulxq	%%rdx, %%rax, %%rcx\n\t"
        "adoxq	%%r10, %%r10\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r11, %%r11\n\t"
        "adcxq	%%rcx, %%r11\n\t"
        "movq	%%r10, 352(%[r])\n\t"
        "movq	%%r11, 360(%[r])\n\t"
        "movq	368(%[tmp]), %%r10\n\t"
        "movq	376(%[tmp]), %%r11\n\t"
        "# A[23] x A[23]\n\t"
        "movq	184(%[a]), %%rdx\n\t"
        "mulxq	%%rdx, %%rax, %%rcx\n\t"
        "adoxq	%%r10, %%r10\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r11, %%r11\n\t"
        "adcxq	%%rcx, %%r11\n\t"
        "movq	%%r10, 368(%[r])\n\t"
        "movq	%%r11, 376(%[r])\n\t"
        :
        : [r] "r" (r), [a] "r" (a), [tmp] "r" (tmp)
        : "memory", "rax", "rdx", "rcx", "r8", "r9", "r10", "r11",
          "r12", "r13", "r14", "r15"
    );

    XMEMCPY(r, tmp, sizeof(tmp)/2);
}
#endif /* HAVE_INTEL_AVX2 */

/* Add b to a into r. (r = a + b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
SP_NOINLINE static sp_digit sp_3072_add_24(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    sp_digit c = 0;

    __asm__ __volatile__ (
        "movq	(%[a]), %%rax\n\t"
        "addq	(%[b]), %%rax\n\t"
        "movq	%%rax, (%[r])\n\t"
        "movq	8(%[a]), %%rax\n\t"
        "adcq	8(%[b]), %%rax\n\t"
        "movq	%%rax, 8(%[r])\n\t"
        "movq	16(%[a]), %%rax\n\t"
        "adcq	16(%[b]), %%rax\n\t"
        "movq	%%rax, 16(%[r])\n\t"
        "movq	24(%[a]), %%rax\n\t"
        "adcq	24(%[b]), %%rax\n\t"
        "movq	%%rax, 24(%[r])\n\t"
        "movq	32(%[a]), %%rax\n\t"
        "adcq	32(%[b]), %%rax\n\t"
        "movq	%%rax, 32(%[r])\n\t"
        "movq	40(%[a]), %%rax\n\t"
        "adcq	40(%[b]), %%rax\n\t"
        "movq	%%rax, 40(%[r])\n\t"
        "movq	48(%[a]), %%rax\n\t"
        "adcq	48(%[b]), %%rax\n\t"
        "movq	%%rax, 48(%[r])\n\t"
        "movq	56(%[a]), %%rax\n\t"
        "adcq	56(%[b]), %%rax\n\t"
        "movq	%%rax, 56(%[r])\n\t"
        "movq	64(%[a]), %%rax\n\t"
        "adcq	64(%[b]), %%rax\n\t"
        "movq	%%rax, 64(%[r])\n\t"
        "movq	72(%[a]), %%rax\n\t"
        "adcq	72(%[b]), %%rax\n\t"
        "movq	%%rax, 72(%[r])\n\t"
        "movq	80(%[a]), %%rax\n\t"
        "adcq	80(%[b]), %%rax\n\t"
        "movq	%%rax, 80(%[r])\n\t"
        "movq	88(%[a]), %%rax\n\t"
        "adcq	88(%[b]), %%rax\n\t"
        "movq	%%rax, 88(%[r])\n\t"
        "movq	96(%[a]), %%rax\n\t"
        "adcq	96(%[b]), %%rax\n\t"
        "movq	%%rax, 96(%[r])\n\t"
        "movq	104(%[a]), %%rax\n\t"
        "adcq	104(%[b]), %%rax\n\t"
        "movq	%%rax, 104(%[r])\n\t"
        "movq	112(%[a]), %%rax\n\t"
        "adcq	112(%[b]), %%rax\n\t"
        "movq	%%rax, 112(%[r])\n\t"
        "movq	120(%[a]), %%rax\n\t"
        "adcq	120(%[b]), %%rax\n\t"
        "movq	%%rax, 120(%[r])\n\t"
        "movq	128(%[a]), %%rax\n\t"
        "adcq	128(%[b]), %%rax\n\t"
        "movq	%%rax, 128(%[r])\n\t"
        "movq	136(%[a]), %%rax\n\t"
        "adcq	136(%[b]), %%rax\n\t"
        "movq	%%rax, 136(%[r])\n\t"
        "movq	144(%[a]), %%rax\n\t"
        "adcq	144(%[b]), %%rax\n\t"
        "movq	%%rax, 144(%[r])\n\t"
        "movq	152(%[a]), %%rax\n\t"
        "adcq	152(%[b]), %%rax\n\t"
        "movq	%%rax, 152(%[r])\n\t"
        "movq	160(%[a]), %%rax\n\t"
        "adcq	160(%[b]), %%rax\n\t"
        "movq	%%rax, 160(%[r])\n\t"
        "movq	168(%[a]), %%rax\n\t"
        "adcq	168(%[b]), %%rax\n\t"
        "movq	%%rax, 168(%[r])\n\t"
        "movq	176(%[a]), %%rax\n\t"
        "adcq	176(%[b]), %%rax\n\t"
        "movq	%%rax, 176(%[r])\n\t"
        "movq	184(%[a]), %%rax\n\t"
        "adcq	184(%[b]), %%rax\n\t"
        "movq	%%rax, 184(%[r])\n\t"
        "adcq	$0, %[c]\n\t"
        : [c] "+r" (c)
        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
        : "memory", "rax"
    );

    return c;
}

/* Sub b from a into a. (a -= b)
 *
 * a  A single precision integer and result.
 * b  A single precision integer.
 */
SP_NOINLINE static sp_digit sp_3072_sub_in_place_48(sp_digit* a,
    const sp_digit* b)
{
    sp_digit c = 0;

    __asm__ __volatile__ (
        "movq	0(%[a]), %%r8\n\t"
        "movq	8(%[a]), %%r9\n\t"
        "movq	0(%[b]), %%rdx\n\t"
        "movq	8(%[b]), %%rcx\n\t"
        "subq	%%rdx, %%r8\n\t"
        "movq	16(%[b]), %%rdx\n\t"
        "movq	%%r8, 0(%[a])\n\t"
        "movq	16(%[a]), %%r8\n\t"
        "sbbq	%%rcx, %%r9\n\t"
        "movq	24(%[b]), %%rcx\n\t"
        "movq	%%r9, 8(%[a])\n\t"
        "movq	24(%[a]), %%r9\n\t"
        "sbbq	%%rdx, %%r8\n\t"
        "movq	32(%[b]), %%rdx\n\t"
        "movq	%%r8, 16(%[a])\n\t"
        "movq	32(%[a]), %%r8\n\t"
        "sbbq	%%rcx, %%r9\n\t"
        "movq	40(%[b]), %%rcx\n\t"
        "movq	%%r9, 24(%[a])\n\t"
        "movq	40(%[a]), %%r9\n\t"
        "sbbq	%%rdx, %%r8\n\t"
        "movq	48(%[b]), %%rdx\n\t"
        "movq	%%r8, 32(%[a])\n\t"
        "movq	48(%[a]), %%r8\n\t"
        "sbbq	%%rcx, %%r9\n\t"
        "movq	56(%[b]), %%rcx\n\t"
        "movq	%%r9, 40(%[a])\n\t"
        "movq	56(%[a]), %%r9\n\t"
        "sbbq	%%rdx, %%r8\n\t"
        "movq	64(%[b]), %%rdx\n\t"
        "movq	%%r8, 48(%[a])\n\t"
        "movq	64(%[a]), %%r8\n\t"
        "sbbq	%%rcx, %%r9\n\t"
        "movq	72(%[b]), %%rcx\n\t"
        "movq	%%r9, 56(%[a])\n\t"
        "movq	72(%[a]), %%r9\n\t"
        "sbbq	%%rdx, %%r8\n\t"
        "movq	80(%[b]), %%rdx\n\t"
        "movq	%%r8, 64(%[a])\n\t"
        "movq	80(%[a]), %%r8\n\t"
        "sbbq	%%rcx, %%r9\n\t"
        "movq	88(%[b]), %%rcx\n\t"
        "movq	%%r9, 72(%[a])\n\t"
        "movq	88(%[a]), %%r9\n\t"
        "sbbq	%%rdx, %%r8\n\t"
        "movq	96(%[b]), %%rdx\n\t"
        "movq	%%r8, 80(%[a])\n\t"
        "movq	96(%[a]), %%r8\n\t"
        "sbbq	%%rcx, %%r9\n\t"
        "movq	104(%[b]), %%rcx\n\t"
        "movq	%%r9, 88(%[a])\n\t"
        "movq	104(%[a]), %%r9\n\t"
        "sbbq	%%rdx, %%r8\n\t"
        "movq	112(%[b]), %%rdx\n\t"
        "movq	%%r8, 96(%[a])\n\t"
        "movq	112(%[a]), %%r8\n\t"
        "sbbq	%%rcx, %%r9\n\t"
        "movq	120(%[b]), %%rcx\n\t"
        "movq	%%r9, 104(%[a])\n\t"
        "movq	120(%[a]), %%r9\n\t"
        "sbbq	%%rdx, %%r8\n\t"
        "movq	128(%[b]), %%rdx\n\t"
        "movq	%%r8, 112(%[a])\n\t"
        "movq	128(%[a]), %%r8\n\t"
        "sbbq	%%rcx, %%r9\n\t"
        "movq	136(%[b]), %%rcx\n\t"
        "movq	%%r9, 120(%[a])\n\t"
        "movq	136(%[a]), %%r9\n\t"
        "sbbq	%%rdx, %%r8\n\t"
        "movq	144(%[b]), %%rdx\n\t"
        "movq	%%r8, 128(%[a])\n\t"
        "movq	144(%[a]), %%r8\n\t"
        "sbbq	%%rcx, %%r9\n\t"
        "movq	152(%[b]), %%rcx\n\t"
        "movq	%%r9, 136(%[a])\n\t"
        "movq	152(%[a]), %%r9\n\t"
        "sbbq	%%rdx, %%r8\n\t"
        "movq	160(%[b]), %%rdx\n\t"
        "movq	%%r8, 144(%[a])\n\t"
        "movq	160(%[a]), %%r8\n\t"
        "sbbq	%%rcx, %%r9\n\t"
        "movq	168(%[b]), %%rcx\n\t"
        "movq	%%r9, 152(%[a])\n\t"
        "movq	168(%[a]), %%r9\n\t"
        "sbbq	%%rdx, %%r8\n\t"
        "movq	176(%[b]), %%rdx\n\t"
        "movq	%%r8, 160(%[a])\n\t"
        "movq	176(%[a]), %%r8\n\t"
        "sbbq	%%rcx, %%r9\n\t"
        "movq	184(%[b]), %%rcx\n\t"
        "movq	%%r9, 168(%[a])\n\t"
        "movq	184(%[a]), %%r9\n\t"
        "sbbq	%%rdx, %%r8\n\t"
        "movq	192(%[b]), %%rdx\n\t"
        "movq	%%r8, 176(%[a])\n\t"
        "movq	192(%[a]), %%r8\n\t"
        "sbbq	%%rcx, %%r9\n\t"
        "movq	200(%[b]), %%rcx\n\t"
        "movq	%%r9, 184(%[a])\n\t"
        "movq	200(%[a]), %%r9\n\t"
        "sbbq	%%rdx, %%r8\n\t"
        "movq	208(%[b]), %%rdx\n\t"
        "movq	%%r8, 192(%[a])\n\t"
        "movq	208(%[a]), %%r8\n\t"
        "sbbq	%%rcx, %%r9\n\t"
        "movq	216(%[b]), %%rcx\n\t"
        "movq	%%r9, 200(%[a])\n\t"
        "movq	216(%[a]), %%r9\n\t"
        "sbbq	%%rdx, %%r8\n\t"
        "movq	224(%[b]), %%rdx\n\t"
        "movq	%%r8, 208(%[a])\n\t"
        "movq	224(%[a]), %%r8\n\t"
        "sbbq	%%rcx, %%r9\n\t"
        "movq	232(%[b]), %%rcx\n\t"
        "movq	%%r9, 216(%[a])\n\t"
        "movq	232(%[a]), %%r9\n\t"
        "sbbq	%%rdx, %%r8\n\t"
        "movq	240(%[b]), %%rdx\n\t"
        "movq	%%r8, 224(%[a])\n\t"
        "movq	240(%[a]), %%r8\n\t"
        "sbbq	%%rcx, %%r9\n\t"
        "movq	248(%[b]), %%rcx\n\t"
        "movq	%%r9, 232(%[a])\n\t"
        "movq	248(%[a]), %%r9\n\t"
        "sbbq	%%rdx, %%r8\n\t"
        "movq	256(%[b]), %%rdx\n\t"
        "movq	%%r8, 240(%[a])\n\t"
        "movq	256(%[a]), %%r8\n\t"
        "sbbq	%%rcx, %%r9\n\t"
        "movq	264(%[b]), %%rcx\n\t"
        "movq	%%r9, 248(%[a])\n\t"
        "movq	264(%[a]), %%r9\n\t"
        "sbbq	%%rdx, %%r8\n\t"
        "movq	272(%[b]), %%rdx\n\t"
        "movq	%%r8, 256(%[a])\n\t"
        "movq	272(%[a]), %%r8\n\t"
        "sbbq	%%rcx, %%r9\n\t"
        "movq	280(%[b]), %%rcx\n\t"
        "movq	%%r9, 264(%[a])\n\t"
        "movq	280(%[a]), %%r9\n\t"
        "sbbq	%%rdx, %%r8\n\t"
        "movq	288(%[b]), %%rdx\n\t"
        "movq	%%r8, 272(%[a])\n\t"
        "movq	288(%[a]), %%r8\n\t"
        "sbbq	%%rcx, %%r9\n\t"
        "movq	296(%[b]), %%rcx\n\t"
        "movq	%%r9, 280(%[a])\n\t"
        "movq	296(%[a]), %%r9\n\t"
        "sbbq	%%rdx, %%r8\n\t"
        "movq	304(%[b]), %%rdx\n\t"
        "movq	%%r8, 288(%[a])\n\t"
        "movq	304(%[a]), %%r8\n\t"
        "sbbq	%%rcx, %%r9\n\t"
        "movq	312(%[b]), %%rcx\n\t"
        "movq	%%r9, 296(%[a])\n\t"
        "movq	312(%[a]), %%r9\n\t"
        "sbbq	%%rdx, %%r8\n\t"
        "movq	320(%[b]), %%rdx\n\t"
        "movq	%%r8, 304(%[a])\n\t"
        "movq	320(%[a]), %%r8\n\t"
        "sbbq	%%rcx, %%r9\n\t"
        "movq	328(%[b]), %%rcx\n\t"
        "movq	%%r9, 312(%[a])\n\t"
        "movq	328(%[a]), %%r9\n\t"
        "sbbq	%%rdx, %%r8\n\t"
        "movq	336(%[b]), %%rdx\n\t"
        "movq	%%r8, 320(%[a])\n\t"
        "movq	336(%[a]), %%r8\n\t"
        "sbbq	%%rcx, %%r9\n\t"
        "movq	344(%[b]), %%rcx\n\t"
        "movq	%%r9, 328(%[a])\n\t"
        "movq	344(%[a]), %%r9\n\t"
        "sbbq	%%rdx, %%r8\n\t"
        "movq	352(%[b]), %%rdx\n\t"
        "movq	%%r8, 336(%[a])\n\t"
        "movq	352(%[a]), %%r8\n\t"
        "sbbq	%%rcx, %%r9\n\t"
        "movq	360(%[b]), %%rcx\n\t"
        "movq	%%r9, 344(%[a])\n\t"
        "movq	360(%[a]), %%r9\n\t"
        "sbbq	%%rdx, %%r8\n\t"
        "movq	368(%[b]), %%rdx\n\t"
        "movq	%%r8, 352(%[a])\n\t"
        "movq	368(%[a]), %%r8\n\t"
        "sbbq	%%rcx, %%r9\n\t"
        "movq	376(%[b]), %%rcx\n\t"
        "movq	%%r9, 360(%[a])\n\t"
        "movq	376(%[a]), %%r9\n\t"
        "sbbq	%%rdx, %%r8\n\t"
        "movq	%%r8, 368(%[a])\n\t"
        "sbbq	%%rcx, %%r9\n\t"
        "movq	%%r9, 376(%[a])\n\t"
        "sbbq	$0, %[c]\n\t"
        : [c] "+r" (c)
        : [a] "r" (a), [b] "r" (b)
        : "memory", "rdx", "rcx", "r8", "r9"
    );

    return c;
}

/* Add b to a into r. (r = a + b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
SP_NOINLINE static sp_digit sp_3072_add_48(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    sp_digit c = 0;

    __asm__ __volatile__ (
        "movq	(%[a]), %%rax\n\t"
        "addq	(%[b]), %%rax\n\t"
        "movq	%%rax, (%[r])\n\t"
        "movq	8(%[a]), %%rax\n\t"
        "adcq	8(%[b]), %%rax\n\t"
        "movq	%%rax, 8(%[r])\n\t"
        "movq	16(%[a]), %%rax\n\t"
        "adcq	16(%[b]), %%rax\n\t"
        "movq	%%rax, 16(%[r])\n\t"
        "movq	24(%[a]), %%rax\n\t"
        "adcq	24(%[b]), %%rax\n\t"
        "movq	%%rax, 24(%[r])\n\t"
        "movq	32(%[a]), %%rax\n\t"
        "adcq	32(%[b]), %%rax\n\t"
        "movq	%%rax, 32(%[r])\n\t"
        "movq	40(%[a]), %%rax\n\t"
        "adcq	40(%[b]), %%rax\n\t"
        "movq	%%rax, 40(%[r])\n\t"
        "movq	48(%[a]), %%rax\n\t"
        "adcq	48(%[b]), %%rax\n\t"
        "movq	%%rax, 48(%[r])\n\t"
        "movq	56(%[a]), %%rax\n\t"
        "adcq	56(%[b]), %%rax\n\t"
        "movq	%%rax, 56(%[r])\n\t"
        "movq	64(%[a]), %%rax\n\t"
        "adcq	64(%[b]), %%rax\n\t"
        "movq	%%rax, 64(%[r])\n\t"
        "movq	72(%[a]), %%rax\n\t"
        "adcq	72(%[b]), %%rax\n\t"
        "movq	%%rax, 72(%[r])\n\t"
        "movq	80(%[a]), %%rax\n\t"
        "adcq	80(%[b]), %%rax\n\t"
        "movq	%%rax, 80(%[r])\n\t"
        "movq	88(%[a]), %%rax\n\t"
        "adcq	88(%[b]), %%rax\n\t"
        "movq	%%rax, 88(%[r])\n\t"
        "movq	96(%[a]), %%rax\n\t"
        "adcq	96(%[b]), %%rax\n\t"
        "movq	%%rax, 96(%[r])\n\t"
        "movq	104(%[a]), %%rax\n\t"
        "adcq	104(%[b]), %%rax\n\t"
        "movq	%%rax, 104(%[r])\n\t"
        "movq	112(%[a]), %%rax\n\t"
        "adcq	112(%[b]), %%rax\n\t"
        "movq	%%rax, 112(%[r])\n\t"
        "movq	120(%[a]), %%rax\n\t"
        "adcq	120(%[b]), %%rax\n\t"
        "movq	%%rax, 120(%[r])\n\t"
        "movq	128(%[a]), %%rax\n\t"
        "adcq	128(%[b]), %%rax\n\t"
        "movq	%%rax, 128(%[r])\n\t"
        "movq	136(%[a]), %%rax\n\t"
        "adcq	136(%[b]), %%rax\n\t"
        "movq	%%rax, 136(%[r])\n\t"
        "movq	144(%[a]), %%rax\n\t"
        "adcq	144(%[b]), %%rax\n\t"
        "movq	%%rax, 144(%[r])\n\t"
        "movq	152(%[a]), %%rax\n\t"
        "adcq	152(%[b]), %%rax\n\t"
        "movq	%%rax, 152(%[r])\n\t"
        "movq	160(%[a]), %%rax\n\t"
        "adcq	160(%[b]), %%rax\n\t"
        "movq	%%rax, 160(%[r])\n\t"
        "movq	168(%[a]), %%rax\n\t"
        "adcq	168(%[b]), %%rax\n\t"
        "movq	%%rax, 168(%[r])\n\t"
        "movq	176(%[a]), %%rax\n\t"
        "adcq	176(%[b]), %%rax\n\t"
        "movq	%%rax, 176(%[r])\n\t"
        "movq	184(%[a]), %%rax\n\t"
        "adcq	184(%[b]), %%rax\n\t"
        "movq	%%rax, 184(%[r])\n\t"
        "movq	192(%[a]), %%rax\n\t"
        "adcq	192(%[b]), %%rax\n\t"
        "movq	%%rax, 192(%[r])\n\t"
        "movq	200(%[a]), %%rax\n\t"
        "adcq	200(%[b]), %%rax\n\t"
        "movq	%%rax, 200(%[r])\n\t"
        "movq	208(%[a]), %%rax\n\t"
        "adcq	208(%[b]), %%rax\n\t"
        "movq	%%rax, 208(%[r])\n\t"
        "movq	216(%[a]), %%rax\n\t"
        "adcq	216(%[b]), %%rax\n\t"
        "movq	%%rax, 216(%[r])\n\t"
        "movq	224(%[a]), %%rax\n\t"
        "adcq	224(%[b]), %%rax\n\t"
        "movq	%%rax, 224(%[r])\n\t"
        "movq	232(%[a]), %%rax\n\t"
        "adcq	232(%[b]), %%rax\n\t"
        "movq	%%rax, 232(%[r])\n\t"
        "movq	240(%[a]), %%rax\n\t"
        "adcq	240(%[b]), %%rax\n\t"
        "movq	%%rax, 240(%[r])\n\t"
        "movq	248(%[a]), %%rax\n\t"
        "adcq	248(%[b]), %%rax\n\t"
        "movq	%%rax, 248(%[r])\n\t"
        "movq	256(%[a]), %%rax\n\t"
        "adcq	256(%[b]), %%rax\n\t"
        "movq	%%rax, 256(%[r])\n\t"
        "movq	264(%[a]), %%rax\n\t"
        "adcq	264(%[b]), %%rax\n\t"
        "movq	%%rax, 264(%[r])\n\t"
        "movq	272(%[a]), %%rax\n\t"
        "adcq	272(%[b]), %%rax\n\t"
        "movq	%%rax, 272(%[r])\n\t"
        "movq	280(%[a]), %%rax\n\t"
        "adcq	280(%[b]), %%rax\n\t"
        "movq	%%rax, 280(%[r])\n\t"
        "movq	288(%[a]), %%rax\n\t"
        "adcq	288(%[b]), %%rax\n\t"
        "movq	%%rax, 288(%[r])\n\t"
        "movq	296(%[a]), %%rax\n\t"
        "adcq	296(%[b]), %%rax\n\t"
        "movq	%%rax, 296(%[r])\n\t"
        "movq	304(%[a]), %%rax\n\t"
        "adcq	304(%[b]), %%rax\n\t"
        "movq	%%rax, 304(%[r])\n\t"
        "movq	312(%[a]), %%rax\n\t"
        "adcq	312(%[b]), %%rax\n\t"
        "movq	%%rax, 312(%[r])\n\t"
        "movq	320(%[a]), %%rax\n\t"
        "adcq	320(%[b]), %%rax\n\t"
        "movq	%%rax, 320(%[r])\n\t"
        "movq	328(%[a]), %%rax\n\t"
        "adcq	328(%[b]), %%rax\n\t"
        "movq	%%rax, 328(%[r])\n\t"
        "movq	336(%[a]), %%rax\n\t"
        "adcq	336(%[b]), %%rax\n\t"
        "movq	%%rax, 336(%[r])\n\t"
        "movq	344(%[a]), %%rax\n\t"
        "adcq	344(%[b]), %%rax\n\t"
        "movq	%%rax, 344(%[r])\n\t"
        "movq	352(%[a]), %%rax\n\t"
        "adcq	352(%[b]), %%rax\n\t"
        "movq	%%rax, 352(%[r])\n\t"
        "movq	360(%[a]), %%rax\n\t"
        "adcq	360(%[b]), %%rax\n\t"
        "movq	%%rax, 360(%[r])\n\t"
        "movq	368(%[a]), %%rax\n\t"
        "adcq	368(%[b]), %%rax\n\t"
        "movq	%%rax, 368(%[r])\n\t"
        "movq	376(%[a]), %%rax\n\t"
        "adcq	376(%[b]), %%rax\n\t"
        "movq	%%rax, 376(%[r])\n\t"
        "adcq	$0, %[c]\n\t"
        : [c] "+r" (c)
        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
        : "memory", "rax"
    );

    return c;
}

/* AND m into each word of a and store in r.
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * m  Mask to AND against each digit.
 */
static void sp_3072_mask_24(sp_digit* r, sp_digit* a, sp_digit m)
{
#ifdef WOLFSSL_SP_SMALL
    int i;

    for (i=0; i<24; i++)
        r[i] = a[i] & m;
#else
    int i;

    for (i = 0; i < 24; i += 8) {
        r[i+0] = a[i+0] & m;
        r[i+1] = a[i+1] & m;
        r[i+2] = a[i+2] & m;
        r[i+3] = a[i+3] & m;
        r[i+4] = a[i+4] & m;
        r[i+5] = a[i+5] & m;
        r[i+6] = a[i+6] & m;
        r[i+7] = a[i+7] & m;
    }
#endif
}

/* Multiply a and b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
SP_NOINLINE static void sp_3072_mul_48(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    sp_digit* z0 = r;
    sp_digit z1[48];
    sp_digit a1[24];
    sp_digit b1[24];
    sp_digit z2[48];
    sp_digit u, ca, cb;

    ca = sp_3072_add_24(a1, a, &a[24]);
    cb = sp_3072_add_24(b1, b, &b[24]);
    u  = ca & cb;
    sp_3072_mul_24(z1, a1, b1);
    sp_3072_mul_24(z2, &a[24], &b[24]);
    sp_3072_mul_24(z0, a, b);
    sp_3072_mask_24(r + 48, a1, 0 - cb);
    sp_3072_mask_24(b1, b1, 0 - ca);
    u += sp_3072_add_24(r + 48, r + 48, b1);
    u += sp_3072_sub_in_place_48(z1, z2);
    u += sp_3072_sub_in_place_48(z1, z0);
    u += sp_3072_add_48(r + 24, r + 24, z1);
    r[72] = u;
    XMEMSET(r + 72 + 1, 0, sizeof(sp_digit) * (24 - 1));
    sp_3072_add_48(r + 48, r + 48, z2);
}

/* Square a and put result in r. (r = a * a)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 */
SP_NOINLINE static void sp_3072_sqr_48(sp_digit* r, const sp_digit* a)
{
    sp_digit* z0 = r;
    sp_digit z2[48];
    sp_digit z1[48];
    sp_digit a1[24];
    sp_digit u;

    u = sp_3072_add_24(a1, a, &a[24]);
    sp_3072_sqr_24(z1, a1);
    sp_3072_sqr_24(z2, &a[24]);
    sp_3072_sqr_24(z0, a);
    sp_3072_mask_24(r + 48, a1, 0 - u);
    u += sp_3072_add_24(r + 48, r + 48, r + 48);
    u += sp_3072_sub_in_place_48(z1, z2);
    u += sp_3072_sub_in_place_48(z1, z0);
    u += sp_3072_add_48(r + 24, r + 24, z1);
    r[72] = u;
    XMEMSET(r + 72 + 1, 0, sizeof(sp_digit) * (24 - 1));
    sp_3072_add_48(r + 48, r + 48, z2);
}

#ifdef HAVE_INTEL_AVX2
/* Multiply a and b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
SP_NOINLINE static void sp_3072_mul_avx2_48(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    sp_digit* z0 = r;
    sp_digit z1[48];
    sp_digit a1[24];
    sp_digit b1[24];
    sp_digit z2[48];
    sp_digit u, ca, cb;

    ca = sp_3072_add_24(a1, a, &a[24]);
    cb = sp_3072_add_24(b1, b, &b[24]);
    u  = ca & cb;
    sp_3072_mul_avx2_24(z1, a1, b1);
    sp_3072_mul_avx2_24(z2, &a[24], &b[24]);
    sp_3072_mul_avx2_24(z0, a, b);
    sp_3072_mask_24(r + 48, a1, 0 - cb);
    sp_3072_mask_24(b1, b1, 0 - ca);
    u += sp_3072_add_24(r + 48, r + 48, b1);
    u += sp_3072_sub_in_place_48(z1, z2);
    u += sp_3072_sub_in_place_48(z1, z0);
    u += sp_3072_add_48(r + 24, r + 24, z1);
    r[72] = u;
    XMEMSET(r + 72 + 1, 0, sizeof(sp_digit) * (24 - 1));
    sp_3072_add_48(r + 48, r + 48, z2);
}
#endif /* HAVE_INTEL_AVX2 */

#ifdef HAVE_INTEL_AVX2
/* Square a and put result in r. (r = a * a)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 */
SP_NOINLINE static void sp_3072_sqr_avx2_48(sp_digit* r, const sp_digit* a)
{
    sp_digit* z0 = r;
    sp_digit z2[48];
    sp_digit z1[48];
    sp_digit a1[24];
    sp_digit u;

    u = sp_3072_add_24(a1, a, &a[24]);
    sp_3072_sqr_avx2_24(z1, a1);
    sp_3072_sqr_avx2_24(z2, &a[24]);
    sp_3072_sqr_avx2_24(z0, a);
    sp_3072_mask_24(r + 48, a1, 0 - u);
    u += sp_3072_add_24(r + 48, r + 48, r + 48);
    u += sp_3072_sub_in_place_48(z1, z2);
    u += sp_3072_sub_in_place_48(z1, z0);
    u += sp_3072_add_48(r + 24, r + 24, z1);
    r[72] = u;
    XMEMSET(r + 72 + 1, 0, sizeof(sp_digit) * (24 - 1));
    sp_3072_add_48(r + 48, r + 48, z2);
}
#endif /* HAVE_INTEL_AVX2 */

#if !defined(SP_RSA_PRIVATE_EXP_D) && defined(WOLFSSL_HAVE_SP_RSA)
#endif /* !SP_RSA_PRIVATE_EXP_D && WOLFSSL_HAVE_SP_RSA */

/* Caclulate the bottom digit of -1/a mod 2^n.
 *
 * a    A single precision number.
 * rho  Bottom word of inverse.
 */
static void sp_3072_mont_setup(sp_digit* a, sp_digit* rho)
{
    sp_digit x, b;

    b = a[0];
    x = (((b + 2) & 4) << 1) + b; /* here x*a==1 mod 2**4 */
    x *= 2 - b * x;               /* here x*a==1 mod 2**8 */
    x *= 2 - b * x;               /* here x*a==1 mod 2**16 */
    x *= 2 - b * x;               /* here x*a==1 mod 2**32 */
    x *= 2 - b * x;               /* here x*a==1 mod 2**64 */

    /* rho = -1/m mod b */
    *rho = -x;
}

#if !defined(SP_RSA_PRIVATE_EXP_D) && defined(WOLFSSL_HAVE_SP_RSA)
/* Sub b from a into a. (a -= b)
 *
 * a  A single precision integer and result.
 * b  A single precision integer.
 */
SP_NOINLINE static sp_digit sp_3072_sub_in_place_24(sp_digit* a,
    const sp_digit* b)
{
    sp_digit c = 0;

    __asm__ __volatile__ (
        "movq	0(%[a]), %%r8\n\t"
        "movq	8(%[a]), %%r9\n\t"
        "movq	0(%[b]), %%rdx\n\t"
        "movq	8(%[b]), %%rcx\n\t"
        "subq	%%rdx, %%r8\n\t"
        "movq	16(%[b]), %%rdx\n\t"
        "movq	%%r8, 0(%[a])\n\t"
        "movq	16(%[a]), %%r8\n\t"
        "sbbq	%%rcx, %%r9\n\t"
        "movq	24(%[b]), %%rcx\n\t"
        "movq	%%r9, 8(%[a])\n\t"
        "movq	24(%[a]), %%r9\n\t"
        "sbbq	%%rdx, %%r8\n\t"
        "movq	32(%[b]), %%rdx\n\t"
        "movq	%%r8, 16(%[a])\n\t"
        "movq	32(%[a]), %%r8\n\t"
        "sbbq	%%rcx, %%r9\n\t"
        "movq	40(%[b]), %%rcx\n\t"
        "movq	%%r9, 24(%[a])\n\t"
        "movq	40(%[a]), %%r9\n\t"
        "sbbq	%%rdx, %%r8\n\t"
        "movq	48(%[b]), %%rdx\n\t"
        "movq	%%r8, 32(%[a])\n\t"
        "movq	48(%[a]), %%r8\n\t"
        "sbbq	%%rcx, %%r9\n\t"
        "movq	56(%[b]), %%rcx\n\t"
        "movq	%%r9, 40(%[a])\n\t"
        "movq	56(%[a]), %%r9\n\t"
        "sbbq	%%rdx, %%r8\n\t"
        "movq	64(%[b]), %%rdx\n\t"
        "movq	%%r8, 48(%[a])\n\t"
        "movq	64(%[a]), %%r8\n\t"
        "sbbq	%%rcx, %%r9\n\t"
        "movq	72(%[b]), %%rcx\n\t"
        "movq	%%r9, 56(%[a])\n\t"
        "movq	72(%[a]), %%r9\n\t"
        "sbbq	%%rdx, %%r8\n\t"
        "movq	80(%[b]), %%rdx\n\t"
        "movq	%%r8, 64(%[a])\n\t"
        "movq	80(%[a]), %%r8\n\t"
        "sbbq	%%rcx, %%r9\n\t"
        "movq	88(%[b]), %%rcx\n\t"
        "movq	%%r9, 72(%[a])\n\t"
        "movq	88(%[a]), %%r9\n\t"
        "sbbq	%%rdx, %%r8\n\t"
        "movq	96(%[b]), %%rdx\n\t"
        "movq	%%r8, 80(%[a])\n\t"
        "movq	96(%[a]), %%r8\n\t"
        "sbbq	%%rcx, %%r9\n\t"
        "movq	104(%[b]), %%rcx\n\t"
        "movq	%%r9, 88(%[a])\n\t"
        "movq	104(%[a]), %%r9\n\t"
        "sbbq	%%rdx, %%r8\n\t"
        "movq	112(%[b]), %%rdx\n\t"
        "movq	%%r8, 96(%[a])\n\t"
        "movq	112(%[a]), %%r8\n\t"
        "sbbq	%%rcx, %%r9\n\t"
        "movq	120(%[b]), %%rcx\n\t"
        "movq	%%r9, 104(%[a])\n\t"
        "movq	120(%[a]), %%r9\n\t"
        "sbbq	%%rdx, %%r8\n\t"
        "movq	128(%[b]), %%rdx\n\t"
        "movq	%%r8, 112(%[a])\n\t"
        "movq	128(%[a]), %%r8\n\t"
        "sbbq	%%rcx, %%r9\n\t"
        "movq	136(%[b]), %%rcx\n\t"
        "movq	%%r9, 120(%[a])\n\t"
        "movq	136(%[a]), %%r9\n\t"
        "sbbq	%%rdx, %%r8\n\t"
        "movq	144(%[b]), %%rdx\n\t"
        "movq	%%r8, 128(%[a])\n\t"
        "movq	144(%[a]), %%r8\n\t"
        "sbbq	%%rcx, %%r9\n\t"
        "movq	152(%[b]), %%rcx\n\t"
        "movq	%%r9, 136(%[a])\n\t"
        "movq	152(%[a]), %%r9\n\t"
        "sbbq	%%rdx, %%r8\n\t"
        "movq	160(%[b]), %%rdx\n\t"
        "movq	%%r8, 144(%[a])\n\t"
        "movq	160(%[a]), %%r8\n\t"
        "sbbq	%%rcx, %%r9\n\t"
        "movq	168(%[b]), %%rcx\n\t"
        "movq	%%r9, 152(%[a])\n\t"
        "movq	168(%[a]), %%r9\n\t"
        "sbbq	%%rdx, %%r8\n\t"
        "movq	176(%[b]), %%rdx\n\t"
        "movq	%%r8, 160(%[a])\n\t"
        "movq	176(%[a]), %%r8\n\t"
        "sbbq	%%rcx, %%r9\n\t"
        "movq	184(%[b]), %%rcx\n\t"
        "movq	%%r9, 168(%[a])\n\t"
        "movq	184(%[a]), %%r9\n\t"
        "sbbq	%%rdx, %%r8\n\t"
        "movq	%%r8, 176(%[a])\n\t"
        "sbbq	%%rcx, %%r9\n\t"
        "movq	%%r9, 184(%[a])\n\t"
        "sbbq	$0, %[c]\n\t"
        : [c] "+r" (c)
        : [a] "r" (a), [b] "r" (b)
        : "memory", "rdx", "rcx", "r8", "r9"
    );

    return c;
}

/* r = 2^n mod m where n is the number of bits to reduce by.
 * Given m must be 3072 bits, just need to subtract.
 *
 * r  A single precision number.
 * m  A signle precision number.
 */
static void sp_3072_mont_norm_24(sp_digit* r, sp_digit* m)
{
    XMEMSET(r, 0, sizeof(sp_digit) * 24);

    /* r = 2^n mod m */
    sp_3072_sub_in_place_24(r, m);
}

/* Conditionally subtract b from a using the mask m.
 * m is -1 to subtract and 0 when not copying.
 *
 * r  A single precision number representing condition subtract result.
 * a  A single precision number to subtract from.
 * b  A single precision number to subtract.
 * m  Mask value to apply.
 */
static sp_digit sp_3072_cond_sub_24(sp_digit* r, sp_digit* a, sp_digit* b,
        sp_digit m)
{
    sp_digit t[24];
    sp_digit c = 0;

    __asm__ __volatile__ (
        "movq	0(%[b]), %%rax\n\t"
        "movq	8(%[b]), %%rcx\n\t"
        "andq	%[m], %%rax\n\t"
        "andq	%[m], %%rcx\n\t"
        "movq	%%rax, 0(%[t])\n\t"
        "movq	%%rcx, 8(%[t])\n\t"
        "movq	16(%[b]), %%rax\n\t"
        "movq	24(%[b]), %%rcx\n\t"
        "andq	%[m], %%rax\n\t"
        "andq	%[m], %%rcx\n\t"
        "movq	%%rax, 16(%[t])\n\t"
        "movq	%%rcx, 24(%[t])\n\t"
        "movq	32(%[b]), %%rax\n\t"
        "movq	40(%[b]), %%rcx\n\t"
        "andq	%[m], %%rax\n\t"
        "andq	%[m], %%rcx\n\t"
        "movq	%%rax, 32(%[t])\n\t"
        "movq	%%rcx, 40(%[t])\n\t"
        "movq	48(%[b]), %%rax\n\t"
        "movq	56(%[b]), %%rcx\n\t"
        "andq	%[m], %%rax\n\t"
        "andq	%[m], %%rcx\n\t"
        "movq	%%rax, 48(%[t])\n\t"
        "movq	%%rcx, 56(%[t])\n\t"
        "movq	64(%[b]), %%rax\n\t"
        "movq	72(%[b]), %%rcx\n\t"
        "andq	%[m], %%rax\n\t"
        "andq	%[m], %%rcx\n\t"
        "movq	%%rax, 64(%[t])\n\t"
        "movq	%%rcx, 72(%[t])\n\t"
        "movq	80(%[b]), %%rax\n\t"
        "movq	88(%[b]), %%rcx\n\t"
        "andq	%[m], %%rax\n\t"
        "andq	%[m], %%rcx\n\t"
        "movq	%%rax, 80(%[t])\n\t"
        "movq	%%rcx, 88(%[t])\n\t"
        "movq	96(%[b]), %%rax\n\t"
        "movq	104(%[b]), %%rcx\n\t"
        "andq	%[m], %%rax\n\t"
        "andq	%[m], %%rcx\n\t"
        "movq	%%rax, 96(%[t])\n\t"
        "movq	%%rcx, 104(%[t])\n\t"
        "movq	112(%[b]), %%rax\n\t"
        "movq	120(%[b]), %%rcx\n\t"
        "andq	%[m], %%rax\n\t"
        "andq	%[m], %%rcx\n\t"
        "movq	%%rax, 112(%[t])\n\t"
        "movq	%%rcx, 120(%[t])\n\t"
        "movq	128(%[b]), %%rax\n\t"
        "movq	136(%[b]), %%rcx\n\t"
        "andq	%[m], %%rax\n\t"
        "andq	%[m], %%rcx\n\t"
        "movq	%%rax, 128(%[t])\n\t"
        "movq	%%rcx, 136(%[t])\n\t"
        "movq	144(%[b]), %%rax\n\t"
        "movq	152(%[b]), %%rcx\n\t"
        "andq	%[m], %%rax\n\t"
        "andq	%[m], %%rcx\n\t"
        "movq	%%rax, 144(%[t])\n\t"
        "movq	%%rcx, 152(%[t])\n\t"
        "movq	160(%[b]), %%rax\n\t"
        "movq	168(%[b]), %%rcx\n\t"
        "andq	%[m], %%rax\n\t"
        "andq	%[m], %%rcx\n\t"
        "movq	%%rax, 160(%[t])\n\t"
        "movq	%%rcx, 168(%[t])\n\t"
        "movq	176(%[b]), %%rax\n\t"
        "movq	184(%[b]), %%rcx\n\t"
        "andq	%[m], %%rax\n\t"
        "andq	%[m], %%rcx\n\t"
        "movq	%%rax, 176(%[t])\n\t"
        "movq	%%rcx, 184(%[t])\n\t"
        "movq	(%[a]), %%rax\n\t"
        "movq	(%[t]), %%rdx\n\t"
        "subq	%%rdx,%%rax\n\t"
        "movq	8(%[a]), %%rcx\n\t"
        "movq	8(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rcx\n\t"
        "movq	%%rax, 0(%[r])\n\t"
        "movq	16(%[a]), %%rax\n\t"
        "movq	16(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rax\n\t"
        "movq	%%rcx, 8(%[r])\n\t"
        "movq	24(%[a]), %%rcx\n\t"
        "movq	24(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rcx\n\t"
        "movq	%%rax, 16(%[r])\n\t"
        "movq	32(%[a]), %%rax\n\t"
        "movq	32(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rax\n\t"
        "movq	%%rcx, 24(%[r])\n\t"
        "movq	40(%[a]), %%rcx\n\t"
        "movq	40(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rcx\n\t"
        "movq	%%rax, 32(%[r])\n\t"
        "movq	48(%[a]), %%rax\n\t"
        "movq	48(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rax\n\t"
        "movq	%%rcx, 40(%[r])\n\t"
        "movq	56(%[a]), %%rcx\n\t"
        "movq	56(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rcx\n\t"
        "movq	%%rax, 48(%[r])\n\t"
        "movq	64(%[a]), %%rax\n\t"
        "movq	64(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rax\n\t"
        "movq	%%rcx, 56(%[r])\n\t"
        "movq	72(%[a]), %%rcx\n\t"
        "movq	72(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rcx\n\t"
        "movq	%%rax, 64(%[r])\n\t"
        "movq	80(%[a]), %%rax\n\t"
        "movq	80(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rax\n\t"
        "movq	%%rcx, 72(%[r])\n\t"
        "movq	88(%[a]), %%rcx\n\t"
        "movq	88(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rcx\n\t"
        "movq	%%rax, 80(%[r])\n\t"
        "movq	96(%[a]), %%rax\n\t"
        "movq	96(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rax\n\t"
        "movq	%%rcx, 88(%[r])\n\t"
        "movq	104(%[a]), %%rcx\n\t"
        "movq	104(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rcx\n\t"
        "movq	%%rax, 96(%[r])\n\t"
        "movq	112(%[a]), %%rax\n\t"
        "movq	112(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rax\n\t"
        "movq	%%rcx, 104(%[r])\n\t"
        "movq	120(%[a]), %%rcx\n\t"
        "movq	120(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rcx\n\t"
        "movq	%%rax, 112(%[r])\n\t"
        "movq	128(%[a]), %%rax\n\t"
        "movq	128(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rax\n\t"
        "movq	%%rcx, 120(%[r])\n\t"
        "movq	136(%[a]), %%rcx\n\t"
        "movq	136(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rcx\n\t"
        "movq	%%rax, 128(%[r])\n\t"
        "movq	144(%[a]), %%rax\n\t"
        "movq	144(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rax\n\t"
        "movq	%%rcx, 136(%[r])\n\t"
        "movq	152(%[a]), %%rcx\n\t"
        "movq	152(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rcx\n\t"
        "movq	%%rax, 144(%[r])\n\t"
        "movq	160(%[a]), %%rax\n\t"
        "movq	160(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rax\n\t"
        "movq	%%rcx, 152(%[r])\n\t"
        "movq	168(%[a]), %%rcx\n\t"
        "movq	168(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rcx\n\t"
        "movq	%%rax, 160(%[r])\n\t"
        "movq	176(%[a]), %%rax\n\t"
        "movq	176(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rax\n\t"
        "movq	%%rcx, 168(%[r])\n\t"
        "movq	184(%[a]), %%rcx\n\t"
        "movq	184(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rcx\n\t"
        "movq	%%rax, 176(%[r])\n\t"
        "movq	%%rcx, 184(%[r])\n\t"
        "sbbq	$0, %[c]\n\t"
        : [c] "+r" (c)
        : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m), [t] "r" (t)
        : "memory", "rax", "rcx", "rdx"
    );

    return c;
}

/* Reduce the number back to 3072 bits using Montgomery reduction.
 *
 * a   A single precision number to reduce in place.
 * m   The single precision number representing the modulus.
 * mp  The digit representing the negative inverse of m mod 2^n.
 */
SP_NOINLINE static void sp_3072_mont_reduce_24(sp_digit* a, sp_digit* m,
        sp_digit mp)
{
    sp_digit ca = 0;

    __asm__ __volatile__ (
        "# i = 0\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "movq	0(%[a]), %%r12\n\t"
        "movq	8(%[a]), %%r13\n\t"
        "\nL_mont_loop_24:\n\t"
        "# mu = a[i] * mp\n\t"
        "movq	%%r12, %%r10\n\t"
        "imulq	%[mp], %%r10\n\t"
        "# a[i+0] += m[0] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r9, %%r9\n\t"
        "mulq	0(%[m])\n\t"
        "addq	%%rax, %%r12\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "# a[i+1] += m[1] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	8(%[m])\n\t"
        "movq	%%r13, %%r12\n\t"
        "addq	%%rax, %%r12\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "addq	%%r9,  %%r12\n\t"
        "adcq	$0, %%r8\n\t"
        "# a[i+2] += m[2] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r9, %%r9\n\t"
        "mulq	16(%[m])\n\t"
        "movq	16(%[a]), %%r13\n\t"
        "addq	%%rax, %%r13\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "addq	%%r8,  %%r13\n\t"
        "adcq	$0, %%r9\n\t"
        "# a[i+3] += m[3] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	24(%[m])\n\t"
        "movq	24(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "addq	%%r9,  %%r11\n\t"
        "movq	%%r11, 24(%[a])\n\t"
        "adcq	$0, %%r8\n\t"
        "# a[i+4] += m[4] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r9, %%r9\n\t"
        "mulq	32(%[m])\n\t"
        "movq	32(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "addq	%%r8,  %%r11\n\t"
        "movq	%%r11, 32(%[a])\n\t"
        "adcq	$0, %%r9\n\t"
        "# a[i+5] += m[5] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	40(%[m])\n\t"
        "movq	40(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "addq	%%r9,  %%r11\n\t"
        "movq	%%r11, 40(%[a])\n\t"
        "adcq	$0, %%r8\n\t"
        "# a[i+6] += m[6] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r9, %%r9\n\t"
        "mulq	48(%[m])\n\t"
        "movq	48(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "addq	%%r8,  %%r11\n\t"
        "movq	%%r11, 48(%[a])\n\t"
        "adcq	$0, %%r9\n\t"
        "# a[i+7] += m[7] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	56(%[m])\n\t"
        "movq	56(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "addq	%%r9,  %%r11\n\t"
        "movq	%%r11, 56(%[a])\n\t"
        "adcq	$0, %%r8\n\t"
        "# a[i+8] += m[8] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r9, %%r9\n\t"
        "mulq	64(%[m])\n\t"
        "movq	64(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "addq	%%r8,  %%r11\n\t"
        "movq	%%r11, 64(%[a])\n\t"
        "adcq	$0, %%r9\n\t"
        "# a[i+9] += m[9] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	72(%[m])\n\t"
        "movq	72(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "addq	%%r9,  %%r11\n\t"
        "movq	%%r11, 72(%[a])\n\t"
        "adcq	$0, %%r8\n\t"
        "# a[i+10] += m[10] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r9, %%r9\n\t"
        "mulq	80(%[m])\n\t"
        "movq	80(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "addq	%%r8,  %%r11\n\t"
        "movq	%%r11, 80(%[a])\n\t"
        "adcq	$0, %%r9\n\t"
        "# a[i+11] += m[11] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	88(%[m])\n\t"
        "movq	88(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "addq	%%r9,  %%r11\n\t"
        "movq	%%r11, 88(%[a])\n\t"
        "adcq	$0, %%r8\n\t"
        "# a[i+12] += m[12] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r9, %%r9\n\t"
        "mulq	96(%[m])\n\t"
        "movq	96(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "addq	%%r8,  %%r11\n\t"
        "movq	%%r11, 96(%[a])\n\t"
        "adcq	$0, %%r9\n\t"
        "# a[i+13] += m[13] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	104(%[m])\n\t"
        "movq	104(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "addq	%%r9,  %%r11\n\t"
        "movq	%%r11, 104(%[a])\n\t"
        "adcq	$0, %%r8\n\t"
        "# a[i+14] += m[14] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r9, %%r9\n\t"
        "mulq	112(%[m])\n\t"
        "movq	112(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "addq	%%r8,  %%r11\n\t"
        "movq	%%r11, 112(%[a])\n\t"
        "adcq	$0, %%r9\n\t"
        "# a[i+15] += m[15] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	120(%[m])\n\t"
        "movq	120(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "addq	%%r9,  %%r11\n\t"
        "movq	%%r11, 120(%[a])\n\t"
        "adcq	$0, %%r8\n\t"
        "# a[i+16] += m[16] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r9, %%r9\n\t"
        "mulq	128(%[m])\n\t"
        "movq	128(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "addq	%%r8,  %%r11\n\t"
        "movq	%%r11, 128(%[a])\n\t"
        "adcq	$0, %%r9\n\t"
        "# a[i+17] += m[17] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	136(%[m])\n\t"
        "movq	136(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "addq	%%r9,  %%r11\n\t"
        "movq	%%r11, 136(%[a])\n\t"
        "adcq	$0, %%r8\n\t"
        "# a[i+18] += m[18] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r9, %%r9\n\t"
        "mulq	144(%[m])\n\t"
        "movq	144(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "addq	%%r8,  %%r11\n\t"
        "movq	%%r11, 144(%[a])\n\t"
        "adcq	$0, %%r9\n\t"
        "# a[i+19] += m[19] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	152(%[m])\n\t"
        "movq	152(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "addq	%%r9,  %%r11\n\t"
        "movq	%%r11, 152(%[a])\n\t"
        "adcq	$0, %%r8\n\t"
        "# a[i+20] += m[20] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r9, %%r9\n\t"
        "mulq	160(%[m])\n\t"
        "movq	160(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "addq	%%r8,  %%r11\n\t"
        "movq	%%r11, 160(%[a])\n\t"
        "adcq	$0, %%r9\n\t"
        "# a[i+21] += m[21] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	168(%[m])\n\t"
        "movq	168(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "addq	%%r9,  %%r11\n\t"
        "movq	%%r11, 168(%[a])\n\t"
        "adcq	$0, %%r8\n\t"
        "# a[i+22] += m[22] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r9, %%r9\n\t"
        "mulq	176(%[m])\n\t"
        "movq	176(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "addq	%%r8,  %%r11\n\t"
        "movq	%%r11, 176(%[a])\n\t"
        "adcq	$0, %%r9\n\t"
        "# a[i+23] += m[23] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "mulq	184(%[m])\n\t"
        "movq	184(%[a]), %%r11\n\t"
        "addq	%%rax, %%r9\n\t"
        "adcq	%[ca], %%rdx\n\t"
        "movq	$0, %[ca]\n\t"
        "adcq	$0, %[ca]\n\t"
        "addq	%%r9, %%r11\n\t"
        "movq	%%r11, 184(%[a])\n\t"
        "adcq	%%rdx, 192(%[a])\n\t"
        "adcq	$0, %[ca]\n\t"
        "# i += 1\n\t"
        "addq	$8, %[a]\n\t"
        "addq	$8, %%rcx\n\t"
        "cmpq	$192, %%rcx\n\t"
        "jl	L_mont_loop_24\n\t"
        "movq	%%r12, 0(%[a])\n\t"
        "movq	%%r13, 8(%[a])\n\t"
        : [ca] "+r" (ca), [a] "+r" (a)
        : [m] "r" (m), [mp] "r" (mp)
        : "memory", "rax", "rdx", "rcx", "r8", "r9", "r10", "r11",
          "r12", "r13"
    );

    sp_3072_cond_sub_24(a - 24, a, m, (sp_digit)0 - ca);
}

/* Multiply two Montogmery form numbers mod the modulus (prime).
 * (r = a * b mod m)
 *
 * r   Result of multiplication.
 * a   First number to multiply in Montogmery form.
 * b   Second number to multiply in Montogmery form.
 * m   Modulus (prime).
 * mp  Montogmery mulitplier.
 */
static void sp_3072_mont_mul_24(sp_digit* r, sp_digit* a, sp_digit* b,
        sp_digit* m, sp_digit mp)
{
    sp_3072_mul_24(r, a, b);
    sp_3072_mont_reduce_24(r, m, mp);
}

/* Square the Montgomery form number. (r = a * a mod m)
 *
 * r   Result of squaring.
 * a   Number to square in Montogmery form.
 * m   Modulus (prime).
 * mp  Montogmery mulitplier.
 */
static void sp_3072_mont_sqr_24(sp_digit* r, sp_digit* a, sp_digit* m,
        sp_digit mp)
{
    sp_3072_sqr_24(r, a);
    sp_3072_mont_reduce_24(r, m, mp);
}

/* Mul a by digit b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision digit.
 */
SP_NOINLINE static void sp_3072_mul_d_24(sp_digit* r, const sp_digit* a,
        const sp_digit b)
{
    __asm__ __volatile__ (
        "# A[0] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	(%[a])\n\t"
        "movq	%%rax, %%rbx\n\t"
        "movq	%%rdx, %%rcx\n\t"
        "movq	%%rbx, 0(%[r])\n\t"
        "# A[1] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "movq	%%rcx, 8(%[r])\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "# A[2] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "movq	%%r8, 16(%[r])\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "# A[3] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "movq	%%rbx, 24(%[r])\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "# A[4] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "movq	%%rcx, 32(%[r])\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "# A[5] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "movq	%%r8, 40(%[r])\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "# A[6] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "movq	%%rbx, 48(%[r])\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "# A[7] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "movq	%%rcx, 56(%[r])\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "# A[8] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "movq	%%r8, 64(%[r])\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "# A[9] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "movq	%%rbx, 72(%[r])\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "# A[10] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "movq	%%rcx, 80(%[r])\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "# A[11] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "movq	%%r8, 88(%[r])\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "# A[12] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "movq	%%rbx, 96(%[r])\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "# A[13] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "movq	%%rcx, 104(%[r])\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "# A[14] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "movq	%%r8, 112(%[r])\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "# A[15] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "movq	%%rbx, 120(%[r])\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "# A[16] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "mulq	128(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "movq	%%rcx, 128(%[r])\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "# A[17] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "mulq	136(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "movq	%%r8, 136(%[r])\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "# A[18] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	144(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "movq	%%rbx, 144(%[r])\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "# A[19] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "mulq	152(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "movq	%%rcx, 152(%[r])\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "# A[20] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "mulq	160(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "movq	%%r8, 160(%[r])\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "# A[21] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	168(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "movq	%%rbx, 168(%[r])\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "# A[22] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "mulq	176(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "movq	%%rcx, 176(%[r])\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "# A[23] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "mulq	184(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "movq	%%r8, 184(%[r])\n\t"
        "movq	%%rbx, 192(%[r])\n\t"
        :
        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
        : "memory", "rax", "rdx", "rbx", "rcx", "r8"
    );
}

#ifdef HAVE_INTEL_AVX2
/* Mul a by digit b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision digit.
 */
SP_NOINLINE static void sp_3072_mul_d_avx2_24(sp_digit* r, const sp_digit* a,
        const sp_digit b)
{
    __asm__ __volatile__ (
        "# A[0] * B\n\t"
        "movq	%[b], %%rdx\n\t"
        "xorq	%%r10, %%r10\n\t"
        "mulxq	(%[a]), %%r8, %%r9\n\t"
        "movq	%%r8, 0(%[r])\n\t"
        "# A[1] * B\n\t"
        "mulxq	8(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r8\n\t"
        "adcxq	%%rax, %%r9\n\t"
        "movq	%%r9, 8(%[r])\n\t"
        "adoxq	%%rcx, %%r8\n\t"
        "# A[2] * B\n\t"
        "mulxq	16(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r9\n\t"
        "adcxq	%%rax, %%r8\n\t"
        "movq	%%r8, 16(%[r])\n\t"
        "adoxq	%%rcx, %%r9\n\t"
        "# A[3] * B\n\t"
        "mulxq	24(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r8\n\t"
        "adcxq	%%rax, %%r9\n\t"
        "movq	%%r9, 24(%[r])\n\t"
        "adoxq	%%rcx, %%r8\n\t"
        "# A[4] * B\n\t"
        "mulxq	32(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r9\n\t"
        "adcxq	%%rax, %%r8\n\t"
        "movq	%%r8, 32(%[r])\n\t"
        "adoxq	%%rcx, %%r9\n\t"
        "# A[5] * B\n\t"
        "mulxq	40(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r8\n\t"
        "adcxq	%%rax, %%r9\n\t"
        "movq	%%r9, 40(%[r])\n\t"
        "adoxq	%%rcx, %%r8\n\t"
        "# A[6] * B\n\t"
        "mulxq	48(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r9\n\t"
        "adcxq	%%rax, %%r8\n\t"
        "movq	%%r8, 48(%[r])\n\t"
        "adoxq	%%rcx, %%r9\n\t"
        "# A[7] * B\n\t"
        "mulxq	56(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r8\n\t"
        "adcxq	%%rax, %%r9\n\t"
        "movq	%%r9, 56(%[r])\n\t"
        "adoxq	%%rcx, %%r8\n\t"
        "# A[8] * B\n\t"
        "mulxq	64(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r9\n\t"
        "adcxq	%%rax, %%r8\n\t"
        "movq	%%r8, 64(%[r])\n\t"
        "adoxq	%%rcx, %%r9\n\t"
        "# A[9] * B\n\t"
        "mulxq	72(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r8\n\t"
        "adcxq	%%rax, %%r9\n\t"
        "movq	%%r9, 72(%[r])\n\t"
        "adoxq	%%rcx, %%r8\n\t"
        "# A[10] * B\n\t"
        "mulxq	80(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r9\n\t"
        "adcxq	%%rax, %%r8\n\t"
        "movq	%%r8, 80(%[r])\n\t"
        "adoxq	%%rcx, %%r9\n\t"
        "# A[11] * B\n\t"
        "mulxq	88(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r8\n\t"
        "adcxq	%%rax, %%r9\n\t"
        "movq	%%r9, 88(%[r])\n\t"
        "adoxq	%%rcx, %%r8\n\t"
        "# A[12] * B\n\t"
        "mulxq	96(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r9\n\t"
        "adcxq	%%rax, %%r8\n\t"
        "movq	%%r8, 96(%[r])\n\t"
        "adoxq	%%rcx, %%r9\n\t"
        "# A[13] * B\n\t"
        "mulxq	104(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r8\n\t"
        "adcxq	%%rax, %%r9\n\t"
        "movq	%%r9, 104(%[r])\n\t"
        "adoxq	%%rcx, %%r8\n\t"
        "# A[14] * B\n\t"
        "mulxq	112(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r9\n\t"
        "adcxq	%%rax, %%r8\n\t"
        "movq	%%r8, 112(%[r])\n\t"
        "adoxq	%%rcx, %%r9\n\t"
        "# A[15] * B\n\t"
        "mulxq	120(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r8\n\t"
        "adcxq	%%rax, %%r9\n\t"
        "movq	%%r9, 120(%[r])\n\t"
        "adoxq	%%rcx, %%r8\n\t"
        "# A[16] * B\n\t"
        "mulxq	128(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r9\n\t"
        "adcxq	%%rax, %%r8\n\t"
        "movq	%%r8, 128(%[r])\n\t"
        "adoxq	%%rcx, %%r9\n\t"
        "# A[17] * B\n\t"
        "mulxq	136(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r8\n\t"
        "adcxq	%%rax, %%r9\n\t"
        "movq	%%r9, 136(%[r])\n\t"
        "adoxq	%%rcx, %%r8\n\t"
        "# A[18] * B\n\t"
        "mulxq	144(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r9\n\t"
        "adcxq	%%rax, %%r8\n\t"
        "movq	%%r8, 144(%[r])\n\t"
        "adoxq	%%rcx, %%r9\n\t"
        "# A[19] * B\n\t"
        "mulxq	152(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r8\n\t"
        "adcxq	%%rax, %%r9\n\t"
        "movq	%%r9, 152(%[r])\n\t"
        "adoxq	%%rcx, %%r8\n\t"
        "# A[20] * B\n\t"
        "mulxq	160(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r9\n\t"
        "adcxq	%%rax, %%r8\n\t"
        "movq	%%r8, 160(%[r])\n\t"
        "adoxq	%%rcx, %%r9\n\t"
        "# A[21] * B\n\t"
        "mulxq	168(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r8\n\t"
        "adcxq	%%rax, %%r9\n\t"
        "movq	%%r9, 168(%[r])\n\t"
        "adoxq	%%rcx, %%r8\n\t"
        "# A[22] * B\n\t"
        "mulxq	176(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r9\n\t"
        "adcxq	%%rax, %%r8\n\t"
        "movq	%%r8, 176(%[r])\n\t"
        "adoxq	%%rcx, %%r9\n\t"
        "# A[23] * B\n\t"
        "mulxq	184(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r8\n\t"
        "adcxq	%%rax, %%r9\n\t"
        "adoxq	%%rcx, %%r8\n\t"
        "adcxq	%%r10, %%r8\n\t"
        "movq	%%r9, 184(%[r])\n\t"
        "movq	%%r8, 192(%[r])\n\t"
        :
        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
        : "memory", "rax", "rdx", "rcx", "r8", "r9", "r10"
    );
}
#endif /* HAVE_INTEL_AVX2 */

/* Divide the double width number (d1|d0) by the dividend. (d1|d0 / div)
 *
 * d1   The high order half of the number to divide.
 * d0   The low order half of the number to divide.
 * div  The dividend.
 * returns the result of the division.
 */
static sp_digit div_3072_word_24(sp_digit d1, sp_digit d0, sp_digit div)
{
    sp_digit r;

    __asm__ __volatile__ (
        "movq	%[d0], %%rax\n\t"
        "movq	%[d1], %%rdx\n\t"
        "divq	%[div]\n\t"
        "movq	%%rax, %[r]\n\t"
        : [r] "=r" (r)
        : [d1] "r" (d1), [d0] "r" (d0), [div] "r" (div)
        : "rax", "rdx"
    );

    return r;
}

/* Compare a with b in constant time.
 *
 * a  A single precision integer.
 * b  A single precision integer.
 * return -ve, 0 or +ve if a is less than, equal to or greater than b
 * respectively.
 */
static int64_t sp_3072_cmp_24(sp_digit* a, sp_digit* b)
{
    sp_digit r = -1;
    sp_digit one = 1;

    __asm__ __volatile__ (
        "xorq	%%rcx, %%rcx\n\t"
        "movq	$-1, %%rdx\n\t"
        "movq	184(%[a]), %%rbx\n\t"
        "movq	184(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	176(%[a]), %%rbx\n\t"
        "movq	176(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	168(%[a]), %%rbx\n\t"
        "movq	168(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	160(%[a]), %%rbx\n\t"
        "movq	160(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	152(%[a]), %%rbx\n\t"
        "movq	152(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	144(%[a]), %%rbx\n\t"
        "movq	144(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	136(%[a]), %%rbx\n\t"
        "movq	136(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	128(%[a]), %%rbx\n\t"
        "movq	128(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	120(%[a]), %%rbx\n\t"
        "movq	120(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	112(%[a]), %%rbx\n\t"
        "movq	112(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	104(%[a]), %%rbx\n\t"
        "movq	104(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	96(%[a]), %%rbx\n\t"
        "movq	96(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	88(%[a]), %%rbx\n\t"
        "movq	88(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	80(%[a]), %%rbx\n\t"
        "movq	80(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	72(%[a]), %%rbx\n\t"
        "movq	72(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	64(%[a]), %%rbx\n\t"
        "movq	64(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	56(%[a]), %%rbx\n\t"
        "movq	56(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	48(%[a]), %%rbx\n\t"
        "movq	48(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	40(%[a]), %%rbx\n\t"
        "movq	40(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	32(%[a]), %%rbx\n\t"
        "movq	32(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	24(%[a]), %%rbx\n\t"
        "movq	24(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	16(%[a]), %%rbx\n\t"
        "movq	16(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	8(%[a]), %%rbx\n\t"
        "movq	8(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	0(%[a]), %%rbx\n\t"
        "movq	0(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "xorq	%%rdx, %[r]\n\t"
        : [r] "+r" (r)
        : [a] "r" (a), [b] "r" (b), [one] "r" (one)
        : "rax", "rdx", "rcx", "rbx", "r8"
    );

    return r;
}

/* Divide d in a and put remainder into r (m*d + r = a)
 * m is not calculated as it is not needed at this time.
 *
 * a  Nmber to be divided.
 * d  Number to divide with.
 * m  Multiplier result.
 * r  Remainder from the division.
 * returns MP_OKAY indicating success.
 */
static INLINE int sp_3072_div_24(sp_digit* a, sp_digit* d, sp_digit* m,
        sp_digit* r)
{
    sp_digit t1[48], t2[25];
    sp_digit div, r1;
    int i;
#ifdef HAVE_INTEL_AVX2
    word32 cpuid_flags = cpuid_get_flags();
#endif

    (void)m;

    div = d[23];
    XMEMCPY(t1, a, sizeof(*t1) * 2 * 24);
    for (i=23; i>=0; i--) {
        r1 = div_3072_word_24(t1[24 + i], t1[24 + i - 1], div);

#ifdef HAVE_INTEL_AVX2
        if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags))
            sp_3072_mul_d_avx2_24(t2, d, r1);
        else
#endif
            sp_3072_mul_d_24(t2, d, r1);
        t1[24 + i] += sp_3072_sub_in_place_24(&t1[i], t2);
        t1[24 + i] -= t2[24];
        sp_3072_mask_24(t2, d, t1[24 + i]);
        t1[24 + i] += sp_3072_add_24(&t1[i], &t1[i], t2);
        sp_3072_mask_24(t2, d, t1[24 + i]);
        t1[24 + i] += sp_3072_add_24(&t1[i], &t1[i], t2);
    }

    r1 = sp_3072_cmp_24(t1, d) >= 0;
    sp_3072_cond_sub_24(r, t1, t2, (sp_digit)0 - r1);

    return MP_OKAY;
}

/* Reduce a modulo m into r. (r = a mod m)
 *
 * r  A single precision number that is the reduced result.
 * a  A single precision number that is to be reduced.
 * m  A single precision number that is the modulus to reduce with.
 * returns MP_OKAY indicating success.
 */
static INLINE int sp_3072_mod_24(sp_digit* r, sp_digit* a, sp_digit* m)
{
    return sp_3072_div_24(a, m, NULL, r);
}

/* Modular exponentiate a to the e mod m. (r = a^e mod m)
 *
 * r     A single precision number that is the result of the operation.
 * a     A single precision number being exponentiated.
 * e     A single precision number that is the exponent.
 * bits  The number of bits in the exponent.
 * m     A single precision number that is the modulus.
 * returns 0 on success and MEMORY_E on dynamic memory allocation failure.
 */
static int sp_3072_mod_exp_24(sp_digit* r, sp_digit* a, sp_digit* e,
        int bits, sp_digit* m, int reduceA)
{
#ifndef WOLFSSL_SMALL_STACK
    sp_digit t[32][48];
#else
    sp_digit* t[32];
    sp_digit* td;
#endif
    sp_digit* norm;
    sp_digit mp = 1;
    sp_digit n;
    sp_digit mask;
    int i;
    int c, y;
    int err = MP_OKAY;

#ifdef WOLFSSL_SMALL_STACK
    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 32 * 48, NULL,
                            DYNAMIC_TYPE_TMP_BUFFER);
    if (td == NULL)
        err = MEMORY_E;

    if (err == MP_OKAY) {
        for (i=0; i<32; i++)
            t[i] = td + i * 48;
        norm = t[0];
    }
#else
    norm = t[0];
#endif

    if (err == MP_OKAY) {
        sp_3072_mont_setup(m, &mp);
        sp_3072_mont_norm_24(norm, m);

        XMEMSET(t[1], 0, sizeof(sp_digit) * 24);
        if (reduceA) {
            err = sp_3072_mod_24(t[1] + 24, a, m);
            if (err == MP_OKAY)
                err = sp_3072_mod_24(t[1], t[1], m);
        }
        else {
            XMEMCPY(t[1] + 24, a, sizeof(sp_digit) * 24);
            err = sp_3072_mod_24(t[1], t[1], m);
        }
    }

    if (err == MP_OKAY) {
        sp_3072_mont_sqr_24(t[ 2], t[ 1], m, mp);
        sp_3072_mont_mul_24(t[ 3], t[ 2], t[ 1], m, mp);
        sp_3072_mont_sqr_24(t[ 4], t[ 2], m, mp);
        sp_3072_mont_mul_24(t[ 5], t[ 3], t[ 2], m, mp);
        sp_3072_mont_sqr_24(t[ 6], t[ 3], m, mp);
        sp_3072_mont_mul_24(t[ 7], t[ 4], t[ 3], m, mp);
        sp_3072_mont_sqr_24(t[ 8], t[ 4], m, mp);
        sp_3072_mont_mul_24(t[ 9], t[ 5], t[ 4], m, mp);
        sp_3072_mont_sqr_24(t[10], t[ 5], m, mp);
        sp_3072_mont_mul_24(t[11], t[ 6], t[ 5], m, mp);
        sp_3072_mont_sqr_24(t[12], t[ 6], m, mp);
        sp_3072_mont_mul_24(t[13], t[ 7], t[ 6], m, mp);
        sp_3072_mont_sqr_24(t[14], t[ 7], m, mp);
        sp_3072_mont_mul_24(t[15], t[ 8], t[ 7], m, mp);
        sp_3072_mont_sqr_24(t[16], t[ 8], m, mp);
        sp_3072_mont_mul_24(t[17], t[ 9], t[ 8], m, mp);
        sp_3072_mont_sqr_24(t[18], t[ 9], m, mp);
        sp_3072_mont_mul_24(t[19], t[10], t[ 9], m, mp);
        sp_3072_mont_sqr_24(t[20], t[10], m, mp);
        sp_3072_mont_mul_24(t[21], t[11], t[10], m, mp);
        sp_3072_mont_sqr_24(t[22], t[11], m, mp);
        sp_3072_mont_mul_24(t[23], t[12], t[11], m, mp);
        sp_3072_mont_sqr_24(t[24], t[12], m, mp);
        sp_3072_mont_mul_24(t[25], t[13], t[12], m, mp);
        sp_3072_mont_sqr_24(t[26], t[13], m, mp);
        sp_3072_mont_mul_24(t[27], t[14], t[13], m, mp);
        sp_3072_mont_sqr_24(t[28], t[14], m, mp);
        sp_3072_mont_mul_24(t[29], t[15], t[14], m, mp);
        sp_3072_mont_sqr_24(t[30], t[15], m, mp);
        sp_3072_mont_mul_24(t[31], t[16], t[15], m, mp);

        i = (bits - 1) / 64;
        n = e[i--];
        y = n >> 59;
        n <<= 5;
        c = 59;
        XMEMCPY(r, t[y], sizeof(sp_digit) * 24);
        for (; i>=0 || c>=5; ) {
            if (c == 0) {
                n = e[i--];
                y = n >> 59;
                n <<= 5;
                c = 59;
            }
            else if (c < 5) {
                y = n >> 59;
                n = e[i--];
                c = 5 - c;
                y |= n >> (64 - c);
                n <<= c;
                c = 64 - c;
            }
            else {
                y = (n >> 59) & 0x1f;
                n <<= 5;
                c -= 5;
            }

            sp_3072_mont_sqr_24(r, r, m, mp);
            sp_3072_mont_sqr_24(r, r, m, mp);
            sp_3072_mont_sqr_24(r, r, m, mp);
            sp_3072_mont_sqr_24(r, r, m, mp);
            sp_3072_mont_sqr_24(r, r, m, mp);

            sp_3072_mont_mul_24(r, r, t[y], m, mp);
        }
        y = e[0] & ((1 << c) - 1);
        for (; c > 0; c--)
            sp_3072_mont_sqr_24(r, r, m, mp);
        sp_3072_mont_mul_24(r, r, t[y], m, mp);

        XMEMSET(&r[24], 0, sizeof(sp_digit) * 24);
        sp_3072_mont_reduce_24(r, m, mp);

        mask = 0 - (sp_3072_cmp_24(r, m) >= 0);
        sp_3072_cond_sub_24(r, r, m, mask);
    }

#ifdef WOLFSSL_SMALL_STACK
    if (td != NULL)
        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
}

#ifdef HAVE_INTEL_AVX2
/* Reduce the number back to 3072 bits using Montgomery reduction.
 *
 * a   A single precision number to reduce in place.
 * m   The single precision number representing the modulus.
 * mp  The digit representing the negative inverse of m mod 2^n.
 */
SP_NOINLINE static void sp_3072_mont_reduce_avx2_24(sp_digit* a, sp_digit* m,
        sp_digit mp)
{
    sp_digit ca = 0;

    __asm__ __volatile__ (
        "# i = 0\n\t"
        "movq	0(%[a]), %%r12\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "\nL_mont_loop_avx2_24:\n\t"
        "xorq	%%r9, %%r9\n\t"
        "movq	%%r12, %%r10\n\t"
        "# mu = a[i] * mp\n\t"
        "movq	%%r12, %%rdx\n\t"
        "mulxq	%[mp], %%rdx, %%r8\n\t"
        "# a[i+0] += m[0] * mu\n\t"
        "mulxq	0(%[m]), %%rax, %%r8\n\t"
        "movq	8(%[a]), %%r12\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r8, %%r12\n\t"
        "# a[i+1] += m[1] * mu\n\t"
        "mulxq	8(%[m]), %%rax, %%r8\n\t"
        "movq	16(%[a]), %%r10\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%r8, %%r10\n\t"
        "# a[i+2] += m[2] * mu\n\t"
        "mulxq	16(%[m]), %%rax, %%r8\n\t"
        "movq	24(%[a]), %%r11\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r8, %%r11\n\t"
        "movq	%%r10, 16(%[a])\n\t"
        "# a[i+3] += m[3] * mu\n\t"
        "mulxq	24(%[m]), %%rax, %%r8\n\t"
        "movq	32(%[a]), %%r10\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%r8, %%r10\n\t"
        "movq	%%r11, 24(%[a])\n\t"
        "# a[i+4] += m[4] * mu\n\t"
        "mulxq	32(%[m]), %%rax, %%r8\n\t"
        "movq	40(%[a]), %%r11\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r8, %%r11\n\t"
        "movq	%%r10, 32(%[a])\n\t"
        "# a[i+5] += m[5] * mu\n\t"
        "mulxq	40(%[m]), %%rax, %%r8\n\t"
        "movq	48(%[a]), %%r10\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%r8, %%r10\n\t"
        "movq	%%r11, 40(%[a])\n\t"
        "# a[i+6] += m[6] * mu\n\t"
        "mulxq	48(%[m]), %%rax, %%r8\n\t"
        "movq	56(%[a]), %%r11\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r8, %%r11\n\t"
        "movq	%%r10, 48(%[a])\n\t"
        "# a[i+7] += m[7] * mu\n\t"
        "mulxq	56(%[m]), %%rax, %%r8\n\t"
        "movq	64(%[a]), %%r10\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%r8, %%r10\n\t"
        "movq	%%r11, 56(%[a])\n\t"
        "# a[i+8] += m[8] * mu\n\t"
        "mulxq	64(%[m]), %%rax, %%r8\n\t"
        "movq	72(%[a]), %%r11\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r8, %%r11\n\t"
        "movq	%%r10, 64(%[a])\n\t"
        "# a[i+9] += m[9] * mu\n\t"
        "mulxq	72(%[m]), %%rax, %%r8\n\t"
        "movq	80(%[a]), %%r10\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%r8, %%r10\n\t"
        "movq	%%r11, 72(%[a])\n\t"
        "# a[i+10] += m[10] * mu\n\t"
        "mulxq	80(%[m]), %%rax, %%r8\n\t"
        "movq	88(%[a]), %%r11\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r8, %%r11\n\t"
        "movq	%%r10, 80(%[a])\n\t"
        "# a[i+11] += m[11] * mu\n\t"
        "mulxq	88(%[m]), %%rax, %%r8\n\t"
        "movq	96(%[a]), %%r10\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%r8, %%r10\n\t"
        "movq	%%r11, 88(%[a])\n\t"
        "# a[i+12] += m[12] * mu\n\t"
        "mulxq	96(%[m]), %%rax, %%r8\n\t"
        "movq	104(%[a]), %%r11\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r8, %%r11\n\t"
        "movq	%%r10, 96(%[a])\n\t"
        "# a[i+13] += m[13] * mu\n\t"
        "mulxq	104(%[m]), %%rax, %%r8\n\t"
        "movq	112(%[a]), %%r10\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%r8, %%r10\n\t"
        "movq	%%r11, 104(%[a])\n\t"
        "# a[i+14] += m[14] * mu\n\t"
        "mulxq	112(%[m]), %%rax, %%r8\n\t"
        "movq	120(%[a]), %%r11\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r8, %%r11\n\t"
        "movq	%%r10, 112(%[a])\n\t"
        "# a[i+15] += m[15] * mu\n\t"
        "mulxq	120(%[m]), %%rax, %%r8\n\t"
        "movq	128(%[a]), %%r10\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%r8, %%r10\n\t"
        "movq	%%r11, 120(%[a])\n\t"
        "# a[i+16] += m[16] * mu\n\t"
        "mulxq	128(%[m]), %%rax, %%r8\n\t"
        "movq	136(%[a]), %%r11\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r8, %%r11\n\t"
        "movq	%%r10, 128(%[a])\n\t"
        "# a[i+17] += m[17] * mu\n\t"
        "mulxq	136(%[m]), %%rax, %%r8\n\t"
        "movq	144(%[a]), %%r10\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%r8, %%r10\n\t"
        "movq	%%r11, 136(%[a])\n\t"
        "# a[i+18] += m[18] * mu\n\t"
        "mulxq	144(%[m]), %%rax, %%r8\n\t"
        "movq	152(%[a]), %%r11\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r8, %%r11\n\t"
        "movq	%%r10, 144(%[a])\n\t"
        "# a[i+19] += m[19] * mu\n\t"
        "mulxq	152(%[m]), %%rax, %%r8\n\t"
        "movq	160(%[a]), %%r10\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%r8, %%r10\n\t"
        "movq	%%r11, 152(%[a])\n\t"
        "# a[i+20] += m[20] * mu\n\t"
        "mulxq	160(%[m]), %%rax, %%r8\n\t"
        "movq	168(%[a]), %%r11\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r8, %%r11\n\t"
        "movq	%%r10, 160(%[a])\n\t"
        "# a[i+21] += m[21] * mu\n\t"
        "mulxq	168(%[m]), %%rax, %%r8\n\t"
        "movq	176(%[a]), %%r10\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%r8, %%r10\n\t"
        "movq	%%r11, 168(%[a])\n\t"
        "# a[i+22] += m[22] * mu\n\t"
        "mulxq	176(%[m]), %%rax, %%r8\n\t"
        "movq	184(%[a]), %%r11\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r8, %%r11\n\t"
        "movq	%%r10, 176(%[a])\n\t"
        "# a[i+23] += m[23] * mu\n\t"
        "mulxq	184(%[m]), %%rax, %%r8\n\t"
        "movq	192(%[a]), %%r10\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%r8, %%r10\n\t"
        "movq	%%r11, 184(%[a])\n\t"
        "adcxq	%[ca], %%r10\n\t"
        "movq	%%r9, %[ca]\n\t"
        "adoxq	%%r9, %[ca]\n\t"
        "adcxq	%%r9, %[ca]\n\t"
        "movq	%%r10, 192(%[a])\n\t"
        "# i += 1\n\t"
        "addq	$8, %[a]\n\t"
        "addq	$1, %%rcx\n\t"
        "cmpq	$24, %%rcx\n\t"
        "jl	L_mont_loop_avx2_24\n\t"
        "movq	%%r12, 0(%[a])\n\t"
        : [ca] "+r" (ca), [a] "+r" (a)
        : [m] "r" (m), [mp] "r" (mp)
        : "memory", "rax", "rdx", "rcx", "r8", "r9", "r10", "r11", "r12"
    );

    sp_3072_cond_sub_24(a - 24, a, m, (sp_digit)0 - ca);
}
#endif /* HAVE_INTEL_AVX2 */

#ifdef HAVE_INTEL_AVX2
/* Multiply two Montogmery form numbers mod the modulus (prime).
 * (r = a * b mod m)
 *
 * r   Result of multiplication.
 * a   First number to multiply in Montogmery form.
 * b   Second number to multiply in Montogmery form.
 * m   Modulus (prime).
 * mp  Montogmery mulitplier.
 */
static void sp_3072_mont_mul_avx2_24(sp_digit* r, sp_digit* a, sp_digit* b,
        sp_digit* m, sp_digit mp)
{
    sp_3072_mul_avx2_24(r, a, b);
    sp_3072_mont_reduce_avx2_24(r, m, mp);
}

#endif /* HAVE_INTEL_AVX2 */
#ifdef HAVE_INTEL_AVX2
/* Square the Montgomery form number. (r = a * a mod m)
 *
 * r   Result of squaring.
 * a   Number to square in Montogmery form.
 * m   Modulus (prime).
 * mp  Montogmery mulitplier.
 */
static void sp_3072_mont_sqr_avx2_24(sp_digit* r, sp_digit* a, sp_digit* m,
        sp_digit mp)
{
    sp_3072_sqr_avx2_24(r, a);
    sp_3072_mont_reduce_avx2_24(r, m, mp);
}

#endif /* HAVE_INTEL_AVX2 */
#ifdef HAVE_INTEL_AVX2
/* Modular exponentiate a to the e mod m. (r = a^e mod m)
 *
 * r     A single precision number that is the result of the operation.
 * a     A single precision number being exponentiated.
 * e     A single precision number that is the exponent.
 * bits  The number of bits in the exponent.
 * m     A single precision number that is the modulus.
 * returns 0 on success and MEMORY_E on dynamic memory allocation failure.
 */
static int sp_3072_mod_exp_avx2_24(sp_digit* r, sp_digit* a, sp_digit* e,
        int bits, sp_digit* m, int reduceA)
{
#ifndef WOLFSSL_SMALL_STACK
    sp_digit t[32][48];
#else
    sp_digit* t[32];
    sp_digit* td;
#endif
    sp_digit* norm;
    sp_digit mp = 1;
    sp_digit n;
    sp_digit mask;
    int i;
    int c, y;
    int err = MP_OKAY;

#ifdef WOLFSSL_SMALL_STACK
    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 32 * 48, NULL,
                            DYNAMIC_TYPE_TMP_BUFFER);
    if (td == NULL)
        err = MEMORY_E;

    if (err == MP_OKAY) {
        for (i=0; i<32; i++)
            t[i] = td + i * 48;
        norm = t[0];
    }
#else
    norm = t[0];
#endif

    if (err == MP_OKAY) {
        sp_3072_mont_setup(m, &mp);
        sp_3072_mont_norm_24(norm, m);

        XMEMSET(t[1], 0, sizeof(sp_digit) * 24);
        if (reduceA) {
            err = sp_3072_mod_24(t[1] + 24, a, m);
            if (err == MP_OKAY)
                err = sp_3072_mod_24(t[1], t[1], m);
        }
        else {
            XMEMCPY(t[1] + 24, a, sizeof(sp_digit) * 24);
            err = sp_3072_mod_24(t[1], t[1], m);
        }
    }

    if (err == MP_OKAY) {
        sp_3072_mont_sqr_avx2_24(t[ 2], t[ 1], m, mp);
        sp_3072_mont_mul_avx2_24(t[ 3], t[ 2], t[ 1], m, mp);
        sp_3072_mont_sqr_avx2_24(t[ 4], t[ 2], m, mp);
        sp_3072_mont_mul_avx2_24(t[ 5], t[ 3], t[ 2], m, mp);
        sp_3072_mont_sqr_avx2_24(t[ 6], t[ 3], m, mp);
        sp_3072_mont_mul_avx2_24(t[ 7], t[ 4], t[ 3], m, mp);
        sp_3072_mont_sqr_avx2_24(t[ 8], t[ 4], m, mp);
        sp_3072_mont_mul_avx2_24(t[ 9], t[ 5], t[ 4], m, mp);
        sp_3072_mont_sqr_avx2_24(t[10], t[ 5], m, mp);
        sp_3072_mont_mul_avx2_24(t[11], t[ 6], t[ 5], m, mp);
        sp_3072_mont_sqr_avx2_24(t[12], t[ 6], m, mp);
        sp_3072_mont_mul_avx2_24(t[13], t[ 7], t[ 6], m, mp);
        sp_3072_mont_sqr_avx2_24(t[14], t[ 7], m, mp);
        sp_3072_mont_mul_avx2_24(t[15], t[ 8], t[ 7], m, mp);
        sp_3072_mont_sqr_avx2_24(t[16], t[ 8], m, mp);
        sp_3072_mont_mul_avx2_24(t[17], t[ 9], t[ 8], m, mp);
        sp_3072_mont_sqr_avx2_24(t[18], t[ 9], m, mp);
        sp_3072_mont_mul_avx2_24(t[19], t[10], t[ 9], m, mp);
        sp_3072_mont_sqr_avx2_24(t[20], t[10], m, mp);
        sp_3072_mont_mul_avx2_24(t[21], t[11], t[10], m, mp);
        sp_3072_mont_sqr_avx2_24(t[22], t[11], m, mp);
        sp_3072_mont_mul_avx2_24(t[23], t[12], t[11], m, mp);
        sp_3072_mont_sqr_avx2_24(t[24], t[12], m, mp);
        sp_3072_mont_mul_avx2_24(t[25], t[13], t[12], m, mp);
        sp_3072_mont_sqr_avx2_24(t[26], t[13], m, mp);
        sp_3072_mont_mul_avx2_24(t[27], t[14], t[13], m, mp);
        sp_3072_mont_sqr_avx2_24(t[28], t[14], m, mp);
        sp_3072_mont_mul_avx2_24(t[29], t[15], t[14], m, mp);
        sp_3072_mont_sqr_avx2_24(t[30], t[15], m, mp);
        sp_3072_mont_mul_avx2_24(t[31], t[16], t[15], m, mp);

        i = (bits - 1) / 64;
        n = e[i--];
        y = n >> 59;
        n <<= 5;
        c = 59;
        XMEMCPY(r, t[y], sizeof(sp_digit) * 24);
        for (; i>=0 || c>=5; ) {
            if (c == 0) {
                n = e[i--];
                y = n >> 59;
                n <<= 5;
                c = 59;
            }
            else if (c < 5) {
                y = n >> 59;
                n = e[i--];
                c = 5 - c;
                y |= n >> (64 - c);
                n <<= c;
                c = 64 - c;
            }
            else {
                y = (n >> 59) & 0x1f;
                n <<= 5;
                c -= 5;
            }

            sp_3072_mont_sqr_avx2_24(r, r, m, mp);
            sp_3072_mont_sqr_avx2_24(r, r, m, mp);
            sp_3072_mont_sqr_avx2_24(r, r, m, mp);
            sp_3072_mont_sqr_avx2_24(r, r, m, mp);
            sp_3072_mont_sqr_avx2_24(r, r, m, mp);

            sp_3072_mont_mul_avx2_24(r, r, t[y], m, mp);
        }
        y = e[0] & ((1 << c) - 1);
        for (; c > 0; c--)
            sp_3072_mont_sqr_avx2_24(r, r, m, mp);
        sp_3072_mont_mul_avx2_24(r, r, t[y], m, mp);

        XMEMSET(&r[24], 0, sizeof(sp_digit) * 24);
        sp_3072_mont_reduce_avx2_24(r, m, mp);

        mask = 0 - (sp_3072_cmp_24(r, m) >= 0);
        sp_3072_cond_sub_24(r, r, m, mask);
    }

#ifdef WOLFSSL_SMALL_STACK
    if (td != NULL)
        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
}
#endif /* HAVE_INTEL_AVX2 */

#endif /* !SP_RSA_PRIVATE_EXP_D && WOLFSSL_HAVE_SP_RSA */

/* r = 2^n mod m where n is the number of bits to reduce by.
 * Given m must be 3072 bits, just need to subtract.
 *
 * r  A single precision number.
 * m  A signle precision number.
 */
static void sp_3072_mont_norm_48(sp_digit* r, sp_digit* m)
{
    XMEMSET(r, 0, sizeof(sp_digit) * 48);

    /* r = 2^n mod m */
    sp_3072_sub_in_place_48(r, m);
}

/* Conditionally subtract b from a using the mask m.
 * m is -1 to subtract and 0 when not copying.
 *
 * r  A single precision number representing condition subtract result.
 * a  A single precision number to subtract from.
 * b  A single precision number to subtract.
 * m  Mask value to apply.
 */
static sp_digit sp_3072_cond_sub_48(sp_digit* r, sp_digit* a, sp_digit* b,
        sp_digit m)
{
    sp_digit t[48];
    sp_digit c = 0;

    __asm__ __volatile__ (
        "movq	0(%[b]), %%rax\n\t"
        "movq	8(%[b]), %%rcx\n\t"
        "andq	%[m], %%rax\n\t"
        "andq	%[m], %%rcx\n\t"
        "movq	%%rax, 0(%[t])\n\t"
        "movq	%%rcx, 8(%[t])\n\t"
        "movq	16(%[b]), %%rax\n\t"
        "movq	24(%[b]), %%rcx\n\t"
        "andq	%[m], %%rax\n\t"
        "andq	%[m], %%rcx\n\t"
        "movq	%%rax, 16(%[t])\n\t"
        "movq	%%rcx, 24(%[t])\n\t"
        "movq	32(%[b]), %%rax\n\t"
        "movq	40(%[b]), %%rcx\n\t"
        "andq	%[m], %%rax\n\t"
        "andq	%[m], %%rcx\n\t"
        "movq	%%rax, 32(%[t])\n\t"
        "movq	%%rcx, 40(%[t])\n\t"
        "movq	48(%[b]), %%rax\n\t"
        "movq	56(%[b]), %%rcx\n\t"
        "andq	%[m], %%rax\n\t"
        "andq	%[m], %%rcx\n\t"
        "movq	%%rax, 48(%[t])\n\t"
        "movq	%%rcx, 56(%[t])\n\t"
        "movq	64(%[b]), %%rax\n\t"
        "movq	72(%[b]), %%rcx\n\t"
        "andq	%[m], %%rax\n\t"
        "andq	%[m], %%rcx\n\t"
        "movq	%%rax, 64(%[t])\n\t"
        "movq	%%rcx, 72(%[t])\n\t"
        "movq	80(%[b]), %%rax\n\t"
        "movq	88(%[b]), %%rcx\n\t"
        "andq	%[m], %%rax\n\t"
        "andq	%[m], %%rcx\n\t"
        "movq	%%rax, 80(%[t])\n\t"
        "movq	%%rcx, 88(%[t])\n\t"
        "movq	96(%[b]), %%rax\n\t"
        "movq	104(%[b]), %%rcx\n\t"
        "andq	%[m], %%rax\n\t"
        "andq	%[m], %%rcx\n\t"
        "movq	%%rax, 96(%[t])\n\t"
        "movq	%%rcx, 104(%[t])\n\t"
        "movq	112(%[b]), %%rax\n\t"
        "movq	120(%[b]), %%rcx\n\t"
        "andq	%[m], %%rax\n\t"
        "andq	%[m], %%rcx\n\t"
        "movq	%%rax, 112(%[t])\n\t"
        "movq	%%rcx, 120(%[t])\n\t"
        "movq	128(%[b]), %%rax\n\t"
        "movq	136(%[b]), %%rcx\n\t"
        "andq	%[m], %%rax\n\t"
        "andq	%[m], %%rcx\n\t"
        "movq	%%rax, 128(%[t])\n\t"
        "movq	%%rcx, 136(%[t])\n\t"
        "movq	144(%[b]), %%rax\n\t"
        "movq	152(%[b]), %%rcx\n\t"
        "andq	%[m], %%rax\n\t"
        "andq	%[m], %%rcx\n\t"
        "movq	%%rax, 144(%[t])\n\t"
        "movq	%%rcx, 152(%[t])\n\t"
        "movq	160(%[b]), %%rax\n\t"
        "movq	168(%[b]), %%rcx\n\t"
        "andq	%[m], %%rax\n\t"
        "andq	%[m], %%rcx\n\t"
        "movq	%%rax, 160(%[t])\n\t"
        "movq	%%rcx, 168(%[t])\n\t"
        "movq	176(%[b]), %%rax\n\t"
        "movq	184(%[b]), %%rcx\n\t"
        "andq	%[m], %%rax\n\t"
        "andq	%[m], %%rcx\n\t"
        "movq	%%rax, 176(%[t])\n\t"
        "movq	%%rcx, 184(%[t])\n\t"
        "movq	192(%[b]), %%rax\n\t"
        "movq	200(%[b]), %%rcx\n\t"
        "andq	%[m], %%rax\n\t"
        "andq	%[m], %%rcx\n\t"
        "movq	%%rax, 192(%[t])\n\t"
        "movq	%%rcx, 200(%[t])\n\t"
        "movq	208(%[b]), %%rax\n\t"
        "movq	216(%[b]), %%rcx\n\t"
        "andq	%[m], %%rax\n\t"
        "andq	%[m], %%rcx\n\t"
        "movq	%%rax, 208(%[t])\n\t"
        "movq	%%rcx, 216(%[t])\n\t"
        "movq	224(%[b]), %%rax\n\t"
        "movq	232(%[b]), %%rcx\n\t"
        "andq	%[m], %%rax\n\t"
        "andq	%[m], %%rcx\n\t"
        "movq	%%rax, 224(%[t])\n\t"
        "movq	%%rcx, 232(%[t])\n\t"
        "movq	240(%[b]), %%rax\n\t"
        "movq	248(%[b]), %%rcx\n\t"
        "andq	%[m], %%rax\n\t"
        "andq	%[m], %%rcx\n\t"
        "movq	%%rax, 240(%[t])\n\t"
        "movq	%%rcx, 248(%[t])\n\t"
        "movq	256(%[b]), %%rax\n\t"
        "movq	264(%[b]), %%rcx\n\t"
        "andq	%[m], %%rax\n\t"
        "andq	%[m], %%rcx\n\t"
        "movq	%%rax, 256(%[t])\n\t"
        "movq	%%rcx, 264(%[t])\n\t"
        "movq	272(%[b]), %%rax\n\t"
        "movq	280(%[b]), %%rcx\n\t"
        "andq	%[m], %%rax\n\t"
        "andq	%[m], %%rcx\n\t"
        "movq	%%rax, 272(%[t])\n\t"
        "movq	%%rcx, 280(%[t])\n\t"
        "movq	288(%[b]), %%rax\n\t"
        "movq	296(%[b]), %%rcx\n\t"
        "andq	%[m], %%rax\n\t"
        "andq	%[m], %%rcx\n\t"
        "movq	%%rax, 288(%[t])\n\t"
        "movq	%%rcx, 296(%[t])\n\t"
        "movq	304(%[b]), %%rax\n\t"
        "movq	312(%[b]), %%rcx\n\t"
        "andq	%[m], %%rax\n\t"
        "andq	%[m], %%rcx\n\t"
        "movq	%%rax, 304(%[t])\n\t"
        "movq	%%rcx, 312(%[t])\n\t"
        "movq	320(%[b]), %%rax\n\t"
        "movq	328(%[b]), %%rcx\n\t"
        "andq	%[m], %%rax\n\t"
        "andq	%[m], %%rcx\n\t"
        "movq	%%rax, 320(%[t])\n\t"
        "movq	%%rcx, 328(%[t])\n\t"
        "movq	336(%[b]), %%rax\n\t"
        "movq	344(%[b]), %%rcx\n\t"
        "andq	%[m], %%rax\n\t"
        "andq	%[m], %%rcx\n\t"
        "movq	%%rax, 336(%[t])\n\t"
        "movq	%%rcx, 344(%[t])\n\t"
        "movq	352(%[b]), %%rax\n\t"
        "movq	360(%[b]), %%rcx\n\t"
        "andq	%[m], %%rax\n\t"
        "andq	%[m], %%rcx\n\t"
        "movq	%%rax, 352(%[t])\n\t"
        "movq	%%rcx, 360(%[t])\n\t"
        "movq	368(%[b]), %%rax\n\t"
        "movq	376(%[b]), %%rcx\n\t"
        "andq	%[m], %%rax\n\t"
        "andq	%[m], %%rcx\n\t"
        "movq	%%rax, 368(%[t])\n\t"
        "movq	%%rcx, 376(%[t])\n\t"
        "movq	(%[a]), %%rax\n\t"
        "movq	(%[t]), %%rdx\n\t"
        "subq	%%rdx,%%rax\n\t"
        "movq	8(%[a]), %%rcx\n\t"
        "movq	8(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rcx\n\t"
        "movq	%%rax, 0(%[r])\n\t"
        "movq	16(%[a]), %%rax\n\t"
        "movq	16(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rax\n\t"
        "movq	%%rcx, 8(%[r])\n\t"
        "movq	24(%[a]), %%rcx\n\t"
        "movq	24(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rcx\n\t"
        "movq	%%rax, 16(%[r])\n\t"
        "movq	32(%[a]), %%rax\n\t"
        "movq	32(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rax\n\t"
        "movq	%%rcx, 24(%[r])\n\t"
        "movq	40(%[a]), %%rcx\n\t"
        "movq	40(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rcx\n\t"
        "movq	%%rax, 32(%[r])\n\t"
        "movq	48(%[a]), %%rax\n\t"
        "movq	48(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rax\n\t"
        "movq	%%rcx, 40(%[r])\n\t"
        "movq	56(%[a]), %%rcx\n\t"
        "movq	56(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rcx\n\t"
        "movq	%%rax, 48(%[r])\n\t"
        "movq	64(%[a]), %%rax\n\t"
        "movq	64(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rax\n\t"
        "movq	%%rcx, 56(%[r])\n\t"
        "movq	72(%[a]), %%rcx\n\t"
        "movq	72(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rcx\n\t"
        "movq	%%rax, 64(%[r])\n\t"
        "movq	80(%[a]), %%rax\n\t"
        "movq	80(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rax\n\t"
        "movq	%%rcx, 72(%[r])\n\t"
        "movq	88(%[a]), %%rcx\n\t"
        "movq	88(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rcx\n\t"
        "movq	%%rax, 80(%[r])\n\t"
        "movq	96(%[a]), %%rax\n\t"
        "movq	96(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rax\n\t"
        "movq	%%rcx, 88(%[r])\n\t"
        "movq	104(%[a]), %%rcx\n\t"
        "movq	104(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rcx\n\t"
        "movq	%%rax, 96(%[r])\n\t"
        "movq	112(%[a]), %%rax\n\t"
        "movq	112(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rax\n\t"
        "movq	%%rcx, 104(%[r])\n\t"
        "movq	120(%[a]), %%rcx\n\t"
        "movq	120(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rcx\n\t"
        "movq	%%rax, 112(%[r])\n\t"
        "movq	128(%[a]), %%rax\n\t"
        "movq	128(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rax\n\t"
        "movq	%%rcx, 120(%[r])\n\t"
        "movq	136(%[a]), %%rcx\n\t"
        "movq	136(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rcx\n\t"
        "movq	%%rax, 128(%[r])\n\t"
        "movq	144(%[a]), %%rax\n\t"
        "movq	144(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rax\n\t"
        "movq	%%rcx, 136(%[r])\n\t"
        "movq	152(%[a]), %%rcx\n\t"
        "movq	152(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rcx\n\t"
        "movq	%%rax, 144(%[r])\n\t"
        "movq	160(%[a]), %%rax\n\t"
        "movq	160(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rax\n\t"
        "movq	%%rcx, 152(%[r])\n\t"
        "movq	168(%[a]), %%rcx\n\t"
        "movq	168(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rcx\n\t"
        "movq	%%rax, 160(%[r])\n\t"
        "movq	176(%[a]), %%rax\n\t"
        "movq	176(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rax\n\t"
        "movq	%%rcx, 168(%[r])\n\t"
        "movq	184(%[a]), %%rcx\n\t"
        "movq	184(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rcx\n\t"
        "movq	%%rax, 176(%[r])\n\t"
        "movq	192(%[a]), %%rax\n\t"
        "movq	192(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rax\n\t"
        "movq	%%rcx, 184(%[r])\n\t"
        "movq	200(%[a]), %%rcx\n\t"
        "movq	200(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rcx\n\t"
        "movq	%%rax, 192(%[r])\n\t"
        "movq	208(%[a]), %%rax\n\t"
        "movq	208(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rax\n\t"
        "movq	%%rcx, 200(%[r])\n\t"
        "movq	216(%[a]), %%rcx\n\t"
        "movq	216(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rcx\n\t"
        "movq	%%rax, 208(%[r])\n\t"
        "movq	224(%[a]), %%rax\n\t"
        "movq	224(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rax\n\t"
        "movq	%%rcx, 216(%[r])\n\t"
        "movq	232(%[a]), %%rcx\n\t"
        "movq	232(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rcx\n\t"
        "movq	%%rax, 224(%[r])\n\t"
        "movq	240(%[a]), %%rax\n\t"
        "movq	240(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rax\n\t"
        "movq	%%rcx, 232(%[r])\n\t"
        "movq	248(%[a]), %%rcx\n\t"
        "movq	248(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rcx\n\t"
        "movq	%%rax, 240(%[r])\n\t"
        "movq	256(%[a]), %%rax\n\t"
        "movq	256(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rax\n\t"
        "movq	%%rcx, 248(%[r])\n\t"
        "movq	264(%[a]), %%rcx\n\t"
        "movq	264(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rcx\n\t"
        "movq	%%rax, 256(%[r])\n\t"
        "movq	272(%[a]), %%rax\n\t"
        "movq	272(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rax\n\t"
        "movq	%%rcx, 264(%[r])\n\t"
        "movq	280(%[a]), %%rcx\n\t"
        "movq	280(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rcx\n\t"
        "movq	%%rax, 272(%[r])\n\t"
        "movq	288(%[a]), %%rax\n\t"
        "movq	288(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rax\n\t"
        "movq	%%rcx, 280(%[r])\n\t"
        "movq	296(%[a]), %%rcx\n\t"
        "movq	296(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rcx\n\t"
        "movq	%%rax, 288(%[r])\n\t"
        "movq	304(%[a]), %%rax\n\t"
        "movq	304(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rax\n\t"
        "movq	%%rcx, 296(%[r])\n\t"
        "movq	312(%[a]), %%rcx\n\t"
        "movq	312(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rcx\n\t"
        "movq	%%rax, 304(%[r])\n\t"
        "movq	320(%[a]), %%rax\n\t"
        "movq	320(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rax\n\t"
        "movq	%%rcx, 312(%[r])\n\t"
        "movq	328(%[a]), %%rcx\n\t"
        "movq	328(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rcx\n\t"
        "movq	%%rax, 320(%[r])\n\t"
        "movq	336(%[a]), %%rax\n\t"
        "movq	336(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rax\n\t"
        "movq	%%rcx, 328(%[r])\n\t"
        "movq	344(%[a]), %%rcx\n\t"
        "movq	344(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rcx\n\t"
        "movq	%%rax, 336(%[r])\n\t"
        "movq	352(%[a]), %%rax\n\t"
        "movq	352(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rax\n\t"
        "movq	%%rcx, 344(%[r])\n\t"
        "movq	360(%[a]), %%rcx\n\t"
        "movq	360(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rcx\n\t"
        "movq	%%rax, 352(%[r])\n\t"
        "movq	368(%[a]), %%rax\n\t"
        "movq	368(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rax\n\t"
        "movq	%%rcx, 360(%[r])\n\t"
        "movq	376(%[a]), %%rcx\n\t"
        "movq	376(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rcx\n\t"
        "movq	%%rax, 368(%[r])\n\t"
        "movq	%%rcx, 376(%[r])\n\t"
        "sbbq	$0, %[c]\n\t"
        : [c] "+r" (c)
        : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m), [t] "r" (t)
        : "memory", "rax", "rcx", "rdx"
    );

    return c;
}

/* Reduce the number back to 3072 bits using Montgomery reduction.
 *
 * a   A single precision number to reduce in place.
 * m   The single precision number representing the modulus.
 * mp  The digit representing the negative inverse of m mod 2^n.
 */
SP_NOINLINE static void sp_3072_mont_reduce_48(sp_digit* a, sp_digit* m,
        sp_digit mp)
{
    sp_digit ca = 0;

    __asm__ __volatile__ (
        "# i = 0\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "movq	0(%[a]), %%r12\n\t"
        "movq	8(%[a]), %%r13\n\t"
        "\nL_mont_loop_48:\n\t"
        "# mu = a[i] * mp\n\t"
        "movq	%%r12, %%r10\n\t"
        "imulq	%[mp], %%r10\n\t"
        "# a[i+0] += m[0] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r9, %%r9\n\t"
        "mulq	0(%[m])\n\t"
        "addq	%%rax, %%r12\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "# a[i+1] += m[1] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	8(%[m])\n\t"
        "movq	%%r13, %%r12\n\t"
        "addq	%%rax, %%r12\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "addq	%%r9,  %%r12\n\t"
        "adcq	$0, %%r8\n\t"
        "# a[i+2] += m[2] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r9, %%r9\n\t"
        "mulq	16(%[m])\n\t"
        "movq	16(%[a]), %%r13\n\t"
        "addq	%%rax, %%r13\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "addq	%%r8,  %%r13\n\t"
        "adcq	$0, %%r9\n\t"
        "# a[i+3] += m[3] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	24(%[m])\n\t"
        "movq	24(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "addq	%%r9,  %%r11\n\t"
        "movq	%%r11, 24(%[a])\n\t"
        "adcq	$0, %%r8\n\t"
        "# a[i+4] += m[4] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r9, %%r9\n\t"
        "mulq	32(%[m])\n\t"
        "movq	32(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "addq	%%r8,  %%r11\n\t"
        "movq	%%r11, 32(%[a])\n\t"
        "adcq	$0, %%r9\n\t"
        "# a[i+5] += m[5] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	40(%[m])\n\t"
        "movq	40(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "addq	%%r9,  %%r11\n\t"
        "movq	%%r11, 40(%[a])\n\t"
        "adcq	$0, %%r8\n\t"
        "# a[i+6] += m[6] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r9, %%r9\n\t"
        "mulq	48(%[m])\n\t"
        "movq	48(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "addq	%%r8,  %%r11\n\t"
        "movq	%%r11, 48(%[a])\n\t"
        "adcq	$0, %%r9\n\t"
        "# a[i+7] += m[7] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	56(%[m])\n\t"
        "movq	56(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "addq	%%r9,  %%r11\n\t"
        "movq	%%r11, 56(%[a])\n\t"
        "adcq	$0, %%r8\n\t"
        "# a[i+8] += m[8] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r9, %%r9\n\t"
        "mulq	64(%[m])\n\t"
        "movq	64(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "addq	%%r8,  %%r11\n\t"
        "movq	%%r11, 64(%[a])\n\t"
        "adcq	$0, %%r9\n\t"
        "# a[i+9] += m[9] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	72(%[m])\n\t"
        "movq	72(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "addq	%%r9,  %%r11\n\t"
        "movq	%%r11, 72(%[a])\n\t"
        "adcq	$0, %%r8\n\t"
        "# a[i+10] += m[10] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r9, %%r9\n\t"
        "mulq	80(%[m])\n\t"
        "movq	80(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "addq	%%r8,  %%r11\n\t"
        "movq	%%r11, 80(%[a])\n\t"
        "adcq	$0, %%r9\n\t"
        "# a[i+11] += m[11] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	88(%[m])\n\t"
        "movq	88(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "addq	%%r9,  %%r11\n\t"
        "movq	%%r11, 88(%[a])\n\t"
        "adcq	$0, %%r8\n\t"
        "# a[i+12] += m[12] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r9, %%r9\n\t"
        "mulq	96(%[m])\n\t"
        "movq	96(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "addq	%%r8,  %%r11\n\t"
        "movq	%%r11, 96(%[a])\n\t"
        "adcq	$0, %%r9\n\t"
        "# a[i+13] += m[13] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	104(%[m])\n\t"
        "movq	104(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "addq	%%r9,  %%r11\n\t"
        "movq	%%r11, 104(%[a])\n\t"
        "adcq	$0, %%r8\n\t"
        "# a[i+14] += m[14] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r9, %%r9\n\t"
        "mulq	112(%[m])\n\t"
        "movq	112(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "addq	%%r8,  %%r11\n\t"
        "movq	%%r11, 112(%[a])\n\t"
        "adcq	$0, %%r9\n\t"
        "# a[i+15] += m[15] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	120(%[m])\n\t"
        "movq	120(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "addq	%%r9,  %%r11\n\t"
        "movq	%%r11, 120(%[a])\n\t"
        "adcq	$0, %%r8\n\t"
        "# a[i+16] += m[16] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r9, %%r9\n\t"
        "mulq	128(%[m])\n\t"
        "movq	128(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "addq	%%r8,  %%r11\n\t"
        "movq	%%r11, 128(%[a])\n\t"
        "adcq	$0, %%r9\n\t"
        "# a[i+17] += m[17] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	136(%[m])\n\t"
        "movq	136(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "addq	%%r9,  %%r11\n\t"
        "movq	%%r11, 136(%[a])\n\t"
        "adcq	$0, %%r8\n\t"
        "# a[i+18] += m[18] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r9, %%r9\n\t"
        "mulq	144(%[m])\n\t"
        "movq	144(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "addq	%%r8,  %%r11\n\t"
        "movq	%%r11, 144(%[a])\n\t"
        "adcq	$0, %%r9\n\t"
        "# a[i+19] += m[19] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	152(%[m])\n\t"
        "movq	152(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "addq	%%r9,  %%r11\n\t"
        "movq	%%r11, 152(%[a])\n\t"
        "adcq	$0, %%r8\n\t"
        "# a[i+20] += m[20] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r9, %%r9\n\t"
        "mulq	160(%[m])\n\t"
        "movq	160(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "addq	%%r8,  %%r11\n\t"
        "movq	%%r11, 160(%[a])\n\t"
        "adcq	$0, %%r9\n\t"
        "# a[i+21] += m[21] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	168(%[m])\n\t"
        "movq	168(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "addq	%%r9,  %%r11\n\t"
        "movq	%%r11, 168(%[a])\n\t"
        "adcq	$0, %%r8\n\t"
        "# a[i+22] += m[22] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r9, %%r9\n\t"
        "mulq	176(%[m])\n\t"
        "movq	176(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "addq	%%r8,  %%r11\n\t"
        "movq	%%r11, 176(%[a])\n\t"
        "adcq	$0, %%r9\n\t"
        "# a[i+23] += m[23] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	184(%[m])\n\t"
        "movq	184(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "addq	%%r9,  %%r11\n\t"
        "movq	%%r11, 184(%[a])\n\t"
        "adcq	$0, %%r8\n\t"
        "# a[i+24] += m[24] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r9, %%r9\n\t"
        "mulq	192(%[m])\n\t"
        "movq	192(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "addq	%%r8,  %%r11\n\t"
        "movq	%%r11, 192(%[a])\n\t"
        "adcq	$0, %%r9\n\t"
        "# a[i+25] += m[25] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	200(%[m])\n\t"
        "movq	200(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "addq	%%r9,  %%r11\n\t"
        "movq	%%r11, 200(%[a])\n\t"
        "adcq	$0, %%r8\n\t"
        "# a[i+26] += m[26] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r9, %%r9\n\t"
        "mulq	208(%[m])\n\t"
        "movq	208(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "addq	%%r8,  %%r11\n\t"
        "movq	%%r11, 208(%[a])\n\t"
        "adcq	$0, %%r9\n\t"
        "# a[i+27] += m[27] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	216(%[m])\n\t"
        "movq	216(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "addq	%%r9,  %%r11\n\t"
        "movq	%%r11, 216(%[a])\n\t"
        "adcq	$0, %%r8\n\t"
        "# a[i+28] += m[28] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r9, %%r9\n\t"
        "mulq	224(%[m])\n\t"
        "movq	224(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "addq	%%r8,  %%r11\n\t"
        "movq	%%r11, 224(%[a])\n\t"
        "adcq	$0, %%r9\n\t"
        "# a[i+29] += m[29] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	232(%[m])\n\t"
        "movq	232(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "addq	%%r9,  %%r11\n\t"
        "movq	%%r11, 232(%[a])\n\t"
        "adcq	$0, %%r8\n\t"
        "# a[i+30] += m[30] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r9, %%r9\n\t"
        "mulq	240(%[m])\n\t"
        "movq	240(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "addq	%%r8,  %%r11\n\t"
        "movq	%%r11, 240(%[a])\n\t"
        "adcq	$0, %%r9\n\t"
        "# a[i+31] += m[31] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	248(%[m])\n\t"
        "movq	248(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "addq	%%r9,  %%r11\n\t"
        "movq	%%r11, 248(%[a])\n\t"
        "adcq	$0, %%r8\n\t"
        "# a[i+32] += m[32] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r9, %%r9\n\t"
        "mulq	256(%[m])\n\t"
        "movq	256(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "addq	%%r8,  %%r11\n\t"
        "movq	%%r11, 256(%[a])\n\t"
        "adcq	$0, %%r9\n\t"
        "# a[i+33] += m[33] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	264(%[m])\n\t"
        "movq	264(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "addq	%%r9,  %%r11\n\t"
        "movq	%%r11, 264(%[a])\n\t"
        "adcq	$0, %%r8\n\t"
        "# a[i+34] += m[34] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r9, %%r9\n\t"
        "mulq	272(%[m])\n\t"
        "movq	272(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "addq	%%r8,  %%r11\n\t"
        "movq	%%r11, 272(%[a])\n\t"
        "adcq	$0, %%r9\n\t"
        "# a[i+35] += m[35] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	280(%[m])\n\t"
        "movq	280(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "addq	%%r9,  %%r11\n\t"
        "movq	%%r11, 280(%[a])\n\t"
        "adcq	$0, %%r8\n\t"
        "# a[i+36] += m[36] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r9, %%r9\n\t"
        "mulq	288(%[m])\n\t"
        "movq	288(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "addq	%%r8,  %%r11\n\t"
        "movq	%%r11, 288(%[a])\n\t"
        "adcq	$0, %%r9\n\t"
        "# a[i+37] += m[37] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	296(%[m])\n\t"
        "movq	296(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "addq	%%r9,  %%r11\n\t"
        "movq	%%r11, 296(%[a])\n\t"
        "adcq	$0, %%r8\n\t"
        "# a[i+38] += m[38] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r9, %%r9\n\t"
        "mulq	304(%[m])\n\t"
        "movq	304(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "addq	%%r8,  %%r11\n\t"
        "movq	%%r11, 304(%[a])\n\t"
        "adcq	$0, %%r9\n\t"
        "# a[i+39] += m[39] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	312(%[m])\n\t"
        "movq	312(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "addq	%%r9,  %%r11\n\t"
        "movq	%%r11, 312(%[a])\n\t"
        "adcq	$0, %%r8\n\t"
        "# a[i+40] += m[40] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r9, %%r9\n\t"
        "mulq	320(%[m])\n\t"
        "movq	320(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "addq	%%r8,  %%r11\n\t"
        "movq	%%r11, 320(%[a])\n\t"
        "adcq	$0, %%r9\n\t"
        "# a[i+41] += m[41] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	328(%[m])\n\t"
        "movq	328(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "addq	%%r9,  %%r11\n\t"
        "movq	%%r11, 328(%[a])\n\t"
        "adcq	$0, %%r8\n\t"
        "# a[i+42] += m[42] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r9, %%r9\n\t"
        "mulq	336(%[m])\n\t"
        "movq	336(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "addq	%%r8,  %%r11\n\t"
        "movq	%%r11, 336(%[a])\n\t"
        "adcq	$0, %%r9\n\t"
        "# a[i+43] += m[43] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	344(%[m])\n\t"
        "movq	344(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "addq	%%r9,  %%r11\n\t"
        "movq	%%r11, 344(%[a])\n\t"
        "adcq	$0, %%r8\n\t"
        "# a[i+44] += m[44] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r9, %%r9\n\t"
        "mulq	352(%[m])\n\t"
        "movq	352(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "addq	%%r8,  %%r11\n\t"
        "movq	%%r11, 352(%[a])\n\t"
        "adcq	$0, %%r9\n\t"
        "# a[i+45] += m[45] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	360(%[m])\n\t"
        "movq	360(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "addq	%%r9,  %%r11\n\t"
        "movq	%%r11, 360(%[a])\n\t"
        "adcq	$0, %%r8\n\t"
        "# a[i+46] += m[46] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "xorq	%%r9, %%r9\n\t"
        "mulq	368(%[m])\n\t"
        "movq	368(%[a]), %%r11\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r9\n\t"
        "addq	%%r8,  %%r11\n\t"
        "movq	%%r11, 368(%[a])\n\t"
        "adcq	$0, %%r9\n\t"
        "# a[i+47] += m[47] * mu\n\t"
        "movq	%%r10, %%rax\n\t"
        "mulq	376(%[m])\n\t"
        "movq	376(%[a]), %%r11\n\t"
        "addq	%%rax, %%r9\n\t"
        "adcq	%[ca], %%rdx\n\t"
        "movq	$0, %[ca]\n\t"
        "adcq	$0, %[ca]\n\t"
        "addq	%%r9, %%r11\n\t"
        "movq	%%r11, 376(%[a])\n\t"
        "adcq	%%rdx, 384(%[a])\n\t"
        "adcq	$0, %[ca]\n\t"
        "# i += 1\n\t"
        "addq	$8, %[a]\n\t"
        "addq	$8, %%rcx\n\t"
        "cmpq	$384, %%rcx\n\t"
        "jl	L_mont_loop_48\n\t"
        "movq	%%r12, 0(%[a])\n\t"
        "movq	%%r13, 8(%[a])\n\t"
        : [ca] "+r" (ca), [a] "+r" (a)
        : [m] "r" (m), [mp] "r" (mp)
        : "memory", "rax", "rdx", "rcx", "r8", "r9", "r10", "r11",
          "r12", "r13"
    );

    sp_3072_cond_sub_48(a - 48, a, m, (sp_digit)0 - ca);
}

/* Multiply two Montogmery form numbers mod the modulus (prime).
 * (r = a * b mod m)
 *
 * r   Result of multiplication.
 * a   First number to multiply in Montogmery form.
 * b   Second number to multiply in Montogmery form.
 * m   Modulus (prime).
 * mp  Montogmery mulitplier.
 */
static void sp_3072_mont_mul_48(sp_digit* r, sp_digit* a, sp_digit* b,
        sp_digit* m, sp_digit mp)
{
    sp_3072_mul_48(r, a, b);
    sp_3072_mont_reduce_48(r, m, mp);
}

/* Square the Montgomery form number. (r = a * a mod m)
 *
 * r   Result of squaring.
 * a   Number to square in Montogmery form.
 * m   Modulus (prime).
 * mp  Montogmery mulitplier.
 */
static void sp_3072_mont_sqr_48(sp_digit* r, sp_digit* a, sp_digit* m,
        sp_digit mp)
{
    sp_3072_sqr_48(r, a);
    sp_3072_mont_reduce_48(r, m, mp);
}

/* Mul a by digit b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision digit.
 */
SP_NOINLINE static void sp_3072_mul_d_48(sp_digit* r, const sp_digit* a,
        const sp_digit b)
{
    __asm__ __volatile__ (
        "# A[0] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	(%[a])\n\t"
        "movq	%%rax, %%rbx\n\t"
        "movq	%%rdx, %%rcx\n\t"
        "movq	%%rbx, 0(%[r])\n\t"
        "# A[1] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "movq	%%rcx, 8(%[r])\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "# A[2] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "movq	%%r8, 16(%[r])\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "# A[3] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "movq	%%rbx, 24(%[r])\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "# A[4] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "mulq	32(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "movq	%%rcx, 32(%[r])\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "# A[5] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "mulq	40(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "movq	%%r8, 40(%[r])\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "# A[6] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	48(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "movq	%%rbx, 48(%[r])\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "# A[7] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "mulq	56(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "movq	%%rcx, 56(%[r])\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "# A[8] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "mulq	64(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "movq	%%r8, 64(%[r])\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "# A[9] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	72(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "movq	%%rbx, 72(%[r])\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "# A[10] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "mulq	80(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "movq	%%rcx, 80(%[r])\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "# A[11] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "mulq	88(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "movq	%%r8, 88(%[r])\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "# A[12] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	96(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "movq	%%rbx, 96(%[r])\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "# A[13] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "mulq	104(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "movq	%%rcx, 104(%[r])\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "# A[14] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "mulq	112(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "movq	%%r8, 112(%[r])\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "# A[15] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	120(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "movq	%%rbx, 120(%[r])\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "# A[16] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "mulq	128(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "movq	%%rcx, 128(%[r])\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "# A[17] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "mulq	136(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "movq	%%r8, 136(%[r])\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "# A[18] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	144(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "movq	%%rbx, 144(%[r])\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "# A[19] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "mulq	152(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "movq	%%rcx, 152(%[r])\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "# A[20] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "mulq	160(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "movq	%%r8, 160(%[r])\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "# A[21] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	168(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "movq	%%rbx, 168(%[r])\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "# A[22] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "mulq	176(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "movq	%%rcx, 176(%[r])\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "# A[23] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "mulq	184(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "movq	%%r8, 184(%[r])\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "# A[24] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	192(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "movq	%%rbx, 192(%[r])\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "# A[25] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "mulq	200(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "movq	%%rcx, 200(%[r])\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "# A[26] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "mulq	208(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "movq	%%r8, 208(%[r])\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "# A[27] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	216(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "movq	%%rbx, 216(%[r])\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "# A[28] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "mulq	224(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "movq	%%rcx, 224(%[r])\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "# A[29] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "mulq	232(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "movq	%%r8, 232(%[r])\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "# A[30] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	240(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "movq	%%rbx, 240(%[r])\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "# A[31] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "mulq	248(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "movq	%%rcx, 248(%[r])\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "# A[32] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "mulq	256(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "movq	%%r8, 256(%[r])\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "# A[33] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	264(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "movq	%%rbx, 264(%[r])\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "# A[34] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "mulq	272(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "movq	%%rcx, 272(%[r])\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "# A[35] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "mulq	280(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "movq	%%r8, 280(%[r])\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "# A[36] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	288(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "movq	%%rbx, 288(%[r])\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "# A[37] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "mulq	296(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "movq	%%rcx, 296(%[r])\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "# A[38] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "mulq	304(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "movq	%%r8, 304(%[r])\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "# A[39] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	312(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "movq	%%rbx, 312(%[r])\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "# A[40] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "mulq	320(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "movq	%%rcx, 320(%[r])\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "# A[41] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "mulq	328(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "movq	%%r8, 328(%[r])\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "# A[42] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	336(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "movq	%%rbx, 336(%[r])\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "# A[43] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "mulq	344(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "movq	%%rcx, 344(%[r])\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "# A[44] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "mulq	352(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "movq	%%r8, 352(%[r])\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "adcq	$0, %%rcx\n\t"
        "# A[45] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%r8, %%r8\n\t"
        "mulq	360(%[a])\n\t"
        "addq	%%rax, %%rbx\n\t"
        "movq	%%rbx, 360(%[r])\n\t"
        "adcq	%%rdx, %%rcx\n\t"
        "adcq	$0, %%r8\n\t"
        "# A[46] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "xorq	%%rbx, %%rbx\n\t"
        "mulq	368(%[a])\n\t"
        "addq	%%rax, %%rcx\n\t"
        "movq	%%rcx, 368(%[r])\n\t"
        "adcq	%%rdx, %%r8\n\t"
        "adcq	$0, %%rbx\n\t"
        "# A[47] * B\n\t"
        "movq	%[b], %%rax\n\t"
        "mulq	376(%[a])\n\t"
        "addq	%%rax, %%r8\n\t"
        "adcq	%%rdx, %%rbx\n\t"
        "movq	%%r8, 376(%[r])\n\t"
        "movq	%%rbx, 384(%[r])\n\t"
        :
        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
        : "memory", "rax", "rdx", "rbx", "rcx", "r8"
    );
}

#ifdef HAVE_INTEL_AVX2
/* Mul a by digit b into r. (r = a * b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision digit.
 */
SP_NOINLINE static void sp_3072_mul_d_avx2_48(sp_digit* r, const sp_digit* a,
        const sp_digit b)
{
    __asm__ __volatile__ (
        "# A[0] * B\n\t"
        "movq	%[b], %%rdx\n\t"
        "xorq	%%r10, %%r10\n\t"
        "mulxq	(%[a]), %%r8, %%r9\n\t"
        "movq	%%r8, 0(%[r])\n\t"
        "# A[1] * B\n\t"
        "mulxq	8(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r8\n\t"
        "adcxq	%%rax, %%r9\n\t"
        "movq	%%r9, 8(%[r])\n\t"
        "adoxq	%%rcx, %%r8\n\t"
        "# A[2] * B\n\t"
        "mulxq	16(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r9\n\t"
        "adcxq	%%rax, %%r8\n\t"
        "movq	%%r8, 16(%[r])\n\t"
        "adoxq	%%rcx, %%r9\n\t"
        "# A[3] * B\n\t"
        "mulxq	24(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r8\n\t"
        "adcxq	%%rax, %%r9\n\t"
        "movq	%%r9, 24(%[r])\n\t"
        "adoxq	%%rcx, %%r8\n\t"
        "# A[4] * B\n\t"
        "mulxq	32(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r9\n\t"
        "adcxq	%%rax, %%r8\n\t"
        "movq	%%r8, 32(%[r])\n\t"
        "adoxq	%%rcx, %%r9\n\t"
        "# A[5] * B\n\t"
        "mulxq	40(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r8\n\t"
        "adcxq	%%rax, %%r9\n\t"
        "movq	%%r9, 40(%[r])\n\t"
        "adoxq	%%rcx, %%r8\n\t"
        "# A[6] * B\n\t"
        "mulxq	48(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r9\n\t"
        "adcxq	%%rax, %%r8\n\t"
        "movq	%%r8, 48(%[r])\n\t"
        "adoxq	%%rcx, %%r9\n\t"
        "# A[7] * B\n\t"
        "mulxq	56(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r8\n\t"
        "adcxq	%%rax, %%r9\n\t"
        "movq	%%r9, 56(%[r])\n\t"
        "adoxq	%%rcx, %%r8\n\t"
        "# A[8] * B\n\t"
        "mulxq	64(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r9\n\t"
        "adcxq	%%rax, %%r8\n\t"
        "movq	%%r8, 64(%[r])\n\t"
        "adoxq	%%rcx, %%r9\n\t"
        "# A[9] * B\n\t"
        "mulxq	72(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r8\n\t"
        "adcxq	%%rax, %%r9\n\t"
        "movq	%%r9, 72(%[r])\n\t"
        "adoxq	%%rcx, %%r8\n\t"
        "# A[10] * B\n\t"
        "mulxq	80(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r9\n\t"
        "adcxq	%%rax, %%r8\n\t"
        "movq	%%r8, 80(%[r])\n\t"
        "adoxq	%%rcx, %%r9\n\t"
        "# A[11] * B\n\t"
        "mulxq	88(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r8\n\t"
        "adcxq	%%rax, %%r9\n\t"
        "movq	%%r9, 88(%[r])\n\t"
        "adoxq	%%rcx, %%r8\n\t"
        "# A[12] * B\n\t"
        "mulxq	96(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r9\n\t"
        "adcxq	%%rax, %%r8\n\t"
        "movq	%%r8, 96(%[r])\n\t"
        "adoxq	%%rcx, %%r9\n\t"
        "# A[13] * B\n\t"
        "mulxq	104(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r8\n\t"
        "adcxq	%%rax, %%r9\n\t"
        "movq	%%r9, 104(%[r])\n\t"
        "adoxq	%%rcx, %%r8\n\t"
        "# A[14] * B\n\t"
        "mulxq	112(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r9\n\t"
        "adcxq	%%rax, %%r8\n\t"
        "movq	%%r8, 112(%[r])\n\t"
        "adoxq	%%rcx, %%r9\n\t"
        "# A[15] * B\n\t"
        "mulxq	120(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r8\n\t"
        "adcxq	%%rax, %%r9\n\t"
        "movq	%%r9, 120(%[r])\n\t"
        "adoxq	%%rcx, %%r8\n\t"
        "# A[16] * B\n\t"
        "mulxq	128(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r9\n\t"
        "adcxq	%%rax, %%r8\n\t"
        "movq	%%r8, 128(%[r])\n\t"
        "adoxq	%%rcx, %%r9\n\t"
        "# A[17] * B\n\t"
        "mulxq	136(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r8\n\t"
        "adcxq	%%rax, %%r9\n\t"
        "movq	%%r9, 136(%[r])\n\t"
        "adoxq	%%rcx, %%r8\n\t"
        "# A[18] * B\n\t"
        "mulxq	144(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r9\n\t"
        "adcxq	%%rax, %%r8\n\t"
        "movq	%%r8, 144(%[r])\n\t"
        "adoxq	%%rcx, %%r9\n\t"
        "# A[19] * B\n\t"
        "mulxq	152(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r8\n\t"
        "adcxq	%%rax, %%r9\n\t"
        "movq	%%r9, 152(%[r])\n\t"
        "adoxq	%%rcx, %%r8\n\t"
        "# A[20] * B\n\t"
        "mulxq	160(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r9\n\t"
        "adcxq	%%rax, %%r8\n\t"
        "movq	%%r8, 160(%[r])\n\t"
        "adoxq	%%rcx, %%r9\n\t"
        "# A[21] * B\n\t"
        "mulxq	168(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r8\n\t"
        "adcxq	%%rax, %%r9\n\t"
        "movq	%%r9, 168(%[r])\n\t"
        "adoxq	%%rcx, %%r8\n\t"
        "# A[22] * B\n\t"
        "mulxq	176(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r9\n\t"
        "adcxq	%%rax, %%r8\n\t"
        "movq	%%r8, 176(%[r])\n\t"
        "adoxq	%%rcx, %%r9\n\t"
        "# A[23] * B\n\t"
        "mulxq	184(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r8\n\t"
        "adcxq	%%rax, %%r9\n\t"
        "movq	%%r9, 184(%[r])\n\t"
        "adoxq	%%rcx, %%r8\n\t"
        "# A[24] * B\n\t"
        "mulxq	192(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r9\n\t"
        "adcxq	%%rax, %%r8\n\t"
        "movq	%%r8, 192(%[r])\n\t"
        "adoxq	%%rcx, %%r9\n\t"
        "# A[25] * B\n\t"
        "mulxq	200(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r8\n\t"
        "adcxq	%%rax, %%r9\n\t"
        "movq	%%r9, 200(%[r])\n\t"
        "adoxq	%%rcx, %%r8\n\t"
        "# A[26] * B\n\t"
        "mulxq	208(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r9\n\t"
        "adcxq	%%rax, %%r8\n\t"
        "movq	%%r8, 208(%[r])\n\t"
        "adoxq	%%rcx, %%r9\n\t"
        "# A[27] * B\n\t"
        "mulxq	216(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r8\n\t"
        "adcxq	%%rax, %%r9\n\t"
        "movq	%%r9, 216(%[r])\n\t"
        "adoxq	%%rcx, %%r8\n\t"
        "# A[28] * B\n\t"
        "mulxq	224(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r9\n\t"
        "adcxq	%%rax, %%r8\n\t"
        "movq	%%r8, 224(%[r])\n\t"
        "adoxq	%%rcx, %%r9\n\t"
        "# A[29] * B\n\t"
        "mulxq	232(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r8\n\t"
        "adcxq	%%rax, %%r9\n\t"
        "movq	%%r9, 232(%[r])\n\t"
        "adoxq	%%rcx, %%r8\n\t"
        "# A[30] * B\n\t"
        "mulxq	240(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r9\n\t"
        "adcxq	%%rax, %%r8\n\t"
        "movq	%%r8, 240(%[r])\n\t"
        "adoxq	%%rcx, %%r9\n\t"
        "# A[31] * B\n\t"
        "mulxq	248(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r8\n\t"
        "adcxq	%%rax, %%r9\n\t"
        "movq	%%r9, 248(%[r])\n\t"
        "adoxq	%%rcx, %%r8\n\t"
        "# A[32] * B\n\t"
        "mulxq	256(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r9\n\t"
        "adcxq	%%rax, %%r8\n\t"
        "movq	%%r8, 256(%[r])\n\t"
        "adoxq	%%rcx, %%r9\n\t"
        "# A[33] * B\n\t"
        "mulxq	264(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r8\n\t"
        "adcxq	%%rax, %%r9\n\t"
        "movq	%%r9, 264(%[r])\n\t"
        "adoxq	%%rcx, %%r8\n\t"
        "# A[34] * B\n\t"
        "mulxq	272(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r9\n\t"
        "adcxq	%%rax, %%r8\n\t"
        "movq	%%r8, 272(%[r])\n\t"
        "adoxq	%%rcx, %%r9\n\t"
        "# A[35] * B\n\t"
        "mulxq	280(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r8\n\t"
        "adcxq	%%rax, %%r9\n\t"
        "movq	%%r9, 280(%[r])\n\t"
        "adoxq	%%rcx, %%r8\n\t"
        "# A[36] * B\n\t"
        "mulxq	288(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r9\n\t"
        "adcxq	%%rax, %%r8\n\t"
        "movq	%%r8, 288(%[r])\n\t"
        "adoxq	%%rcx, %%r9\n\t"
        "# A[37] * B\n\t"
        "mulxq	296(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r8\n\t"
        "adcxq	%%rax, %%r9\n\t"
        "movq	%%r9, 296(%[r])\n\t"
        "adoxq	%%rcx, %%r8\n\t"
        "# A[38] * B\n\t"
        "mulxq	304(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r9\n\t"
        "adcxq	%%rax, %%r8\n\t"
        "movq	%%r8, 304(%[r])\n\t"
        "adoxq	%%rcx, %%r9\n\t"
        "# A[39] * B\n\t"
        "mulxq	312(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r8\n\t"
        "adcxq	%%rax, %%r9\n\t"
        "movq	%%r9, 312(%[r])\n\t"
        "adoxq	%%rcx, %%r8\n\t"
        "# A[40] * B\n\t"
        "mulxq	320(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r9\n\t"
        "adcxq	%%rax, %%r8\n\t"
        "movq	%%r8, 320(%[r])\n\t"
        "adoxq	%%rcx, %%r9\n\t"
        "# A[41] * B\n\t"
        "mulxq	328(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r8\n\t"
        "adcxq	%%rax, %%r9\n\t"
        "movq	%%r9, 328(%[r])\n\t"
        "adoxq	%%rcx, %%r8\n\t"
        "# A[42] * B\n\t"
        "mulxq	336(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r9\n\t"
        "adcxq	%%rax, %%r8\n\t"
        "movq	%%r8, 336(%[r])\n\t"
        "adoxq	%%rcx, %%r9\n\t"
        "# A[43] * B\n\t"
        "mulxq	344(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r8\n\t"
        "adcxq	%%rax, %%r9\n\t"
        "movq	%%r9, 344(%[r])\n\t"
        "adoxq	%%rcx, %%r8\n\t"
        "# A[44] * B\n\t"
        "mulxq	352(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r9\n\t"
        "adcxq	%%rax, %%r8\n\t"
        "movq	%%r8, 352(%[r])\n\t"
        "adoxq	%%rcx, %%r9\n\t"
        "# A[45] * B\n\t"
        "mulxq	360(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r8\n\t"
        "adcxq	%%rax, %%r9\n\t"
        "movq	%%r9, 360(%[r])\n\t"
        "adoxq	%%rcx, %%r8\n\t"
        "# A[46] * B\n\t"
        "mulxq	368(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r9\n\t"
        "adcxq	%%rax, %%r8\n\t"
        "movq	%%r8, 368(%[r])\n\t"
        "adoxq	%%rcx, %%r9\n\t"
        "# A[47] * B\n\t"
        "mulxq	376(%[a]), %%rax, %%rcx\n\t"
        "movq	%%r10, %%r8\n\t"
        "adcxq	%%rax, %%r9\n\t"
        "adoxq	%%rcx, %%r8\n\t"
        "adcxq	%%r10, %%r8\n\t"
        "movq	%%r9, 376(%[r])\n\t"
        "movq	%%r8, 384(%[r])\n\t"
        :
        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
        : "memory", "rax", "rdx", "rcx", "r8", "r9", "r10"
    );
}
#endif /* HAVE_INTEL_AVX2 */

/* Divide the double width number (d1|d0) by the dividend. (d1|d0 / div)
 *
 * d1   The high order half of the number to divide.
 * d0   The low order half of the number to divide.
 * div  The dividend.
 * returns the result of the division.
 */
static sp_digit div_3072_word_48(sp_digit d1, sp_digit d0, sp_digit div)
{
    sp_digit r;

    __asm__ __volatile__ (
        "movq	%[d0], %%rax\n\t"
        "movq	%[d1], %%rdx\n\t"
        "divq	%[div]\n\t"
        "movq	%%rax, %[r]\n\t"
        : [r] "=r" (r)
        : [d1] "r" (d1), [d0] "r" (d0), [div] "r" (div)
        : "rax", "rdx"
    );

    return r;
}

/* AND m into each word of a and store in r.
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * m  Mask to AND against each digit.
 */
static void sp_3072_mask_48(sp_digit* r, sp_digit* a, sp_digit m)
{
#ifdef WOLFSSL_SP_SMALL
    int i;

    for (i=0; i<48; i++)
        r[i] = a[i] & m;
#else
    int i;

    for (i = 0; i < 48; i += 8) {
        r[i+0] = a[i+0] & m;
        r[i+1] = a[i+1] & m;
        r[i+2] = a[i+2] & m;
        r[i+3] = a[i+3] & m;
        r[i+4] = a[i+4] & m;
        r[i+5] = a[i+5] & m;
        r[i+6] = a[i+6] & m;
        r[i+7] = a[i+7] & m;
    }
#endif
}

/* Compare a with b in constant time.
 *
 * a  A single precision integer.
 * b  A single precision integer.
 * return -ve, 0 or +ve if a is less than, equal to or greater than b
 * respectively.
 */
static int64_t sp_3072_cmp_48(sp_digit* a, sp_digit* b)
{
    sp_digit r = -1;
    sp_digit one = 1;

    __asm__ __volatile__ (
        "xorq	%%rcx, %%rcx\n\t"
        "movq	$-1, %%rdx\n\t"
        "movq	376(%[a]), %%rbx\n\t"
        "movq	376(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	368(%[a]), %%rbx\n\t"
        "movq	368(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	360(%[a]), %%rbx\n\t"
        "movq	360(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	352(%[a]), %%rbx\n\t"
        "movq	352(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	344(%[a]), %%rbx\n\t"
        "movq	344(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	336(%[a]), %%rbx\n\t"
        "movq	336(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	328(%[a]), %%rbx\n\t"
        "movq	328(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	320(%[a]), %%rbx\n\t"
        "movq	320(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	312(%[a]), %%rbx\n\t"
        "movq	312(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	304(%[a]), %%rbx\n\t"
        "movq	304(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	296(%[a]), %%rbx\n\t"
        "movq	296(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	288(%[a]), %%rbx\n\t"
        "movq	288(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	280(%[a]), %%rbx\n\t"
        "movq	280(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	272(%[a]), %%rbx\n\t"
        "movq	272(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	264(%[a]), %%rbx\n\t"
        "movq	264(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	256(%[a]), %%rbx\n\t"
        "movq	256(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	248(%[a]), %%rbx\n\t"
        "movq	248(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	240(%[a]), %%rbx\n\t"
        "movq	240(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	232(%[a]), %%rbx\n\t"
        "movq	232(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	224(%[a]), %%rbx\n\t"
        "movq	224(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	216(%[a]), %%rbx\n\t"
        "movq	216(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	208(%[a]), %%rbx\n\t"
        "movq	208(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	200(%[a]), %%rbx\n\t"
        "movq	200(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	192(%[a]), %%rbx\n\t"
        "movq	192(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	184(%[a]), %%rbx\n\t"
        "movq	184(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	176(%[a]), %%rbx\n\t"
        "movq	176(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	168(%[a]), %%rbx\n\t"
        "movq	168(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	160(%[a]), %%rbx\n\t"
        "movq	160(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	152(%[a]), %%rbx\n\t"
        "movq	152(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	144(%[a]), %%rbx\n\t"
        "movq	144(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	136(%[a]), %%rbx\n\t"
        "movq	136(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	128(%[a]), %%rbx\n\t"
        "movq	128(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	120(%[a]), %%rbx\n\t"
        "movq	120(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	112(%[a]), %%rbx\n\t"
        "movq	112(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	104(%[a]), %%rbx\n\t"
        "movq	104(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	96(%[a]), %%rbx\n\t"
        "movq	96(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	88(%[a]), %%rbx\n\t"
        "movq	88(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	80(%[a]), %%rbx\n\t"
        "movq	80(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	72(%[a]), %%rbx\n\t"
        "movq	72(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	64(%[a]), %%rbx\n\t"
        "movq	64(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	56(%[a]), %%rbx\n\t"
        "movq	56(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	48(%[a]), %%rbx\n\t"
        "movq	48(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	40(%[a]), %%rbx\n\t"
        "movq	40(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	32(%[a]), %%rbx\n\t"
        "movq	32(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	24(%[a]), %%rbx\n\t"
        "movq	24(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	16(%[a]), %%rbx\n\t"
        "movq	16(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	8(%[a]), %%rbx\n\t"
        "movq	8(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	0(%[a]), %%rbx\n\t"
        "movq	0(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "xorq	%%rdx, %[r]\n\t"
        : [r] "+r" (r)
        : [a] "r" (a), [b] "r" (b), [one] "r" (one)
        : "rax", "rdx", "rcx", "rbx", "r8"
    );

    return r;
}

/* Divide d in a and put remainder into r (m*d + r = a)
 * m is not calculated as it is not needed at this time.
 *
 * a  Nmber to be divided.
 * d  Number to divide with.
 * m  Multiplier result.
 * r  Remainder from the division.
 * returns MP_OKAY indicating success.
 */
static INLINE int sp_3072_div_48(sp_digit* a, sp_digit* d, sp_digit* m,
        sp_digit* r)
{
    sp_digit t1[96], t2[49];
    sp_digit div, r1;
    int i;
#ifdef HAVE_INTEL_AVX2
    word32 cpuid_flags = cpuid_get_flags();
#endif

    (void)m;

    div = d[47];
    XMEMCPY(t1, a, sizeof(*t1) * 2 * 48);
    for (i=47; i>=0; i--) {
        r1 = div_3072_word_48(t1[48 + i], t1[48 + i - 1], div);

#ifdef HAVE_INTEL_AVX2
        if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags))
            sp_3072_mul_d_avx2_48(t2, d, r1);
        else
#endif
            sp_3072_mul_d_48(t2, d, r1);
        t1[48 + i] += sp_3072_sub_in_place_48(&t1[i], t2);
        t1[48 + i] -= t2[48];
        sp_3072_mask_48(t2, d, t1[48 + i]);
        t1[48 + i] += sp_3072_add_48(&t1[i], &t1[i], t2);
        sp_3072_mask_48(t2, d, t1[48 + i]);
        t1[48 + i] += sp_3072_add_48(&t1[i], &t1[i], t2);
    }

    r1 = sp_3072_cmp_48(t1, d) >= 0;
    sp_3072_cond_sub_48(r, t1, t2, (sp_digit)0 - r1);

    return MP_OKAY;
}

/* Reduce a modulo m into r. (r = a mod m)
 *
 * r  A single precision number that is the reduced result.
 * a  A single precision number that is to be reduced.
 * m  A single precision number that is the modulus to reduce with.
 * returns MP_OKAY indicating success.
 */
static INLINE int sp_3072_mod_48(sp_digit* r, sp_digit* a, sp_digit* m)
{
    return sp_3072_div_48(a, m, NULL, r);
}

/* Divide d in a and put remainder into r (m*d + r = a)
 * m is not calculated as it is not needed at this time.
 *
 * a  Nmber to be divided.
 * d  Number to divide with.
 * m  Multiplier result.
 * r  Remainder from the division.
 * returns MP_OKAY indicating success.
 */
static INLINE int sp_3072_div_48_cond(sp_digit* a, sp_digit* d, sp_digit* m,
        sp_digit* r)
{
    sp_digit t1[96], t2[49];
    sp_digit div, r1;
    int i;
#ifdef HAVE_INTEL_AVX2
    word32 cpuid_flags = cpuid_get_flags();
#endif

    (void)m;

    div = d[47];
    XMEMCPY(t1, a, sizeof(*t1) * 2 * 48);
    for (i=47; i>=0; i--) {
        r1 = div_3072_word_48(t1[48 + i], t1[48 + i - 1], div);

#ifdef HAVE_INTEL_AVX2
        if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags))
            sp_3072_mul_d_avx2_48(t2, d, r1);
        else
#endif
            sp_3072_mul_d_48(t2, d, r1);
        t1[48 + i] += sp_3072_sub_in_place_48(&t1[i], t2);
        t1[48 + i] -= t2[48];
        if (t1[48 + i] != 0) {
            t1[48 + i] += sp_3072_add_48(&t1[i], &t1[i], d);
            if (t1[48 + i] != 0)
                t1[48 + i] += sp_3072_add_48(&t1[i], &t1[i], d);
        }
    }

    r1 = sp_3072_cmp_48(t1, d) >= 0;
    sp_3072_cond_sub_48(r, t1, t2, (sp_digit)0 - r1);

    return MP_OKAY;
}

/* Reduce a modulo m into r. (r = a mod m)
 *
 * r  A single precision number that is the reduced result.
 * a  A single precision number that is to be reduced.
 * m  A single precision number that is the modulus to reduce with.
 * returns MP_OKAY indicating success.
 */
static INLINE int sp_3072_mod_48_cond(sp_digit* r, sp_digit* a, sp_digit* m)
{
    return sp_3072_div_48_cond(a, m, NULL, r);
}

#if defined(SP_RSA_PRIVATE_EXP_D) || defined(WOLFSSL_HAVE_SP_DH)
/* Modular exponentiate a to the e mod m. (r = a^e mod m)
 *
 * r     A single precision number that is the result of the operation.
 * a     A single precision number being exponentiated.
 * e     A single precision number that is the exponent.
 * bits  The number of bits in the exponent.
 * m     A single precision number that is the modulus.
 * returns 0 on success and MEMORY_E on dynamic memory allocation failure.
 */
static int sp_3072_mod_exp_48(sp_digit* r, sp_digit* a, sp_digit* e,
        int bits, sp_digit* m, int reduceA)
{
#ifndef WOLFSSL_SMALL_STACK
    sp_digit t[32][96];
#else
    sp_digit* t[32];
    sp_digit* td;
#endif
    sp_digit* norm;
    sp_digit mp = 1;
    sp_digit n;
    sp_digit mask;
    int i;
    int c, y;
    int err = MP_OKAY;

#ifdef WOLFSSL_SMALL_STACK
    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 32 * 96, NULL,
                            DYNAMIC_TYPE_TMP_BUFFER);
    if (td == NULL)
        err = MEMORY_E;

    if (err == MP_OKAY) {
        for (i=0; i<32; i++)
            t[i] = td + i * 96;
        norm = t[0];
    }
#else
    norm = t[0];
#endif

    if (err == MP_OKAY) {
        sp_3072_mont_setup(m, &mp);
        sp_3072_mont_norm_48(norm, m);

        XMEMSET(t[1], 0, sizeof(sp_digit) * 48);
        if (reduceA) {
            err = sp_3072_mod_48(t[1] + 48, a, m);
            if (err == MP_OKAY)
                err = sp_3072_mod_48(t[1], t[1], m);
        }
        else {
            XMEMCPY(t[1] + 48, a, sizeof(sp_digit) * 48);
            err = sp_3072_mod_48(t[1], t[1], m);
        }
    }

    if (err == MP_OKAY) {
        sp_3072_mont_sqr_48(t[ 2], t[ 1], m, mp);
        sp_3072_mont_mul_48(t[ 3], t[ 2], t[ 1], m, mp);
        sp_3072_mont_sqr_48(t[ 4], t[ 2], m, mp);
        sp_3072_mont_mul_48(t[ 5], t[ 3], t[ 2], m, mp);
        sp_3072_mont_sqr_48(t[ 6], t[ 3], m, mp);
        sp_3072_mont_mul_48(t[ 7], t[ 4], t[ 3], m, mp);
        sp_3072_mont_sqr_48(t[ 8], t[ 4], m, mp);
        sp_3072_mont_mul_48(t[ 9], t[ 5], t[ 4], m, mp);
        sp_3072_mont_sqr_48(t[10], t[ 5], m, mp);
        sp_3072_mont_mul_48(t[11], t[ 6], t[ 5], m, mp);
        sp_3072_mont_sqr_48(t[12], t[ 6], m, mp);
        sp_3072_mont_mul_48(t[13], t[ 7], t[ 6], m, mp);
        sp_3072_mont_sqr_48(t[14], t[ 7], m, mp);
        sp_3072_mont_mul_48(t[15], t[ 8], t[ 7], m, mp);
        sp_3072_mont_sqr_48(t[16], t[ 8], m, mp);
        sp_3072_mont_mul_48(t[17], t[ 9], t[ 8], m, mp);
        sp_3072_mont_sqr_48(t[18], t[ 9], m, mp);
        sp_3072_mont_mul_48(t[19], t[10], t[ 9], m, mp);
        sp_3072_mont_sqr_48(t[20], t[10], m, mp);
        sp_3072_mont_mul_48(t[21], t[11], t[10], m, mp);
        sp_3072_mont_sqr_48(t[22], t[11], m, mp);
        sp_3072_mont_mul_48(t[23], t[12], t[11], m, mp);
        sp_3072_mont_sqr_48(t[24], t[12], m, mp);
        sp_3072_mont_mul_48(t[25], t[13], t[12], m, mp);
        sp_3072_mont_sqr_48(t[26], t[13], m, mp);
        sp_3072_mont_mul_48(t[27], t[14], t[13], m, mp);
        sp_3072_mont_sqr_48(t[28], t[14], m, mp);
        sp_3072_mont_mul_48(t[29], t[15], t[14], m, mp);
        sp_3072_mont_sqr_48(t[30], t[15], m, mp);
        sp_3072_mont_mul_48(t[31], t[16], t[15], m, mp);

        i = (bits - 1) / 64;
        n = e[i--];
        y = n >> 59;
        n <<= 5;
        c = 59;
        XMEMCPY(r, t[y], sizeof(sp_digit) * 48);
        for (; i>=0 || c>=5; ) {
            if (c == 0) {
                n = e[i--];
                y = n >> 59;
                n <<= 5;
                c = 59;
            }
            else if (c < 5) {
                y = n >> 59;
                n = e[i--];
                c = 5 - c;
                y |= n >> (64 - c);
                n <<= c;
                c = 64 - c;
            }
            else {
                y = (n >> 59) & 0x1f;
                n <<= 5;
                c -= 5;
            }

            sp_3072_mont_sqr_48(r, r, m, mp);
            sp_3072_mont_sqr_48(r, r, m, mp);
            sp_3072_mont_sqr_48(r, r, m, mp);
            sp_3072_mont_sqr_48(r, r, m, mp);
            sp_3072_mont_sqr_48(r, r, m, mp);

            sp_3072_mont_mul_48(r, r, t[y], m, mp);
        }
        y = e[0] & ((1 << c) - 1);
        for (; c > 0; c--)
            sp_3072_mont_sqr_48(r, r, m, mp);
        sp_3072_mont_mul_48(r, r, t[y], m, mp);

        XMEMSET(&r[48], 0, sizeof(sp_digit) * 48);
        sp_3072_mont_reduce_48(r, m, mp);

        mask = 0 - (sp_3072_cmp_48(r, m) >= 0);
        sp_3072_cond_sub_48(r, r, m, mask);
    }

#ifdef WOLFSSL_SMALL_STACK
    if (td != NULL)
        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
}
#endif /* SP_RSA_PRIVATE_EXP_D || WOLFSSL_HAVE_SP_DH */

#ifdef HAVE_INTEL_AVX2
/* Reduce the number back to 3072 bits using Montgomery reduction.
 *
 * a   A single precision number to reduce in place.
 * m   The single precision number representing the modulus.
 * mp  The digit representing the negative inverse of m mod 2^n.
 */
SP_NOINLINE static void sp_3072_mont_reduce_avx2_48(sp_digit* a, sp_digit* m,
        sp_digit mp)
{
    sp_digit ca = 0;

    __asm__ __volatile__ (
        "# i = 0\n\t"
        "movq	0(%[a]), %%r12\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "\nL_mont_loop_avx2_48:\n\t"
        "xorq	%%r9, %%r9\n\t"
        "movq	%%r12, %%r10\n\t"
        "# mu = a[i] * mp\n\t"
        "movq	%%r12, %%rdx\n\t"
        "mulxq	%[mp], %%rdx, %%r8\n\t"
        "# a[i+0] += m[0] * mu\n\t"
        "mulxq	0(%[m]), %%rax, %%r8\n\t"
        "movq	8(%[a]), %%r12\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r8, %%r12\n\t"
        "# a[i+1] += m[1] * mu\n\t"
        "mulxq	8(%[m]), %%rax, %%r8\n\t"
        "movq	16(%[a]), %%r10\n\t"
        "adcxq	%%rax, %%r12\n\t"
        "adoxq	%%r8, %%r10\n\t"
        "# a[i+2] += m[2] * mu\n\t"
        "mulxq	16(%[m]), %%rax, %%r8\n\t"
        "movq	24(%[a]), %%r11\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r8, %%r11\n\t"
        "movq	%%r10, 16(%[a])\n\t"
        "# a[i+3] += m[3] * mu\n\t"
        "mulxq	24(%[m]), %%rax, %%r8\n\t"
        "movq	32(%[a]), %%r10\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%r8, %%r10\n\t"
        "movq	%%r11, 24(%[a])\n\t"
        "# a[i+4] += m[4] * mu\n\t"
        "mulxq	32(%[m]), %%rax, %%r8\n\t"
        "movq	40(%[a]), %%r11\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r8, %%r11\n\t"
        "movq	%%r10, 32(%[a])\n\t"
        "# a[i+5] += m[5] * mu\n\t"
        "mulxq	40(%[m]), %%rax, %%r8\n\t"
        "movq	48(%[a]), %%r10\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%r8, %%r10\n\t"
        "movq	%%r11, 40(%[a])\n\t"
        "# a[i+6] += m[6] * mu\n\t"
        "mulxq	48(%[m]), %%rax, %%r8\n\t"
        "movq	56(%[a]), %%r11\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r8, %%r11\n\t"
        "movq	%%r10, 48(%[a])\n\t"
        "# a[i+7] += m[7] * mu\n\t"
        "mulxq	56(%[m]), %%rax, %%r8\n\t"
        "movq	64(%[a]), %%r10\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%r8, %%r10\n\t"
        "movq	%%r11, 56(%[a])\n\t"
        "# a[i+8] += m[8] * mu\n\t"
        "mulxq	64(%[m]), %%rax, %%r8\n\t"
        "movq	72(%[a]), %%r11\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r8, %%r11\n\t"
        "movq	%%r10, 64(%[a])\n\t"
        "# a[i+9] += m[9] * mu\n\t"
        "mulxq	72(%[m]), %%rax, %%r8\n\t"
        "movq	80(%[a]), %%r10\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%r8, %%r10\n\t"
        "movq	%%r11, 72(%[a])\n\t"
        "# a[i+10] += m[10] * mu\n\t"
        "mulxq	80(%[m]), %%rax, %%r8\n\t"
        "movq	88(%[a]), %%r11\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r8, %%r11\n\t"
        "movq	%%r10, 80(%[a])\n\t"
        "# a[i+11] += m[11] * mu\n\t"
        "mulxq	88(%[m]), %%rax, %%r8\n\t"
        "movq	96(%[a]), %%r10\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%r8, %%r10\n\t"
        "movq	%%r11, 88(%[a])\n\t"
        "# a[i+12] += m[12] * mu\n\t"
        "mulxq	96(%[m]), %%rax, %%r8\n\t"
        "movq	104(%[a]), %%r11\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r8, %%r11\n\t"
        "movq	%%r10, 96(%[a])\n\t"
        "# a[i+13] += m[13] * mu\n\t"
        "mulxq	104(%[m]), %%rax, %%r8\n\t"
        "movq	112(%[a]), %%r10\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%r8, %%r10\n\t"
        "movq	%%r11, 104(%[a])\n\t"
        "# a[i+14] += m[14] * mu\n\t"
        "mulxq	112(%[m]), %%rax, %%r8\n\t"
        "movq	120(%[a]), %%r11\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r8, %%r11\n\t"
        "movq	%%r10, 112(%[a])\n\t"
        "# a[i+15] += m[15] * mu\n\t"
        "mulxq	120(%[m]), %%rax, %%r8\n\t"
        "movq	128(%[a]), %%r10\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%r8, %%r10\n\t"
        "movq	%%r11, 120(%[a])\n\t"
        "# a[i+16] += m[16] * mu\n\t"
        "mulxq	128(%[m]), %%rax, %%r8\n\t"
        "movq	136(%[a]), %%r11\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r8, %%r11\n\t"
        "movq	%%r10, 128(%[a])\n\t"
        "# a[i+17] += m[17] * mu\n\t"
        "mulxq	136(%[m]), %%rax, %%r8\n\t"
        "movq	144(%[a]), %%r10\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%r8, %%r10\n\t"
        "movq	%%r11, 136(%[a])\n\t"
        "# a[i+18] += m[18] * mu\n\t"
        "mulxq	144(%[m]), %%rax, %%r8\n\t"
        "movq	152(%[a]), %%r11\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r8, %%r11\n\t"
        "movq	%%r10, 144(%[a])\n\t"
        "# a[i+19] += m[19] * mu\n\t"
        "mulxq	152(%[m]), %%rax, %%r8\n\t"
        "movq	160(%[a]), %%r10\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%r8, %%r10\n\t"
        "movq	%%r11, 152(%[a])\n\t"
        "# a[i+20] += m[20] * mu\n\t"
        "mulxq	160(%[m]), %%rax, %%r8\n\t"
        "movq	168(%[a]), %%r11\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r8, %%r11\n\t"
        "movq	%%r10, 160(%[a])\n\t"
        "# a[i+21] += m[21] * mu\n\t"
        "mulxq	168(%[m]), %%rax, %%r8\n\t"
        "movq	176(%[a]), %%r10\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%r8, %%r10\n\t"
        "movq	%%r11, 168(%[a])\n\t"
        "# a[i+22] += m[22] * mu\n\t"
        "mulxq	176(%[m]), %%rax, %%r8\n\t"
        "movq	184(%[a]), %%r11\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r8, %%r11\n\t"
        "movq	%%r10, 176(%[a])\n\t"
        "# a[i+23] += m[23] * mu\n\t"
        "mulxq	184(%[m]), %%rax, %%r8\n\t"
        "movq	192(%[a]), %%r10\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%r8, %%r10\n\t"
        "movq	%%r11, 184(%[a])\n\t"
        "# a[i+24] += m[24] * mu\n\t"
        "mulxq	192(%[m]), %%rax, %%r8\n\t"
        "movq	200(%[a]), %%r11\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r8, %%r11\n\t"
        "movq	%%r10, 192(%[a])\n\t"
        "# a[i+25] += m[25] * mu\n\t"
        "mulxq	200(%[m]), %%rax, %%r8\n\t"
        "movq	208(%[a]), %%r10\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%r8, %%r10\n\t"
        "movq	%%r11, 200(%[a])\n\t"
        "# a[i+26] += m[26] * mu\n\t"
        "mulxq	208(%[m]), %%rax, %%r8\n\t"
        "movq	216(%[a]), %%r11\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r8, %%r11\n\t"
        "movq	%%r10, 208(%[a])\n\t"
        "# a[i+27] += m[27] * mu\n\t"
        "mulxq	216(%[m]), %%rax, %%r8\n\t"
        "movq	224(%[a]), %%r10\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%r8, %%r10\n\t"
        "movq	%%r11, 216(%[a])\n\t"
        "# a[i+28] += m[28] * mu\n\t"
        "mulxq	224(%[m]), %%rax, %%r8\n\t"
        "movq	232(%[a]), %%r11\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r8, %%r11\n\t"
        "movq	%%r10, 224(%[a])\n\t"
        "# a[i+29] += m[29] * mu\n\t"
        "mulxq	232(%[m]), %%rax, %%r8\n\t"
        "movq	240(%[a]), %%r10\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%r8, %%r10\n\t"
        "movq	%%r11, 232(%[a])\n\t"
        "# a[i+30] += m[30] * mu\n\t"
        "mulxq	240(%[m]), %%rax, %%r8\n\t"
        "movq	248(%[a]), %%r11\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r8, %%r11\n\t"
        "movq	%%r10, 240(%[a])\n\t"
        "# a[i+31] += m[31] * mu\n\t"
        "mulxq	248(%[m]), %%rax, %%r8\n\t"
        "movq	256(%[a]), %%r10\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%r8, %%r10\n\t"
        "movq	%%r11, 248(%[a])\n\t"
        "# a[i+32] += m[32] * mu\n\t"
        "mulxq	256(%[m]), %%rax, %%r8\n\t"
        "movq	264(%[a]), %%r11\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r8, %%r11\n\t"
        "movq	%%r10, 256(%[a])\n\t"
        "# a[i+33] += m[33] * mu\n\t"
        "mulxq	264(%[m]), %%rax, %%r8\n\t"
        "movq	272(%[a]), %%r10\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%r8, %%r10\n\t"
        "movq	%%r11, 264(%[a])\n\t"
        "# a[i+34] += m[34] * mu\n\t"
        "mulxq	272(%[m]), %%rax, %%r8\n\t"
        "movq	280(%[a]), %%r11\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r8, %%r11\n\t"
        "movq	%%r10, 272(%[a])\n\t"
        "# a[i+35] += m[35] * mu\n\t"
        "mulxq	280(%[m]), %%rax, %%r8\n\t"
        "movq	288(%[a]), %%r10\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%r8, %%r10\n\t"
        "movq	%%r11, 280(%[a])\n\t"
        "# a[i+36] += m[36] * mu\n\t"
        "mulxq	288(%[m]), %%rax, %%r8\n\t"
        "movq	296(%[a]), %%r11\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r8, %%r11\n\t"
        "movq	%%r10, 288(%[a])\n\t"
        "# a[i+37] += m[37] * mu\n\t"
        "mulxq	296(%[m]), %%rax, %%r8\n\t"
        "movq	304(%[a]), %%r10\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%r8, %%r10\n\t"
        "movq	%%r11, 296(%[a])\n\t"
        "# a[i+38] += m[38] * mu\n\t"
        "mulxq	304(%[m]), %%rax, %%r8\n\t"
        "movq	312(%[a]), %%r11\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r8, %%r11\n\t"
        "movq	%%r10, 304(%[a])\n\t"
        "# a[i+39] += m[39] * mu\n\t"
        "mulxq	312(%[m]), %%rax, %%r8\n\t"
        "movq	320(%[a]), %%r10\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%r8, %%r10\n\t"
        "movq	%%r11, 312(%[a])\n\t"
        "# a[i+40] += m[40] * mu\n\t"
        "mulxq	320(%[m]), %%rax, %%r8\n\t"
        "movq	328(%[a]), %%r11\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r8, %%r11\n\t"
        "movq	%%r10, 320(%[a])\n\t"
        "# a[i+41] += m[41] * mu\n\t"
        "mulxq	328(%[m]), %%rax, %%r8\n\t"
        "movq	336(%[a]), %%r10\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%r8, %%r10\n\t"
        "movq	%%r11, 328(%[a])\n\t"
        "# a[i+42] += m[42] * mu\n\t"
        "mulxq	336(%[m]), %%rax, %%r8\n\t"
        "movq	344(%[a]), %%r11\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r8, %%r11\n\t"
        "movq	%%r10, 336(%[a])\n\t"
        "# a[i+43] += m[43] * mu\n\t"
        "mulxq	344(%[m]), %%rax, %%r8\n\t"
        "movq	352(%[a]), %%r10\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%r8, %%r10\n\t"
        "movq	%%r11, 344(%[a])\n\t"
        "# a[i+44] += m[44] * mu\n\t"
        "mulxq	352(%[m]), %%rax, %%r8\n\t"
        "movq	360(%[a]), %%r11\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r8, %%r11\n\t"
        "movq	%%r10, 352(%[a])\n\t"
        "# a[i+45] += m[45] * mu\n\t"
        "mulxq	360(%[m]), %%rax, %%r8\n\t"
        "movq	368(%[a]), %%r10\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%r8, %%r10\n\t"
        "movq	%%r11, 360(%[a])\n\t"
        "# a[i+46] += m[46] * mu\n\t"
        "mulxq	368(%[m]), %%rax, %%r8\n\t"
        "movq	376(%[a]), %%r11\n\t"
        "adcxq	%%rax, %%r10\n\t"
        "adoxq	%%r8, %%r11\n\t"
        "movq	%%r10, 368(%[a])\n\t"
        "# a[i+47] += m[47] * mu\n\t"
        "mulxq	376(%[m]), %%rax, %%r8\n\t"
        "movq	384(%[a]), %%r10\n\t"
        "adcxq	%%rax, %%r11\n\t"
        "adoxq	%%r8, %%r10\n\t"
        "movq	%%r11, 376(%[a])\n\t"
        "adcxq	%[ca], %%r10\n\t"
        "movq	%%r9, %[ca]\n\t"
        "adoxq	%%r9, %[ca]\n\t"
        "adcxq	%%r9, %[ca]\n\t"
        "movq	%%r10, 384(%[a])\n\t"
        "# i += 1\n\t"
        "addq	$8, %[a]\n\t"
        "addq	$1, %%rcx\n\t"
        "cmpq	$48, %%rcx\n\t"
        "jl	L_mont_loop_avx2_48\n\t"
        "movq	%%r12, 0(%[a])\n\t"
        : [ca] "+r" (ca), [a] "+r" (a)
        : [m] "r" (m), [mp] "r" (mp)
        : "memory", "rax", "rdx", "rcx", "r8", "r9", "r10", "r11", "r12"
    );

    sp_3072_cond_sub_48(a - 48, a, m, (sp_digit)0 - ca);
}
#endif /* HAVE_INTEL_AVX2 */

#ifdef HAVE_INTEL_AVX2
/* Multiply two Montogmery form numbers mod the modulus (prime).
 * (r = a * b mod m)
 *
 * r   Result of multiplication.
 * a   First number to multiply in Montogmery form.
 * b   Second number to multiply in Montogmery form.
 * m   Modulus (prime).
 * mp  Montogmery mulitplier.
 */
static void sp_3072_mont_mul_avx2_48(sp_digit* r, sp_digit* a, sp_digit* b,
        sp_digit* m, sp_digit mp)
{
    sp_3072_mul_avx2_48(r, a, b);
    sp_3072_mont_reduce_avx2_48(r, m, mp);
}

#endif /* HAVE_INTEL_AVX2 */
#ifdef HAVE_INTEL_AVX2
/* Square the Montgomery form number. (r = a * a mod m)
 *
 * r   Result of squaring.
 * a   Number to square in Montogmery form.
 * m   Modulus (prime).
 * mp  Montogmery mulitplier.
 */
static void sp_3072_mont_sqr_avx2_48(sp_digit* r, sp_digit* a, sp_digit* m,
        sp_digit mp)
{
    sp_3072_sqr_avx2_48(r, a);
    sp_3072_mont_reduce_avx2_48(r, m, mp);
}

#endif /* HAVE_INTEL_AVX2 */
#if defined(SP_RSA_PRIVATE_EXP_D) || defined(WOLFSSL_HAVE_SP_DH)
#ifdef HAVE_INTEL_AVX2
/* Modular exponentiate a to the e mod m. (r = a^e mod m)
 *
 * r     A single precision number that is the result of the operation.
 * a     A single precision number being exponentiated.
 * e     A single precision number that is the exponent.
 * bits  The number of bits in the exponent.
 * m     A single precision number that is the modulus.
 * returns 0 on success and MEMORY_E on dynamic memory allocation failure.
 */
static int sp_3072_mod_exp_avx2_48(sp_digit* r, sp_digit* a, sp_digit* e,
        int bits, sp_digit* m, int reduceA)
{
#ifndef WOLFSSL_SMALL_STACK
    sp_digit t[32][96];
#else
    sp_digit* t[32];
    sp_digit* td;
#endif
    sp_digit* norm;
    sp_digit mp = 1;
    sp_digit n;
    sp_digit mask;
    int i;
    int c, y;
    int err = MP_OKAY;

#ifdef WOLFSSL_SMALL_STACK
    td = (sp_digit*)XMALLOC(sizeof(sp_digit) * 32 * 96, NULL,
                            DYNAMIC_TYPE_TMP_BUFFER);
    if (td == NULL)
        err = MEMORY_E;

    if (err == MP_OKAY) {
        for (i=0; i<32; i++)
            t[i] = td + i * 96;
        norm = t[0];
    }
#else
    norm = t[0];
#endif

    if (err == MP_OKAY) {
        sp_3072_mont_setup(m, &mp);
        sp_3072_mont_norm_48(norm, m);

        XMEMSET(t[1], 0, sizeof(sp_digit) * 48);
        if (reduceA) {
            err = sp_3072_mod_48(t[1] + 48, a, m);
            if (err == MP_OKAY)
                err = sp_3072_mod_48(t[1], t[1], m);
        }
        else {
            XMEMCPY(t[1] + 48, a, sizeof(sp_digit) * 48);
            err = sp_3072_mod_48(t[1], t[1], m);
        }
    }

    if (err == MP_OKAY) {
        sp_3072_mont_sqr_avx2_48(t[ 2], t[ 1], m, mp);
        sp_3072_mont_mul_avx2_48(t[ 3], t[ 2], t[ 1], m, mp);
        sp_3072_mont_sqr_avx2_48(t[ 4], t[ 2], m, mp);
        sp_3072_mont_mul_avx2_48(t[ 5], t[ 3], t[ 2], m, mp);
        sp_3072_mont_sqr_avx2_48(t[ 6], t[ 3], m, mp);
        sp_3072_mont_mul_avx2_48(t[ 7], t[ 4], t[ 3], m, mp);
        sp_3072_mont_sqr_avx2_48(t[ 8], t[ 4], m, mp);
        sp_3072_mont_mul_avx2_48(t[ 9], t[ 5], t[ 4], m, mp);
        sp_3072_mont_sqr_avx2_48(t[10], t[ 5], m, mp);
        sp_3072_mont_mul_avx2_48(t[11], t[ 6], t[ 5], m, mp);
        sp_3072_mont_sqr_avx2_48(t[12], t[ 6], m, mp);
        sp_3072_mont_mul_avx2_48(t[13], t[ 7], t[ 6], m, mp);
        sp_3072_mont_sqr_avx2_48(t[14], t[ 7], m, mp);
        sp_3072_mont_mul_avx2_48(t[15], t[ 8], t[ 7], m, mp);
        sp_3072_mont_sqr_avx2_48(t[16], t[ 8], m, mp);
        sp_3072_mont_mul_avx2_48(t[17], t[ 9], t[ 8], m, mp);
        sp_3072_mont_sqr_avx2_48(t[18], t[ 9], m, mp);
        sp_3072_mont_mul_avx2_48(t[19], t[10], t[ 9], m, mp);
        sp_3072_mont_sqr_avx2_48(t[20], t[10], m, mp);
        sp_3072_mont_mul_avx2_48(t[21], t[11], t[10], m, mp);
        sp_3072_mont_sqr_avx2_48(t[22], t[11], m, mp);
        sp_3072_mont_mul_avx2_48(t[23], t[12], t[11], m, mp);
        sp_3072_mont_sqr_avx2_48(t[24], t[12], m, mp);
        sp_3072_mont_mul_avx2_48(t[25], t[13], t[12], m, mp);
        sp_3072_mont_sqr_avx2_48(t[26], t[13], m, mp);
        sp_3072_mont_mul_avx2_48(t[27], t[14], t[13], m, mp);
        sp_3072_mont_sqr_avx2_48(t[28], t[14], m, mp);
        sp_3072_mont_mul_avx2_48(t[29], t[15], t[14], m, mp);
        sp_3072_mont_sqr_avx2_48(t[30], t[15], m, mp);
        sp_3072_mont_mul_avx2_48(t[31], t[16], t[15], m, mp);

        i = (bits - 1) / 64;
        n = e[i--];
        y = n >> 59;
        n <<= 5;
        c = 59;
        XMEMCPY(r, t[y], sizeof(sp_digit) * 48);
        for (; i>=0 || c>=5; ) {
            if (c == 0) {
                n = e[i--];
                y = n >> 59;
                n <<= 5;
                c = 59;
            }
            else if (c < 5) {
                y = n >> 59;
                n = e[i--];
                c = 5 - c;
                y |= n >> (64 - c);
                n <<= c;
                c = 64 - c;
            }
            else {
                y = (n >> 59) & 0x1f;
                n <<= 5;
                c -= 5;
            }

            sp_3072_mont_sqr_avx2_48(r, r, m, mp);
            sp_3072_mont_sqr_avx2_48(r, r, m, mp);
            sp_3072_mont_sqr_avx2_48(r, r, m, mp);
            sp_3072_mont_sqr_avx2_48(r, r, m, mp);
            sp_3072_mont_sqr_avx2_48(r, r, m, mp);

            sp_3072_mont_mul_avx2_48(r, r, t[y], m, mp);
        }
        y = e[0] & ((1 << c) - 1);
        for (; c > 0; c--)
            sp_3072_mont_sqr_avx2_48(r, r, m, mp);
        sp_3072_mont_mul_avx2_48(r, r, t[y], m, mp);

        XMEMSET(&r[48], 0, sizeof(sp_digit) * 48);
        sp_3072_mont_reduce_avx2_48(r, m, mp);

        mask = 0 - (sp_3072_cmp_48(r, m) >= 0);
        sp_3072_cond_sub_48(r, r, m, mask);
    }

#ifdef WOLFSSL_SMALL_STACK
    if (td != NULL)
        XFREE(td, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
}
#endif /* HAVE_INTEL_AVX2 */
#endif /* SP_RSA_PRIVATE_EXP_D || WOLFSSL_HAVE_SP_DH */

#ifdef WOLFSSL_HAVE_SP_RSA
/* RSA public key operation.
 *
 * in      Array of bytes representing the number to exponentiate, base.
 * inLen   Number of bytes in base.
 * em      Public exponent.
 * mm      Modulus.
 * out     Buffer to hold big-endian bytes of exponentiation result.
 *         Must be at least 384 bytes long.
 * outLen  Number of bytes in result.
 * returns 0 on success, MP_TO_E when the outLen is too small, MP_READ_E when
 * an array is too long and MEMORY_E when dynamic memory allocation fails.
 */
int sp_RsaPublic_3072(const byte* in, word32 inLen, mp_int* em, mp_int* mm,
    byte* out, word32* outLen)
{
#if !defined(WOLFSSL_SP_SMALL) && !defined(WOLFSSL_SMALL_STACK)
    sp_digit ad[96], md[48], rd[96];
#else
    sp_digit* d = NULL;
#endif
    sp_digit* a;
    sp_digit *ah;
    sp_digit* m;
    sp_digit* r;
    sp_digit e[1];
    int err = MP_OKAY;
#ifdef HAVE_INTEL_AVX2
    word32 cpuid_flags = cpuid_get_flags();
#endif

    if (*outLen < 384)
        err = MP_TO_E;
    if (err == MP_OKAY && (mp_count_bits(em) > 64 || inLen > 384 ||
                                                     mp_count_bits(mm) != 3072))
        err = MP_READ_E;

#if defined(WOLFSSL_SP_SMALL) || defined(WOLFSSL_SMALL_STACK)
    if (err == MP_OKAY) {
        d = (sp_digit*)XMALLOC(sizeof(sp_digit) * 48 * 5, NULL,
                               DYNAMIC_TYPE_TMP_BUFFER);
        if (d == NULL)
            err = MEMORY_E;
    }

    if (err == MP_OKAY) {
        a = d;
        r = a + 48 * 2;
        m = r + 48 * 2;
        ah = a + 48;
    }
#else
    a = ad;
    m = md;
    r = rd;
    ah = a + 48;
#endif

    if (err == MP_OKAY) {
        sp_3072_from_bin(ah, 48, in, inLen);
#if DIGIT_BIT >= 64
        e[0] = em->dp[0];
#else
        e[0] = em->dp[0];
        if (em->used > 1)
            e[0] |= ((sp_digit)em->dp[1]) << DIGIT_BIT;
#endif
        if (e[0] == 0)
            err = MP_EXPTMOD_E;
    }
    if (err == MP_OKAY) {
        sp_3072_from_mp(m, 48, mm);

        if (e[0] == 0x3) {
#ifdef HAVE_INTEL_AVX2
            if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags)) {
                if (err == MP_OKAY) {
                    sp_3072_sqr_avx2_48(r, ah);
                    err = sp_3072_mod_48_cond(r, r, m);
                }
                if (err == MP_OKAY) {
                    sp_3072_mul_avx2_48(r, ah, r);
                    err = sp_3072_mod_48_cond(r, r, m);
                }
            }
            else
#endif
            {
                if (err == MP_OKAY) {
                    sp_3072_sqr_48(r, ah);
                    err = sp_3072_mod_48_cond(r, r, m);
                }
                if (err == MP_OKAY) {
                    sp_3072_mul_48(r, ah, r);
                    err = sp_3072_mod_48_cond(r, r, m);
                }
            }
        }
        else {
            int i;
            sp_digit mp;

            sp_3072_mont_setup(m, &mp);

            /* Convert to Montgomery form. */
            XMEMSET(a, 0, sizeof(sp_digit) * 48);
            err = sp_3072_mod_48_cond(a, a, m);

            if (err == MP_OKAY) {
                for (i=63; i>=0; i--)
                    if (e[0] >> i)
                        break;

                XMEMCPY(r, a, sizeof(sp_digit) * 48);
#ifdef HAVE_INTEL_AVX2
                if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags)) {
                    for (i--; i>=0; i--) {
                        sp_3072_mont_sqr_avx2_48(r, r, m, mp);
                        if (((e[0] >> i) & 1) == 1)
                            sp_3072_mont_mul_avx2_48(r, r, a, m, mp);
                    }
                    XMEMSET(&r[48], 0, sizeof(sp_digit) * 48);
                    sp_3072_mont_reduce_avx2_48(r, m, mp);
                }
                else
#endif
                {
                    for (i--; i>=0; i--) {
                        sp_3072_mont_sqr_48(r, r, m, mp);
                        if (((e[0] >> i) & 1) == 1)
                            sp_3072_mont_mul_48(r, r, a, m, mp);
                    }
                    XMEMSET(&r[48], 0, sizeof(sp_digit) * 48);
                    sp_3072_mont_reduce_48(r, m, mp);
                }

                for (i = 47; i > 0; i--) {
                    if (r[i] != m[i])
                        break;
                }
                if (r[i] >= m[i])
                    sp_3072_sub_in_place_48(r, m);
            }
        }
    }

    if (err == MP_OKAY) {
        sp_3072_to_bin(r, out);
        *outLen = 384;
    }

#if defined(WOLFSSL_SP_SMALL) || defined(WOLFSSL_SMALL_STACK)
    if (d != NULL)
        XFREE(d, NULL, DYNAMIC_TYPE_TMP_BUFFER);
#endif

    return err;
}

/* RSA private key operation.
 *
 * in      Array of bytes representing the number to exponentiate, base.
 * inLen   Number of bytes in base.
 * dm      Private exponent.
 * pm      First prime.
 * qm      Second prime.
 * dpm     First prime's CRT exponent.
 * dqm     Second prime's CRT exponent.
 * qim     Inverse of second prime mod p.
 * mm      Modulus.
 * out     Buffer to hold big-endian bytes of exponentiation result.
 *         Must be at least 384 bytes long.
 * outLen  Number of bytes in result.
 * returns 0 on success, MP_TO_E when the outLen is too small, MP_READ_E when
 * an array is too long and MEMORY_E when dynamic memory allocation fails.
 */
int sp_RsaPrivate_3072(const byte* in, word32 inLen, mp_int* dm,
    mp_int* pm, mp_int* qm, mp_int* dpm, mp_int* dqm, mp_int* qim, mp_int* mm,
    byte* out, word32* outLen)
{
#if !defined(WOLFSSL_SP_SMALL) && !defined(WOLFSSL_SMALL_STACK)
    sp_digit ad[48 * 2];
    sp_digit pd[24], qd[24], dpd[24];
    sp_digit tmpad[48], tmpbd[48];
#else
    sp_digit* t = NULL;
#endif
    sp_digit* a;
    sp_digit* p;
    sp_digit* q;
    sp_digit* dp;
    sp_digit* dq;
    sp_digit* qi;
    sp_digit* tmp;
    sp_digit* tmpa;
    sp_digit* tmpb;
    sp_digit* r;
    sp_digit c;
    int err = MP_OKAY;
#ifdef HAVE_INTEL_AVX2
    word32 cpuid_flags = cpuid_get_flags();
#endif

    (void)dm;
    (void)mm;

    if (*outLen < 384)
        err = MP_TO_E;
    if (err == MP_OKAY && (inLen > 384 || mp_count_bits(mm) != 3072))
        err = MP_READ_E;

#if defined(WOLFSSL_SP_SMALL) || defined(WOLFSSL_SMALL_STACK)
    if (err == MP_OKAY) {
        t = (sp_digit*)XMALLOC(sizeof(sp_digit) * 24 * 11, NULL,
                               DYNAMIC_TYPE_TMP_BUFFER);
        if (t == NULL)
            err = MEMORY_E;
    }
    if (err == MP_OKAY) {
        a = t;
        p = a + 48 * 2;
        q = p + 24;
        qi = dq = dp = q + 24;
        tmpa = qi + 24;
        tmpb = tmpa + 48;

        tmp = t;
        r = tmp + 48;
    }
#else
    r = a = ad;
    p = pd;
    q = qd;
    qi = dq = dp = dpd;
    tmpa = tmpad;
    tmpb = tmpbd;
    tmp = a + 48;
#endif

    if (err == MP_OKAY) {
        sp_3072_from_bin(a, 48, in, inLen);
        sp_3072_from_mp(p, 24, pm);
        sp_3072_from_mp(q, 24, qm);
        sp_3072_from_mp(dp, 24, dpm);

#ifdef HAVE_INTEL_AVX2
        if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags))
            err = sp_3072_mod_exp_avx2_24(tmpa, a, dp, 1536, p, 1);
        else
#endif
            err = sp_3072_mod_exp_24(tmpa, a, dp, 1536, p, 1);
    }
    if (err == MP_OKAY) {
        sp_3072_from_mp(dq, 24, dqm);
#ifdef HAVE_INTEL_AVX2
        if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags))
            err = sp_3072_mod_exp_avx2_24(tmpb, a, dq, 1536, q, 1);
       else
#endif
            err = sp_3072_mod_exp_24(tmpb, a, dq, 1536, q, 1);
    }

    if (err == MP_OKAY) {
        c = sp_3072_sub_in_place_24(tmpa, tmpb);
        sp_3072_mask_24(tmp, p, c);
        sp_3072_add_24(tmpa, tmpa, tmp);

        sp_3072_from_mp(qi, 24, qim);
#ifdef HAVE_INTEL_AVX2
        if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags))
            sp_3072_mul_avx2_24(tmpa, tmpa, qi);
        else
#endif
            sp_3072_mul_24(tmpa, tmpa, qi);
        err = sp_3072_mod_24(tmpa, tmpa, p);
    }

    if (err == MP_OKAY) {
#ifdef HAVE_INTEL_AVX2
        if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags))
            sp_3072_mul_avx2_24(tmpa, q, tmpa);
        else
#endif
            sp_3072_mul_24(tmpa, q, tmpa);
        XMEMSET(&tmpb[24], 0, sizeof(sp_digit) * 24);
        sp_3072_add_48(r, tmpb, tmpa);

        sp_3072_to_bin(r, out);
        *outLen = 384;
    }

#if defined(WOLFSSL_SP_SMALL) || defined(WOLFSSL_SMALL_STACK)
    if (t != NULL) {
        XMEMSET(t, 0, sizeof(sp_digit) * 24 * 11);
        XFREE(t, NULL, DYNAMIC_TYPE_TMP_BUFFER);
    }
#else
    XMEMSET(tmpad, 0, sizeof(tmpad));
    XMEMSET(tmpbd, 0, sizeof(tmpbd));
    XMEMSET(pd, 0, sizeof(pd));
    XMEMSET(qd, 0, sizeof(qd));
    XMEMSET(dpd, 0, sizeof(dpd));
#endif

    return err;
}
#endif /* WOLFSSL_HAVE_SP_RSA */
#ifdef WOLFSSL_HAVE_SP_DH
/* Convert an array of sp_digit to an mp_int.
 *
 * a  A single precision integer.
 * r  A multi-precision integer.
 */
static int sp_3072_to_mp(sp_digit* a, mp_int* r)
{
    int err;

    err = mp_grow(r, (3072 + DIGIT_BIT - 1) / DIGIT_BIT);
    if (err == MP_OKAY) {
#if DIGIT_BIT == 64
        XMEMCPY(r->dp, a, sizeof(sp_digit) * 48);
        r->used = 48;
        mp_clamp(r);
#elif DIGIT_BIT < 64
        int i, j = 0, s = 0;

        r->dp[0] = 0;
        for (i = 0; i < 48; i++) {
            r->dp[j] |= a[i] << s;
            r->dp[j] &= (1l << DIGIT_BIT) - 1;
            s = DIGIT_BIT - s;
            r->dp[++j] = a[i] >> s;
            while (s + DIGIT_BIT <= 64) {
                s += DIGIT_BIT;
                r->dp[j] &= (1l << DIGIT_BIT) - 1;
                r->dp[++j] = a[i] >> s;
            }
            s = 64 - s;
        }
        r->used = (3072 + DIGIT_BIT - 1) / DIGIT_BIT;
        mp_clamp(r);
#else
        int i, j = 0, s = 0;

        r->dp[0] = 0;
        for (i = 0; i < 48; i++) {
            r->dp[j] |= ((mp_digit)a[i]) << s;
            if (s + 64 >= DIGIT_BIT) {
    #if DIGIT_BIT < 64
                r->dp[j] &= (1l << DIGIT_BIT) - 1;
    #endif
                s = DIGIT_BIT - s;
                r->dp[++j] = a[i] >> s;
                s = 64 - s;
            }
            else
                s += 64;
        }
        r->used = (3072 + DIGIT_BIT - 1) / DIGIT_BIT;
        mp_clamp(r);
#endif
    }

    return err;
}

/* Perform the modular exponentiation for Diffie-Hellman.
 *
 * base  Base. MP integer.
 * exp   Exponent. MP integer.
 * mod   Modulus. MP integer.
 * res   Result. MP integer.
 * returs 0 on success, MP_READ_E if there are too many bytes in an array
 * and MEMORY_E if memory allocation fails.
 */
int sp_ModExp_3072(mp_int* base, mp_int* exp, mp_int* mod, mp_int* res)
{
    int err = MP_OKAY;
    sp_digit b[96], e[48], m[48];
    sp_digit* r = b;
#ifdef HAVE_INTEL_AVX2
    word32 cpuid_flags = cpuid_get_flags();
#endif
    int expBits = mp_count_bits(exp);

    if (mp_count_bits(base) > 3072 || expBits > 3072 ||
                                                   mp_count_bits(mod) != 3072) {
        err = MP_READ_E;
    }

    if (err == MP_OKAY) {
        sp_3072_from_mp(b, 48, base);
        sp_3072_from_mp(e, 48, exp);
        sp_3072_from_mp(m, 48, mod);

#ifdef HAVE_INTEL_AVX2
        if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags))
            err = sp_3072_mod_exp_avx2_48(r, b, e, expBits, m, 0);
        else
#endif
            err = sp_3072_mod_exp_48(r, b, e, expBits, m, 0);
    }

    if (err == MP_OKAY) {
        err = sp_3072_to_mp(r, res);
    }

    XMEMSET(e, 0, sizeof(e));

    return err;
}

/* Perform the modular exponentiation for Diffie-Hellman.
 *
 * base     Base.
 * exp      Array of bytes that is the exponent.
 * expLen   Length of data, in bytes, in exponent.
 * mod      Modulus.
 * out      Buffer to hold big-endian bytes of exponentiation result.
 *          Must be at least 384 bytes long.
 * outLen   Length, in bytes, of exponentiation result.
 * returs 0 on success, MP_READ_E if there are too many bytes in an array
 * and MEMORY_E if memory allocation fails.
 */
int sp_DhExp_3072(mp_int* base, const byte* exp, word32 expLen,
    mp_int* mod, byte* out, word32* outLen)
{
    int err = MP_OKAY;
    sp_digit b[96], e[48], m[48];
    sp_digit* r = b;
    word32 i;
#ifdef HAVE_INTEL_AVX2
    word32 cpuid_flags = cpuid_get_flags();
#endif

    if (mp_count_bits(base) > 3072 || expLen > 384 ||
                                                   mp_count_bits(mod) != 3072) {
        err = MP_READ_E;
    }

    if (err == MP_OKAY) {
        sp_3072_from_mp(b, 48, base);
        sp_3072_from_bin(e, 48, exp, expLen);
        sp_3072_from_mp(m, 48, mod);

#ifdef HAVE_INTEL_AVX2
        if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags))
            err = sp_3072_mod_exp_avx2_48(r, b, e, expLen * 8, m, 0);
        else
#endif
            err = sp_3072_mod_exp_48(r, b, e, expLen * 8, m, 0);
    }

    if (err == MP_OKAY) {
        sp_3072_to_bin(r, out);
        *outLen = 384;
        for (i=0; i<384 && out[i] == 0; i++) {
        }
        *outLen -= i;
        XMEMMOVE(out, out + i, *outLen);
    }

    XMEMSET(e, 0, sizeof(e));

    return err;
}
#endif /* WOLFSSL_HAVE_SP_DH */

#endif /* WOLFSSL_SP_NO_3072 */

#endif /* WOLFSSL_HAVE_SP_RSA || WOLFSSL_HAVE_SP_DH */
#ifdef WOLFSSL_HAVE_SP_ECC
#ifndef WOLFSSL_SP_NO_256

/* Point structure to use. */
typedef struct sp_point {
    sp_digit x[2 * 4];
    sp_digit y[2 * 4];
    sp_digit z[2 * 4];
    int infinity;
} sp_point;

/* The modulus (prime) of the curve P256. */
static sp_digit p256_mod[4] = {
    0xffffffffffffffffl,0x00000000ffffffffl,0x0000000000000000l,
    0xffffffff00000001l
};
/* The Montogmery normalizer for modulus of the curve P256. */
static sp_digit p256_norm_mod[4] = {
    0x0000000000000001l,0xffffffff00000000l,0xffffffffffffffffl,
    0x00000000fffffffel
};
/* The Montogmery multiplier for modulus of the curve P256. */
static sp_digit p256_mp_mod = 0x0000000000000001;
#if defined(WOLFSSL_VALIDATE_ECC_KEYGEN) || defined(HAVE_ECC_SIGN) || \
                                            defined(HAVE_ECC_VERIFY)
/* The order of the curve P256. */
static sp_digit p256_order[4] = {
    0xf3b9cac2fc632551l,0xbce6faada7179e84l,0xffffffffffffffffl,
    0xffffffff00000000l
};
#endif
/* The order of the curve P256 minus 2. */
static sp_digit p256_order2[4] = {
    0xf3b9cac2fc63254fl,0xbce6faada7179e84l,0xffffffffffffffffl,
    0xffffffff00000000l
};
#if defined(HAVE_ECC_SIGN) || defined(HAVE_ECC_VERIFY)
/* The Montogmery normalizer for order of the curve P256. */
static sp_digit p256_norm_order[4] = {
    0x0c46353d039cdaafl,0x4319055258e8617bl,0x0000000000000000l,
    0x00000000ffffffffl
};
#endif
#if defined(HAVE_ECC_SIGN) || defined(HAVE_ECC_VERIFY)
/* The Montogmery multiplier for order of the curve P256. */
static sp_digit p256_mp_order = 0xccd1c8aaee00bc4fl;
#endif
#ifdef WOLFSSL_SP_SMALL
/* The base point of curve P256. */
static sp_point p256_base = {
    /* X ordinate */
    {
        0xf4a13945d898c296l,0x77037d812deb33a0l,0xf8bce6e563a440f2l,
        0x6b17d1f2e12c4247l
    },
    /* Y ordinate */
    {
        0xcbb6406837bf51f5l,0x2bce33576b315ecel,0x8ee7eb4a7c0f9e16l,
        0x4fe342e2fe1a7f9bl
    },
    /* Z ordinate */
    {
        0x0000000000000001l,0x0000000000000000l,0x0000000000000000l,
        0x0000000000000000l
    },
    /* infinity */
    0
};
#endif /* WOLFSSL_SP_SMALL */
#if defined(HAVE_ECC_CHECK_KEY) || defined(HAVE_COMP_KEY)
static sp_digit p256_b[4] = {
    0x3bce3c3e27d2604bl,0x651d06b0cc53b0f6l,0xb3ebbd55769886bcl,
    0x5ac635d8aa3a93e7l
};
#endif

#if defined(WOLFSSL_SP_SMALL) || defined(WOLFSSL_SMALL_STACK)
/* Allocate memory for point and return error. */
#define sp_ecc_point_new(heap, sp, p)                                   \
    ((p = XMALLOC(sizeof(sp_point), heap, DYNAMIC_TYPE_ECC)) == NULL) ? \
        MEMORY_E : MP_OKAY
#else
/* Set pointer to data and return no error. */
#define sp_ecc_point_new(heap, sp, p)   ((p = &sp) == NULL) ? MEMORY_E : MP_OKAY
#endif

#if defined(WOLFSSL_SP_SMALL) || defined(WOLFSSL_SMALL_STACK)
/* If valid pointer then clear point data if requested and free data. */
#define sp_ecc_point_free(p, clear, heap)     \
    do {                                      \
        if (p != NULL) {                      \
            if (clear)                        \
                XMEMSET(p, 0, sizeof(*p));    \
            XFREE(p, heap, DYNAMIC_TYPE_ECC); \
        }                                     \
    }                                         \
    while (0)
#else
/* Clear point data if requested. */
#define sp_ecc_point_free(p, clear, heap) \
    do {                                  \
        if (clear)                        \
            XMEMSET(p, 0, sizeof(*p));    \
    }                                     \
    while (0)
#endif

/* Multiply a number by Montogmery normalizer mod modulus (prime).
 *
 * r  The resulting Montgomery form number.
 * a  The number to convert.
 * m  The modulus (prime).
 */
static int sp_256_mod_mul_norm_4(sp_digit* r, sp_digit* a, sp_digit* m)
{
    int64_t t[8];
    int64_t a32[8];
    int64_t o;

    (void)m;

    a32[0] = a[0] & 0xffffffff;
    a32[1] = a[0] >> 32;
    a32[2] = a[1] & 0xffffffff;
    a32[3] = a[1] >> 32;
    a32[4] = a[2] & 0xffffffff;
    a32[5] = a[2] >> 32;
    a32[6] = a[3] & 0xffffffff;
    a32[7] = a[3] >> 32;

    /*  1  1  0 -1 -1 -1 -1  0 */
    t[0] = 0 + a32[0] + a32[1] - a32[3] - a32[4] - a32[5] - a32[6];
    /*  0  1  1  0 -1 -1 -1 -1 */
    t[1] = 0 + a32[1] + a32[2] - a32[4] - a32[5] - a32[6] - a32[7];
    /*  0  0  1  1  0 -1 -1 -1 */
    t[2] = 0 + a32[2] + a32[3] - a32[5] - a32[6] - a32[7];
    /* -1 -1  0  2  2  1  0 -1 */
    t[3] = 0 - a32[0] - a32[1] + 2 * a32[3] + 2 * a32[4] + a32[5] - a32[7];
    /*  0 -1 -1  0  2  2  1  0 */
    t[4] = 0 - a32[1] - a32[2] + 2 * a32[4] + 2 * a32[5] + a32[6];
    /*  0  0 -1 -1  0  2  2  1 */
    t[5] = 0 - a32[2] - a32[3] + 2 * a32[5] + 2 * a32[6] + a32[7];
    /* -1 -1  0  0  0  1  3  2 */
    t[6] = 0 - a32[0] - a32[1] + a32[5] + 3 * a32[6] + 2 * a32[7];
    /*  1  0 -1 -1 -1 -1  0  3 */
    t[7] = 0 + a32[0] - a32[2] - a32[3] - a32[4] - a32[5] + 3 * a32[7];

    t[1] += t[0] >> 32; t[0] &= 0xffffffff;
    t[2] += t[1] >> 32; t[1] &= 0xffffffff;
    t[3] += t[2] >> 32; t[2] &= 0xffffffff;
    t[4] += t[3] >> 32; t[3] &= 0xffffffff;
    t[5] += t[4] >> 32; t[4] &= 0xffffffff;
    t[6] += t[5] >> 32; t[5] &= 0xffffffff;
    t[7] += t[6] >> 32; t[6] &= 0xffffffff;
    o     = t[7] >> 32; t[7] &= 0xffffffff;
    t[0] += o;
    t[3] -= o;
    t[6] -= o;
    t[7] += o;
    t[1] += t[0] >> 32; t[0] &= 0xffffffff;
    t[2] += t[1] >> 32; t[1] &= 0xffffffff;
    t[3] += t[2] >> 32; t[2] &= 0xffffffff;
    t[4] += t[3] >> 32; t[3] &= 0xffffffff;
    t[5] += t[4] >> 32; t[4] &= 0xffffffff;
    t[6] += t[5] >> 32; t[5] &= 0xffffffff;
    t[7] += t[6] >> 32; t[6] &= 0xffffffff;
    r[0] = (t[1] << 32) | t[0];
    r[1] = (t[3] << 32) | t[2];
    r[2] = (t[5] << 32) | t[4];
    r[3] = (t[7] << 32) | t[6];

    return MP_OKAY;
}

/* Convert an mp_int to an array of sp_digit.
 *
 * r  A single precision integer.
 * a  A multi-precision integer.
 */
static void sp_256_from_mp(sp_digit* r, int max, mp_int* a)
{
#if DIGIT_BIT == 64
    int j;

    XMEMCPY(r, a->dp, sizeof(sp_digit) * a->used);

    for (j = a->used; j < max; j++)
        r[j] = 0;
#elif DIGIT_BIT > 64
    int i, j = 0, s = 0;

    r[0] = 0;
    for (i = 0; i < a->used && j < max; i++) {
        r[j] |= a->dp[i] << s;
        r[j] &= 0xffffffffffffffffl;
        s = 64 - s;
        if (j + 1 >= max)
            break;
        r[++j] = a->dp[i] >> s;
        while (s + 64 <= DIGIT_BIT) {
            s += 64;
            r[j] &= 0xffffffffffffffffl;
            if (j + 1 >= max)
                break;
            if (s < DIGIT_BIT)
                r[++j] = a->dp[i] >> s;
            else
                r[++j] = 0;
        }
        s = DIGIT_BIT - s;
    }

    for (j++; j < max; j++)
        r[j] = 0;
#else
    int i, j = 0, s = 0;

    r[0] = 0;
    for (i = 0; i < a->used && j < max; i++) {
        r[j] |= ((sp_digit)a->dp[i]) << s;
        if (s + DIGIT_BIT >= 64) {
            r[j] &= 0xffffffffffffffffl;
            if (j + 1 >= max)
                break;
            s = 64 - s;
            if (s == DIGIT_BIT) {
                r[++j] = 0;
                s = 0;
            }
            else {
                r[++j] = a->dp[i] >> s;
                s = DIGIT_BIT - s;
            }
        }
        else
            s += DIGIT_BIT;
    }

    for (j++; j < max; j++)
        r[j] = 0;
#endif
}

/* Convert a point of type ecc_point to type sp_point.
 *
 * p   Point of type sp_point (result).
 * pm  Point of type ecc_point.
 */
static void sp_256_point_from_ecc_point_4(sp_point* p, ecc_point* pm)
{
    XMEMSET(p->x, 0, sizeof(p->x));
    XMEMSET(p->y, 0, sizeof(p->y));
    XMEMSET(p->z, 0, sizeof(p->z));
    sp_256_from_mp(p->x, 4, pm->x);
    sp_256_from_mp(p->y, 4, pm->y);
    sp_256_from_mp(p->z, 4, pm->z);
    p->infinity = 0;
}

/* Convert an array of sp_digit to an mp_int.
 *
 * a  A single precision integer.
 * r  A multi-precision integer.
 */
static int sp_256_to_mp(sp_digit* a, mp_int* r)
{
    int err;

    err = mp_grow(r, (256 + DIGIT_BIT - 1) / DIGIT_BIT);
    if (err == MP_OKAY) {
#if DIGIT_BIT == 64
        XMEMCPY(r->dp, a, sizeof(sp_digit) * 4);
        r->used = 4;
        mp_clamp(r);
#elif DIGIT_BIT < 64
        int i, j = 0, s = 0;

        r->dp[0] = 0;
        for (i = 0; i < 4; i++) {
            r->dp[j] |= a[i] << s;
            r->dp[j] &= (1l << DIGIT_BIT) - 1;
            s = DIGIT_BIT - s;
            r->dp[++j] = a[i] >> s;
            while (s + DIGIT_BIT <= 64) {
                s += DIGIT_BIT;
                r->dp[j] &= (1l << DIGIT_BIT) - 1;
                r->dp[++j] = a[i] >> s;
            }
            s = 64 - s;
        }
        r->used = (256 + DIGIT_BIT - 1) / DIGIT_BIT;
        mp_clamp(r);
#else
        int i, j = 0, s = 0;

        r->dp[0] = 0;
        for (i = 0; i < 4; i++) {
            r->dp[j] |= ((mp_digit)a[i]) << s;
            if (s + 64 >= DIGIT_BIT) {
    #if DIGIT_BIT < 64
                r->dp[j] &= (1l << DIGIT_BIT) - 1;
    #endif
                s = DIGIT_BIT - s;
                r->dp[++j] = a[i] >> s;
                s = 64 - s;
            }
            else
                s += 64;
        }
        r->used = (256 + DIGIT_BIT - 1) / DIGIT_BIT;
        mp_clamp(r);
#endif
    }

    return err;
}

/* Convert a point of type sp_point to type ecc_point.
 *
 * p   Point of type sp_point.
 * pm  Point of type ecc_point (result).
 * returns MEMORY_E when allocation of memory in ecc_point fails otherwise
 * MP_OKAY.
 */
static int sp_256_point_to_ecc_point_4(sp_point* p, ecc_point* pm)
{
    int err;

    err = sp_256_to_mp(p->x, pm->x);
    if (err == MP_OKAY)
        err = sp_256_to_mp(p->y, pm->y);
    if (err == MP_OKAY)
        err = sp_256_to_mp(p->z, pm->z);

    return err;
}

/* Conditionally copy a into r using the mask m.
 * m is -1 to copy and 0 when not.
 *
 * r  A single precision number to copy over.
 * a  A single precision number to copy.
 * m  Mask value to apply.
 */
static void sp_256_cond_copy_4(sp_digit* r, const sp_digit* a, const sp_digit m)
{
    sp_digit t[4];
    __asm__ __volatile__ (
        "movq	(%[r]), %%rax\n\t"
        "movq	8(%[r]), %%rcx\n\t"
        "movq	16(%[r]), %%rdx\n\t"
        "movq	24(%[r]), %%r8\n\t"
        "xorq	(%[a]), %%rax\n\t"
        "xorq	8(%[a]), %%rcx\n\t"
        "xorq	16(%[a]), %%rdx\n\t"
        "xorq	24(%[a]), %%r8\n\t"
        "andq	%[m], %%rax\n\t"
        "andq	%[m], %%rcx\n\t"
        "andq	%[m], %%rdx\n\t"
        "andq	%[m], %%r8\n\t"
        "xorq	%%rax, (%[r])\n\t"
        "xorq	%%rcx, 8(%[r])\n\t"
        "xorq	%%rdx, 16(%[r])\n\t"
        "xorq	%%r8, 24(%[r])\n\t"
        :
        : [r] "r" (r), [a] "r" (a), [m] "r" (m), [t] "r" (t)
        : "memory", "rax", "rcx", "rdx", "r8"
    );
}

/* Compare a with b in constant time.
 *
 * a  A single precision integer.
 * b  A single precision integer.
 * return -ve, 0 or +ve if a is less than, equal to or greater than b
 * respectively.
 */
static int64_t sp_256_cmp_4(sp_digit* a, sp_digit* b)
{
    sp_digit r = -1;
    sp_digit one = 1;

    __asm__ __volatile__ (
        "xorq	%%rcx, %%rcx\n\t"
        "movq	$-1, %%rdx\n\t"
        "movq	24(%[a]), %%rbx\n\t"
        "movq	24(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	16(%[a]), %%rbx\n\t"
        "movq	16(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	8(%[a]), %%rbx\n\t"
        "movq	8(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "movq	0(%[a]), %%rbx\n\t"
        "movq	0(%[b]), %%r8\n\t"
        "andq	%%rdx, %%rbx\n\t"
        "andq	%%rdx, %%r8\n\t"
        "subq	%%r8, %%rbx\n\t"
        "cmova	%[one], %[r]\n\t"
        "cmovc	%%rdx, %[r]\n\t"
        "cmovnz	%%rcx, %%rdx\n\t"
        "xorq	%%rdx, %[r]\n\t"
        : [r] "+r" (r)
        : [a] "r" (a), [b] "r" (b), [one] "r" (one)
        : "rax", "rdx", "rcx", "rbx", "r8"
    );

    return r;
}

/* Normalize the values in each word to 64.
 *
 * a  Array of sp_digit to normalize.
 */
#define sp_256_norm_4(a)

/* Conditionally subtract b from a using the mask m.
 * m is -1 to subtract and 0 when not copying.
 *
 * r  A single precision number representing condition subtract result.
 * a  A single precision number to subtract from.
 * b  A single precision number to subtract.
 * m  Mask value to apply.
 */
static sp_digit sp_256_cond_sub_4(sp_digit* r, sp_digit* a, sp_digit* b,
        sp_digit m)
{
    sp_digit t[4];
    sp_digit c = 0;

    __asm__ __volatile__ (
        "movq	0(%[b]), %%rax\n\t"
        "movq	8(%[b]), %%rcx\n\t"
        "andq	%[m], %%rax\n\t"
        "andq	%[m], %%rcx\n\t"
        "movq	%%rax, 0(%[t])\n\t"
        "movq	%%rcx, 8(%[t])\n\t"
        "movq	16(%[b]), %%rax\n\t"
        "movq	24(%[b]), %%rcx\n\t"
        "andq	%[m], %%rax\n\t"
        "andq	%[m], %%rcx\n\t"
        "movq	%%rax, 16(%[t])\n\t"
        "movq	%%rcx, 24(%[t])\n\t"
        "movq	(%[a]), %%rax\n\t"
        "movq	(%[t]), %%rdx\n\t"
        "subq	%%rdx,%%rax\n\t"
        "movq	8(%[a]), %%rcx\n\t"
        "movq	8(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rcx\n\t"
        "movq	%%rax, 0(%[r])\n\t"
        "movq	16(%[a]), %%rax\n\t"
        "movq	16(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rax\n\t"
        "movq	%%rcx, 8(%[r])\n\t"
        "movq	24(%[a]), %%rcx\n\t"
        "movq	24(%[t]), %%rdx\n\t"
        "sbbq	%%rdx, %%rcx\n\t"
        "movq	%%rax, 16(%[r])\n\t"
        "movq	%%rcx, 24(%[r])\n\t"
        "sbbq	$0, %[c]\n\t"
        : [c] "+r" (c)
        : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m), [t] "r" (t)
        : "memory", "rax", "rcx", "rdx"
    );

    return c;
}

/* Sub b from a into r. (r = a - b)
 *
 * r  A single precision integer.
 * a  A single precision integer.
 * b  A single precision integer.
 */
SP_NOINLINE static sp_digit sp_256_sub_4(sp_digit* r, const sp_digit* a,
        const sp_digit* b)
{
    sp_digit c = 0;

    __asm__ __volatile__ (
        "movq	(%[a]), %%rax\n\t"
        "movq	(%[b]), %%rdx\n\t"
        "subq	%%rdx, %%rax\n\t"
        "movq	8(%[a]), %%rcx\n\t"
        "movq	8(%[b]), %%rdx\n\t"
        "sbbq	%%rdx, %%rcx\n\t"
        "movq	%%rax, 0(%[r])\n\t"
        "movq	16(%[a]), %%rax\n\t"
        "movq	16(%[b]), %%rdx\n\t"
        "sbbq	%%rdx, %%rax\n\t"
        "movq	%%rcx, 8(%[r])\n\t"
        "movq	24(%[a]), %%rcx\n\t"
        "movq	24(%[b]), %%rdx\n\t"
        "sbbq	%%rdx, %%rcx\n\t"
        "movq	%%rax, 16(%[r])\n\t"
        "movq	%%rcx, 24(%[r])\n\t"
        "sbbq	$0, %[c]\n\t"
        : [c] "+r" (c)
        : [r] "r" (r), [a] "r" (a), [b] "r" (b)
        : "memory", "rax", "rcx", "rdx"
    );

    return c;
}

/* Reduce the number back to 256 bits using Montgomery reduction.
 *
 * a   A single precision number to reduce in place.
 * m   The single precision number representing the modulus.
 * mp  The digit representing the negative inverse of m mod 2^n.
 */
SP_NOINLINE static void sp_256_mont_reduce_4(sp_digit* a, sp_digit* m,
        sp_digit mp)
{
    __asm__ __volatile__ (
        "# i = 0\n\t"
        "xorq	%%r13, %%r13\n\t"
        "xorq	%%rcx, %%rcx\n\t"
        "movq	%[a], %%r12\n\t"
        "\nL_mont_loop_4:\n\t"
        "# mu = a[i] * mp\n\t"
        "movq	0(%%r12), %%r11\n\t"
        "imulq	%[mp], %%r11\n\t"
        "# a[i+0] += m[0] * mu\n\t"
        "movq	0(%[m]), %%rax\n\t"
        "movq	8(%[m]), %%r9\n\t"
        "mulq	%%r11\n\t"
        "movq	0(%%r12), %%rbx\n\t"
        "addq	%%rax,  %%rbx\n\t"
        "movq	%%rdx, %%r8\n\t"
        "movq	%%rbx, 0(%%r12)\n\t"
        "adcq	$0, %%r8\n\t"
        "# a[i+1] += m[1] * mu\n\t"
        "movq	%%r9, %%rax\n\t"
        "mulq	%%r11\n\t"
        "movq	16(%[m]), %%r9\n\t"
        "movq	8(%%r12), %%rbx\n\t"
        "addq	%%r8, %%rax\n\t"
        "movq	%%rdx, %%r10\n\t"
        "adcq	$0, %%r10\n\t"
        "addq	%%rax,  %%rbx\n\t"
        "movq	%%rbx, 8(%%r12)\n\t"
        "adcq	$0, %%r10\n\t"
        "# a[i+2] += m[2] * mu\n\t"
        "movq	%%r9, %%rax\n\t"
        "mulq	%%r11\n\t"
        "movq	24(%[m]), %%r9\n\t"
        "movq	16(%%r12), %%rbx\n\t"
        "addq	%%r10, %%rax\n\t"
        "movq	%%rdx, %%r8\n\t"
        "adcq	$0, %%r8\n\t"
        "addq	%%rax,  %%rbx\n\t"
        "movq	%%rbx, 16(%%r12)\n\t"
        "adcq	$0, %%r8\n\t"
        "# a[i+3] += m[3] * mu\n\t"
        "movq	%%r9, %%rax\n\t"
        "mulq	%%r11\n\t"
        "movq	24(%%r12), %%rbx\n\t"
        "addq	%%r8, %%rax\n\t"
        "adcq	%%r13, %%rdx\n\t"
        "movq	$0, %%r13\n\t"
        "adcq	$0, %%r13\n\t"
        "addq	%%rax, %%rbx\n\t"
        "movq	%%rbx, 24(%%r12)\n\t"
        "adcq	%%rdx, 32(%%r12)\n\t"
        "adcq	$0, %%r13\n\t"
        "# i += 1\n\t"
        "addq	$8, %%r12\n\t"
        "addq	$8, %%rcx\n\t"
        "cmpq	$32, %%rcx\n\t"
        "jl	L_mont_loop_4\n\t"
        "xorq	%%rax, %%rax\n\t"
        "movq	32(%[a]), %%rdx\n\t"
        "movq	40(%[a]), %%rcx\n\t"
        "movq	48(%[a]), %%rbx\n\t"
        "movq	56(%[a]), %%r8\n\t"
        "subq	%%r13, %%rax\n\t"
        "movq	0(%[m]), %%r9\n\t"
        "movq	8(%[m]), %%r10\n\t"
        "movq	16(%[m]), %%r11\n\t"
        "movq	24(%[m]), %%r12\n\t"
        "andq	%%rax, %%r9\n\t"
        "andq	%%rax, %%r10\n\t"
        "andq	%%rax, %%r11\n\t"
        "andq	%%rax, %%r12\n\t"
        "subq	%%r9, %%rdx\n\t"
        "sbbq	%%r10, %%rcx\n\t"
        "sbbq	%%r11, %%rbx\n\t"
        "sbbq	%%r12, %%r8\n\t"
        "movq	%%rdx,   (%[a])\n\t"
        "movq	%%rcx,  8(%[a])\n\t"
        "movq	%%rbx, 16(%[a])\n\t"
        "movq	%%r8, 24(%[a])\n\t"
        :
        : [a] "r" (a), [m] "r" (m), [mp] "r" (mp)
        : "memory", "rax", "rbx", "rdx", "rcx", "r8", "r9", "r10", "r11",
          "r12", "r13"
    );
}

/* Multiply two Montogmery form numbers mod the modulus (prime).
 * (r = a * b mod m)
 *
 * r   Result of multiplication.
 * a   First number to multiply in Montogmery form.
 * b   Second number to multiply in Montogmery form.
 * m   Modulus (prime).
 * mp  Montogmery mulitplier.
 */
SP_NOINLINE static void sp_256_mont_mul_4(sp_digit* r, sp_digit* a, sp_digit* b,
        sp_digit* m, sp_digit mp)
{
    (void)mp;

    __asm__ __volatile__ (
        "#  A[0] * B[0]\n\t"
        "movq	0(%[b]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "movq	%%rax, %%r8\n\t"
        "movq	%%rdx, %%r9\n\t"
        "#  A[0] * B[1]\n\t"
        "movq	8(%[b]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%r10, %%r10\n\t"
        "addq	%%rax, %%r9\n\t"
        "adcq	%%rdx, %%r10\n\t"
        "#  A[1] * B[0]\n\t"
        "movq	0(%[b]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "xorq	%%r11, %%r11\n\t"
        "addq	%%rax, %%r9\n\t"
        "adcq	%%rdx, %%r10\n\t"
        "adcq	$0, %%r11\n\t"
        "#  A[0] * B[2]\n\t"
        "movq	16(%[b]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "#  A[1] * B[1]\n\t"
        "movq	8(%[b]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "xorq	%%r12, %%r12\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[2] * B[0]\n\t"
        "movq	0(%[b]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "adcq	$0, %%r12\n\t"
        "#  A[0] * B[3]\n\t"
        "movq	24(%[b]), %%rax\n\t"
        "mulq	0(%[a])\n\t"
        "xorq	%%r13, %%r13\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r12\n\t"
        "adcq	$0, %%r13\n\t"
        "#  A[1] * B[2]\n\t"
        "movq	16(%[b]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r12\n\t"
        "adcq	$0, %%r13\n\t"
        "#  A[2] * B[1]\n\t"
        "movq	8(%[b]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r12\n\t"
        "adcq	$0, %%r13\n\t"
        "#  A[3] * B[0]\n\t"
        "movq	0(%[b]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r12\n\t"
        "adcq	$0, %%r13\n\t"
        "#  A[1] * B[3]\n\t"
        "movq	24(%[b]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "xorq	%%r14, %%r14\n\t"
        "addq	%%rax, %%r12\n\t"
        "adcq	%%rdx, %%r13\n\t"
        "adcq	$0, %%r14\n\t"
        "#  A[2] * B[2]\n\t"
        "movq	16(%[b]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "addq	%%rax, %%r12\n\t"
        "adcq	%%rdx, %%r13\n\t"
        "adcq	$0, %%r14\n\t"
        "#  A[3] * B[1]\n\t"
        "movq	8(%[b]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%r12\n\t"
        "adcq	%%rdx, %%r13\n\t"
        "adcq	$0, %%r14\n\t"
        "#  A[2] * B[3]\n\t"
        "movq	24(%[b]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "xorq	%%r15, %%r15\n\t"
        "addq	%%rax, %%r13\n\t"
        "adcq	%%rdx, %%r14\n\t"
        "adcq	$0, %%r15\n\t"
        "#  A[3] * B[2]\n\t"
        "movq	16(%[b]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%r13\n\t"
        "adcq	%%rdx, %%r14\n\t"
        "adcq	$0, %%r15\n\t"
        "#  A[3] * B[3]\n\t"
        "movq	24(%[b]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%r14\n\t"
        "adcq	%%rdx, %%r15\n\t"
        "# Start Reduction\n\t"
        "movq	%%r8, %%rax\n\t"
        "movq	%%r9, %[a]\n\t"
        "movq	%%r10, %[b]\n\t"
        "movq	%%r11, %%rdx\n\t"
        "# mu = a[0]-a[3] + a[0]-a[2] << 32 << 64 + (a[0] * 2) << 192\n\t"
        "#    - a[0] << 32 << 192\n\t"
        "#   + (a[0] * 2) << 192\n\t"
        "addq	%%r8, %%rdx\n\t"
        "addq	%%r8, %%rdx\n\t"
        "#   a[0]-a[2] << 32\n\t"
        "shlq	$32, %%r8\n\t"
        "shldq	$32, %[a], %%r10\n\t"
        "shldq	$32, %%rax, %%r9\n\t"
        "#   - a[0] << 32 << 192\n\t"
        "subq	%%r8, %%rdx\n\t"
        "#   + a[0]-a[2] << 32 << 64\n\t"
        "addq	%%r8, %[a]\n\t"
        "adcq	%%r9, %[b]\n\t"
        "adcq	%%r10, %%rdx\n\t"
        "# a += (mu << 256) - (mu << 224) + (mu << 192) + (mu << 96) - mu\n\t"
        "#   a += mu << 256\n\t"
        "xorq	%%r8, %%r8\n\t"
        "addq	%%rax, %%r12\n\t"
        "adcq	%[a], %%r13\n\t"
        "adcq	%[b], %%r14\n\t"
        "adcq	%%rdx, %%r15\n\t"
        "sbbq	$0, %%r8\n\t"
        "#   a += mu << 192\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%[a], %%r12\n\t"
        "adcq	%[b], %%r13\n\t"
        "adcq	%%rdx, %%r14\n\t"
        "adcq	$0, %%r15\n\t"
        "sbbq	$0, %%r8\n\t"
        "# mu <<= 32\n\t"
        "movq	%%rdx, %[m]\n\t"
        "shldq	$32, %[b], %%rdx\n\t"
        "shldq	$32, %[a], %[b]\n\t"
        "shldq	$32, %%rax, %[a]\n\t"
        "shlq	$32, %%rax\n\t"
        "shrq	$32, %[m]\n\t"
        "#   a += (mu << 32) << 64\n\t"
        "addq	%[b], %%r11\n\t"
        "adcq	%%rdx, %%r12\n\t"
        "adcq	%[m], %%r13\n\t"
        "adcq	$0, %%r14\n\t"
        "adcq	$0, %%r15\n\t"
        "sbbq	$0, %%r8\n\t"
        "#   a -= (mu << 32) << 192\n\t"
        "subq	%%rax, %%r11\n\t"
        "movq	$0xffffffff, %%rax\n\t"
        "sbbq	%[a], %%r12\n\t"
        "movq	$0xffffffff00000001, %[a]\n\t"
        "sbbq	%[b], %%r13\n\t"
        "sbbq	%%rdx, %%r14\n\t"
        "sbbq	%[m], %%r15\n\t"
        "adcq	$0, %%r8\n\t"
        "# mask m and sub from result if overflow\n\t"
        "#  m[0] = -1 & mask = mask\n\t"
        "andq	%%r8, %%rax\n\t"
        "#  m[2] =  0 & mask = 0\n\t"
        "andq	%%r8, %[a]\n\t"
        "subq	%%r8, %%r12\n\t"
        "sbbq	%%rax, %%r13\n\t"
        "sbbq	$0, %%r14\n\t"
        "sbbq	%[a], %%r15\n\t"
        "movq	%%r12, 0(%[r])\n\t"
        "movq	%%r13, 8(%[r])\n\t"
        "movq	%%r14, 16(%[r])\n\t"
        "movq	%%r15, 24(%[r])\n\t"
        : [m] "+r" (m), [a] "+r" (a), [b] "+r" (b)
        : [r] "r" (r)
        : "memory", "rax", "rdx", "r8", "r9", "r10", "r11",
          "r12", "r13", "r14", "r15"
    );
}

/* Square the Montgomery form number mod the modulus (prime). (r = a * a mod m)
 *
 * r   Result of squaring.
 * a   Number to square in Montogmery form.
 * m   Modulus (prime).
 * mp  Montogmery mulitplier.
 */
SP_NOINLINE static void sp_256_mont_sqr_4(sp_digit* r, sp_digit* a, sp_digit* m,
        sp_digit mp)
{
    __asm__ __volatile__ (
        "#  A[0] * A[1]\n\t"
        "movq	0(%[a]), %%rax\n\t"
        "mulq	8(%[a])\n\t"
        "movq	%%rax, %%r9\n\t"
        "movq	%%rdx, %%r10\n\t"
        "#  A[0] * A[2]\n\t"
        "movq	0(%[a]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "xorq	%%r11, %%r11\n\t"
        "addq	%%rax, %%r10\n\t"
        "adcq	%%rdx, %%r11\n\t"
        "#  A[0] * A[3]\n\t"
        "movq	0(%[a]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "xorq	%%r12, %%r12\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r12\n\t"
        "#  A[1] * A[2]\n\t"
        "movq	8(%[a]), %%rax\n\t"
        "mulq	16(%[a])\n\t"
        "xorq	%%r13, %%r13\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%%rdx, %%r12\n\t"
        "adcq	$0, %%r13\n\t"
        "#  A[1] * A[3]\n\t"
        "movq	8(%[a]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "addq	%%rax, %%r12\n\t"
        "adcq	%%rdx, %%r13\n\t"
        "#  A[2] * A[3]\n\t"
        "movq	16(%[a]), %%rax\n\t"
        "mulq	24(%[a])\n\t"
        "xorq	%%r14, %%r14\n\t"
        "addq	%%rax, %%r13\n\t"
        "adcq	%%rdx, %%r14\n\t"
        "# Double\n\t"
        "xorq	%%r15, %%r15\n\t"
        "addq	%%r9, %%r9\n\t"
        "adcq	%%r10, %%r10\n\t"
        "adcq	%%r11, %%r11\n\t"
        "adcq	%%r12, %%r12\n\t"
        "adcq	%%r13, %%r13\n\t"
        "adcq	%%r14, %%r14\n\t"
        "adcq	$0, %%r15\n\t"
        "#  A[0] * A[0]\n\t"
        "movq	0(%[a]), %%rax\n\t"
        "mulq	%%rax\n\t"
        "movq	%%rax, %%r8\n\t"
        "movq	%%rdx, %[mp]\n\t"
        "#  A[1] * A[1]\n\t"
        "movq	8(%[a]), %%rax\n\t"
        "mulq	%%rax\n\t"
        "addq	%[mp], %%r9\n\t"
        "adcq	%%rax, %%r10\n\t"
        "adcq	$0, %%rdx\n\t"
        "movq	%%rdx, %[mp]\n\t"
        "#  A[2] * A[2]\n\t"
        "movq	16(%[a]), %%rax\n\t"
        "mulq	%%rax\n\t"
        "addq	%[mp], %%r11\n\t"
        "adcq	%%rax, %%r12\n\t"
        "adcq	$0, %%rdx\n\t"
        "movq	%%rdx, %[mp]\n\t"
        "#  A[3] * A[3]\n\t"
        "movq	24(%[a]), %%rax\n\t"
        "mulq	%%rax\n\t"
        "addq	%%rax, %%r14\n\t"
        "adcq	%%rdx, %%r15\n\t"
        "addq	%[mp], %%r13\n\t"
        "adcq	$0, %%r14\n\t"
        "adcq	$0, %%r15\n\t"
        "# Start Reduction\n\t"
        "movq	%%r8, %%rax\n\t"
        "movq	%%r9, %[a]\n\t"
        "movq	%%r10, %[mp]\n\t"
        "movq	%%r11, %%rdx\n\t"
        "# mu = a[0]-a[3] + a[0]-a[2] << 32 << 64 + (a[0] * 2) << 192\n\t"
        "#    - a[0] << 32 << 192\n\t"
        "#   + (a[0] * 2) << 192\n\t"
        "addq	%%r8, %%rdx\n\t"
        "addq	%%r8, %%rdx\n\t"
        "#   a[0]-a[2] << 32\n\t"
        "shlq	$32, %%r8\n\t"
        "shldq	$32, %[a], %%r10\n\t"
        "shldq	$32, %%rax, %%r9\n\t"
        "#   - a[0] << 32 << 192\n\t"
        "subq	%%r8, %%rdx\n\t"
        "#   + a[0]-a[2] << 32 << 64\n\t"
        "addq	%%r8, %[a]\n\t"
        "adcq	%%r9, %[mp]\n\t"
        "adcq	%%r10, %%rdx\n\t"
        "# a += (mu << 256) - (mu << 224) + (mu << 192) + (mu << 96) - mu\n\t"
        "#   a += mu << 256\n\t"
        "xorq	%%r8, %%r8\n\t"
        "addq	%%rax, %%r12\n\t"
        "adcq	%[a], %%r13\n\t"
        "adcq	%[mp], %%r14\n\t"
        "adcq	%%rdx, %%r15\n\t"
        "sbbq	$0, %%r8\n\t"
        "#   a += mu << 192\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%[a], %%r12\n\t"
        "adcq	%[mp], %%r13\n\t"
        "adcq	%%rdx, %%r14\n\t"
        "adcq	$0, %%r15\n\t"
        "sbbq	$0, %%r8\n\t"
        "# mu <<= 32\n\t"
        "movq	%%rdx, %[m]\n\t"
        "shldq	$32, %[mp], %%rdx\n\t"
        "shldq	$32, %[a], %[mp]\n\t"
        "shldq	$32, %%rax, %[a]\n\t"
        "shlq	$32, %%rax\n\t"
        "shrq	$32, %[m]\n\t"
        "#   a += (mu << 32) << 64\n\t"
        "addq	%[mp], %%r11\n\t"
        "adcq	%%rdx, %%r12\n\t"
        "adcq	%[m], %%r13\n\t"
        "adcq	$0, %%r14\n\t"
        "adcq	$0, %%r15\n\t"
        "sbbq	$0, %%r8\n\t"
        "#   a -= (mu << 32) << 192\n\t"
        "subq	%%rax, %%r11\n\t"
        "movq	$0xffffffff, %%rax\n\t"
        "sbbq	%[a], %%r12\n\t"
        "movq	$0xffffffff00000001, %[a]\n\t"
        "sbbq	%[mp], %%r13\n\t"
        "sbbq	%%rdx, %%r14\n\t"
        "sbbq	%[m], %%r15\n\t"
        "adcq	$0, %%r8\n\t"
        "# mask m and sub from result if overflow\n\t"
        "#  m[0] = -1 & mask = mask\n\t"
        "andq	%%r8, %%rax\n\t"
        "#  m[2] =  0 & mask = 0\n\t"
        "andq	%%r8, %[a]\n\t"
        "subq	%%r8, %%r12\n\t"
        "sbbq	%%rax, %%r13\n\t"
        "sbbq	$0, %%r14\n\t"
        "sbbq	%[a], %%r15\n\t"
        "movq	%%r12, 0(%[r])\n\t"
        "movq	%%r13, 8(%[r])\n\t"
        "movq	%%r14, 16(%[r])\n\t"
        "movq	%%r15, 24(%[r])\n\t"
        : [m] "+r" (m), [a] "+r" (a), [mp] "+r" (mp)
        : [r] "r" (r)
        : "memory", "rax", "rdx", "r8", "r9", "r10", "r11",
          "r12", "r13", "r14", "r15"
    );
}

#ifndef WOLFSSL_SP_SMALL
/* Square the Montgomery form number a number of times. (r = a ^ n mod m)
 *
 * r   Result of squaring.
 * a   Number to square in Montogmery form.
 * n   Number of times to square.
 * m   Modulus (prime).
 * mp  Montogmery mulitplier.
 */
static void sp_256_mont_sqr_n_4(sp_digit* r, sp_digit* a, int n,
        sp_digit* m, sp_digit mp)
{
    sp_256_mont_sqr_4(r, a, m, mp);
    for (; n > 1; n--)
        sp_256_mont_sqr_4(r, r, m, mp);
}

#else
/* Mod-2 for the P256 curve. */
static const uint64_t p256_mod_2[4] = {
    0xfffffffffffffffd,0x00000000ffffffff,0x0000000000000000,
    0xffffffff00000001
};
#endif /* !WOLFSSL_SP_SMALL */

/* Invert the number, in Montgomery form, modulo the modulus (prime) of the
 * P256 curve. (r = 1 / a mod m)
 *
 * r   Inverse result.
 * a   Number to invert.
 * td  Temporary data.
 */
static void sp_256_mont_inv_4(sp_digit* r, sp_digit* a, sp_digit* td)
{
#ifdef WOLFSSL_SP_SMALL
    sp_digit* t = td;
    int i;

    XMEMCPY(t, a, sizeof(sp_digit) * 4);
    for (i=254; i>=0; i--) {
        sp_256_mont_sqr_4(t, t, p256_mod, p256_mp_mod);
        if (p256_mod_2[i / 64] & ((sp_digit)1 << (i % 64)))
            sp_256_mont_mul_4(t, t, a, p256_mod, p256_mp_mod);
    }
    XMEMCPY(r, t, sizeof(sp_digit) * 4);
#else
    sp_digit* t = td;
    sp_digit* t2 = td + 2 * 4;
    sp_digit* t3 = td + 4 * 4;

    /* t = a^2 */
    sp_256_mont_sqr_4(t, a, p256_mod, p256_mp_mod);
    /* t = a^3 = t * a */
    sp_256_mont_mul_4(t, t, a, p256_mod, p256_mp_mod);
    /* t2= a^c = t ^ 2 ^ 2 */
    sp_256_mont_sqr_n_4(t2, t, 2, p256_mod, p256_mp_mod);
    /* t3= a^d = t2 * a */
    sp_256_mont_mul_4(t3, t2, a, p256_mod, p256_mp_mod);
    /* t = a^f = t2 * t */
    sp_256_mont_mul_4(t, t2, t, p256_mod, p256_mp_mod);
    /* t2= a^f0 = t ^ 2 ^ 4 */
    sp_256_mont_sqr_n_4(t2, t, 4, p256_mod, p256_mp_mod);
    /* t3= a^fd = t2 * t3 */
    sp_256_mont_mul_4(t3, t2, t3, p256_mod, p256_mp_mod);
    /* t = a^ff = t2 * t */
    sp_256_mont_mul_4(t, t2, t, p256_mod, p256_mp_mod);
    /* t2= a^ff00 = t ^ 2 ^ 8 */
    sp_256_mont_sqr_n_4(t2, t, 8, p256_mod, p256_mp_mod);
    /* t3= a^fffd = t2 * t3 */
    sp_256_mont_mul_4(t3, t2, t3, p256_mod, p256_mp_mod);
    /* t = a^ffff = t2 * t */
    sp_256_mont_mul_4(t, t2, t, p256_mod, p256_mp_mod);
    /* t2= a^ffff0000 = t ^ 2 ^ 16 */
    sp_256_mont_sqr_n_4(t2, t, 16, p256_mod, p256_mp_mod);
    /* t3= a^fffffffd = t2 * t3 */
    sp_256_mont_mul_4(t3, t2, t3, p256_mod, p256_mp_mod);
    /* t = a^ffffffff = t2 * t */
    sp_256_mont_mul_4(t, t2, t, p256_mod, p256_mp_mod);
    /* t = a^ffffffff00000000 = t ^ 2 ^ 32  */
    sp_256_mont_sqr_n_4(t2, t, 32, p256_mod, p256_mp_mod);
    /* t2= a^ffffffffffffffff = t2 * t */
    sp_256_mont_mul_4(t, t2, t, p256_mod, p256_mp_mod);
    /* t2= a^ffffffff00000001 = t2 * a */
    sp_256_mont_mul_4(t2, t2, a, p256_mod, p256_mp_mod);
    /* t2= a^ffffffff000000010000000000000000000000000000000000000000
     *   = t2 ^ 2 ^ 160 */
    sp_256_mont_sqr_n_4(t2, t2, 160, p256_mod, p256_mp_mod);
    /* t2= a^ffffffff00000001000000000000000000000000ffffffffffffffff
     *   = t2 * t */
    sp_256_mont_mul_4(t2, t2, t, p256_mod, p256_mp_mod);
    /* t2= a^ffffffff00000001000000000000000000000000ffffffffffffffff00000000
     *   = t2 ^ 2 ^ 32 */
    sp_256_mont_sqr_n_4(t2, t2, 32, p256_mod, p256_mp_mod);
    /* r = a^ffffffff00000001000000000000000000000000fffffffffffffffffffffffd
     *   = t2 * t3 */
    sp_256_mont_mul_4(r, t2, t3, p256_mod, p256_mp_mod);
#endif /* WOLFSSL_SP_SMALL */
}

/* Map the Montgomery form projective co-ordinate point to an affine point.
 *
 * r  Resulting affine co-ordinate point.
 * p  Montgomery form projective co-ordinate point.
 * t  Temporary ordinate data.
 */
static void sp_256_map_4(sp_point* r, sp_point* p, sp_digit* t)
{
    sp_digit* t1 = t;
    sp_digit* t2 = t + 2*4;
    int64_t n;

    sp_256_mont_inv_4(t1, p->z, t + 2*4);

    sp_256_mont_sqr_4(t2, t1, p256_mod, p256_mp_mod);
    sp_256_mont_mul_4(t1, t2, t1, p256_mod, p256_mp_mod);

    /* x /= z^2 */
    sp_256_mont_mul_4(r->x, p->x, t2, p256_mod, p256_mp_mod);
    XMEMSET(r->x + 4, 0, sizeof(r->x) / 2);
    sp_256_mont_reduce_4(r->x, p256_mod, p256_mp_mod);
    /* Reduce x to less than modulus */
    n = sp_256_cmp_4(r->x, p256_mod);
    sp_256_cond_sub_4(r->x, r->x, p256_mod, 0 - (n >= 0));
    sp_256_norm_4(r->x);

    /* y /= z^3 */
    sp_256_mont_mul_4(r->y, p->y, t1, p256_mod, p256_mp_mod);
    XMEMSET(r->y + 4, 0, sizeof(r->y) / 2);
    sp_256_mont_reduce_4(r->y, p256_mod, p256_mp_mod);
    /* Reduce y to less than modulus */
    n = sp_256_cmp_4(r->y, p256_mod);
    sp_256_cond_sub_4(r->y, r->y, p256_mod, 0 - (n >= 0));
    sp_256_norm_4(r->y);

    XMEMSET(r->z, 0, sizeof(r->z));
    r->z[0] = 1;

}

/* Add two Montgomery form numbers (r = a + b % m).
 *
 * r   Result of addition.
 * a   First number to add in Montogmery form.
 * b   Second number to add in Montogmery form.
 * m   Modulus (prime).
 */
static void sp_256_mont_add_4(sp_digit* r, sp_digit* a, sp_digit* b,
        sp_digit* m)
{
    __asm__ __volatile__ (
        "movq	0(%[a]), %%rax\n\t"
        "movq	8(%[a]), %%rcx\n\t"
        "movq	16(%[a]), %%rdx\n\t"
        "movq	24(%[a]), %%r10\n\t"
        "xorq	%%r11, %%r11\n\t"
        "addq	0(%[b]), %%rax\n\t"
        "adcq	8(%[b]), %%rcx\n\t"
        "movq	$0xffffffff, %%r8\n\t"
        "adcq	16(%[b]), %%rdx\n\t"
        "adcq	24(%[b]), %%r10\n\t"
        "movq	$0xffffffff00000001, %%r9\n\t"
        "sbbq	$0, %%r11\n\t"
        "andq	%%r11, %%r8\n\t"
        "andq	%%r11, %%r9\n\t"
        "subq	%%r11, %%rax\n\t"
        "sbbq	%%r8, %%rcx\n\t"
        "movq	%%rax, 0(%[r])\n\t"
        "sbbq	$0, %%rdx\n\t"
        "movq	%%rcx, 8(%[r])\n\t"
        "sbbq	%%r9, %%r10\n\t"
        "movq	%%rdx, 16(%[r])\n\t"
        "movq	%%r10, 24(%[r])\n\t"
        :
        : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m)
        : "memory", "rax", "rcx", "rdx", "r8", "r9", "r10", "r11"
    );
}

/* Double a Montgomery form number (r = a + a % m).
 *
 * r   Result of doubling.
 * a   Number to double in Montogmery form.
 * m   Modulus (prime).
 */
static void sp_256_mont_dbl_4(sp_digit* r, sp_digit* a, sp_digit* m)
{
    __asm__ __volatile__ (
        "movq	(%[a]), %%rax\n\t"
        "movq	8(%[a]), %%rcx\n\t"
        "movq	16(%[a]), %%rdx\n\t"
        "movq	24(%[a]), %%r10\n\t"
        "xorq	%%r11, %%r11\n\t"
        "addq	%%rax, %%rax\n\t"
        "adcq	%%rcx, %%rcx\n\t"
        "movq	$0xffffffff, %%r8\n\t"
        "adcq	%%rdx, %%rdx\n\t"
        "movq	$0xffffffff00000001, %%r9\n\t"
        "adcq	%%r10, %%r10\n\t"
        "sbbq	$0, %%r11\n\t"
        "andq	%%r11, %%r8\n\t"
        "andq	%%r11, %%r9\n\t"
        "subq	%%r11, %%rax\n\t"
        "sbbq	%%r8, %%rcx\n\t"
        "movq	%%rax, 0(%[r])\n\t"
        "sbbq	$0, %%rdx\n\t"
        "movq	%%rcx, 8(%[r])\n\t"
        "sbbq	%%r9, %%r10\n\t"
        "movq	%%rdx, 16(%[r])\n\t"
        "movq	%%r10, 24(%[r])\n\t"
        :
        : [r] "r" (r), [a] "r" (a)
        : "memory", "rax", "rcx", "rdx", "r8", "r9", "r10", "r11"
    );

    (void)m;
}

/* Triple a Montgomery form number (r = a + a + a % m).
 *
 * r   Result of Tripling.
 * a   Number to triple in Montogmery form.
 * m   Modulus (prime).
 */
static void sp_256_mont_tpl_4(sp_digit* r, sp_digit* a, sp_digit* m)
{
    __asm__ __volatile__ (
        "movq	(%[a]), %%rax\n\t"
        "movq	8(%[a]), %%rcx\n\t"
        "movq	16(%[a]), %%rdx\n\t"
        "movq	24(%[a]), %%r10\n\t"
        "xorq	%%r11, %%r11\n\t"
        "addq	%%rax, %%rax\n\t"
        "adcq	%%rcx, %%rcx\n\t"
        "movq	$0xffffffff, %%r8\n\t"
        "adcq	%%rdx, %%rdx\n\t"
        "adcq	%%r10, %%r10\n\t"
        "movq	$0xffffffff00000001, %%r9\n\t"
        "sbbq	$0, %%r11\n\t"
        "andq	%%r11, %%r8\n\t"
        "andq	%%r11, %%r9\n\t"
        "subq	%%r11, %%rax\n\t"
        "sbbq	%%r8, %%rcx\n\t"
        "sbbq	$0, %%rdx\n\t"
        "sbbq	%%r9, %%r10\n\t"
        "xorq	%%r11, %%r11\n\t"
        "addq	(%[a]), %%rax\n\t"
        "adcq	8(%[a]), %%rcx\n\t"
        "movq	$0xffffffff, %%r8\n\t"
        "adcq	16(%[a]), %%rdx\n\t"
        "adcq	24(%[a]), %%r10\n\t"
        "movq	$0xffffffff00000001, %%r9\n\t"
        "sbbq	$0, %%r11\n\t"
        "andq	%%r11, %%r8\n\t"
        "andq	%%r11, %%r9\n\t"
        "subq	%%r11, %%rax\n\t"
        "sbbq	%%r8, %%rcx\n\t"
        "sbbq	$0, %%rdx\n\t"
        "sbbq	%%r9, %%r10\n\t"
        "movq	%%rax, 0(%[r])\n\t"
        "movq	%%rcx, 8(%[r])\n\t"
        "movq	%%rdx, 16(%[r])\n\t"
        "movq	%%r10, 24(%[r])\n\t"
        :
        : [r] "r" (r), [a] "r" (a)
        : "memory", "rax", "rcx", "rdx", "r8", "r9", "r10", "r11"
    );

    (void)m;
}

/* Subtract two Montgomery form numbers (r = a - b % m).
 *
 * r   Result of subtration.
 * a   Number to subtract from in Montogmery form.
 * b   Number to subtract with in Montogmery form.
 * m   Modulus (prime).
 */
static void sp_256_mont_sub_4(sp_digit* r, sp_digit* a, sp_digit* b,
        sp_digit* m)
{
    __asm__ __volatile__ (
        "movq	0(%[a]), %%rax\n\t"
        "movq	8(%[a]), %%rcx\n\t"
        "movq	16(%[a]), %%rdx\n\t"
        "movq	24(%[a]), %%r10\n\t"
        "xorq	%%r11, %%r11\n\t"
        "subq	0(%[b]), %%rax\n\t"
        "sbbq	8(%[b]), %%rcx\n\t"
        "movq	$0xffffffff, %%r8\n\t"
        "sbbq	16(%[b]), %%rdx\n\t"
        "sbbq	24(%[b]), %%r10\n\t"
        "movq	$0xffffffff00000001, %%r9\n\t"
        "sbbq	$0, %%r11\n\t"
        "andq	%%r11, %%r8\n\t"
        "andq	%%r11, %%r9\n\t"
        "addq	%%r11, %%rax\n\t"
        "adcq	%%r8, %%rcx\n\t"
        "movq	%%rax, 0(%[r])\n\t"
        "adcq	$0, %%rdx\n\t"
        "movq	%%rcx, 8(%[r])\n\t"
        "adcq	%%r9, %%r10\n\t"
        "movq	%%rdx, 16(%[r])\n\t"
        "movq	%%r10, 24(%[r])\n\t"
        :
        : [r] "r" (r), [a] "r" (a), [b] "r" (b), [m] "r" (m)
        : "memory", "rax", "rcx", "rdx", "r8", "r9", "r10", "r11"
    );
}

/* Divide the number by 2 mod the modulus (prime). (r = a / 2 % m)
 *
 * r  Result of division by 2.
 * a  Number to divide.
 * m  Modulus (prime).
 */
SP_NOINLINE static void sp_256_div2_4(sp_digit* r, sp_digit* a, sp_digit* m)
{
    __asm__ __volatile__ (
        "movq	0(%[a]), %%rax\n\t"
        "movq	8(%[a]), %%rdx\n\t"
        "movq	16(%[a]), %%rcx\n\t"
        "movq	24(%[a]), %%r10\n\t"
        "movq	$0xffffffff, %%r8\n\t"
        "movq	$0xffffffff00000001, %%r9\n\t"
        "xorq	%%r12, %%r12\n\t"
        "movq	%%rax, %%r11\n\t"
        "andq	$1, %%r11\n\t"
        "subq	%%r11, %%r12\n\t"
        "andq	%%r12, %%r8\n\t"
        "andq	%%r12, %%r9\n\t"
        "xorq	%%r11, %%r11\n\t"
        "addq	%%r12, %%rax\n\t"
        "adcq	%%r8, %%rdx\n\t"
        "adcq	$0, %%rcx\n\t"
        "adcq	%%r9, %%r10\n\t"
        "adcq	$0, %%r11\n\t"
        "shrdq	$1, %%rdx, %%rax\n\t"
        "shrdq	$1, %%rcx, %%rdx\n\t"
        "shrdq	$1, %%r10, %%rcx\n\t"
        "shrdq	$1, %%r11, %%r10\n\t"
        "movq	%%rax, 0(%[r])\n\t"
        "movq	%%rdx, 8(%[r])\n\t"
        "movq	%%rcx, 16(%[r])\n\t"
        "movq	%%r10, 24(%[r])\n\t"
        :
        : [r] "r" (r), [a] "r" (a), [m] "r" (m)
        : "memory", "rax", "rdx", "rcx", "r8", "r9", "r10", "r11", "r12"
    );

}

/* Double the Montgomery form projective point p.
 *
 * r  Result of doubling point.
 * p  Point to double.
 * t  Temporary ordinate data.
 */
static void sp_256_proj_point_dbl_4(sp_point* r, sp_point* p, sp_digit* t)
{
    sp_point *rp[2];
    sp_point tp;
    sp_digit* t1 = t;
    sp_digit* t2 = t + 2*4;
    sp_digit* x;
    sp_digit* y;
    sp_digit* z;
    int i;

    /* When infinity don't double point passed in - constant time. */
    rp[0] = r;
    rp[1] = &tp;
    x = rp[p->infinity]->x;
    y = rp[p->infinity]->y;
    z = rp[p->infinity]->z;
    /* Put point to double into result - good for infinty. */
    if (r != p) {
        for (i=0; i<4; i++)
            r->x[i] = p->x[i];
        for (i=0; i<4; i++)
            r->y[i] = p->y[i];
        for (i=0; i<4; i++)
            r->z[i] = p->z[i];
        r->infinity = p->infinity;
    }

    /* T1 = Z * Z */
    sp_256_mont_sqr_4(t1, z, p256_mod, p256_mp_mod);
    /* Z = Y * Z */
    sp_256_mont_mul_4(z, y, z, p256_mod, p256_mp_mod);
    /* Z = 2Z */
    sp_256_mont_dbl_4(z, z, p256_mod);
    /* T2 = X - T1 */
    sp_256_mont_sub_4(t2, x, t1, p256_mod);
    /* T1 = X + T1 */
    sp_256_mont_add_4(t1, x, t1, p256_mod);
    /* T2 = T1 * T2 */
    sp_256_mont_mul_4(t2, t1, t2, p256_mod, p256_mp_mod);
    /* T1 = 3T2 */
    sp_256_mont_tpl_4(t1, t2, p256_mod);
    /* Y = 2Y */
    sp_256_mont_dbl_4(y, y, p256_mod);
    /* Y = Y * Y */
    sp_256_mont_sqr_4(y, y, p256_mod, p256_mp_mod);
    /* T2 = Y * Y */
    sp_256_mont_sqr_4(t2, y, p256_mod, p256_mp_mod);
    /* T2 = T2/2 */
    sp_256_div2_4(t2, t2, p256_mod);
    /* Y = Y * X */
    sp_256_mont_mul_4(y, y, x, p256_mod, p256_mp_mod);
    /* X = T1 * T1 */
    sp_256_mont_mul_4(x, t1, t1, p256_mod, p256_mp_mod);
    /* X = X - Y */
    sp_256_mont_sub_4(x, x, y, p256_mod);
    /* X = X - Y */
    sp_256_mont_sub_4(x, x, y, p256_mod);
    /* Y = Y - X */
    sp_256_mont_sub_4(y, y, x, p256_mod);
    /* Y = Y * T1 */
    sp_256_mont_mul_4(y, y, t1, p256_mod, p256_mp_mod);
    /* Y = Y - T2 */
    sp_256_mont_sub_4(y, y, t2, p256_mod);

}

/* Double the Montgomery form projective point p a number of times.
 *
 * r  Result of repeated doubling of point.
 * p  Point to double.
 * n  Number of times to double
 * t  Temporary ordinate data.
 */
static void sp_256_proj_point_dbl_n_4(sp_point* r, sp_point* p, int n,
        sp_digit* t)
{
    sp_point *rp[2];
    sp_point tp;
    sp_digit* w = t;
    sp_digit* a = t + 2*4;
    sp_digit* b = t + 4*4;
    sp_digit* t1 = t + 6*4;
    sp_digit* t2 = t + 8*4;
    sp_digit* x;
    sp_digit* y;
    sp_digit* z;
    int i;

    rp[0] = r;
    rp[1] = &tp;
    x = rp[p->infinity]->x;
    y = rp[p->infinity]->y;
    z = rp[p->infinity]->z;
    if (r != p) {
        for (i=0; i<4; i++)
            r->x[i] = p->x[i];
        for (i=0; i<4; i++)
            r->y[i] = p->y[i];
        for (i=0; i<4; i++)
            r->z[i] = p->z[i];
        r->infinity = p->infinity;
    }

    /* Y = 2*Y */
    sp_256_mont_dbl_4(y, y, p256_mod);
    /* W = Z^4 */
    sp_256_mont_sqr_4(w, z, p256_mod, p256_mp_mod);
    sp_256_mont_sqr_4(w, w, p256_mod, p256_mp_mod);
    while (n--) {
        /* A = 3*(X^2 - W) */
        sp_256_mont_sqr_4(t1, x, p256_mod, p256_mp_mod);
        sp_256_mont_sub_4(t1, t1, w, p256_mod);
        sp_256_mont_tpl_4(a, t1, p256_mod);
        /* B = X*Y^2 */
        sp_256_mont_sqr_4(t2, y, p256_mod, p256_mp_mod);
        sp_256_mont_mul_4(b, t2, x, p256_mod, p256_mp_mod);
        /* X = A^2 - 2B */
        sp_256_mont_sqr_4(x, a, p256_mod, p256_mp_mod);
        sp_256_mont_dbl_4(t1, b, p256_mod);
        sp_256_mont_sub_4(x, x, t1, p256_mod);
        /* Z = Z*Y */
        sp_256_mont_mul_4(z, z, y, p256_mod, p256_mp_mod);
        /* t2 = Y^4 */
        sp_256_mont_sqr_4(t2, t2, p256_mod, p256_mp_mod);
        if (n) {
            /* W = W*Y^4 */
            sp_256_mont_mul_4(w, w, t2, p256_mod, p256_mp_mod);
        }
        /* y = 2*A*(B - X) - Y^4 */
        sp_256_mont_sub_4(y, b, x, p256_mod);
        sp_256_mont_mul_4(y, y, a, p256_mod, p256_mp_mod);
        sp_256_mont_dbl_4(y, y, p256_mod);
        sp_256_mont_sub_4(y, y, t2, p256_mod);
    }
    /* Y = Y/2 */
    sp_256_div2_4(y, y, p256_mod);
}

/* Compare two numbers to determine if they are equal.
 * Constant time implementation.
 *
 * a  First number to compare.
 * b  Second number to compare.
 * returns 1 when equal and 0 otherwise.
 */
static int sp_256_cmp_equal_4(const sp_digit* a, const sp_digit* b)
{
    return ((a[0] ^ b[0]) | (a[1] ^ b[1]) | (a[2] ^ b[2]) | (a[3] ^ b[3])) == 0;
}

/* Add two Montgomery form projective points.
 *
 * r  Result of addition.
 * p  Frist point to add.
 * q  Second point to add.
 * t  Temporary ordinate data.
 */
static void sp_256_proj_point_add_4(sp_point* r, sp_point* p, sp_point* q,
        sp_digit* t)
{
    sp_point *ap[2];
    sp_point *rp[2];
    sp_point tp;
    sp_digit* t1 = t;
    sp_digit* t2 = t + 2*4;
    sp_digit* t3 = t + 4*4;
    sp_digit* t4 = t + 6*4;
    sp_digit* t5 = t + 8*4;
    sp_digit* x;
    sp_digit* y;
    sp_digit* z;
    int i;

    /* Ensure only the first point is the same as the result. */
    if (q == r) {
        sp_point* a = p;
        p = q;
        q = a;
    }

    /* Check double */
    sp_256_sub_4(t1, p256_mod, q->y);
    sp_256_norm_4(t1);
    if (sp_256_cmp_equal_4(p->x, q->x) & sp_256_cmp_equal_4(p->z, q->z) &
        (sp_256_cmp_equal_4(p->y, q->y) | sp_256_cmp_equal_4(p->y, t1))) {
        sp_256_proj_point_dbl_4(r, p, t);
    }
    else {
        rp[0] = r;
        rp[1] = &tp;
        XMEMSET(&tp, 0, sizeof(tp));
        x = rp[p->infinity | q->infinity]->x;
        y = rp[p->infinity | q->infinity]->y;
        z = rp[p->infinity | q->infinity]->z;

        ap[0] = p;
        ap[1] = q;
        for (i=0; i<4; i++)
            r->x[i] = ap[p->infinity]->x[i];
        for (i=0; i<4; i++)
            r->y[i] = ap[p->infinity]->y[i];
        for (i=0; i<4; i++)
            r->z[i] = ap[p->infinity]->z[i];
        r->infinity = ap[p->infinity]->infinity;

        /* U1 = X1*Z2^2 */
        sp_256_mont_sqr_4(t1, q->z, p256_mod, p256_mp_mod);
        sp_256_mont_mul_4(t3, t1, q->z, p256_mod, p256_mp_mod);
        sp_256_mont_mul_4(t1, t1, x, p256_mod, p256_mp_mod);
        /* U2 = X2*Z1^2 */
        sp_256_mont_sqr_4(t2, z, p256_mod, p256_mp_mod);
        sp_256_mont_mul_4(t4, t2, z, p256_mod, p256_mp_mod);
        sp_256_mont_mul_4(t2, t2, q->x, p256_mod, p256_mp_mod);
        /* S1 = Y1*Z2^3 */
        sp_256_mont_mul_4(t3, t3, y, p256_mod, p256_mp_mod);
        /* S2 = Y2*Z1^3 */
        sp_256_mont_mul_4(t4, t4, q->y, p256_mod, p256_mp_mod);
        /* H = U2 - U1 */
        sp_256_mont_sub_4(t2, t2, t1, p256_mod);
        /* R = S2 - S1 */
        sp_256_mont_sub_4(t4, t4, t3, p256_mod);
        /* Z3 = H*Z1*Z2 */
        sp_256_mont_mul_4(z, z, q->z, p256_mod, p256_mp_mod);
        sp_256_mont_mul_4(z, z, t2, p256_mod, p256_mp_mod);
        /* X3 = R^2 - H^3 - 2*U1*H^2 */
        sp_256_mont_sqr_4(x, t4, p256_mod, p256_mp_mod);
        sp_256_mont_sqr_4(t5, t2, p256_mod, p256_mp_mod);
        sp_256_mont_mul_4(y, t1, t5, p256_mod, p256_mp_mod);
        sp_256_mont_mul_4(t5, t5, t2, p256_mod, p256_mp_mod);
        sp_256_mont_sub_4(x, x, t5, p256_mod);
        sp_256_mont_dbl_4(t1, y, p256_mod);
        sp_256_mont_sub_4(x, x, t1, p256_mod);
        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
        sp_256_mont_sub_4(y, y, x, p256_mod);
        sp_256_mont_mul_4(y, y, t4, p256_mod, p256_mp_mod);
        sp_256_mont_mul_4(t5, t5, t3, p256_mod, p256_mp_mod);
        sp_256_mont_sub_4(y, y, t5, p256_mod);
    }
}

/* Double the Montgomery form projective point p a number of times.
 *
 * r  Result of repeated doubling of point.
 * p  Point to double.
 * n  Number of times to double
 * t  Temporary ordinate data.
 */
static void sp_256_proj_point_dbl_n_store_4(sp_point* r, sp_point* p,
        int n, int m, sp_digit* t)
{
    sp_digit* w = t;
    sp_digit* a = t + 2*4;
    sp_digit* b = t + 4*4;
    sp_digit* t1 = t + 6*4;
    sp_digit* t2 = t + 8*4;
    sp_digit* x = r[2*m].x;
    sp_digit* y = r[(1<<n)*m].y;
    sp_digit* z = r[2*m].z;
    int i;

    for (i=0; i<4; i++)
        x[i] = p->x[i];
    for (i=0; i<4; i++)
        y[i] = p->y[i];
    for (i=0; i<4; i++)
        z[i] = p->z[i];

    /* Y = 2*Y */
    sp_256_mont_dbl_4(y, y, p256_mod);
    /* W = Z^4 */
    sp_256_mont_sqr_4(w, z, p256_mod, p256_mp_mod);
    sp_256_mont_sqr_4(w, w, p256_mod, p256_mp_mod);
    for (i=1; i<=n; i++) {
        /* A = 3*(X^2 - W) */
        sp_256_mont_sqr_4(t1, x, p256_mod, p256_mp_mod);
        sp_256_mont_sub_4(t1, t1, w, p256_mod);
        sp_256_mont_tpl_4(a, t1, p256_mod);
        /* B = X*Y^2 */
        sp_256_mont_sqr_4(t2, y, p256_mod, p256_mp_mod);
        sp_256_mont_mul_4(b, t2, x, p256_mod, p256_mp_mod);
        x = r[(1<<i)*m].x;
        /* X = A^2 - 2B */
        sp_256_mont_sqr_4(x, a, p256_mod, p256_mp_mod);
        sp_256_mont_dbl_4(t1, b, p256_mod);
        sp_256_mont_sub_4(x, x, t1, p256_mod);
        /* Z = Z*Y */
        sp_256_mont_mul_4(r[(1<<i)*m].z, z, y, p256_mod, p256_mp_mod);
        z = r[(1<<i)*m].z;
        /* t2 = Y^4 */
        sp_256_mont_sqr_4(t2, t2, p256_mod, p256_mp_mod);
        if (i != n) {
            /* W = W*Y^4 */
            sp_256_mont_mul_4(w, w, t2, p256_mod, p256_mp_mod);
        }
        /* y = 2*A*(B - X) - Y^4 */
        sp_256_mont_sub_4(y, b, x, p256_mod);
        sp_256_mont_mul_4(y, y, a, p256_mod, p256_mp_mod);
        sp_256_mont_dbl_4(y, y, p256_mod);
        sp_256_mont_sub_4(y, y, t2, p256_mod);

        /* Y = Y/2 */
        sp_256_div2_4(r[(1<<i)*m].y, y, p256_mod);
        r[(1<<i)*m].infinity = 0;
    }
}

/* Add two Montgomery form projective points.
 *
 * ra  Result of addition.
 * rs  Result of subtraction.
 * p   Frist point to add.
 * q   Second point to add.
 * t   Temporary ordinate data.
 */
static void sp_256_proj_point_add_sub_4(sp_point* ra, sp_point* rs,
        sp_point* p, sp_point* q, sp_digit* t)
{
    sp_digit* t1 = t;
    sp_digit* t2 = t + 2*4;
    sp_digit* t3 = t + 4*4;
    sp_digit* t4 = t + 6*4;
    sp_digit* t5 = t + 8*4;
    sp_digit* t6 = t + 10*4;
    sp_digit* x = ra->x;
    sp_digit* y = ra->y;
    sp_digit* z = ra->z;
    sp_digit* xs = rs->x;
    sp_digit* ys = rs->y;
    sp_digit* zs = rs->z;


    XMEMCPY(x, p->x, sizeof(p->x) / 2);
    XMEMCPY(y, p->y, sizeof(p->y) / 2);
    XMEMCPY(z, p->z, sizeof(p->z) / 2);
    ra->infinity = 0;
    rs->infinity = 0;

    /* U1 = X1*Z2^2 */
    sp_256_mont_sqr_4(t1, q->z, p256_mod, p256_mp_mod);
    sp_256_mont_mul_4(t3, t1, q->z, p256_mod, p256_mp_mod);
    sp_256_mont_mul_4(t1, t1, x, p256_mod, p256_mp_mod);
    /* U2 = X2*Z1^2 */
    sp_256_mont_sqr_4(t2, z, p256_mod, p256_mp_mod);
    sp_256_mont_mul_4(t4, t2, z, p256_mod, p256_mp_mod);
    sp_256_mont_mul_4(t2, t2, q->x, p256_mod, p256_mp_mod);
    /* S1 = Y1*Z2^3 */
    sp_256_mont_mul_4(t3, t3, y, p256_mod, p256_mp_mod);
    /* S2 = Y2*Z1^3 */
    sp_256_mont_mul_4(t4, t4, q->y, p256_mod, p256_mp_mod);
    /* H = U2 - U1 */
    sp_256_mont_sub_4(t2, t2, t1, p256_mod);
    /* RS = S2 + S1 */
    sp_256_mont_add_4(t6, t4, t3, p256_mod);
    /* R = S2 - S1 */
    sp_256_mont_sub_4(t4, t4, t3, p256_mod);
    /* Z3 = H*Z1*Z2 */
    /* ZS = H*Z1*Z2 */
    sp_256_mont_mul_4(z, z, q->z, p256_mod, p256_mp_mod);
    sp_256_mont_mul_4(z, z, t2, p256_mod, p256_mp_mod);
    XMEMCPY(zs, z, sizeof(p->z)/2);
    /* X3 = R^2 - H^3 - 2*U1*H^2 */
    /* XS = RS^2 - H^3 - 2*U1*H^2 */
    sp_256_mont_sqr_4(x, t4, p256_mod, p256_mp_mod);
    sp_256_mont_sqr_4(xs, t6, p256_mod, p256_mp_mod);
    sp_256_mont_sqr_4(t5, t2, p256_mod, p256_mp_mod);
    sp_256_mont_mul_4(y, t1, t5, p256_mod, p256_mp_mod);
    sp_256_mont_mul_4(t5, t5, t2, p256_mod, p256_mp_mod);
    sp_256_mont_sub_4(x, x, t5, p256_mod);
    sp_256_mont_sub_4(xs, xs, t5, p256_mod);
    sp_256_mont_dbl_4(t1, y, p256_mod);
    sp_256_mont_sub_4(x, x, t1, p256_mod);
    sp_256_mont_sub_4(xs, xs, t1, p256_mod);
    /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
    /* YS = -RS*(U1*H^2 - XS) - S1*H^3 */
    sp_256_mont_sub_4(ys, y, xs, p256_mod);
    sp_256_mont_sub_4(y, y, x, p256_mod);
    sp_256_mont_mul_4(y, y, t4, p256_mod, p256_mp_mod);
    sp_256_sub_4(t6, p256_mod, t6);
    sp_256_mont_mul_4(ys, ys, t6, p256_mod, p256_mp_mod);
    sp_256_mont_mul_4(t5, t5, t3, p256_mod, p256_mp_mod);
    sp_256_mont_sub_4(y, y, t5, p256_mod);
    sp_256_mont_sub_4(ys, ys, t5, p256_mod);
}

/* Structure used to describe recoding of scalar multiplication. */
typedef struct ecc_recode {
    /* Index into pre-computation table. */
    uint8_t i;
    /* Use the negative of the point. */
    uint8_t neg;
} ecc_recode;

/* The index into pre-computation table to use. */
static uint8_t recode_index_4_6[66] = {
     0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
    32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17,
    16, 15, 14, 13, 12, 11, 10,  9,  8,  7,  6,  5,  4,  3,  2,  1,
     0,  1,
};

/* Whether to negate y-ordinate. */
static uint8_t recode_neg_4_6[66] = {
     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
     1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
     1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
     0,  0,
};

/* Recode the scalar for multiplication using pre-computed values and
 * subtraction.
 *
 * k  Scalar to multiply by.
 * v  Vector of operations to peform.
 */
static void sp_256_ecc_recode_6_4(sp_digit* k, ecc_recode* v)
{
    int i, j;
    uint8_t y;
    int carry = 0;
    int o;
    sp_digit n;

    j = 0;
    n = k[j];
    o = 0;
    for (i=0; i<43; i++) {
        y = n;
        if (o + 6 < 64) {
            y &= 0x3f;
            n >>= 6;
            o += 6;
        }
        else if (o + 6 == 64) {
            n >>= 6;
            if (++j < 4)
                n = k[j];
            o = 0;
        }
        else if (++j < 4) {
            n = k[j];
            y |= (n << (64 - o)) & 0x3f;
            o -= 58;
            n >>= o;
        }

        y += carry;
        v[i].i = recode_index_4_6[y];
        v[i].neg = recode_neg_4_6[y];
        carry = (y >> 6) + v[i].neg;
    }
}

/* Multiply the point by the scalar and return the result.
 * If map is true then convert result to affine co-ordinates.
 *
 * r     Resulting point.
 * g     Point to multiply.
 * k     Scalar to multiply by.
 * map   Indicates whether to convert result to affine.
 * heap  Heap to use for allocation.
 * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
 */
static int sp_256_ecc_mulmod_win_add_sub_4(sp_point* r, sp_point* g,
        sp_digit* k, int map, void* heap)
{
#if !defined(WOLFSSL_SP_SMALL) && !defined(WOLFSSL_SMALL_STACK)
    sp_point td[33];
    sp_point rtd, pd;
    sp_digit tmpd[2 * 4 * 6];
#endif
    sp_point* t;
    sp_point* rt;
    sp_point* p = NULL;
    sp_digit* tmp;
    sp_digit* negy;
    int i;
    ecc_recode v[43];
    int err;

    (void)heap;

    err = sp_ecc_point_new(heap, rtd, rt);
    if (err == MP_OKAY)
        err = sp_ecc_point_new(heap, pd, p);
#if defined(WOLFSSL_SP_SMALL) || defined(WOLFSSL_SMALL_STACK)
    t = (sp_point*)XMALLOC(sizeof(sp_point) * 33, heap, DYNAMIC_TYPE_ECC);
    if (t == NULL)
        err = MEMORY_E;
    tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 4 * 6, heap,
                             DYNAMIC_TYPE_ECC);
    if (tmp == NULL)
        err = MEMORY_E;
#else
    t = td;
    tmp = tmpd;
#endif


    if (err == MP_OKAY) {
        /* t[0] = {0, 0, 1} * norm */
        XMEMSET(&t[0], 0, sizeof(t[0]));
        t[0].infinity = 1;
        /* t[1] = {g->x, g->y, g->z} * norm */
        err = sp_256_mod_mul_norm_4(t[1].x, g->x, p256_mod);
    }
    if (err == MP_OKAY)
        err = sp_256_mod_mul_norm_4(t[1].y, g->y, p256_mod);
    if (err == MP_OKAY)
        err = sp_256_mod_mul_norm_4(t[1].z, g->z, p256_mod);

    if (err == MP_OKAY) {
        t[1].infinity = 0;
        /* t[2] ... t[32]  */
    sp_256_proj_point_dbl_n_store_4(t, &t[ 1], 5, 1, tmp);
    sp_256_proj_point_add_4(&t[ 3], &t[ 2], &t[ 1], tmp);
    sp_256_proj_point_dbl_4(&t[ 6], &t[ 3], tmp);
    sp_256_proj_point_add_sub_4(&t[ 7], &t[ 5], &t[ 6], &t[ 1], tmp);
    sp_256_proj_point_dbl_4(&t[10], &t[ 5], tmp);
    sp_256_proj_point_add_sub_4(&t[11], &t[ 9], &t[10], &t[ 1], tmp);
    sp_256_proj_point_dbl_4(&t[12], &t[ 6], tmp);
    sp_256_proj_point_dbl_4(&t[14], &t[ 7], tmp);
    sp_256_proj_point_add_sub_4(&t[15], &t[13], &t[14], &t[ 1], tmp);
    sp_256_proj_point_dbl_4(&t[18], &t[ 9], tmp);
    sp_256_proj_point_add_sub_4(&t[19], &t[17], &t[18], &t[ 1], tmp);
    sp_256_proj_point_dbl_4(&t[20], &t[10], tmp);
    sp_256_proj_point_dbl_4(&t[22], &t[11], tmp);
    sp_256_proj_point_add_sub_4(&t[23], &t[21], &t[22], &t[ 1], tmp);
    sp_256_proj_point_dbl_4(&t[24], &t[12], tmp);
    sp_256_proj_point_dbl_4(&t[26], &t[13], tmp);
    sp_256_proj_point_add_sub_4(&t[27], &t[25], &t[26], &t[ 1], tmp);
    sp_256_proj_point_dbl_4(&t[28], &t[14], tmp);
    sp_256_proj_point_dbl_4(&t[30], &t[15], tmp);
    sp_256_proj_point_add_sub_4(&t[31], &t[29], &t[30], &t[ 1], tmp);

        negy = t[0].y;

        sp_256_ecc_recode_6_4(k, v);

        i = 42;
        XMEMCPY(rt, &t[v[i].i], sizeof(sp_point));
        for (--i; i>=0; i--) {
            sp_256_proj_point_dbl_n_4(rt, rt, 6, tmp);

            XMEMCPY(p, &t[v[i].i], sizeof(sp_point));
            sp_256_sub_4(negy, p256_mod, p->y);
            sp_256_cond_copy_4(p->y, negy, (sp_digit)0 - v[i].neg);
            sp_256_proj_point_add_4(rt, rt, p, tmp);
        }

        if (map)
            sp_256_map_4(r, rt, tmp);
        else
            XMEMCPY(r, rt, sizeof(sp_point));
    }

#if defined(WOLFSSL_SP_SMALL) || defined(WOLFSSL_SMALL_STACK)
    if (t != NULL)
        XFREE(t, heap, DYNAMIC_TYPE_ECC);
    if (tmp != NULL)
        XFREE(tmp, heap, DYNAMIC_TYPE_ECC);
#endif
    sp_ecc_point_free(p, 0, heap);
    sp_ecc_point_free(rt, 0, heap);

    return err;
}

#ifdef HAVE_INTEL_AVX2
/* Multiply two Montogmery form numbers mod the modulus (prime).
 * (r = a * b mod m)
 *
 * r   Result of multiplication.
 * a   First number to multiply in Montogmery form.
 * b   Second number to multiply in Montogmery form.
 * m   Modulus (prime).
 * mp  Montogmery mulitplier.
 */
SP_NOINLINE static void sp_256_mont_mul_avx2_4(sp_digit* r, sp_digit* a,
        sp_digit* b, sp_digit* m, sp_digit mp)
{
    (void)mp;

    __asm__ __volatile__ (
        "#  A[0] * B[0]\n\t"
        "movq   0(%[b]), %%rdx\n\t"
        "mulxq  0(%[a]), %%r8, %%r9\n\t"
        "#  A[2] * B[0]\n\t"
        "mulxq  16(%[a]), %%r10, %%r11\n\t"
        "#  A[1] * B[0]\n\t"
        "mulxq  8(%[a]), %%rax, %[m]\n\t"
        "xorq   %%r15, %%r15\n\t"
        "adcxq  %%rax, %%r9\n\t"
        "#  A[1] * B[3]\n\t"
        "movq   24(%[b]), %%rdx\n\t"
        "mulxq  8(%[a]), %%r12, %%r13\n\t"
        "adcxq  %[m], %%r10\n\t"
        "#  A[0] * B[1]\n\t"
        "movq   8(%[b]), %%rdx\n\t"
        "mulxq  0(%[a]), %%rax, %[m]\n\t"
        "adoxq  %%rax, %%r9\n\t"
        "#  A[2] * B[1]\n\t"
        "mulxq  16(%[a]), %%rax, %%r14\n\t"
        "adoxq  %[m], %%r10\n\t"
        "adcxq  %%rax, %%r11\n\t"
        "#  A[1] * B[2]\n\t"
        "movq   16(%[b]), %%rdx\n\t"
        "mulxq  8(%[a]), %%rax, %[m]\n\t"
        "adcxq  %%r14, %%r12\n\t"
        "adoxq  %%rax, %%r11\n\t"
        "adcxq  %%r15, %%r13\n\t"
        "adoxq  %[m], %%r12\n\t"
        "#  A[0] * B[2]\n\t"
        "mulxq  0(%[a]), %%rax, %[m]\n\t"
        "adoxq  %%r15, %%r13\n\t"
        "xorq   %%r14, %%r14\n\t"
        "adcxq  %%rax, %%r10\n\t"
        "#  A[1] * B[1]\n\t"
        "movq   8(%[b]), %%rdx\n\t"
        "mulxq  8(%[a]), %%rdx, %%rax\n\t"
        "adcxq  %[m], %%r11\n\t"
        "adoxq  %%rdx, %%r10\n\t"
        "#  A[3] * B[1]\n\t"
        "movq   8(%[b]), %%rdx\n\t"
        "adoxq  %%rax, %%r11\n\t"
        "mulxq  24(%[a]), %%rax, %[m]\n\t"
        "adcxq  %%rax, %%r12\n\t"
        "#  A[2] * B[2]\n\t"
        "movq   16(%[b]), %%rdx\n\t"
        "mulxq  16(%[a]), %%rdx, %%rax\n\t"
        "adcxq  %[m], %%r13\n\t"
        "adoxq  %%rdx, %%r12\n\t"
        "#  A[3] * B[3]\n\t"
        "movq   24(%[b]), %%rdx\n\t"
        "adoxq  %%rax, %%r13\n\t"
        "mulxq  24(%[a]), %%rax, %[m]\n\t"
        "adoxq  %%r15, %%r14\n\t"
        "adcxq  %%rax, %%r14\n\t"
        "#  A[0] * B[3]\n\t"
        "mulxq  0(%[a]), %%rdx, %%rax\n\t"
        "adcxq  %[m], %%r15\n\t"
        "xorq   %[m], %[m]\n\t"
        "adcxq  %%rdx, %%r11\n\t"
        "#  A[3] * B[0]\n\t"
        "movq   0(%[b]), %%rdx\n\t"
        "adcxq  %%rax, %%r12\n\t"
        "mulxq  24(%[a]), %%rdx, %%rax\n\t"
        "adoxq  %%rdx, %%r11\n\t"
        "adoxq  %%rax, %%r12\n\t"
        "#  A[2] * B[3]\n\t"
        "movq   24(%[b]), %%rdx\n\t"
        "mulxq  16(%[a]), %%rdx, %%rax\n\t"
        "adcxq  %%rdx, %%r13\n\t"
        "#  A[3] * B[2]\n\t"
        "movq   16(%[b]), %%rdx\n\t"
        "adcxq  %%rax, %%r14\n\t"
        "mulxq  24(%[a]), %%rax, %%rdx\n\t"
        "adcxq  %[m], %%r15\n\t"
        "adoxq  %%rax, %%r13\n\t"
        "adoxq  %%rdx, %%r14\n\t"
        "adoxq  %[m], %%r15\n\t"
        "# Start Reduction\n\t"
        "movq	%%r8, %%rax\n\t"
        "movq	%%r9, %[a]\n\t"
        "movq	%%r10, %[b]\n\t"
        "movq	%%r11, %%rdx\n\t"
        "# mu = a[0]-a[3] + a[0]-a[2] << 32 << 64 + (a[0] * 2) << 192\n\t"
        "#    - a[0] << 32 << 192\n\t"
        "#   + (a[0] * 2) << 192\n\t"
        "addq	%%r8, %%rdx\n\t"
        "addq	%%r8, %%rdx\n\t"
        "#   a[0]-a[2] << 32\n\t"
        "shlq	$32, %%r8\n\t"
        "shldq	$32, %[a], %%r10\n\t"
        "shldq	$32, %%rax, %%r9\n\t"
        "#   - a[0] << 32 << 192\n\t"
        "subq	%%r8, %%rdx\n\t"
        "#   + a[0]-a[2] << 32 << 64\n\t"
        "addq	%%r8, %[a]\n\t"
        "adcq	%%r9, %[b]\n\t"
        "adcq	%%r10, %%rdx\n\t"
        "# a += (mu << 256) - (mu << 224) + (mu << 192) + (mu << 96) - mu\n\t"
        "#   a += mu << 256\n\t"
        "xorq	%%r8, %%r8\n\t"
        "addq	%%rax, %%r12\n\t"
        "adcq	%[a], %%r13\n\t"
        "adcq	%[b], %%r14\n\t"
        "adcq	%%rdx, %%r15\n\t"
        "sbbq	$0, %%r8\n\t"
        "#   a += mu << 192\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%[a], %%r12\n\t"
        "adcq	%[b], %%r13\n\t"
        "adcq	%%rdx, %%r14\n\t"
        "adcq	$0, %%r15\n\t"
        "sbbq	$0, %%r8\n\t"
        "# mu <<= 32\n\t"
        "movq	%%rdx, %[m]\n\t"
        "shldq	$32, %[b], %%rdx\n\t"
        "shldq	$32, %[a], %[b]\n\t"
        "shldq	$32, %%rax, %[a]\n\t"
        "shlq	$32, %%rax\n\t"
        "shrq	$32, %[m]\n\t"
        "#   a += (mu << 32) << 64\n\t"
        "addq	%[b], %%r11\n\t"
        "adcq	%%rdx, %%r12\n\t"
        "adcq	%[m], %%r13\n\t"
        "adcq	$0, %%r14\n\t"
        "adcq	$0, %%r15\n\t"
        "sbbq	$0, %%r8\n\t"
        "#   a -= (mu << 32) << 192\n\t"
        "subq	%%rax, %%r11\n\t"
        "movq	$0xffffffff, %%rax\n\t"
        "sbbq	%[a], %%r12\n\t"
        "movq	$0xffffffff00000001, %[a]\n\t"
        "sbbq	%[b], %%r13\n\t"
        "sbbq	%%rdx, %%r14\n\t"
        "sbbq	%[m], %%r15\n\t"
        "adcq	$0, %%r8\n\t"
        "# mask m and sub from result if overflow\n\t"
        "#  m[0] = -1 & mask = mask\n\t"
        "andq	%%r8, %%rax\n\t"
        "#  m[2] =  0 & mask = 0\n\t"
        "andq	%%r8, %[a]\n\t"
        "subq	%%r8, %%r12\n\t"
        "sbbq	%%rax, %%r13\n\t"
        "sbbq	$0, %%r14\n\t"
        "sbbq	%[a], %%r15\n\t"
        "movq	%%r12, 0(%[r])\n\t"
        "movq	%%r13, 8(%[r])\n\t"
        "movq	%%r14, 16(%[r])\n\t"
        "movq	%%r15, 24(%[r])\n\t"
        : [m] "+r" (m), [a] "+r" (a), [b] "+r" (b)
        : [r] "r" (r)
        : "memory", "rax", "rdx", "r8", "r9", "r10", "r11",
          "r12", "r13", "r14", "r15"
    );
}

/* Square the Montgomery form number mod the modulus (prime). (r = a * a mod m)
 *
 * r   Result of squaring.
 * a   Number to square in Montogmery form.
 * m   Modulus (prime).
 * mp  Montogmery mulitplier.
 */
SP_NOINLINE static void sp_256_mont_sqr_avx2_4(sp_digit* r, sp_digit* a,
        sp_digit* m, sp_digit mp)
{
    __asm__ __volatile__ (
        "# A[0] * A[1]\n\t"
        "movq   0(%[a]), %%rdx\n\t"
        "mulxq  8(%[a]), %%r9, %%r10\n\t"
        "# A[0] * A[3]\n\t"
        "mulxq  24(%[a]), %%r11, %%r12\n\t"
        "# A[2] * A[1]\n\t"
        "movq   16(%[a]), %%rdx\n\t"
        "mulxq  8(%[a]), %[mp], %[m]\n\t"
        "xorq   %%r15, %%r15\n\t"
        "adoxq  %[mp], %%r11\n\t"
        "# A[2] * A[3]\n\t"
        "mulxq  24(%[a]), %%r13, %%r14\n\t"
        "adoxq  %[m], %%r12\n\t"
        "# A[2] * A[0]\n\t"
        "mulxq  0(%[a]), %[mp], %[m]\n\t"
        "adoxq  %%r15, %%r13\n\t"
        "adcxq  %[mp], %%r10\n\t"
        "adoxq  %%r15, %%r14\n\t"
        "# A[1] * A[3]\n\t"
        "movq   8(%[a]), %%rdx\n\t"
        "mulxq  24(%[a]), %%rax, %%r8\n\t"
        "adcxq  %[m], %%r11\n\t"
        "adcxq  %%rax, %%r12\n\t"
        "adcxq  %%r8, %%r13\n\t"
        "adcxq  %%r15, %%r14\n\t"
        "# Double with Carry Flag\n\t"
        "xorq   %%r15, %%r15\n\t"
        "# A[0] * A[0]\n\t"
        "movq   0(%[a]), %%rdx\n\t"
        "mulxq  %%rdx, %%r8, %%rax\n\t"
        "adcxq  %%r9, %%r9\n\t"
        "# A[1] * A[1]\n\t"
        "movq   8(%[a]), %%rdx\n\t"
        "mulxq  %%rdx, %[mp], %[m]\n\t"
        "adcxq  %%r10, %%r10\n\t"
        "adoxq  %%rax, %%r9\n\t"
        "adcxq  %%r11, %%r11\n\t"
        "adoxq  %[mp], %%r10\n\t"
        "# A[2] * A[2]\n\t"
        "movq   16(%[a]), %%rdx\n\t"
        "mulxq  %%rdx, %%rax, %[mp]\n\t"
        "adcxq  %%r12, %%r12\n\t"
        "adoxq  %[m], %%r11\n\t"
        "adcxq  %%r13, %%r13\n\t"
        "adoxq  %%rax, %%r12\n\t"
        "# A[3] * A[3]\n\t"
        "movq   24(%[a]), %%rdx\n\t"
        "mulxq  %%rdx, %%rax, %[m]\n\t"
        "adcxq  %%r14, %%r14\n\t"
        "adoxq  %[mp], %%r13\n\t"
        "adcxq  %%r15, %%r15\n\t"
        "adoxq  %%rax, %%r14\n\t"
        "adoxq  %[m], %%r15\n\t"
        "# Start Reduction\n\t"
        "movq	%%r8, %%rax\n\t"
        "movq	%%r9, %[a]\n\t"
        "movq	%%r10, %[mp]\n\t"
        "movq	%%r11, %%rdx\n\t"
        "# mu = a[0]-a[3] + a[0]-a[2] << 32 << 64 + (a[0] * 2) << 192\n\t"
        "#    - a[0] << 32 << 192\n\t"
        "#   + (a[0] * 2) << 192\n\t"
        "addq	%%r8, %%rdx\n\t"
        "addq	%%r8, %%rdx\n\t"
        "#   a[0]-a[2] << 32\n\t"
        "shlq	$32, %%r8\n\t"
        "shldq	$32, %[a], %%r10\n\t"
        "shldq	$32, %%rax, %%r9\n\t"
        "#   - a[0] << 32 << 192\n\t"
        "subq	%%r8, %%rdx\n\t"
        "#   + a[0]-a[2] << 32 << 64\n\t"
        "addq	%%r8, %[a]\n\t"
        "adcq	%%r9, %[mp]\n\t"
        "adcq	%%r10, %%rdx\n\t"
        "# a += (mu << 256) - (mu << 224) + (mu << 192) + (mu << 96) - mu\n\t"
        "#   a += mu << 256\n\t"
        "xorq	%%r8, %%r8\n\t"
        "addq	%%rax, %%r12\n\t"
        "adcq	%[a], %%r13\n\t"
        "adcq	%[mp], %%r14\n\t"
        "adcq	%%rdx, %%r15\n\t"
        "sbbq	$0, %%r8\n\t"
        "#   a += mu << 192\n\t"
        "addq	%%rax, %%r11\n\t"
        "adcq	%[a], %%r12\n\t"
        "adcq	%[mp], %%r13\n\t"
        "adcq	%%rdx, %%r14\n\t"
        "adcq	$0, %%r15\n\t"
        "sbbq	$0, %%r8\n\t"
        "# mu <<= 32\n\t"
        "movq	%%rdx, %[m]\n\t"
        "shldq	$32, %[mp], %%rdx\n\t"
        "shldq	$32, %[a], %[mp]\n\t"
        "shldq	$32, %%rax, %[a]\n\t"
        "shlq	$32, %%rax\n\t"
        "shrq	$32, %[m]\n\t"
        "#   a += (mu << 32) << 64\n\t"
        "addq	%[mp], %%r11\n\t"
        "adcq	%%rdx, %%r12\n\t"
        "adcq	%[m], %%r13\n\t"
        "adcq	$0, %%r14\n\t"
        "adcq	$0, %%r15\n\t"
        "sbbq	$0, %%r8\n\t"
        "#   a -= (mu << 32) << 192\n\t"
        "subq	%%rax, %%r11\n\t"
        "movq	$0xffffffff, %%rax\n\t"
        "sbbq	%[a], %%r12\n\t"
        "movq	$0xffffffff00000001, %[a]\n\t"
        "sbbq	%[mp], %%r13\n\t"
        "sbbq	%%rdx, %%r14\n\t"
        "sbbq	%[m], %%r15\n\t"
        "adcq	$0, %%r8\n\t"
        "# mask m and sub from result if overflow\n\t"
        "#  m[0] = -1 & mask = mask\n\t"
        "andq	%%r8, %%rax\n\t"
        "#  m[2] =  0 & mask = 0\n\t"
        "andq	%%r8, %[a]\n\t"
        "subq	%%r8, %%r12\n\t"
        "sbbq	%%rax, %%r13\n\t"
        "sbbq	$0, %%r14\n\t"
        "sbbq	%[a], %%r15\n\t"
        "movq	%%r12, 0(%[r])\n\t"
        "movq	%%r13, 8(%[r])\n\t"
        "movq	%%r14, 16(%[r])\n\t"
        "movq	%%r15, 24(%[r])\n\t"
        : [m] "+r" (m), [a] "+r" (a), [mp] "+r" (mp)
        : [r] "r" (r)
        : "memory", "rax", "rdx", "r8", "r9", "r10", "r11",
          "r12", "r13", "r14", "r15"
    );
}

#ifndef WOLFSSL_SP_SMALL
/* Square the Montgomery form number a number of times. (r = a ^ n mod m)
 *
 * r   Result of squaring.
 * a   Number to square in Montogmery form.
 * n   Number of times to square.
 * m   Modulus (prime).
 * mp  Montogmery mulitplier.
 */
static void sp_256_mont_sqr_n_avx2_4(sp_digit* r, sp_digit* a, int n,
        sp_digit* m, sp_digit mp)
{
    sp_256_mont_sqr_avx2_4(r, a, m, mp);
    for (; n > 1; n--)
        sp_256_mont_sqr_avx2_4(r, r, m, mp);
}

#endif /* !WOLFSSL_SP_SMALL */

/* Invert the number, in Montgomery form, modulo the modulus (prime) of the
 * P256 curve. (r = 1 / a mod m)
 *
 * r   Inverse result.
 * a   Number to invert.
 * td  Temporary data.
 */
static void sp_256_mont_inv_avx2_4(sp_digit* r, sp_digit* a, sp_digit* td)
{
#ifdef WOLFSSL_SP_SMALL
    sp_digit* t = td;
    int i;

    XMEMCPY(t, a, sizeof(sp_digit) * 4);
    for (i=254; i>=0; i--) {
        sp_256_mont_sqr_avx2_4(t, t, p256_mod, p256_mp_mod);
        if (p256_mod_2[i / 64] & ((sp_digit)1 << (i % 64)))
            sp_256_mont_mul_avx2_4(t, t, a, p256_mod, p256_mp_mod);
    }
    XMEMCPY(r, t, sizeof(sp_digit) * 4);
#else
    sp_digit* t = td;
    sp_digit* t2 = td + 2 * 4;
    sp_digit* t3 = td + 4 * 4;

    /* t = a^2 */
    sp_256_mont_sqr_avx2_4(t, a, p256_mod, p256_mp_mod);
    /* t = a^3 = t * a */
    sp_256_mont_mul_avx2_4(t, t, a, p256_mod, p256_mp_mod);
    /* t2= a^c = t ^ 2 ^ 2 */
    sp_256_mont_sqr_n_avx2_4(t2, t, 2, p256_mod, p256_mp_mod);
    /* t3= a^d = t2 * a */
    sp_256_mont_mul_avx2_4(t3, t2, a, p256_mod, p256_mp_mod);
    /* t = a^f = t2 * t */
    sp_256_mont_mul_avx2_4(t, t2, t, p256_mod, p256_mp_mod);
    /* t2= a^f0 = t ^ 2 ^ 4 */
    sp_256_mont_sqr_n_avx2_4(t2, t, 4, p256_mod, p256_mp_mod);
    /* t3= a^fd = t2 * t3 */
    sp_256_mont_mul_avx2_4(t3, t2, t3, p256_mod, p256_mp_mod);
    /* t = a^ff = t2 * t */
    sp_256_mont_mul_avx2_4(t, t2, t, p256_mod, p256_mp_mod);
    /* t2= a^ff00 = t ^ 2 ^ 8 */
    sp_256_mont_sqr_n_avx2_4(t2, t, 8, p256_mod, p256_mp_mod);
    /* t3= a^fffd = t2 * t3 */
    sp_256_mont_mul_avx2_4(t3, t2, t3, p256_mod, p256_mp_mod);
    /* t = a^ffff = t2 * t */
    sp_256_mont_mul_avx2_4(t, t2, t, p256_mod, p256_mp_mod);
    /* t2= a^ffff0000 = t ^ 2 ^ 16 */
    sp_256_mont_sqr_n_avx2_4(t2, t, 16, p256_mod, p256_mp_mod);
    /* t3= a^fffffffd = t2 * t3 */
    sp_256_mont_mul_avx2_4(t3, t2, t3, p256_mod, p256_mp_mod);
    /* t = a^ffffffff = t2 * t */
    sp_256_mont_mul_avx2_4(t, t2, t, p256_mod, p256_mp_mod);
    /* t = a^ffffffff00000000 = t ^ 2 ^ 32  */
    sp_256_mont_sqr_n_avx2_4(t2, t, 32, p256_mod, p256_mp_mod);
    /* t2= a^ffffffffffffffff = t2 * t */
    sp_256_mont_mul_avx2_4(t, t2, t, p256_mod, p256_mp_mod);
    /* t2= a^ffffffff00000001 = t2 * a */
    sp_256_mont_mul_avx2_4(t2, t2, a, p256_mod, p256_mp_mod);
    /* t2= a^ffffffff000000010000000000000000000000000000000000000000
     *   = t2 ^ 2 ^ 160 */
    sp_256_mont_sqr_n_avx2_4(t2, t2, 160, p256_mod, p256_mp_mod);
    /* t2= a^ffffffff00000001000000000000000000000000ffffffffffffffff
     *   = t2 * t */
    sp_256_mont_mul_avx2_4(t2, t2, t, p256_mod, p256_mp_mod);
    /* t2= a^ffffffff00000001000000000000000000000000ffffffffffffffff00000000
     *   = t2 ^ 2 ^ 32 */
    sp_256_mont_sqr_n_avx2_4(t2, t2, 32, p256_mod, p256_mp_mod);
    /* r = a^ffffffff00000001000000000000000000000000fffffffffffffffffffffffd
     *   = t2 * t3 */
    sp_256_mont_mul_avx2_4(r, t2, t3, p256_mod, p256_mp_mod);
#endif /* WOLFSSL_SP_SMALL */
}

/* Map the Montgomery form projective co-ordinate point to an affine point.
 *
 * r  Resulting affine co-ordinate point.
 * p  Montgomery form projective co-ordinate point.
 * t  Temporary ordinate data.
 */
static void sp_256_map_avx2_4(sp_point* r, sp_point* p, sp_digit* t)
{
    sp_digit* t1 = t;
    sp_digit* t2 = t + 2*4;
    int64_t n;

    sp_256_mont_inv_avx2_4(t1, p->z, t + 2*4);

    sp_256_mont_sqr_avx2_4(t2, t1, p256_mod, p256_mp_mod);
    sp_256_mont_mul_avx2_4(t1, t2, t1, p256_mod, p256_mp_mod);

    /* x /= z^2 */
    sp_256_mont_mul_avx2_4(r->x, p->x, t2, p256_mod, p256_mp_mod);
    XMEMSET(r->x + 4, 0, sizeof(r->x) / 2);
    sp_256_mont_reduce_4(r->x, p256_mod, p256_mp_mod);
    /* Reduce x to less than modulus */
    n = sp_256_cmp_4(r->x, p256_mod);
    sp_256_cond_sub_4(r->x, r->x, p256_mod, 0 - (n >= 0));
    sp_256_norm_4(r->x);

    /* y /= z^3 */
    sp_256_mont_mul_avx2_4(r->y, p->y, t1, p256_mod, p256_mp_mod);
    XMEMSET(r->y + 4, 0, sizeof(r->y) / 2);
    sp_256_mont_reduce_4(r->y, p256_mod, p256_mp_mod);
    /* Reduce y to less than modulus */
    n = sp_256_cmp_4(r->y, p256_mod);
    sp_256_cond_sub_4(r->y, r->y, p256_mod, 0 - (n >= 0));
    sp_256_norm_4(r->y);

    XMEMSET(r->z, 0, sizeof(r->z));
    r->z[0] = 1;

}

/* Double the Montgomery form projective point p.
 *
 * r  Result of doubling point.
 * p  Point to double.
 * t  Temporary ordinate data.
 */
static void sp_256_proj_point_dbl_avx2_4(sp_point* r, sp_point* p, sp_digit* t)
{
    sp_point *rp[2];
    sp_point tp;
    sp_digit* t1 = t;
    sp_digit* t2 = t + 2*4;
    sp_digit* x;
    sp_digit* y;
    sp_digit* z;
    int i;

    /* When infinity don't double point passed in - constant time. */
    rp[0] = r;
    rp[1] = &tp;
    x = rp[p->infinity]->x;
    y = rp[p->infinity]->y;
    z = rp[p->infinity]->z;
    /* Put point to double into result - good for infinty. */
    if (r != p) {
        for (i=0; i<4; i++)
            r->x[i] = p->x[i];
        for (i=0; i<4; i++)
            r->y[i] = p->y[i];
        for (i=0; i<4; i++)
            r->z[i] = p->z[i];
        r->infinity = p->infinity;
    }

    /* T1 = Z * Z */
    sp_256_mont_sqr_avx2_4(t1, z, p256_mod, p256_mp_mod);
    /* Z = Y * Z */
    sp_256_mont_mul_avx2_4(z, y, z, p256_mod, p256_mp_mod);
    /* Z = 2Z */
    sp_256_mont_dbl_4(z, z, p256_mod);
    /* T2 = X - T1 */
    sp_256_mont_sub_4(t2, x, t1, p256_mod);
    /* T1 = X + T1 */
    sp_256_mont_add_4(t1, x, t1, p256_mod);
    /* T2 = T1 * T2 */
    sp_256_mont_mul_avx2_4(t2, t1, t2, p256_mod, p256_mp_mod);
    /* T1 = 3T2 */
    sp_256_mont_tpl_4(t1, t2, p256_mod);
    /* Y = 2Y */
    sp_256_mont_dbl_4(y, y, p256_mod);
    /* Y = Y * Y */
    sp_256_mont_sqr_avx2_4(y, y, p256_mod, p256_mp_mod);
    /* T2 = Y * Y */
    sp_256_mont_sqr_avx2_4(t2, y, p256_mod, p256_mp_mod);
    /* T2 = T2/2 */
    sp_256_div2_4(t2, t2, p256_mod);
    /* Y = Y * X */
    sp_256_mont_mul_avx2_4(y, y, x, p256_mod, p256_mp_mod);
    /* X = T1 * T1 */
    sp_256_mont_mul_avx2_4(x, t1, t1, p256_mod, p256_mp_mod);
    /* X = X - Y */
    sp_256_mont_sub_4(x, x, y, p256_mod);
    /* X = X - Y */
    sp_256_mont_sub_4(x, x, y, p256_mod);
    /* Y = Y - X */
    sp_256_mont_sub_4(y, y, x, p256_mod);
    /* Y = Y * T1 */
    sp_256_mont_mul_avx2_4(y, y, t1, p256_mod, p256_mp_mod);
    /* Y = Y - T2 */
    sp_256_mont_sub_4(y, y, t2, p256_mod);

}

/* Double the Montgomery form projective point p a number of times.
 *
 * r  Result of repeated doubling of point.
 * p  Point to double.
 * n  Number of times to double
 * t  Temporary ordinate data.
 */
static void sp_256_proj_point_dbl_n_avx2_4(sp_point* r, sp_point* p, int n,
        sp_digit* t)
{
    sp_point *rp[2];
    sp_point tp;
    sp_digit* w = t;
    sp_digit* a = t + 2*4;
    sp_digit* b = t + 4*4;
    sp_digit* t1 = t + 6*4;
    sp_digit* t2 = t + 8*4;
    sp_digit* x;
    sp_digit* y;
    sp_digit* z;
    int i;

    rp[0] = r;
    rp[1] = &tp;
    x = rp[p->infinity]->x;
    y = rp[p->infinity]->y;
    z = rp[p->infinity]->z;
    if (r != p) {
        for (i=0; i<4; i++)
            r->x[i] = p->x[i];
        for (i=0; i<4; i++)
            r->y[i] = p->y[i];
        for (i=0; i<4; i++)
            r->z[i] = p->z[i];
        r->infinity = p->infinity;
    }

    /* Y = 2*Y */
    sp_256_mont_dbl_4(y, y, p256_mod);
    /* W = Z^4 */
    sp_256_mont_sqr_avx2_4(w, z, p256_mod, p256_mp_mod);
    sp_256_mont_sqr_avx2_4(w, w, p256_mod, p256_mp_mod);
    while (n--) {
        /* A = 3*(X^2 - W) */
        sp_256_mont_sqr_avx2_4(t1, x, p256_mod, p256_mp_mod);
        sp_256_mont_sub_4(t1, t1, w, p256_mod);
        sp_256_mont_tpl_4(a, t1, p256_mod);
        /* B = X*Y^2 */
        sp_256_mont_sqr_avx2_4(t2, y, p256_mod, p256_mp_mod);
        sp_256_mont_mul_avx2_4(b, t2, x, p256_mod, p256_mp_mod);
        /* X = A^2 - 2B */
        sp_256_mont_sqr_avx2_4(x, a, p256_mod, p256_mp_mod);
        sp_256_mont_dbl_4(t1, b, p256_mod);
        sp_256_mont_sub_4(x, x, t1, p256_mod);
        /* Z = Z*Y */
        sp_256_mont_mul_avx2_4(z, z, y, p256_mod, p256_mp_mod);
        /* t2 = Y^4 */
        sp_256_mont_sqr_avx2_4(t2, t2, p256_mod, p256_mp_mod);
        if (n) {
            /* W = W*Y^4 */
            sp_256_mont_mul_avx2_4(w, w, t2, p256_mod, p256_mp_mod);
        }
        /* y = 2*A*(B - X) - Y^4 */
        sp_256_mont_sub_4(y, b, x, p256_mod);
        sp_256_mont_mul_avx2_4(y, y, a, p256_mod, p256_mp_mod);
        sp_256_mont_dbl_4(y, y, p256_mod);
        sp_256_mont_sub_4(y, y, t2, p256_mod);
    }
    /* Y = Y/2 */
    sp_256_div2_4(y, y, p256_mod);
}

/* Add two Montgomery form projective points.
 *
 * r  Result of addition.
 * p  Frist point to add.
 * q  Second point to add.
 * t  Temporary ordinate data.
 */
static void sp_256_proj_point_add_avx2_4(sp_point* r, sp_point* p, sp_point* q,
        sp_digit* t)
{
    sp_point *ap[2];
    sp_point *rp[2];
    sp_point tp;
    sp_digit* t1 = t;
    sp_digit* t2 = t + 2*4;
    sp_digit* t3 = t + 4*4;
    sp_digit* t4 = t + 6*4;
    sp_digit* t5 = t + 8*4;
    sp_digit* x;
    sp_digit* y;
    sp_digit* z;
    int i;

    /* Ensure only the first point is the same as the result. */
    if (q == r) {
        sp_point* a = p;
        p = q;
        q = a;
    }

    /* Check double */
    sp_256_sub_4(t1, p256_mod, q->y);
    sp_256_norm_4(t1);
    if (sp_256_cmp_equal_4(p->x, q->x) & sp_256_cmp_equal_4(p->z, q->z) &
        (sp_256_cmp_equal_4(p->y, q->y) | sp_256_cmp_equal_4(p->y, t1))) {
        sp_256_proj_point_dbl_4(r, p, t);
    }
    else {
        rp[0] = r;
        rp[1] = &tp;
        XMEMSET(&tp, 0, sizeof(tp));
        x = rp[p->infinity | q->infinity]->x;
        y = rp[p->infinity | q->infinity]->y;
        z = rp[p->infinity | q->infinity]->z;

        ap[0] = p;
        ap[1] = q;
        for (i=0; i<4; i++)
            r->x[i] = ap[p->infinity]->x[i];
        for (i=0; i<4; i++)
            r->y[i] = ap[p->infinity]->y[i];
        for (i=0; i<4; i++)
            r->z[i] = ap[p->infinity]->z[i];
        r->infinity = ap[p->infinity]->infinity;

        /* U1 = X1*Z2^2 */
        sp_256_mont_sqr_avx2_4(t1, q->z, p256_mod, p256_mp_mod);
        sp_256_mont_mul_avx2_4(t3, t1, q->z, p256_mod, p256_mp_mod);
        sp_256_mont_mul_avx2_4(t1, t1, x, p256_mod, p256_mp_mod);
        /* U2 = X2*Z1^2 */
        sp_256_mont_sqr_avx2_4(t2, z, p256_mod, p256_mp_mod);
        sp_256_mont_mul_avx2_4(t4, t2, z, p256_mod, p256_mp_mod);
        sp_256_mont_mul_avx2_4(t2, t2, q->x, p256_mod, p256_mp_mod);
        /* S1 = Y1*Z2^3 */
        sp_256_mont_mul_avx2_4(t3, t3, y, p256_mod, p256_mp_mod);
        /* S2 = Y2*Z1^3 */
        sp_256_mont_mul_avx2_4(t4, t4, q->y, p256_mod, p256_mp_mod);
        /* H = U2 - U1 */
        sp_256_mont_sub_4(t2, t2, t1, p256_mod);
        /* R = S2 - S1 */
        sp_256_mont_sub_4(t4, t4, t3, p256_mod);
        /* Z3 = H*Z1*Z2 */
        sp_256_mont_mul_avx2_4(z, z, q->z, p256_mod, p256_mp_mod);
        sp_256_mont_mul_avx2_4(z, z, t2, p256_mod, p256_mp_mod);
        /* X3 = R^2 - H^3 - 2*U1*H^2 */
        sp_256_mont_sqr_avx2_4(x, t4, p256_mod, p256_mp_mod);
        sp_256_mont_sqr_avx2_4(t5, t2, p256_mod, p256_mp_mod);
        sp_256_mont_mul_avx2_4(y, t1, t5, p256_mod, p256_mp_mod);
        sp_256_mont_mul_avx2_4(t5, t5, t2, p256_mod, p256_mp_mod);
        sp_256_mont_sub_4(x, x, t5, p256_mod);
        sp_256_mont_dbl_4(t1, y, p256_mod);
        sp_256_mont_sub_4(x, x, t1, p256_mod);
        /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
        sp_256_mont_sub_4(y, y, x, p256_mod);
        sp_256_mont_mul_avx2_4(y, y, t4, p256_mod, p256_mp_mod);
        sp_256_mont_mul_avx2_4(t5, t5, t3, p256_mod, p256_mp_mod);
        sp_256_mont_sub_4(y, y, t5, p256_mod);
    }
}

/* Double the Montgomery form projective point p a number of times.
 *
 * r  Result of repeated doubling of point.
 * p  Point to double.
 * n  Number of times to double
 * t  Temporary ordinate data.
 */
static void sp_256_proj_point_dbl_n_store_avx2_4(sp_point* r, sp_point* p,
        int n, int m, sp_digit* t)
{
    sp_digit* w = t;
    sp_digit* a = t + 2*4;
    sp_digit* b = t + 4*4;
    sp_digit* t1 = t + 6*4;
    sp_digit* t2 = t + 8*4;
    sp_digit* x = r[2*m].x;
    sp_digit* y = r[(1<<n)*m].y;
    sp_digit* z = r[2*m].z;
    int i;

    for (i=0; i<4; i++)
        x[i] = p->x[i];
    for (i=0; i<4; i++)
        y[i] = p->y[i];
    for (i=0; i<4; i++)
        z[i] = p->z[i];

    /* Y = 2*Y */
    sp_256_mont_dbl_4(y, y, p256_mod);
    /* W = Z^4 */
    sp_256_mont_sqr_avx2_4(w, z, p256_mod, p256_mp_mod);
    sp_256_mont_sqr_avx2_4(w, w, p256_mod, p256_mp_mod);
    for (i=1; i<=n; i++) {
        /* A = 3*(X^2 - W) */
        sp_256_mont_sqr_avx2_4(t1, x, p256_mod, p256_mp_mod);
        sp_256_mont_sub_4(t1, t1, w, p256_mod);
        sp_256_mont_tpl_4(a, t1, p256_mod);
        /* B = X*Y^2 */
        sp_256_mont_sqr_avx2_4(t2, y, p256_mod, p256_mp_mod);
        sp_256_mont_mul_avx2_4(b, t2, x, p256_mod, p256_mp_mod);
        x = r[(1<<i)*m].x;
        /* X = A^2 - 2B */
        sp_256_mont_sqr_avx2_4(x, a, p256_mod, p256_mp_mod);
        sp_256_mont_dbl_4(t1, b, p256_mod);
        sp_256_mont_sub_4(x, x, t1, p256_mod);
        /* Z = Z*Y */
        sp_256_mont_mul_avx2_4(r[(1<<i)*m].z, z, y, p256_mod, p256_mp_mod);
        z = r[(1<<i)*m].z;
        /* t2 = Y^4 */
        sp_256_mont_sqr_avx2_4(t2, t2, p256_mod, p256_mp_mod);
        if (i != n) {
            /* W = W*Y^4 */
            sp_256_mont_mul_avx2_4(w, w, t2, p256_mod, p256_mp_mod);
        }
        /* y = 2*A*(B - X) - Y^4 */
        sp_256_mont_sub_4(y, b, x, p256_mod);
        sp_256_mont_mul_avx2_4(y, y, a, p256_mod, p256_mp_mod);
        sp_256_mont_dbl_4(y, y, p256_mod);
        sp_256_mont_sub_4(y, y, t2, p256_mod);

        /* Y = Y/2 */
        sp_256_div2_4(r[(1<<i)*m].y, y, p256_mod);
        r[(1<<i)*m].infinity = 0;
    }
}

/* Add two Montgomery form projective points.
 *
 * ra  Result of addition.
 * rs  Result of subtraction.
 * p   Frist point to add.
 * q   Second point to add.
 * t   Temporary ordinate data.
 */
static void sp_256_proj_point_add_sub_avx2_4(sp_point* ra, sp_point* rs,
        sp_point* p, sp_point* q, sp_digit* t)
{
    sp_digit* t1 = t;
    sp_digit* t2 = t + 2*4;
    sp_digit* t3 = t + 4*4;
    sp_digit* t4 = t + 6*4;
    sp_digit* t5 = t + 8*4;
    sp_digit* t6 = t + 10*4;
    sp_digit* x = ra->x;
    sp_digit* y = ra->y;
    sp_digit* z = ra->z;
    sp_digit* xs = rs->x;
    sp_digit* ys = rs->y;
    sp_digit* zs = rs->z;


    XMEMCPY(x, p->x, sizeof(p->x) / 2);
    XMEMCPY(y, p->y, sizeof(p->y) / 2);
    XMEMCPY(z, p->z, sizeof(p->z) / 2);
    ra->infinity = 0;
    rs->infinity = 0;

    /* U1 = X1*Z2^2 */
    sp_256_mont_sqr_avx2_4(t1, q->z, p256_mod, p256_mp_mod);
    sp_256_mont_mul_avx2_4(t3, t1, q->z, p256_mod, p256_mp_mod);
    sp_256_mont_mul_avx2_4(t1, t1, x, p256_mod, p256_mp_mod);
    /* U2 = X2*Z1^2 */
    sp_256_mont_sqr_avx2_4(t2, z, p256_mod, p256_mp_mod);
    sp_256_mont_mul_avx2_4(t4, t2, z, p256_mod, p256_mp_mod);
    sp_256_mont_mul_avx2_4(t2, t2, q->x, p256_mod, p256_mp_mod);
    /* S1 = Y1*Z2^3 */
    sp_256_mont_mul_avx2_4(t3, t3, y, p256_mod, p256_mp_mod);
    /* S2 = Y2*Z1^3 */
    sp_256_mont_mul_avx2_4(t4, t4, q->y, p256_mod, p256_mp_mod);
    /* H = U2 - U1 */
    sp_256_mont_sub_4(t2, t2, t1, p256_mod);
    /* RS = S2 + S1 */
    sp_256_mont_add_4(t6, t4, t3, p256_mod);
    /* R = S2 - S1 */
    sp_256_mont_sub_4(t4, t4, t3, p256_mod);
    /* Z3 = H*Z1*Z2 */
    /* ZS = H*Z1*Z2 */
    sp_256_mont_mul_avx2_4(z, z, q->z, p256_mod, p256_mp_mod);
    sp_256_mont_mul_avx2_4(z, z, t2, p256_mod, p256_mp_mod);
    XMEMCPY(zs, z, sizeof(p->z)/2);
    /* X3 = R^2 - H^3 - 2*U1*H^2 */
    /* XS = RS^2 - H^3 - 2*U1*H^2 */
    sp_256_mont_sqr_avx2_4(x, t4, p256_mod, p256_mp_mod);
    sp_256_mont_sqr_avx2_4(xs, t6, p256_mod, p256_mp_mod);
    sp_256_mont_sqr_avx2_4(t5, t2, p256_mod, p256_mp_mod);
    sp_256_mont_mul_avx2_4(y, t1, t5, p256_mod, p256_mp_mod);
    sp_256_mont_mul_avx2_4(t5, t5, t2, p256_mod, p256_mp_mod);
    sp_256_mont_sub_4(x, x, t5, p256_mod);
    sp_256_mont_sub_4(xs, xs, t5, p256_mod);
    sp_256_mont_dbl_4(t1, y, p256_mod);
    sp_256_mont_sub_4(x, x, t1, p256_mod);
    sp_256_mont_sub_4(xs, xs, t1, p256_mod);
    /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
    /* YS = -RS*(U1*H^2 - XS) - S1*H^3 */
    sp_256_mont_sub_4(ys, y, xs, p256_mod);
    sp_256_mont_sub_4(y, y, x, p256_mod);
    sp_256_mont_mul_avx2_4(y, y, t4, p256_mod, p256_mp_mod);
    sp_256_sub_4(t6, p256_mod, t6);
    sp_256_mont_mul_avx2_4(ys, ys, t6, p256_mod, p256_mp_mod);
    sp_256_mont_mul_avx2_4(t5, t5, t3, p256_mod, p256_mp_mod);
    sp_256_mont_sub_4(y, y, t5, p256_mod);
    sp_256_mont_sub_4(ys, ys, t5, p256_mod);
}

/* Multiply the point by the scalar and return the result.
 * If map is true then convert result to affine co-ordinates.
 *
 * r     Resulting point.
 * g     Point to multiply.
 * k     Scalar to multiply by.
 * map   Indicates whether to convert result to affine.
 * heap  Heap to use for allocation.
 * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
 */
static int sp_256_ecc_mulmod_win_add_sub_avx2_4(sp_point* r, sp_point* g,
        sp_digit* k, int map, void* heap)
{
#if !defined(WOLFSSL_SP_SMALL) && !defined(WOLFSSL_SMALL_STACK)
    sp_point td[33];
    sp_point rtd, pd;
    sp_digit tmpd[2 * 4 * 6];
#endif
    sp_point* t;
    sp_point* rt;
    sp_point* p = NULL;
    sp_digit* tmp;
    sp_digit* negy;
    int i;
    ecc_recode v[43];
    int err;

    (void)heap;

    err = sp_ecc_point_new(heap, rtd, rt);
    if (err == MP_OKAY)
        err = sp_ecc_point_new(heap, pd, p);
#if defined(WOLFSSL_SP_SMALL) || defined(WOLFSSL_SMALL_STACK)
    t = (sp_point*)XMALLOC(sizeof(sp_point) * 33, heap, DYNAMIC_TYPE_ECC);
    if (t == NULL)
        err = MEMORY_E;
    tmp = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 4 * 6, heap,
                             DYNAMIC_TYPE_ECC);
    if (tmp == NULL)
        err = MEMORY_E;
#else
    t = td;
    tmp = tmpd;
#endif


    if (err == MP_OKAY) {
        /* t[0] = {0, 0, 1} * norm */
        XMEMSET(&t[0], 0, sizeof(t[0]));
        t[0].infinity = 1;
        /* t[1] = {g->x, g->y, g->z} * norm */
        err = sp_256_mod_mul_norm_4(t[1].x, g->x, p256_mod);
    }
    if (err == MP_OKAY)
        err = sp_256_mod_mul_norm_4(t[1].y, g->y, p256_mod);
    if (err == MP_OKAY)
        err = sp_256_mod_mul_norm_4(t[1].z, g->z, p256_mod);

    if (err == MP_OKAY) {
        t[1].infinity = 0;
        /* t[2] ... t[32]  */
    sp_256_proj_point_dbl_n_store_avx2_4(t, &t[ 1], 5, 1, tmp);
    sp_256_proj_point_add_avx2_4(&t[ 3], &t[ 2], &t[ 1], tmp);
    sp_256_proj_point_dbl_avx2_4(&t[ 6], &t[ 3], tmp);
    sp_256_proj_point_add_sub_avx2_4(&t[ 7], &t[ 5], &t[ 6], &t[ 1], tmp);
    sp_256_proj_point_dbl_avx2_4(&t[10], &t[ 5], tmp);
    sp_256_proj_point_add_sub_avx2_4(&t[11], &t[ 9], &t[10], &t[ 1], tmp);
    sp_256_proj_point_dbl_avx2_4(&t[12], &t[ 6], tmp);
    sp_256_proj_point_dbl_avx2_4(&t[14], &t[ 7], tmp);
    sp_256_proj_point_add_sub_avx2_4(&t[15], &t[13], &t[14], &t[ 1], tmp);
    sp_256_proj_point_dbl_avx2_4(&t[18], &t[ 9], tmp);
    sp_256_proj_point_add_sub_avx2_4(&t[19], &t[17], &t[18], &t[ 1], tmp);
    sp_256_proj_point_dbl_avx2_4(&t[20], &t[10], tmp);
    sp_256_proj_point_dbl_avx2_4(&t[22], &t[11], tmp);
    sp_256_proj_point_add_sub_avx2_4(&t[23], &t[21], &t[22], &t[ 1], tmp);
    sp_256_proj_point_dbl_avx2_4(&t[24], &t[12], tmp);
    sp_256_proj_point_dbl_avx2_4(&t[26], &t[13], tmp);
    sp_256_proj_point_add_sub_avx2_4(&t[27], &t[25], &t[26], &t[ 1], tmp);
    sp_256_proj_point_dbl_avx2_4(&t[28], &t[14], tmp);
    sp_256_proj_point_dbl_avx2_4(&t[30], &t[15], tmp);
    sp_256_proj_point_add_sub_avx2_4(&t[31], &t[29], &t[30], &t[ 1], tmp);

        negy = t[0].y;

        sp_256_ecc_recode_6_4(k, v);

        i = 42;
        XMEMCPY(rt, &t[v[i].i], sizeof(sp_point));
        for (--i; i>=0; i--) {
            sp_256_proj_point_dbl_n_avx2_4(rt, rt, 6, tmp);

            XMEMCPY(p, &t[v[i].i], sizeof(sp_point));
            sp_256_sub_4(negy, p256_mod, p->y);
            sp_256_cond_copy_4(p->y, negy, (sp_digit)0 - v[i].neg);
            sp_256_proj_point_add_avx2_4(rt, rt, p, tmp);
        }

        if (map)
            sp_256_map_avx2_4(r, rt, tmp);
        else
            XMEMCPY(r, rt, sizeof(sp_point));
    }

#if defined(WOLFSSL_SP_SMALL) || defined(WOLFSSL_SMALL_STACK)
    if (t != NULL)
        XFREE(t, heap, DYNAMIC_TYPE_ECC);
    if (tmp != NULL)
        XFREE(tmp, heap, DYNAMIC_TYPE_ECC);
#endif
    sp_ecc_point_free(p, 0, heap);
    sp_ecc_point_free(rt, 0, heap);

    return err;
}

#endif /* HAVE_INTEL_AVX2 */
/* A table entry for pre-computed points. */
typedef struct sp_table_entry {
    sp_digit x[4];
    sp_digit y[4];
    byte infinity;
} sp_table_entry;

#if defined(FP_ECC) || defined(WOLFSSL_SP_SMALL)
#endif /* FP_ECC || WOLFSSL_SP_SMALL */
/* Add two Montgomery form projective points. The second point has a q value of
 * one.
 * Only the first point can be the same pointer as the result point.
 *
 * r  Result of addition.
 * p  Frist point to add.
 * q  Second point to add.
 * t  Temporary ordinate data.
 */
static void sp_256_proj_point_add_qz1_4(sp_point* r, sp_point* p,
        sp_point* q, sp_digit* t)
{
    sp_point *ap[2];
    sp_point *rp[2];
    sp_point tp;
    sp_digit* t1 = t;
    sp_digit* t2 = t + 2*4;
    sp_digit* t3 = t + 4*4;
    sp_digit* t4 = t + 6*4;
    sp_digit* t5 = t + 8*4;
    sp_digit* x;
    sp_digit* y;
    sp_digit* z;
    int i;

    /* Check double */
    sp_256_sub_4(t1, p256_mod, q->y);
    sp_256_norm_4(t1);
    if (sp_256_cmp_equal_4(p->x, q->x) & sp_256_cmp_equal_4(p->z, q->z) &
        (sp_256_cmp_equal_4(p->y, q->y) | sp_256_cmp_equal_4(p->y, t1))) {
        sp_256_proj_point_dbl_4(r, p, t);
    }
    else {
        rp[0] = r;
        rp[1] = &tp;
        XMEMSET(&tp, 0, sizeof(tp));
        x = rp[p->infinity | q->infinity]->x;
        y = rp[p->infinity | q->infinity]->y;
        z = rp[p->infinity | q->infinity]->z;

        ap[0] = p;
        ap[1] = q;
        for (i=0; i<4; i++)
            r->x[i] = ap[p->infinity]->x[i];
        for (i=0; i<4; i++)
            r->y[i] = ap[p->infinity]->y[i];
        for (i=0; i<4; i++)
            r->z[i] = ap[p->infinity]->z[i];
        r->infinity = ap[p->infinity]->infinity;

        /* U2 = X2*Z1^2 */
        sp_256_mont_sqr_4(t2, z, p256_mod, p256_mp_mod);
        sp_256_mont_mul_4(t4, t2, z, p256_mod, p256_mp_mod);
        sp_256_mont_mul_4(t2, t2, q->x, p256_mod, p256_mp_mod);
        /* S2 = Y2*Z1^3 */
        sp_256_mont_mul_4(t4, t4, q->y, p256_mod, p256_mp_mod);
        /* H = U2 - X1 */
        sp_256_mont_sub_4(t2, t2, x, p256_mod);
        /* R = S2 - Y1 */
        sp_256_mont_sub_4(t4, t4, y, p256_mod);
        /* Z3 = H*Z1 */
        sp_256_mont_mul_4(z, z, t2, p256_mod, p256_mp_mod);
        /* X3 = R^2 - H^3 - 2*X1*H^2 */
        sp_256_mont_sqr_4(t1, t4, p256_mod, p256_mp_mod);
        sp_256_mont_sqr_4(t5, t2, p256_mod, p256_mp_mod);
        sp_256_mont_mul_4(t3, x, t5, p256_mod, p256_mp_mod);
        sp_256_mont_mul_4(t5, t5, t2, p256_mod, p256_mp_mod);
        sp_256_mont_sub_4(x, t1, t5, p256_mod);
        sp_256_mont_dbl_4(t1, t3, p256_mod);
        sp_256_mont_sub_4(x, x, t1, p256_mod);
        /* Y3 = R*(X1*H^2 - X3) - Y1*H^3 */
        sp_256_mont_sub_4(t3, t3, x, p256_mod);
        sp_256_mont_mul_4(t3, t3, t4, p256_mod, p256_mp_mod);
        sp_256_mont_mul_4(t5, t5, y, p256_mod, p256_mp_mod);
        sp_256_mont_sub_4(y, t3, t5, p256_mod);
    }
}

#ifdef FP_ECC
/* Convert the projective point to affine.
 * Ordinates are in Montgomery form.
 *
 * a  Point to convert.
 * t  Temprorary data.
 */
static void sp_256_proj_to_affine_4(sp_point* a, sp_digit* t)
{
    sp_digit* t1 = t;
    sp_digit* t2 = t + 2 * 4;
    sp_digit* tmp = t + 4 * 4;

    sp_256_mont_inv_4(t1, a->z, tmp);

    sp_256_mont_sqr_4(t2, t1, p256_mod, p256_mp_mod);
    sp_256_mont_mul_4(t1, t2, t1, p256_mod, p256_mp_mod);

    sp_256_mont_mul_4(a->x, a->x, t2, p256_mod, p256_mp_mod);
    sp_256_mont_mul_4(a->y, a->y, t1, p256_mod, p256_mp_mod);
    XMEMCPY(a->z, p256_norm_mod, sizeof(p256_norm_mod));
}

/* Generate the pre-computed table of points for the base point.
 *
 * a      The base point.
 * table  Place to store generated point data.
 * tmp    Temprorary data.
 * heap  Heap to use for allocation.
 */
static int sp_256_gen_stripe_table_4(sp_point* a,
        sp_table_entry* table, sp_digit* tmp, void* heap)
{
#if !defined(WOLFSSL_SP_SMALL) && !defined(WOLFSSL_SMALL_STACK)
    sp_point td, s1d, s2d;
#endif
    sp_point* t;
    sp_point* s1 = NULL;
    sp_point* s2 = NULL;
    int i, j;
    int err;

    (void)heap;

    err = sp_ecc_point_new(heap, td, t);
    if (err == MP_OKAY)
        err = sp_ecc_point_new(heap, s1d, s1);
    if (err == MP_OKAY)
        err = sp_ecc_point_new(heap, s2d, s2);

    if (err == MP_OKAY)
        err = sp_256_mod_mul_norm_4(t->x, a->x, p256_mod);
    if (err == MP_OKAY)
        err = sp_256_mod_mul_norm_4(t->y, a->y, p256_mod);
    if (err == MP_OKAY)
        err = sp_256_mod_mul_norm_4(t->z, a->z, p256_mod);
    if (err == MP_OKAY) {
        t->infinity = 0;
        sp_256_proj_to_affine_4(t, tmp);

        XMEMCPY(s1->z, p256_norm_mod, sizeof(p256_norm_mod));
        s1->infinity = 0;
        XMEMCPY(s2->z, p256_norm_mod, sizeof(p256_norm_mod));
        s2->infinity = 0;

        /* table[0] = {0, 0, infinity} */
        XMEMSET(&table[0], 0, sizeof(sp_table_entry));
        table[0].infinity = 1;
        /* table[1] = Affine version of 'a' in Montgomery form */
        XMEMCPY(table[1].x, t->x, sizeof(table->x));
        XMEMCPY(table[1].y, t->y, sizeof(table->y));
        table[1].infinity = 0;

        for (i=1; i<8; i++) {
            sp_256_proj_point_dbl_n_4(t, t, 32, tmp);
            sp_256_proj_to_affine_4(t, tmp);
            XMEMCPY(table[1<<i].x, t->x, sizeof(table->x));
            XMEMCPY(table[1<<i].y, t->y, sizeof(table->y));
            table[1<<i].infinity = 0;
        }

        for (i=1; i<8; i++) {
            XMEMCPY(s1->x, table[1<<i].x, sizeof(table->x));
            XMEMCPY(s1->y, table[1<<i].y, sizeof(table->y));
            for (j=(1<<i)+1; j<(1<<(i+1)); j++) {
                XMEMCPY(s2->x, table[j-(1<<i)].x, sizeof(table->x));
                XMEMCPY(s2->y, table[j-(1<<i)].y, sizeof(table->y));
                sp_256_proj_point_add_qz1_4(t, s1, s2, tmp);
                sp_256_proj_to_affine_4(t, tmp);
                XMEMCPY(table[j].x, t->x, sizeof(table->x));
                XMEMCPY(table[j].y, t->y, sizeof(table->y));
                table[j].infinity = 0;
            }
        }
    }

    sp_ecc_point_free(s2, 0, heap);
    sp_ecc_point_free(s1, 0, heap);
    sp_ecc_point_free( t, 0, heap);

    return err;
}

#endif /* FP_ECC */
#if defined(FP_ECC) || defined(WOLFSSL_SP_SMALL)
/* Multiply the point by the scalar and return the result.
 * If map is true then convert result to affine co-ordinates.
 *
 * r     Resulting point.
 * k     Scalar to multiply by.
 * map   Indicates whether to convert result to affine.
 * heap  Heap to use for allocation.
 * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
 */
static int sp_256_ecc_mulmod_stripe_4(sp_point* r, sp_point* g,
        sp_table_entry* table, sp_digit* k, int map, void* heap)
{
#if !defined(WOLFSSL_SP_SMALL) && !defined(WOLFSSL_SMALL_STACK)
    sp_point rtd;
    sp_point pd;
    sp_digit td[2 * 4 * 5];
#endif
    sp_point* rt;
    sp_point* p = NULL;
    sp_digit* t;
    int i, j;
    int y, x;
    int err;

    (void)g;
    (void)heap;

    err = sp_ecc_point_new(heap, rtd, rt);
    if (err == MP_OKAY)
        err = sp_ecc_point_new(heap, pd, p);
#if defined(WOLFSSL_SP_SMALL) || defined(WOLFSSL_SMALL_STACK)
    t = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 4 * 5, heap,
                           DYNAMIC_TYPE_ECC);
    if (t == NULL)
        err = MEMORY_E;
#else
    t = td;
#endif

    if (err == MP_OKAY) {
        XMEMCPY(p->z, p256_norm_mod, sizeof(p256_norm_mod));
        XMEMCPY(rt->z, p256_norm_mod, sizeof(p256_norm_mod));

        y = 0;
        for (j=0,x=31; j<8; j++,x+=32)
            y |= ((k[x / 64] >> (x % 64)) & 1) << j;
        XMEMCPY(rt->x, table[y].x, sizeof(table[y].x));
        XMEMCPY(rt->y, table[y].y, sizeof(table[y].y));
        rt->infinity = table[y].infinity;
        for (i=30; i>=0; i--) {
            y = 0;
            for (j=0,x=i; j<8; j++,x+=32)
                y |= ((k[x / 64] >> (x % 64)) & 1) << j;

            sp_256_proj_point_dbl_4(rt, rt, t);
            XMEMCPY(p->x, table[y].x, sizeof(table[y].x));
            XMEMCPY(p->y, table[y].y, sizeof(table[y].y));
            p->infinity = table[y].infinity;
            sp_256_proj_point_add_qz1_4(rt, rt, p, t);
        }

        if (map)
            sp_256_map_4(r, rt, t);
        else
            XMEMCPY(r, rt, sizeof(sp_point));
    }

#if defined(WOLFSSL_SP_SMALL) || defined(WOLFSSL_SMALL_STACK)
    if (t != NULL)
        XFREE(t, heap, DYNAMIC_TYPE_ECC);
#endif
    sp_ecc_point_free(p, 0, heap);
    sp_ecc_point_free(rt, 0, heap);

    return err;
}

#endif /* FP_ECC || WOLFSSL_SP_SMALL */
#ifdef FP_ECC
#ifndef FP_ENTRIES
    #define FP_ENTRIES 16
#endif

typedef struct sp_cache_t {
    sp_digit x[4];
    sp_digit y[4];
    sp_table_entry table[256];
    uint32_t cnt;
    int set;
} sp_cache_t;

static THREAD_LS_T sp_cache_t sp_cache[FP_ENTRIES];
static THREAD_LS_T int sp_cache_last = -1;
static THREAD_LS_T int sp_cache_inited = 0;

#ifndef HAVE_THREAD_LS
    static volatile int initCacheMutex = 0;
    static wolfSSL_Mutex sp_cache_lock;
#endif

static void sp_ecc_get_cache(sp_point* g, sp_cache_t** cache)
{
    int i, j;
    uint32_t least;

    if (sp_cache_inited == 0) {
        for (i=0; i<FP_ENTRIES; i++) {
            sp_cache[i].set = 0;
        }
        sp_cache_inited = 1;
    }

    /* Compare point with those in cache. */
    for (i=0; i<FP_ENTRIES; i++) {
        if (!sp_cache[i].set)
            continue;

        if (sp_256_cmp_equal_4(g->x, sp_cache[i].x) & 
                           sp_256_cmp_equal_4(g->y, sp_cache[i].y)) {
            sp_cache[i].cnt++;
            break;
        }
    }

    /* No match. */
    if (i == FP_ENTRIES) {
        /* Find empty entry. */
        i = (sp_cache_last + 1) % FP_ENTRIES;
        for (; i != sp_cache_last; i=(i+1)%FP_ENTRIES) {
            if (!sp_cache[i].set) {
                break;
            }
        }

        /* Evict least used. */
        if (i == sp_cache_last) {
            least = sp_cache[0].cnt;
            for (j=1; j<FP_ENTRIES; j++) {
                if (sp_cache[j].cnt < least) {
                    i = j;
                    least = sp_cache[i].cnt;
                }
            }
        }

        XMEMCPY(sp_cache[i].x, g->x, sizeof(sp_cache[i].x));
        XMEMCPY(sp_cache[i].y, g->y, sizeof(sp_cache[i].y));
        sp_cache[i].set = 1;
        sp_cache[i].cnt = 1;
    }

    *cache = &sp_cache[i];
    sp_cache_last = i;
}
#endif /* FP_ECC */

/* Multiply the base point of P256 by the scalar and return the result.
 * If map is true then convert result to affine co-ordinates.
 *
 * r     Resulting point.
 * g     Point to multiply.
 * k     Scalar to multiply by.
 * map   Indicates whether to convert result to affine.
 * heap  Heap to use for allocation.
 * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
 */
static int sp_256_ecc_mulmod_4(sp_point* r, sp_point* g, sp_digit* k,
        int map, void* heap)
{
#ifndef FP_ECC
    return sp_256_ecc_mulmod_win_add_sub_4(r, g, k, map, heap);
#else
    sp_digit tmp[2 * 4 * 5];
    sp_cache_t* cache;
    int err = MP_OKAY;

#ifndef HAVE_THREAD_LS
    if (initCacheMutex == 0) {
         wc_InitMutex(&sp_cache_lock);
         initCacheMutex = 1;
    }
    if (wc_LockMutex(&sp_cache_lock) != 0)
       err = BAD_MUTEX_E;
#endif /* HAVE_THREAD_LS */

    if (err == MP_OKAY) {
        sp_ecc_get_cache(g, &cache);
        if (cache->cnt == 2)
            sp_256_gen_stripe_table_4(g, cache->table, tmp, heap);

#ifndef HAVE_THREAD_LS
        wc_UnLockMutex(&sp_cache_lock);
#endif /* HAVE_THREAD_LS */

        if (cache->cnt < 2) {
            err = sp_256_ecc_mulmod_win_add_sub_4(r, g, k, map, heap);
        }
        else {
            err = sp_256_ecc_mulmod_stripe_4(r, g, cache->table, k,
                    map, heap);
        }
    }

    return err;
#endif
}

#ifdef HAVE_INTEL_AVX2
#if defined(FP_ECC) || defined(WOLFSSL_SP_SMALL)
#endif /* FP_ECC || WOLFSSL_SP_SMALL */
/* Add two Montgomery form projective points. The second point has a q value of
 * one.
 * Only the first point can be the same pointer as the result point.
 *
 * r  Result of addition.
 * p  Frist point to add.
 * q  Second point to add.
 * t  Temporary ordinate data.
 */
static void sp_256_proj_point_add_qz1_avx2_4(sp_point* r, sp_point* p,
        sp_point* q, sp_digit* t)
{
    sp_point *ap[2];
    sp_point *rp[2];
    sp_point tp;
    sp_digit* t1 = t;
    sp_digit* t2 = t + 2*4;
    sp_digit* t3 = t + 4*4;
    sp_digit* t4 = t + 6*4;
    sp_digit* t5 = t + 8*4;
    sp_digit* x;
    sp_digit* y;
    sp_digit* z;
    int i;

    /* Check double */
    sp_256_sub_4(t1, p256_mod, q->y);
    sp_256_norm_4(t1);
    if (sp_256_cmp_equal_4(p->x, q->x) & sp_256_cmp_equal_4(p->z, q->z) &
        (sp_256_cmp_equal_4(p->y, q->y) | sp_256_cmp_equal_4(p->y, t1))) {
        sp_256_proj_point_dbl_4(r, p, t);
    }
    else {
        rp[0] = r;
        rp[1] = &tp;
        XMEMSET(&tp, 0, sizeof(tp));
        x = rp[p->infinity | q->infinity]->x;
        y = rp[p->infinity | q->infinity]->y;
        z = rp[p->infinity | q->infinity]->z;

        ap[0] = p;
        ap[1] = q;
        for (i=0; i<4; i++)
            r->x[i] = ap[p->infinity]->x[i];
        for (i=0; i<4; i++)
            r->y[i] = ap[p->infinity]->y[i];
        for (i=0; i<4; i++)
            r->z[i] = ap[p->infinity]->z[i];
        r->infinity = ap[p->infinity]->infinity;

        /* U2 = X2*Z1^2 */
        sp_256_mont_sqr_avx2_4(t2, z, p256_mod, p256_mp_mod);
        sp_256_mont_mul_avx2_4(t4, t2, z, p256_mod, p256_mp_mod);
        sp_256_mont_mul_avx2_4(t2, t2, q->x, p256_mod, p256_mp_mod);
        /* S2 = Y2*Z1^3 */
        sp_256_mont_mul_avx2_4(t4, t4, q->y, p256_mod, p256_mp_mod);
        /* H = U2 - X1 */
        sp_256_mont_sub_4(t2, t2, x, p256_mod);
        /* R = S2 - Y1 */
        sp_256_mont_sub_4(t4, t4, y, p256_mod);
        /* Z3 = H*Z1 */
        sp_256_mont_mul_avx2_4(z, z, t2, p256_mod, p256_mp_mod);
        /* X3 = R^2 - H^3 - 2*X1*H^2 */
        sp_256_mont_sqr_avx2_4(t1, t4, p256_mod, p256_mp_mod);
        sp_256_mont_sqr_avx2_4(t5, t2, p256_mod, p256_mp_mod);
        sp_256_mont_mul_avx2_4(t3, x, t5, p256_mod, p256_mp_mod);
        sp_256_mont_mul_avx2_4(t5, t5, t2, p256_mod, p256_mp_mod);
        sp_256_mont_sub_4(x, t1, t5, p256_mod);
        sp_256_mont_dbl_4(t1, t3, p256_mod);
        sp_256_mont_sub_4(x, x, t1, p256_mod);
        /* Y3 = R*(X1*H^2 - X3) - Y1*H^3 */
        sp_256_mont_sub_4(t3, t3, x, p256_mod);
        sp_256_mont_mul_avx2_4(t3, t3, t4, p256_mod, p256_mp_mod);
        sp_256_mont_mul_avx2_4(t5, t5, y, p256_mod, p256_mp_mod);
        sp_256_mont_sub_4(y, t3, t5, p256_mod);
    }
}

#ifdef FP_ECC
/* Convert the projective point to affine.
 * Ordinates are in Montgomery form.
 *
 * a  Point to convert.
 * t  Temprorary data.
 */
static void sp_256_proj_to_affine_avx2_4(sp_point* a, sp_digit* t)
{
    sp_digit* t1 = t;
    sp_digit* t2 = t + 2 * 4;
    sp_digit* tmp = t + 4 * 4;

    sp_256_mont_inv_avx2_4(t1, a->z, tmp);

    sp_256_mont_sqr_avx2_4(t2, t1, p256_mod, p256_mp_mod);
    sp_256_mont_mul_avx2_4(t1, t2, t1, p256_mod, p256_mp_mod);

    sp_256_mont_mul_avx2_4(a->x, a->x, t2, p256_mod, p256_mp_mod);
    sp_256_mont_mul_avx2_4(a->y, a->y, t1, p256_mod, p256_mp_mod);
    XMEMCPY(a->z, p256_norm_mod, sizeof(p256_norm_mod));
}

/* Generate the pre-computed table of points for the base point.
 *
 * a      The base point.
 * table  Place to store generated point data.
 * tmp    Temprorary data.
 * heap  Heap to use for allocation.
 */
static int sp_256_gen_stripe_table_avx2_4(sp_point* a,
        sp_table_entry* table, sp_digit* tmp, void* heap)
{
#if !defined(WOLFSSL_SP_SMALL) && !defined(WOLFSSL_SMALL_STACK)
    sp_point td, s1d, s2d;
#endif
    sp_point* t;
    sp_point* s1 = NULL;
    sp_point* s2 = NULL;
    int i, j;
    int err;

    (void)heap;

    err = sp_ecc_point_new(heap, td, t);
    if (err == MP_OKAY)
        err = sp_ecc_point_new(heap, s1d, s1);
    if (err == MP_OKAY)
        err = sp_ecc_point_new(heap, s2d, s2);

    if (err == MP_OKAY)
        err = sp_256_mod_mul_norm_4(t->x, a->x, p256_mod);
    if (err == MP_OKAY)
        err = sp_256_mod_mul_norm_4(t->y, a->y, p256_mod);
    if (err == MP_OKAY)
        err = sp_256_mod_mul_norm_4(t->z, a->z, p256_mod);
    if (err == MP_OKAY) {
        t->infinity = 0;
        sp_256_proj_to_affine_avx2_4(t, tmp);

        XMEMCPY(s1->z, p256_norm_mod, sizeof(p256_norm_mod));
        s1->infinity = 0;
        XMEMCPY(s2->z, p256_norm_mod, sizeof(p256_norm_mod));
        s2->infinity = 0;

        /* table[0] = {0, 0, infinity} */
        XMEMSET(&table[0], 0, sizeof(sp_table_entry));
        table[0].infinity = 1;
        /* table[1] = Affine version of 'a' in Montgomery form */
        XMEMCPY(table[1].x, t->x, sizeof(table->x));
        XMEMCPY(table[1].y, t->y, sizeof(table->y));
        table[1].infinity = 0;

        for (i=1; i<8; i++) {
            sp_256_proj_point_dbl_n_avx2_4(t, t, 32, tmp);
            sp_256_proj_to_affine_avx2_4(t, tmp);
            XMEMCPY(table[1<<i].x, t->x, sizeof(table->x));
            XMEMCPY(table[1<<i].y, t->y, sizeof(table->y));
            table[1<<i].infinity = 0;
        }

        for (i=1; i<8; i++) {
            XMEMCPY(s1->x, table[1<<i].x, sizeof(table->x));
            XMEMCPY(s1->y, table[1<<i].y, sizeof(table->y));
            for (j=(1<<i)+1; j<(1<<(i+1)); j++) {
                XMEMCPY(s2->x, table[j-(1<<i)].x, sizeof(table->x));
                XMEMCPY(s2->y, table[j-(1<<i)].y, sizeof(table->y));
                sp_256_proj_point_add_qz1_avx2_4(t, s1, s2, tmp);
                sp_256_proj_to_affine_avx2_4(t, tmp);
                XMEMCPY(table[j].x, t->x, sizeof(table->x));
                XMEMCPY(table[j].y, t->y, sizeof(table->y));
                table[j].infinity = 0;
            }
        }
    }

    sp_ecc_point_free(s2, 0, heap);
    sp_ecc_point_free(s1, 0, heap);
    sp_ecc_point_free( t, 0, heap);

    return err;
}

#endif /* FP_ECC */
#if defined(FP_ECC) || defined(WOLFSSL_SP_SMALL)
/* Multiply the point by the scalar and return the result.
 * If map is true then convert result to affine co-ordinates.
 *
 * r     Resulting point.
 * k     Scalar to multiply by.
 * map   Indicates whether to convert result to affine.
 * heap  Heap to use for allocation.
 * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
 */
static int sp_256_ecc_mulmod_stripe_avx2_4(sp_point* r, sp_point* g,
        sp_table_entry* table, sp_digit* k, int map, void* heap)
{
#if !defined(WOLFSSL_SP_SMALL) && !defined(WOLFSSL_SMALL_STACK)
    sp_point rtd;
    sp_point pd;
    sp_digit td[2 * 4 * 5];
#endif
    sp_point* rt;
    sp_point* p = NULL;
    sp_digit* t;
    int i, j;
    int y, x;
    int err;

    (void)g;
    (void)heap;

    err = sp_ecc_point_new(heap, rtd, rt);
    if (err == MP_OKAY)
        err = sp_ecc_point_new(heap, pd, p);
#if defined(WOLFSSL_SP_SMALL) || defined(WOLFSSL_SMALL_STACK)
    t = (sp_digit*)XMALLOC(sizeof(sp_digit) * 2 * 4 * 5, heap,
                           DYNAMIC_TYPE_ECC);
    if (t == NULL)
        err = MEMORY_E;
#else
    t = td;
#endif

    if (err == MP_OKAY) {
        XMEMCPY(p->z, p256_norm_mod, sizeof(p256_norm_mod));
        XMEMCPY(rt->z, p256_norm_mod, sizeof(p256_norm_mod));

        y = 0;
        for (j=0,x=31; j<8; j++,x+=32)
            y |= ((k[x / 64] >> (x % 64)) & 1) << j;
        XMEMCPY(rt->x, table[y].x, sizeof(table[y].x));
        XMEMCPY(rt->y, table[y].y, sizeof(table[y].y));
        rt->infinity = table[y].infinity;
        for (i=30; i>=0; i--) {
            y = 0;
            for (j=0,x=i; j<8; j++,x+=32)
                y |= ((k[x / 64] >> (x % 64)) & 1) << j;

            sp_256_proj_point_dbl_avx2_4(rt, rt, t);
            XMEMCPY(p->x, table[y].x, sizeof(table[y].x));
            XMEMCPY(p->y, table[y].y, sizeof(table[y].y));
            p->infinity = table[y].infinity;
            sp_256_proj_point_add_qz1_avx2_4(rt, rt, p, t);
        }

        if (map)
            sp_256_map_avx2_4(r, rt, t);
        else
            XMEMCPY(r, rt, sizeof(sp_point));
    }

#if defined(WOLFSSL_SP_SMALL) || defined(WOLFSSL_SMALL_STACK)
    if (t != NULL)
        XFREE(t, heap, DYNAMIC_TYPE_ECC);
#endif
    sp_ecc_point_free(p, 0, heap);
    sp_ecc_point_free(rt, 0, heap);

    return err;
}

#endif /* FP_ECC || WOLFSSL_SP_SMALL */
/* Multiply the base point of P256 by the scalar and return the result.
 * If map is true then convert result to affine co-ordinates.
 *
 * r     Resulting point.
 * g     Point to multiply.
 * k     Scalar to multiply by.
 * map   Indicates whether to convert result to affine.
 * heap  Heap to use for allocation.
 * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
 */
static int sp_256_ecc_mulmod_avx2_4(sp_point* r, sp_point* g, sp_digit* k,
        int map, void* heap)
{
#ifndef FP_ECC
    return sp_256_ecc_mulmod_win_add_sub_avx2_4(r, g, k, map, heap);
#else
    sp_digit tmp[2 * 4 * 5];
    sp_cache_t* cache;
    int err = MP_OKAY;

#ifndef HAVE_THREAD_LS
    if (initCacheMutex == 0) {
         wc_InitMutex(&sp_cache_lock);
         initCacheMutex = 1;
    }
    if (wc_LockMutex(&sp_cache_lock) != 0)
       err = BAD_MUTEX_E;
#endif /* HAVE_THREAD_LS */

    if (err == MP_OKAY) {
        sp_ecc_get_cache(g, &cache);
        if (cache->cnt == 2)
            sp_256_gen_stripe_table_avx2_4(g, cache->table, tmp, heap);

#ifndef HAVE_THREAD_LS
        wc_UnLockMutex(&sp_cache_lock);
#endif /* HAVE_THREAD_LS */

        if (cache->cnt < 2) {
            err = sp_256_ecc_mulmod_win_add_sub_avx2_4(r, g, k, map, heap);
        }
        else {
            err = sp_256_ecc_mulmod_stripe_avx2_4(r, g, cache->table, k,
                    map, heap);
        }
    }

    return err;
#endif
}

#endif /* HAVE_INTEL_AVX2 */
/* Multiply the point by the scalar and return the result.
 * If map is true then convert result to affine co-ordinates.
 *
 * km    Scalar to multiply by.
 * p     Point to multiply.
 * r     Resulting point.
 * map   Indicates whether to convert result to affine.
 * heap  Heap to use for allocation.
 * returns MEMORY_E when memory allocation fails and MP_OKAY on success.
 */
int sp_ecc_mulmod_256(mp_int* km, ecc_point* gm, ecc_point* r, int map,
        void* heap)
{
#if !defined(WOLFSSL_SP_SMALL) && !defined(WOLFSSL_SMALL_STACK)
    sp_point p;
    sp_digit kd[4];
#endif
    sp_point* point;
    sp_digit* k = NULL;
    int err = MP_OKAY;
#ifdef HAVE_INTEL_AVX2
    word32 cpuid_flags = cpuid_get_flags();
#endif

    err = sp_ecc_point_new(heap, p, point);
#if defined(WOLFSSL_SP_SMALL) || defined(WOLFSSL_SMALL_STACK)
    if (err == MP_OKAY) {
        k = XMALLOC(sizeof(sp_digit) * 4, heap, DYNAMIC_TYPE_ECC);
        if (k == NULL)
            err = MEMORY_E;
    }
#else
    k = kd;
#endif
    if (err == MP_OKAY) {
        sp_256_from_mp(k, 4, km);
        sp_256_point_from_ecc_point_4(point, gm);

#ifdef HAVE_INTEL_AVX2
        if (IS_INTEL_BMI2(cpuid_flags) && IS_INTEL_ADX(cpuid_flags))
            err = sp_256_ecc_mulmod_avx2_4(point, point, k, map, heap);
        else
#endif
            err = sp_256_ecc_mulmod_4(point, point, k, map, heap);
    }
    if (err == MP_OKAY)
        err = sp_256_point_to_ecc_point_4(point, r);

#if defined(WOLFSSL_SP_SMALL) || defined(WOLFSSL_SMALL_STACK)
    if (k != NULL)
        XFREE(k, heap, DYNAMIC_TYPE_ECC);
#endif
    sp_ecc_point_free(point, 0, heap);

    return err;
}

#ifdef WOLFSSL_SP_SMALL
static sp_table_entry p256_table[256] = {
    /* 0 */
    { { 0x00, 0x00, 0x00, 0x00 },
      { 0x00, 0x00, 0x00, 0x00 },
      1 },
    /* 1 */
    { { 0x79e730d418a9143cl,0x75ba95fc5fedb601l,0x79fb732b77622510l,
        0x18905f76a53755c6l },
      { 0xddf25357ce95560al,0x8b4ab8e4ba19e45cl,0xd2e88688dd21f325l,
        0x8571ff1825885d85l },
      0 },
    /* 2 */
    { { 0x202886024147519al,0xd0981eac26b372f0l,0xa9d4a7caa785ebc8l,
        0xd953c50ddbdf58e9l },
      { 0x9d6361ccfd590f8fl,0x72e9626b44e6c917l,0x7fd9611022eb64cfl,
        0x863ebb7e9eb288f3l },
      0 },
    /* 3 */
    { { 0x7856b6235cdb6485l,0x808f0ea22f0a2f97l,0x3e68d9544f7e300bl,
        0x00076055b5ff80a0l },
      { 0x7634eb9b838d2010l,0x54014fbb3243708al,0xe0e47d39842a6606l,
        0x8308776134373ee0l },
      0 },
    /* 4 */
    { { 0x4f922fc516a0d2bbl,0x0d5cc16c1a623499l,0x9241cf3a57c62c8bl,
        0x2f5e6961fd1b667fl },
      { 0x5c15c70bf5a01797l,0x3d20b44d60956192l,0x04911b37071fdb52l,
        0xf648f9168d6f0f7bl },
      0 },
    /* 5 */
    { { 0x9e566847e137bbbcl,0xe434469e8a6a0becl,0xb1c4276179d73463l,
        0x5abe0285133d0015l },
      { 0x92aa837cc04c7dabl,0x573d9f4c43260c07l,0x0c93156278e6cc37l,
        0x94bb725b6b6f7383l },
      0 },
    /* 6 */
    { { 0xbbf9b48f720f141cl,0x6199b3cd2df5bc74l,0xdc3f6129411045c4l,
        0xcdd6bbcb2f7dc4efl },
      { 0xcca6700beaf436fdl,0x6f647f6db99326bel,0x0c0fa792014f2522l,
        0xa361bebd4bdae5f6l },
      0 },
    /* 7 */
    { { 0x28aa2558597c13c7l,0xc38d635f50b7c3e1l,0x07039aecf3c09d1dl,
        0xba12ca09c4b5292cl },
      { 0x9e408fa459f91dfdl,0x3af43b66ceea07fbl,0x1eceb0899d780b29l,
        0x53ebb99d701fef4bl },
      0 },
    /* 8 */
    { { 0x4fe7ee31b0e63d34l,0xf4600572a9e54fabl,0xc0493334d5e7b5a4l,
        0x8589fb9206d54831l },
      { 0xaa70f5cc6583553al,0x0879094ae25649e5l,0xcc90450710044652l,
        0xebb0696d02541c4fl },
      0 },
    /* 9 */
    { { 0x4616ca15ac1647c5l,0xb8127d47c4cf5799l,0xdc666aa3764dfbacl,
        0xeb2820cbd1b27da3l },
      { 0x9406f8d86a87e008l,0xd87dfa9d922378f3l,0x56ed2e4280ccecb2l,
        0x1f28289b55a7da1dl },
      0 },
    /* 10 */
    { { 0xabbaa0c03b89da99l,0xa6f2d79eb8284022l,0x27847862b81c05e8l,
        0x337a4b5905e54d63l },
      { 0x3c67500d21f7794al,0x207005b77d6d7f61l,0x0a5a378104cfd6e8l,
        0x0d65e0d5f4c2fbd6l },
      0 },
    /* 11 */
    { { 0xd9d09bbeb5275d38l,0x4268a7450be0a358l,0xf0762ff4973eb265l,
        0xc23da24252f4a232l },
      { 0x5da1b84f0b94520cl,0x09666763b05bd78el,0x3a4dcb8694d29ea1l,
        0x19de3b8cc790cff1l },
      0 },
    /* 12 */
    { { 0x183a716c26c5fe04l,0x3b28de0b3bba1bdbl,0x7432c586a4cb712cl,
        0xe34dcbd491fccbfdl },
      { 0xb408d46baaa58403l,0x9a69748682e97a53l,0x9e39012736aaa8afl,
        0xe7641f447b4e0f7fl },
      0 },
    /* 13 */
    { { 0x7d753941df64ba59l,0xd33f10ec0b0242fcl,0x4f06dfc6a1581859l,
        0x4a12df57052a57bfl },
      { 0xbfa6338f9439dbd0l,0xd3c24bd4bde53e1fl,0xfd5e4ffa21f1b314l,
        0x6af5aa93bb5bea46l },
      0 },
    /* 14 */
    { { 0xda10b69910c91999l,0x0a24b4402a580491l,0x3e0094b4b8cc2090l,
        0x5fe3475a66a44013l },
      { 0xb0f8cabdf93e7b4bl,0x292b501a7c23f91al,0x42e889aecd1e6263l,
        0xb544e308ecfea916l },
      0 },
    /* 15 */
    { { 0x6478c6e916ddfdcel,0x2c329166f89179e6l,0x4e8d6e764d4e67e1l,
        0xe0b6b2bda6b0c20bl },
      { 0x0d312df2bb7efb57l,0x1aac0dde790c4007l,0xf90336ad679bc944l,
        0x71c023de25a63774l },
      0 },
    /* 16 */
    { { 0x62a8c244bfe20925l,0x91c19ac38fdce867l,0x5a96a5d5dd387063l,
        0x61d587d421d324f6l },
      { 0xe87673a2a37173eal,0x2384800853778b65l,0x10f8441e05bab43el,
        0xfa11fe124621efbel },
      0 },
    /* 17 */
    { { 0x1c891f2b2cb19ffdl,0x01ba8d5bb1923c23l,0xb6d03d678ac5ca8el,
        0x586eb04c1f13bedcl },
      { 0x0c35c6e527e8ed09l,0x1e81a33c1819ede2l,0x278fd6c056c652fal,
        0x19d5ac0870864f11l },
      0 },
    /* 18 */
    { { 0x1e99f581309a4e1fl,0xab7de71be9270074l,0x26a5ef0befd28d20l,
        0xe7c0073f7f9c563fl },
      { 0x1f6d663a0ef59f76l,0x669b3b5420fcb050l,0xc08c1f7a7a6602d4l,
        0xe08504fec65b3c0al },
      0 },
    /* 19 */
    { { 0xf098f68da031b3cal,0x6d1cab9ee6da6d66l,0x5bfd81fa94f246e8l,
        0x78f018825b0996b4l },
      { 0xb7eefde43a25787fl,0x8016f80d1dccac9bl,0x0cea4877b35bfc36l,
        0x43a773b87e94747al },
      0 },
    /* 20 */
    { { 0x62577734d2b533d5l,0x673b8af6a1bdddc0l,0x577e7c9aa79ec293l,
        0xbb6de651c3b266b1l },
      { 0xe7e9303ab65259b3l,0xd6a0afd3d03a7480l,0xc5ac83d19b3cfc27l,
        0x60b4619a5d18b99bl },
      0 },
    /* 21 */
    { { 0xbd6a38e11ae5aa1cl,0xb8b7652b49e73658l,0x0b130014ee5f87edl,
        0x9d0f27b2aeebffcdl },
      { 0xca9246317a730a55l,0x9c955b2fddbbc83al,0x07c1dfe0ac019a71l,
        0x244a566d356ec48dl },
      0 },
    /* 22 */
    { { 0x6db0394aeacf1f96l,0x9f2122a9024c271cl,0x2626ac1b82cbd3b9l,
        0x45e58c873581ef69l },
      { 0xd3ff479da38f9dbcl,0xa8aaf146e888a040l,0x945adfb246e0bed7l,
        0xc040e21cc1e4b7a4l },
      0 },
    /* 23 */
    { { 0x847af0006f8117b6l,0x651969ff73a35433l,0x482b35761d9475ebl,
        0x1cdf5c97682c6ec7l },
      { 0x7db775b411f04839l,0x7dbeacf448de1698l,0xb2921dd1b70b3219l,
        0x046755f8a92dff3dl },
      0 },
    /* 24 */
    { { 0xcc8ac5d2bce8ffcdl,0x0d53c48b2fe61a82l,0xf6f161727202d6c7l,
        0x046e5e113b83a5f3l },
      { 0xe7b8ff64d8007f01l,0x7fb1ef125af43183l,0x045c5ea635e1a03cl,
        0x6e0106c3303d005bl },
      0 },
    /* 25 */
    { { 0x48c7358488dd73b1l,0x7670708f995ed0d9l,0x38385ea8c56a2ab7l,
        0x442594ede901cf1fl },
      { 0xf8faa2c912d4b65bl,0x94c2343b96c90c37l,0xd326e4a15e978d1fl,
        0xa796fa514c2ee68el },
      0 },
    /* 26 */
    { { 0x359fb604823addd7l,0x9e2a6183e56693b3l,0xf885b78e3cbf3c80l,
        0xe4ad2da9c69766e9l },
      { 0x357f7f428e048a61l,0x082d198cc092d9a0l,0xfc3a1af4c03ed8efl,
        0xc5e94046c37b5143l },
      0 },
    /* 27 */
    { { 0x476a538c2be75f9el,0x6fd1a9e8cb123a78l,0xd85e4df0b109c04bl,
        0x63283dafdb464747l },
      { 0xce728cf7baf2df15l,0xe592c4550ad9a7f4l,0xfab226ade834bcc3l,
        0x68bd19ab1981a938l },
      0 },
    /* 28 */
    { { 0xc08ead511887d659l,0x3374d5f4b359305al,0x96986981cfe74fe3l,
        0x495292f53c6fdfd6l },
      { 0x4a878c9e1acec896l,0xd964b210ec5b4484l,0x6696f7e2664d60a7l,
        0x0ec753