/*
 * This file is part of the GROMACS molecular simulation package.
 *
 * Copyright (c) 2012,2013,2014,2015,2016, by the GROMACS development team, led by
 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
 * and including many others, as listed in the AUTHORS file in the
 * top-level source directory and at http://www.gromacs.org.
 *
 * GROMACS is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public License
 * as published by the Free Software Foundation; either version 2.1
 * of the License, or (at your option) any later version.
 *
 * GROMACS is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with GROMACS; if not, see
 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
 *
 * If you want to redistribute modifications to GROMACS, please
 * consider that scientific software is very special. Version
 * control is crucial - bugs must be traceable. We will be happy to
 * consider code for inclusion in the official distribution, but
 * derived work must not be called official GROMACS. Details are found
 * in the README & COPYING files - if they are missing, get the
 * official version at http://www.gromacs.org.
 *
 * To help us fund GROMACS development, we humbly ask that you cite
 * the research papers on the package. Check out http://www.gromacs.org.
 */
#ifndef _kernelutil_x86_avx_128_fma_single_h_
#define _kernelutil_x86_avx_128_fma_single_h_

#include "config.h"

#include <math.h>

#include <immintrin.h>
#ifdef _MSC_VER
#    include <intrin.h>
#else
#    include <x86intrin.h>
#endif

#define gmx_mm_castsi128_ps   _mm_castsi128_ps
#define gmx_mm_extract_epi32  _mm_extract_epi32

#define gmx_mm_maskload_ps(mem, mask)       _mm_maskload_ps((mem), (mask))
#define gmx_mm_maskstore_ps(mem, mask, x)    _mm_maskstore_ps((mem), (mask), (x))
#define gmx_mm256_maskload_ps(mem, mask)    _mm256_maskload_ps((mem), (mask))
#define gmx_mm256_maskstore_ps(mem, mask, x) _mm256_maskstore_ps((mem), (mask), (x))

/* Normal sum of four xmm registers */
#define gmx_mm_sum4_ps(t0, t1, t2, t3)  _mm_add_ps(_mm_add_ps(t0, t1), _mm_add_ps(t2, t3))

static gmx_inline int gmx_simdcall
gmx_mm_any_lt(__m128 a, __m128 b)
{
    return _mm_movemask_ps(_mm_cmplt_ps(a, b));
}

static gmx_inline __m128 gmx_simdcall
gmx_mm_calc_rsq_ps(__m128 dx, __m128 dy, __m128 dz)
{
    return _mm_macc_ps(dx, dx, _mm_macc_ps(dy, dy, _mm_mul_ps(dz, dz)));
}

/* Load a single value from 1-4 places, merge into xmm register */

static gmx_inline __m128 gmx_simdcall
gmx_mm_load_4real_swizzle_ps(const float * gmx_restrict ptrA,
                             const float * gmx_restrict ptrB,
                             const float * gmx_restrict ptrC,
                             const float * gmx_restrict ptrD)
{
    __m128 t1, t2;

    t1 = _mm_unpacklo_ps(_mm_load_ss(ptrA), _mm_load_ss(ptrC));
    t2 = _mm_unpacklo_ps(_mm_load_ss(ptrB), _mm_load_ss(ptrD));
    return _mm_unpacklo_ps(t1, t2);
}


static gmx_inline void gmx_simdcall
gmx_mm_store_4real_swizzle_ps(float * gmx_restrict ptrA,
                              float * gmx_restrict ptrB,
                              float * gmx_restrict ptrC,
                              float * gmx_restrict ptrD, __m128 xmm1)
{
    __m128 t2, t3, t4;

    t2       = _mm_permute_ps(xmm1, _MM_SHUFFLE(1, 1, 1, 1));
    t3       = _mm_permute_ps(xmm1, _MM_SHUFFLE(2, 2, 2, 2));
    t4       = _mm_permute_ps(xmm1, _MM_SHUFFLE(3, 3, 3, 3));
    _mm_store_ss(ptrA, xmm1);
    _mm_store_ss(ptrB, t2);
    _mm_store_ss(ptrC, t3);
    _mm_store_ss(ptrD, t4);
}


static gmx_inline void gmx_simdcall
gmx_mm_increment_4real_swizzle_ps(float * gmx_restrict ptrA,
                                  float * gmx_restrict ptrB,
                                  float * gmx_restrict ptrC,
                                  float * gmx_restrict ptrD, __m128 xmm1)
{
    __m128 tmp;

    tmp = gmx_mm_load_4real_swizzle_ps(ptrA, ptrB, ptrC, ptrD);
    tmp = _mm_add_ps(tmp, xmm1);
    gmx_mm_store_4real_swizzle_ps(ptrA, ptrB, ptrC, ptrD, tmp);
}


static gmx_inline void gmx_simdcall
gmx_mm_load_4pair_swizzle_ps(const float * gmx_restrict p1,
                             const float * gmx_restrict p2,
                             const float * gmx_restrict p3,
                             const float * gmx_restrict p4,
                             __m128 * gmx_restrict c6, __m128 * gmx_restrict c12)
{
    __m128 t1, t2, t3, t4;
    t1   = _mm_loadl_pi(_mm_setzero_ps(), (__m64 *)p1);
    t2   = _mm_loadl_pi(_mm_setzero_ps(), (__m64 *)p2);
    t3   = _mm_loadl_pi(_mm_setzero_ps(), (__m64 *)p3);
    t4   = _mm_loadl_pi(_mm_setzero_ps(), (__m64 *)p4);
    t1   = _mm_unpacklo_ps(t1, t3);
    t2   = _mm_unpacklo_ps(t2, t4);
    *c6  = _mm_unpacklo_ps(t1, t2);
    *c12 = _mm_unpackhi_ps(t1, t2);
}




static gmx_inline void gmx_simdcall
gmx_mm_load_shift_and_1rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
                                         const float * gmx_restrict xyz,
                                         __m128 * gmx_restrict      x1,
                                         __m128 * gmx_restrict      y1,
                                         __m128 * gmx_restrict      z1)
{
    __m128 t1, t2, t3, t4;

    t1   = _mm_loadl_pi(_mm_setzero_ps(), (__m64 *)xyz_shift);
    t2   = _mm_loadl_pi(_mm_setzero_ps(), (__m64 *)xyz);
    t3   = _mm_load_ss(xyz_shift+2);
    t4   = _mm_load_ss(xyz+2);
    t1   = _mm_add_ps(t1, t2);
    t3   = _mm_add_ss(t3, t4);

    *x1  = _mm_permute_ps(t1, _MM_SHUFFLE(0, 0, 0, 0));
    *y1  = _mm_permute_ps(t1, _MM_SHUFFLE(1, 1, 1, 1));
    *z1  = _mm_permute_ps(t3, _MM_SHUFFLE(0, 0, 0, 0));
}


static gmx_inline void gmx_simdcall
gmx_mm_load_shift_and_3rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
                                         const float * gmx_restrict xyz,
                                         __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
                                         __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
                                         __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3)
{
    __m128 tA, tB;
    __m128 t1, t2, t3, t4, t5, t6;

    tA   = _mm_loadl_pi(_mm_setzero_ps(), (__m64 *)xyz_shift);
    tB   = _mm_load_ss(xyz_shift+2);

    t1   = _mm_loadu_ps(xyz);
    t2   = _mm_loadu_ps(xyz+4);
    t3   = _mm_load_ss(xyz+8);

    tA   = _mm_movelh_ps(tA, tB);
    t4   = _mm_permute_ps(tA, _MM_SHUFFLE(0, 2, 1, 0));
    t5   = _mm_permute_ps(tA, _MM_SHUFFLE(1, 0, 2, 1));
    t6   = _mm_permute_ps(tA, _MM_SHUFFLE(2, 1, 0, 2));

    t1   = _mm_add_ps(t1, t4);
    t2   = _mm_add_ps(t2, t5);
    t3   = _mm_add_ss(t3, t6);

    *x1  = _mm_permute_ps(t1, _MM_SHUFFLE(0, 0, 0, 0));
    *y1  = _mm_permute_ps(t1, _MM_SHUFFLE(1, 1, 1, 1));
    *z1  = _mm_permute_ps(t1, _MM_SHUFFLE(2, 2, 2, 2));
    *x2  = _mm_permute_ps(t1, _MM_SHUFFLE(3, 3, 3, 3));
    *y2  = _mm_permute_ps(t2, _MM_SHUFFLE(0, 0, 0, 0));
    *z2  = _mm_permute_ps(t2, _MM_SHUFFLE(1, 1, 1, 1));
    *x3  = _mm_permute_ps(t2, _MM_SHUFFLE(2, 2, 2, 2));
    *y3  = _mm_permute_ps(t2, _MM_SHUFFLE(3, 3, 3, 3));
    *z3  = _mm_permute_ps(t3, _MM_SHUFFLE(0, 0, 0, 0));
}


static gmx_inline void gmx_simdcall
gmx_mm_load_shift_and_4rvec_broadcast_ps(const float * gmx_restrict xyz_shift,
                                         const float * gmx_restrict xyz,
                                         __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
                                         __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
                                         __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3,
                                         __m128 * gmx_restrict x4, __m128 * gmx_restrict y4, __m128 * gmx_restrict z4)
{
    __m128 tA, tB;
    __m128 t1, t2, t3, t4, t5, t6;

    tA   = _mm_loadl_pi(_mm_setzero_ps(), (__m64 *)xyz_shift);
    tB   = _mm_load_ss(xyz_shift+2);

    t1   = _mm_loadu_ps(xyz);
    t2   = _mm_loadu_ps(xyz+4);
    t3   = _mm_loadu_ps(xyz+8);

    tA   = _mm_movelh_ps(tA, tB);
    t4   = _mm_permute_ps(tA, _MM_SHUFFLE(0, 2, 1, 0));
    t5   = _mm_permute_ps(tA, _MM_SHUFFLE(1, 0, 2, 1));
    t6   = _mm_permute_ps(tA, _MM_SHUFFLE(2, 1, 0, 2));

    t1   = _mm_add_ps(t1, t4);
    t2   = _mm_add_ps(t2, t5);
    t3   = _mm_add_ps(t3, t6);

    *x1  = _mm_permute_ps(t1, _MM_SHUFFLE(0, 0, 0, 0));
    *y1  = _mm_permute_ps(t1, _MM_SHUFFLE(1, 1, 1, 1));
    *z1  = _mm_permute_ps(t1, _MM_SHUFFLE(2, 2, 2, 2));
    *x2  = _mm_permute_ps(t1, _MM_SHUFFLE(3, 3, 3, 3));
    *y2  = _mm_permute_ps(t2, _MM_SHUFFLE(0, 0, 0, 0));
    *z2  = _mm_permute_ps(t2, _MM_SHUFFLE(1, 1, 1, 1));
    *x3  = _mm_permute_ps(t2, _MM_SHUFFLE(2, 2, 2, 2));
    *y3  = _mm_permute_ps(t2, _MM_SHUFFLE(3, 3, 3, 3));
    *z3  = _mm_permute_ps(t3, _MM_SHUFFLE(0, 0, 0, 0));
    *x4  = _mm_permute_ps(t3, _MM_SHUFFLE(1, 1, 1, 1));
    *y4  = _mm_permute_ps(t3, _MM_SHUFFLE(2, 2, 2, 2));
    *z4  = _mm_permute_ps(t3, _MM_SHUFFLE(3, 3, 3, 3));
}


static gmx_inline void gmx_simdcall
gmx_mm_load_1rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
                                  const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
                                  __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1)
{
    __m128  t1, t2, t3, t4;
    __m128i mask = _mm_set_epi32(0, -1, -1, -1);
    t1             = gmx_mm_maskload_ps(ptrA, mask);
    t2             = gmx_mm_maskload_ps(ptrB, mask);
    t3             = gmx_mm_maskload_ps(ptrC, mask);
    t4             = gmx_mm_maskload_ps(ptrD, mask);
    _MM_TRANSPOSE4_PS(t1, t2, t3, t4);
    *x1           = t1;
    *y1           = t2;
    *z1           = t3;
}


static gmx_inline void gmx_simdcall
gmx_mm_load_3rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
                                  const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
                                  __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
                                  __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
                                  __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3)
{
    __m128 t1, t2, t3, t4;
    t1            = _mm_loadu_ps(ptrA);
    t2            = _mm_loadu_ps(ptrB);
    t3            = _mm_loadu_ps(ptrC);
    t4            = _mm_loadu_ps(ptrD);
    _MM_TRANSPOSE4_PS(t1, t2, t3, t4);
    *x1           = t1;
    *y1           = t2;
    *z1           = t3;
    *x2           = t4;
    t1            = _mm_loadu_ps(ptrA+4);
    t2            = _mm_loadu_ps(ptrB+4);
    t3            = _mm_loadu_ps(ptrC+4);
    t4            = _mm_loadu_ps(ptrD+4);
    _MM_TRANSPOSE4_PS(t1, t2, t3, t4);
    *y2           = t1;
    *z2           = t2;
    *x3           = t3;
    *y3           = t4;
    t1            = _mm_load_ss(ptrA+8);
    t2            = _mm_load_ss(ptrB+8);
    t3            = _mm_load_ss(ptrC+8);
    t4            = _mm_load_ss(ptrD+8);
    t1            = _mm_unpacklo_ps(t1, t3);
    t3            = _mm_unpacklo_ps(t2, t4);
    *z3           = _mm_unpacklo_ps(t1, t3);
}


static gmx_inline void gmx_simdcall
gmx_mm_load_4rvec_4ptr_swizzle_ps(const float * gmx_restrict ptrA, const float * gmx_restrict ptrB,
                                  const float * gmx_restrict ptrC, const float * gmx_restrict ptrD,
                                  __m128 * gmx_restrict x1, __m128 * gmx_restrict y1, __m128 * gmx_restrict z1,
                                  __m128 * gmx_restrict x2, __m128 * gmx_restrict y2, __m128 * gmx_restrict z2,
                                  __m128 * gmx_restrict x3, __m128 * gmx_restrict y3, __m128 * gmx_restrict z3,
                                  __m128 * gmx_restrict x4, __m128 * gmx_restrict y4, __m128 * gmx_restrict z4)
{
    __m128 t1, t2, t3, t4;
    t1            = _mm_loadu_ps(ptrA);
    t2            = _mm_loadu_ps(ptrB);
    t3            = _mm_loadu_ps(ptrC);
    t4            = _mm_loadu_ps(ptrD);
    _MM_TRANSPOSE4_PS(t1, t2, t3, t4);
    *x1           = t1;
    *y1           = t2;
    *z1           = t3;
    *x2           = t4;
    t1            = _mm_loadu_ps(ptrA+4);
    t2            = _mm_loadu_ps(ptrB+4);
    t3            = _mm_loadu_ps(ptrC+4);
    t4            = _mm_loadu_ps(ptrD+4);
    _MM_TRANSPOSE4_PS(t1, t2, t3, t4);
    *y2           = t1;
    *z2           = t2;
    *x3           = t3;
    *y3           = t4;
    t1            = _mm_loadu_ps(ptrA+8);
    t2            = _mm_loadu_ps(ptrB+8);
    t3            = _mm_loadu_ps(ptrC+8);
    t4            = _mm_loadu_ps(ptrD+8);
    _MM_TRANSPOSE4_PS(t1, t2, t3, t4);
    *z3           = t1;
    *x4           = t2;
    *y4           = t3;
    *z4           = t4;
}


static gmx_inline void gmx_simdcall
gmx_mm_decrement_1rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
                                       float * gmx_restrict ptrC, float * gmx_restrict ptrD,
                                       __m128 x1, __m128 y1, __m128 z1)
{
    __m128 t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12;
    t5          = _mm_unpacklo_ps(y1, z1);
    t6          = _mm_unpackhi_ps(y1, z1);
    t7          = _mm_shuffle_ps(x1, t5, _MM_SHUFFLE(1, 0, 0, 0));
    t8          = _mm_shuffle_ps(x1, t5, _MM_SHUFFLE(3, 2, 0, 1));
    t9          = _mm_shuffle_ps(x1, t6, _MM_SHUFFLE(1, 0, 0, 2));
    t10         = _mm_shuffle_ps(x1, t6, _MM_SHUFFLE(3, 2, 0, 3));
    t1          = _mm_load_ss(ptrA);
    t1          = _mm_loadh_pi(t1, (__m64 *)(ptrA+1));
    t1          = _mm_sub_ps(t1, t7);
    _mm_store_ss(ptrA, t1);
    _mm_storeh_pi((__m64 *)(ptrA+1), t1);
    t2          = _mm_load_ss(ptrB);
    t2          = _mm_loadh_pi(t2, (__m64 *)(ptrB+1));
    t2          = _mm_sub_ps(t2, t8);
    _mm_store_ss(ptrB, t2);
    _mm_storeh_pi((__m64 *)(ptrB+1), t2);
    t3          = _mm_load_ss(ptrC);
    t3          = _mm_loadh_pi(t3, (__m64 *)(ptrC+1));
    t3          = _mm_sub_ps(t3, t9);
    _mm_store_ss(ptrC, t3);
    _mm_storeh_pi((__m64 *)(ptrC+1), t3);
    t4          = _mm_load_ss(ptrD);
    t4          = _mm_loadh_pi(t4, (__m64 *)(ptrD+1));
    t4          = _mm_sub_ps(t4, t10);
    _mm_store_ss(ptrD, t4);
    _mm_storeh_pi((__m64 *)(ptrD+1), t4);
}


static gmx_inline void gmx_simdcall
gmx_mm_decrement_3rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
                                       float * gmx_restrict ptrC, float * gmx_restrict ptrD,
                                       __m128 x1, __m128 y1, __m128 z1,
                                       __m128 x2, __m128 y2, __m128 z2,
                                       __m128 x3, __m128 y3, __m128 z3)
{
    __m128 t1, t2, t3, t4, t5, t6, t7, t8, t9, t10;
    __m128 t11, t12, t13, t14, t15, t16, t17, t18, t19;
    __m128 t20, t21, t22, t23, t24, t25;
    t13         = _mm_unpackhi_ps(x1, y1);
    x1          = _mm_unpacklo_ps(x1, y1);
    t14         = _mm_unpackhi_ps(z1, x2);
    z1          = _mm_unpacklo_ps(z1, x2);
    t15         = _mm_unpackhi_ps(y2, z2);
    y2          = _mm_unpacklo_ps(y2, z2);
    t16         = _mm_unpackhi_ps(x3, y3);
    x3          = _mm_unpacklo_ps(x3, y3);
    t17         = _mm_permute_ps(z3, _MM_SHUFFLE(0, 0, 0, 1));
    t18         = _mm_movehl_ps(z3, z3);
    t19         = _mm_permute_ps(t18, _MM_SHUFFLE(0, 0, 0, 1));
    t20         = _mm_movelh_ps(x1, z1);
    t21         = _mm_movehl_ps(z1, x1);
    t22         = _mm_movelh_ps(t13, t14);
    t14         = _mm_movehl_ps(t14, t13);
    t23         = _mm_movelh_ps(y2, x3);
    t24         = _mm_movehl_ps(x3, y2);
    t25         = _mm_movelh_ps(t15, t16);
    t16         = _mm_movehl_ps(t16, t15);
    t1          = _mm_loadu_ps(ptrA);
    t2          = _mm_loadu_ps(ptrA+4);
    t3          = _mm_load_ss(ptrA+8);
    t1          = _mm_sub_ps(t1, t20);
    t2          = _mm_sub_ps(t2, t23);
    t3          = _mm_sub_ss(t3, z3);
    _mm_storeu_ps(ptrA, t1);
    _mm_storeu_ps(ptrA+4, t2);
    _mm_store_ss(ptrA+8, t3);
    t4          = _mm_loadu_ps(ptrB);
    t5          = _mm_loadu_ps(ptrB+4);
    t6          = _mm_load_ss(ptrB+8);
    t4          = _mm_sub_ps(t4, t21);
    t5          = _mm_sub_ps(t5, t24);
    t6          = _mm_sub_ss(t6, t17);
    _mm_storeu_ps(ptrB, t4);
    _mm_storeu_ps(ptrB+4, t5);
    _mm_store_ss(ptrB+8, t6);
    t7          = _mm_loadu_ps(ptrC);
    t8          = _mm_loadu_ps(ptrC+4);
    t9          = _mm_load_ss(ptrC+8);
    t7          = _mm_sub_ps(t7, t22);
    t8          = _mm_sub_ps(t8, t25);
    t9          = _mm_sub_ss(t9, t18);
    _mm_storeu_ps(ptrC, t7);
    _mm_storeu_ps(ptrC+4, t8);
    _mm_store_ss(ptrC+8, t9);
    t10         = _mm_loadu_ps(ptrD);
    t11         = _mm_loadu_ps(ptrD+4);
    t12         = _mm_load_ss(ptrD+8);
    t10         = _mm_sub_ps(t10, t14);
    t11         = _mm_sub_ps(t11, t16);
    t12         = _mm_sub_ss(t12, t19);
    _mm_storeu_ps(ptrD, t10);
    _mm_storeu_ps(ptrD+4, t11);
    _mm_store_ss(ptrD+8, t12);
}


static gmx_inline void gmx_simdcall
gmx_mm_decrement_4rvec_4ptr_swizzle_ps(float * gmx_restrict ptrA, float * gmx_restrict ptrB,
                                       float * gmx_restrict ptrC, float * gmx_restrict ptrD,
                                       __m128 x1, __m128 y1, __m128 z1,
                                       __m128 x2, __m128 y2, __m128 z2,
                                       __m128 x3, __m128 y3, __m128 z3,
                                       __m128 x4, __m128 y4, __m128 z4)
{
    __m128 t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11;
    __m128 t12, t13, t14, t15, t16, t17, t18, t19, t20, t21, t22;
    __m128 t23, t24;
    t13         = _mm_unpackhi_ps(x1, y1);
    x1          = _mm_unpacklo_ps(x1, y1);
    t14         = _mm_unpackhi_ps(z1, x2);
    z1          = _mm_unpacklo_ps(z1, x2);
    t15         = _mm_unpackhi_ps(y2, z2);
    y2          = _mm_unpacklo_ps(y2, z2);
    t16         = _mm_unpackhi_ps(x3, y3);
    x3          = _mm_unpacklo_ps(x3, y3);
    t17         = _mm_unpackhi_ps(z3, x4);
    z3          = _mm_unpacklo_ps(z3, x4);
    t18         = _mm_unpackhi_ps(y4, z4);
    y4          = _mm_unpacklo_ps(y4, z4);
    t19         = _mm_movelh_ps(x1, z1);
    z1          = _mm_movehl_ps(z1, x1);
    t20         = _mm_movelh_ps(t13, t14);
    t14         = _mm_movehl_ps(t14, t13);
    t21         = _mm_movelh_ps(y2, x3);
    x3          = _mm_movehl_ps(x3, y2);
    t22         = _mm_movelh_ps(t15, t16);
    t16         = _mm_movehl_ps(t16, t15);
    t23         = _mm_movelh_ps(z3, y4);
    y4          = _mm_movehl_ps(y4, z3);
    t24         = _mm_movelh_ps(t17, t18);
    t18         = _mm_movehl_ps(t18, t17);
    t1          = _mm_loadu_ps(ptrA);
    t2          = _mm_loadu_ps(ptrA+4);
    t3          = _mm_loadu_ps(ptrA+8);
    t1          = _mm_sub_ps(t1, t19);
    t2          = _mm_sub_ps(t2, t21);
    t3          = _mm_sub_ps(t3, t23);
    _mm_storeu_ps(ptrA, t1);
    _mm_storeu_ps(ptrA+4, t2);
    _mm_storeu_ps(ptrA+8, t3);
    t4          = _mm_loadu_ps(ptrB);
    t5          = _mm_loadu_ps(ptrB+4);
    t6          = _mm_loadu_ps(ptrB+8);
    t4          = _mm_sub_ps(t4, z1);
    t5          = _mm_sub_ps(t5, x3);
    t6          = _mm_sub_ps(t6, y4);
    _mm_storeu_ps(ptrB, t4);
    _mm_storeu_ps(ptrB+4, t5);
    _mm_storeu_ps(ptrB+8, t6);
    t7          = _mm_loadu_ps(ptrC);
    t8          = _mm_loadu_ps(ptrC+4);
    t9          = _mm_loadu_ps(ptrC+8);
    t7          = _mm_sub_ps(t7, t20);
    t8          = _mm_sub_ps(t8, t22);
    t9          = _mm_sub_ps(t9, t24);
    _mm_storeu_ps(ptrC, t7);
    _mm_storeu_ps(ptrC+4, t8);
    _mm_storeu_ps(ptrC+8, t9);
    t10         = _mm_loadu_ps(ptrD);
    t11         = _mm_loadu_ps(ptrD+4);
    t12         = _mm_loadu_ps(ptrD+8);
    t10         = _mm_sub_ps(t10, t14);
    t11         = _mm_sub_ps(t11, t16);
    t12         = _mm_sub_ps(t12, t18);
    _mm_storeu_ps(ptrD, t10);
    _mm_storeu_ps(ptrD+4, t11);
    _mm_storeu_ps(ptrD+8, t12);
}


static gmx_inline void gmx_simdcall
gmx_mm_update_iforce_1atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
                                      float * gmx_restrict fptr,
                                      float * gmx_restrict fshiftptr)
{
    __m128 t2, t3;

    fix1 = _mm_hadd_ps(fix1, fix1);
    fiy1 = _mm_hadd_ps(fiy1, fiz1);

    fix1 = _mm_hadd_ps(fix1, fiy1); /* fiz1 fiy1 fix1 fix1 */

    t2 = _mm_load_ss(fptr);
    t2 = _mm_loadh_pi(t2, (__m64 *)(fptr+1));
    t3 = _mm_load_ss(fshiftptr);
    t3 = _mm_loadh_pi(t3, (__m64 *)(fshiftptr+1));

    t2 = _mm_add_ps(t2, fix1);
    t3 = _mm_add_ps(t3, fix1);

    _mm_store_ss(fptr, t2);
    _mm_storeh_pi((__m64 *)(fptr+1), t2);
    _mm_store_ss(fshiftptr, t3);
    _mm_storeh_pi((__m64 *)(fshiftptr+1), t3);
}


static gmx_inline void gmx_simdcall
gmx_mm_update_iforce_3atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
                                      __m128 fix2, __m128 fiy2, __m128 fiz2,
                                      __m128 fix3, __m128 fiy3, __m128 fiz3,
                                      float * gmx_restrict fptr,
                                      float * gmx_restrict fshiftptr)
{
    __m128 t1, t2, t3, t4;

    fix1 = _mm_hadd_ps(fix1, fiy1);
    fiz1 = _mm_hadd_ps(fiz1, fix2);
    fiy2 = _mm_hadd_ps(fiy2, fiz2);
    fix3 = _mm_hadd_ps(fix3, fiy3);
    fiz3 = _mm_hadd_ps(fiz3, fiz3);

    fix1 = _mm_hadd_ps(fix1, fiz1); /* fix2 fiz1 fiy1 fix1 */
    fiy2 = _mm_hadd_ps(fiy2, fix3); /* fiy3 fix3 fiz2 fiy2 */
    fiz3 = _mm_hadd_ps(fiz3, fiz3); /*  -    -    -   fiz3 */

    _mm_storeu_ps(fptr,  _mm_add_ps(fix1, _mm_loadu_ps(fptr)  ));
    _mm_storeu_ps(fptr+4, _mm_add_ps(fiy2, _mm_loadu_ps(fptr+4)));
    _mm_store_ss (fptr+8, _mm_add_ss(fiz3, _mm_load_ss(fptr+8) ));

    t4 = _mm_load_ss(fshiftptr+2);
    t4 = _mm_loadh_pi(t4, (__m64 *)(fshiftptr));

    t1 = _mm_shuffle_ps(fiz3, fix1, _MM_SHUFFLE(1, 0, 0, 0)); /* fiy1 fix1  -   fiz3 */
    t2 = _mm_shuffle_ps(fix1, fiy2, _MM_SHUFFLE(3, 2, 2, 2)); /* fiy3 fix3  -   fiz1 */
    t3 = _mm_shuffle_ps(fiy2, fix1, _MM_SHUFFLE(3, 3, 0, 1)); /* fix2 fix2 fiy2 fiz2 */
    t3 = _mm_permute_ps(t3, _MM_SHUFFLE(1, 2, 0, 0));         /* fiy2 fix2  -   fiz2 */

    t1 = _mm_add_ps(t1, t2);
    t3 = _mm_add_ps(t3, t4);
    t1 = _mm_add_ps(t1, t3); /* y x - z */

    _mm_store_ss(fshiftptr+2, t1);
    _mm_storeh_pi((__m64 *)(fshiftptr), t1);
}


static gmx_inline void gmx_simdcall
gmx_mm_update_iforce_4atom_swizzle_ps(__m128 fix1, __m128 fiy1, __m128 fiz1,
                                      __m128 fix2, __m128 fiy2, __m128 fiz2,
                                      __m128 fix3, __m128 fiy3, __m128 fiz3,
                                      __m128 fix4, __m128 fiy4, __m128 fiz4,
                                      float * gmx_restrict fptr,
                                      float * gmx_restrict fshiftptr)
{
    __m128 t1, t2, t3, t4, t5;

    fix1 = _mm_hadd_ps(fix1, fiy1);
    fiz1 = _mm_hadd_ps(fiz1, fix2);
    fiy2 = _mm_hadd_ps(fiy2, fiz2);
    fix3 = _mm_hadd_ps(fix3, fiy3);
    fiz3 = _mm_hadd_ps(fiz3, fix4);
    fiy4 = _mm_hadd_ps(fiy4, fiz4);

    fix1 = _mm_hadd_ps(fix1, fiz1); /* fix2 fiz1 fiy1 fix1 */
    fiy2 = _mm_hadd_ps(fiy2, fix3); /* fiy3 fix3 fiz2 fiy2 */
    fiz3 = _mm_hadd_ps(fiz3, fiy4); /* fiz4 fiy4 fix4 fiz3 */

    _mm_storeu_ps(fptr,  _mm_add_ps(fix1, _mm_loadu_ps(fptr)  ));
    _mm_storeu_ps(fptr+4, _mm_add_ps(fiy2, _mm_loadu_ps(fptr+4)));
    _mm_storeu_ps(fptr+8, _mm_add_ps(fiz3, _mm_loadu_ps(fptr+8)));

    t5 = _mm_load_ss(fshiftptr+2);
    t5 = _mm_loadh_pi(t5, (__m64 *)(fshiftptr));

    t1 = _mm_permute_ps(fix1, _MM_SHUFFLE(1, 0, 2, 2));
    t2 = _mm_permute_ps(fiy2, _MM_SHUFFLE(3, 2, 1, 1));
    t3 = _mm_permute_ps(fiz3, _MM_SHUFFLE(2, 1, 0, 0));
    t4 = _mm_shuffle_ps(fix1, fiy2, _MM_SHUFFLE(0, 0, 3, 3));
    t4 = _mm_shuffle_ps(fiz3, t4, _MM_SHUFFLE(2, 0, 3, 3));

    t1 = _mm_add_ps(t1, t2);
    t3 = _mm_add_ps(t3, t4);
    t1 = _mm_add_ps(t1, t3);
    t5 = _mm_add_ps(t5, t1);

    _mm_store_ss(fshiftptr+2, t5);
    _mm_storeh_pi((__m64 *)(fshiftptr), t5);
}


static gmx_inline void gmx_simdcall
gmx_mm_update_1pot_ps(__m128 pot1, float * gmx_restrict ptrA)
{
    pot1 = _mm_hadd_ps(pot1, pot1);
    pot1 = _mm_hadd_ps(pot1, pot1);
    _mm_store_ss(ptrA, _mm_add_ss(pot1, _mm_load_ss(ptrA)));
}

static gmx_inline void gmx_simdcall
gmx_mm_update_2pot_ps(__m128 pot1, float * gmx_restrict ptrA,
                      __m128 pot2, float * gmx_restrict ptrB)
{
    pot1 = _mm_hadd_ps(pot1, pot2);
    pot1 = _mm_hadd_ps(pot1, pot1);
    pot2 = _mm_permute_ps(pot1, _MM_SHUFFLE(0, 0, 0, 1));
    _mm_store_ss(ptrA, _mm_add_ss(pot1, _mm_load_ss(ptrA)));
    _mm_store_ss(ptrB, _mm_add_ss(pot2, _mm_load_ss(ptrB)));
}

#ifdef __PGI
#    define AVX128FMA_FLOAT_NEGZERO   ({ const union { int  fi; float f; } _gmx_fzero = {-2147483648}; _gmx_fzero.f; })
#else
#    define AVX128FMA_FLOAT_NEGZERO  (-0.0f)
#endif

static gmx_inline __m128 gmx_simdcall
avx128fma_set_exponent_f(__m128 x)
{
    const __m128i expbias      = _mm_set1_epi32(127);
    __m128i       iexp         = _mm_cvtps_epi32(x);

    iexp = _mm_slli_epi32(_mm_add_epi32(iexp, expbias), 23);
    return _mm_castsi128_ps(iexp);
}


static gmx_inline __m128 gmx_simdcall
avx128fma_exp_f(__m128 x)
{
    const __m128  argscale     = _mm_set1_ps(1.44269504088896341f);
    /* Lower bound: Disallow numbers that would lead to an IEEE fp exponent reaching +-127. */
    const __m128  arglimit     = _mm_set1_ps(126.0f);
    const __m128  invargscale0 = _mm_set1_ps(-0.693145751953125f);
    const __m128  invargscale1 = _mm_set1_ps(-1.428606765330187045e-06f);
    const __m128  CC4          = _mm_set1_ps(0.00136324646882712841033936f);
    const __m128  CC3          = _mm_set1_ps(0.00836596917361021041870117f);
    const __m128  CC2          = _mm_set1_ps(0.0416710823774337768554688f);
    const __m128  CC1          = _mm_set1_ps(0.166665524244308471679688f);
    const __m128  CC0          = _mm_set1_ps(0.499999850988388061523438f);
    const __m128  one          = _mm_set1_ps(1.0f);
    const __m128i expbias      = _mm_set1_epi32(127);
    __m128i       iexp;
    __m128        fexppart;
    __m128        intpart;
    __m128        y, p;
    __m128        valuemask;

    y         = _mm_mul_ps(x, argscale);
    fexppart  = avx128fma_set_exponent_f(y);
    intpart   = _mm_cvtepi32_ps(_mm_cvtps_epi32(y));
    valuemask = _mm_cmple_ps(_mm_andnot_ps(_mm_set1_ps(AVX128FMA_FLOAT_NEGZERO), y), arglimit);
    fexppart  = _mm_and_ps(fexppart, valuemask);

    /* Extended precision arithmetics */
    x         = _mm_macc_ps(invargscale0, intpart, x);
    x         = _mm_macc_ps(invargscale1, intpart, x);

    p         = _mm_macc_ps(CC4, x, CC3);
    p         = _mm_macc_ps(p, x, CC2);
    p         = _mm_macc_ps(p, x, CC1);
    p         = _mm_macc_ps(p, x, CC0);
    p         = _mm_macc_ps(_mm_mul_ps(x, x), p, x);
    p         = _mm_add_ps(p, one);
    x         = _mm_mul_ps(p, fexppart);
    return x;
}

static gmx_inline __m128 gmx_simdcall
avx128fma_invsqrt_f(__m128 x)
{
    __m128 lu = _mm_rsqrt_ps(x);

    return _mm_macc_ps(_mm_nmacc_ps(x, _mm_mul_ps(lu, lu), _mm_set1_ps(1.0f)), _mm_mul_ps(lu, _mm_set1_ps(0.5f)), lu);
}

static gmx_inline __m128 gmx_simdcall
avx128fma_inv_f(__m128 x)
{
    __m128 lu = _mm_rcp_ps(x);
    return _mm_mul_ps(lu, _mm_nmacc_ps(lu, x, _mm_set1_ps(2.0f)));
}

static gmx_inline __m128 gmx_simdcall
avx128fma_pmecorrF_f(__m128 z2)
{
    const __m128  FN6      = _mm_set1_ps(-1.7357322914161492954e-8f);
    const __m128  FN5      = _mm_set1_ps(1.4703624142580877519e-6f);
    const __m128  FN4      = _mm_set1_ps(-0.000053401640219807709149f);
    const __m128  FN3      = _mm_set1_ps(0.0010054721316683106153f);
    const __m128  FN2      = _mm_set1_ps(-0.019278317264888380590f);
    const __m128  FN1      = _mm_set1_ps(0.069670166153766424023f);
    const __m128  FN0      = _mm_set1_ps(-0.75225204789749321333f);

    const __m128  FD4      = _mm_set1_ps(0.0011193462567257629232f);
    const __m128  FD3      = _mm_set1_ps(0.014866955030185295499f);
    const __m128  FD2      = _mm_set1_ps(0.11583842382862377919f);
    const __m128  FD1      = _mm_set1_ps(0.50736591960530292870f);
    const __m128  FD0      = _mm_set1_ps(1.0f);

    __m128        z4;
    __m128        polyFN0, polyFN1, polyFD0, polyFD1;

    z4             = _mm_mul_ps(z2, z2);

    polyFD0        = _mm_macc_ps(FD4, z4, FD2);
    polyFD1        = _mm_macc_ps(FD3, z4, FD1);
    polyFD0        = _mm_macc_ps(polyFD0, z4, FD0);
    polyFD0        = _mm_macc_ps(polyFD1, z2, polyFD0);

    polyFD0        = avx128fma_inv_f(polyFD0);

    polyFN0        = _mm_macc_ps(FN6, z4, FN4);
    polyFN1        = _mm_macc_ps(FN5, z4, FN3);
    polyFN0        = _mm_macc_ps(polyFN0, z4, FN2);
    polyFN1        = _mm_macc_ps(polyFN1, z4, FN1);
    polyFN0        = _mm_macc_ps(polyFN0, z4, FN0);
    polyFN0        = _mm_macc_ps(polyFN1, z2, polyFN0);

    return _mm_mul_ps(polyFN0, polyFD0);
}

static gmx_inline __m128 gmx_simdcall
avx128fma_pmecorrV_f(__m128 z2)
{
    const __m128  VN6      = _mm_set1_ps(1.9296833005951166339e-8f);
    const __m128  VN5      = _mm_set1_ps(-1.4213390571557850962e-6f);
    const __m128  VN4      = _mm_set1_ps(0.000041603292906656984871f);
    const __m128  VN3      = _mm_set1_ps(-0.00013134036773265025626f);
    const __m128  VN2      = _mm_set1_ps(0.038657983986041781264f);
    const __m128  VN1      = _mm_set1_ps(0.11285044772717598220f);
    const __m128  VN0      = _mm_set1_ps(1.1283802385263030286f);

    const __m128  VD3      = _mm_set1_ps(0.0066752224023576045451f);
    const __m128  VD2      = _mm_set1_ps(0.078647795836373922256f);
    const __m128  VD1      = _mm_set1_ps(0.43336185284710920150f);
    const __m128  VD0      = _mm_set1_ps(1.0f);

    __m128        z4;
    __m128        polyVN0, polyVN1, polyVD0, polyVD1;

    z4             = _mm_mul_ps(z2, z2);

    polyVD1        = _mm_macc_ps(VD3, z4, VD1);
    polyVD0        = _mm_macc_ps(VD2, z4, VD0);
    polyVD0        = _mm_macc_ps(polyVD1, z2, polyVD0);

    polyVD0        = avx128fma_inv_f(polyVD0);

    polyVN0        = _mm_macc_ps(VN6, z4, VN4);
    polyVN1        = _mm_macc_ps(VN5, z4, VN3);
    polyVN0        = _mm_macc_ps(polyVN0, z4, VN2);
    polyVN1        = _mm_macc_ps(polyVN1, z4, VN1);
    polyVN0        = _mm_macc_ps(polyVN0, z4, VN0);
    polyVN0        = _mm_macc_ps(polyVN1, z2, polyVN0);

    return _mm_mul_ps(polyVN0, polyVD0);
}

#endif /* _kernelutil_x86_avx_128_fma_single_h_ */
