/* This file is part of the Vc library. Copyright (C) 2010-2012 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . */ #include "limits.h" #include "../common/bitscanintrinsics.h" #include "macros.h" namespace ROOT { namespace Vc { ALIGN(64) extern unsigned int RandomState[16]; namespace SSE { template static Vc_ALWAYS_INLINE Vc_CONST const T *_IndexesFromZero() { if (Size == 4) { return reinterpret_cast(_IndexesFromZero4); } else if (Size == 8) { return reinterpret_cast(_IndexesFromZero8); } else if (Size == 16) { return reinterpret_cast(_IndexesFromZero16); } return 0; } /////////////////////////////////////////////////////////////////////////////////////////// // constants {{{1 template Vc_INTRINSIC Vector::Vector(VectorSpecialInitializerZero::ZEnum) : d(VectorHelper::zero()) { } template Vc_INTRINSIC Vector::Vector(VectorSpecialInitializerOne::OEnum) : d(VectorHelper::one()) { } template Vc_INTRINSIC Vector::Vector(VectorSpecialInitializerIndexesFromZero::IEnum) : d(VectorHelper::load(_IndexesFromZero(), Aligned)) { } template Vc_INTRINSIC Vc_CONST Vector Vector::Zero() { return VectorHelper::zero(); } template Vc_INTRINSIC Vc_CONST Vector Vector::One() { return VectorHelper::one(); } template Vc_INTRINSIC Vc_CONST Vector Vector::IndexesFromZero() { return VectorHelper::load(_IndexesFromZero(), Aligned); } // conversion/casts {{{1 template template Vc_INTRINSIC Vector::Vector(const Vector &x) : d(StaticCastHelper::cast(x.data())) { } template<> template<> Vc_INTRINSIC short_v &Vector::operator=(const ushort_v &x) { data() = StaticCastHelper::cast(x.data()); return *this; } template<> template<> Vc_INTRINSIC ushort_v &Vector::operator=(const short_v &x) { data() = StaticCastHelper::cast(x.data()); return *this; } template<> template<> Vc_INTRINSIC int_v &Vector::operator=(const uint_v &x) { data() = StaticCastHelper::cast(x.data()); return *this; } template<> template<> Vc_INTRINSIC uint_v &Vector::operator=(const int_v &x) { data() = StaticCastHelper::cast(x.data()); return *this; } // broadcasts {{{1 template Vc_INTRINSIC Vector::Vector(EntryType a) : d(VectorHelper::set(a)) { } /////////////////////////////////////////////////////////////////////////////////////////// // load ctors {{{1 template Vc_ALWAYS_INLINE Vector::Vector(const EntryType *x) { load(x); } template template Vc_ALWAYS_INLINE Vector::Vector(const EntryType *x, A a) { load(x, a); } template template Vc_ALWAYS_INLINE Vector::Vector(const OtherT *x) { load(x); } template template Vc_ALWAYS_INLINE Vector::Vector(const OtherT *x, A a) { load(x, a); } /////////////////////////////////////////////////////////////////////////////////////////// // load member functions {{{1 template Vc_INTRINSIC void Vector::load(const EntryType *mem) { load(mem, Aligned); } template template Vc_INTRINSIC void Vector::load(const EntryType *mem, A align) { d.v() = VectorHelper::load(mem, align); } template template Vc_INTRINSIC void Vector::load(const OtherT *mem) { load(mem, Aligned); } // float8: simply use the float implementation twice {{{2 template<> template Vc_INTRINSIC void Vector::load(const OtherT *x, A a) { d.v() = M256::create( Vector(&x[0], a).data(), Vector(&x[4], a).data() ); } // LoadHelper {{{2 template struct LoadHelper; // float {{{2 template struct LoadHelper { static Vc_ALWAYS_INLINE Vc_PURE __m128 load(const double *mem, Flags f) { return _mm_movelh_ps(_mm_cvtpd_ps(VectorHelper<__m128d>::load(&mem[0], f)), _mm_cvtpd_ps(VectorHelper<__m128d>::load(&mem[2], f))); } }; template struct LoadHelper { static Vc_ALWAYS_INLINE Vc_PURE __m128 load(const unsigned int *mem, Flags f) { return StaticCastHelper::cast(VectorHelper<__m128i>::load(mem, f)); } }; template struct LoadHelper { static Vc_ALWAYS_INLINE Vc_PURE __m128 load(const int *mem, Flags f) { return StaticCastHelper::cast(VectorHelper<__m128i>::load(mem, f)); } }; template struct LoadHelper { static Vc_ALWAYS_INLINE Vc_PURE __m128 load(const unsigned short *mem, Flags f) { return _mm_cvtepi32_ps(LoadHelper::load(mem, f)); } }; template struct LoadHelper { static Vc_ALWAYS_INLINE Vc_PURE __m128 load(const short *mem, Flags f) { return _mm_cvtepi32_ps(LoadHelper::load(mem, f)); } }; template struct LoadHelper { static Vc_ALWAYS_INLINE Vc_PURE __m128 load(const unsigned char *mem, Flags f) { return _mm_cvtepi32_ps(LoadHelper::load(mem, f)); } }; template struct LoadHelper { static Vc_ALWAYS_INLINE Vc_PURE __m128 load(const signed char *mem, Flags f) { return _mm_cvtepi32_ps(LoadHelper::load(mem, f)); } }; // int {{{2 template struct LoadHelper { static Vc_ALWAYS_INLINE Vc_PURE __m128i load(const unsigned int *mem, Flags f) { return VectorHelper<__m128i>::load(mem, f); } }; // no difference between streaming and alignment, because the // 32/64 bit loads are not available as streaming loads, and can always be unaligned template struct LoadHelper { static Vc_ALWAYS_INLINE Vc_PURE __m128i load(const unsigned short *mem, Flags) { return _mm_cvtepu16_epi32( _mm_loadl_epi64(reinterpret_cast(mem))); } }; template struct LoadHelper { static Vc_ALWAYS_INLINE Vc_PURE __m128i load(const short *mem, Flags) { return _mm_cvtepi16_epi32(_mm_loadl_epi64(reinterpret_cast(mem))); } }; template struct LoadHelper { static Vc_ALWAYS_INLINE Vc_PURE __m128i load(const unsigned char *mem, Flags) { return _mm_cvtepu8_epi32(_mm_cvtsi32_si128(*reinterpret_cast(mem))); } }; template struct LoadHelper { static Vc_ALWAYS_INLINE Vc_PURE __m128i load(const signed char *mem, Flags) { return _mm_cvtepi8_epi32(_mm_cvtsi32_si128(*reinterpret_cast(mem))); } }; // unsigned int {{{2 template struct LoadHelper { static Vc_ALWAYS_INLINE Vc_PURE __m128i load(const unsigned short *mem, Flags) { return _mm_cvtepu16_epi32(_mm_loadl_epi64(reinterpret_cast(mem))); } }; template struct LoadHelper { static Vc_ALWAYS_INLINE Vc_PURE __m128i load(const unsigned char *mem, Flags) { return _mm_cvtepu8_epi32(_mm_cvtsi32_si128(*reinterpret_cast(mem))); } }; // short {{{2 template struct LoadHelper { static Vc_ALWAYS_INLINE Vc_PURE __m128i load(const unsigned short *mem, Flags f) { return VectorHelper<__m128i>::load(mem, f); } }; template struct LoadHelper { static Vc_ALWAYS_INLINE Vc_PURE __m128i load(const unsigned char *mem, Flags) { return _mm_cvtepu8_epi16(_mm_loadl_epi64(reinterpret_cast(mem))); } }; template struct LoadHelper { static Vc_ALWAYS_INLINE Vc_PURE __m128i load(const signed char *mem, Flags) { return _mm_cvtepi8_epi16(_mm_loadl_epi64(reinterpret_cast(mem))); } }; // unsigned short {{{2 template struct LoadHelper { static Vc_ALWAYS_INLINE Vc_PURE __m128i load(const unsigned char *mem, Flags) { return _mm_cvtepu8_epi16(_mm_loadl_epi64(reinterpret_cast(mem))); } }; // general load, implemented via LoadHelper {{{2 template template Vc_INTRINSIC void Vector::load(const SrcT *x, Flags f) { d.v() = LoadHelper::load(x, f); } /////////////////////////////////////////////////////////////////////////////////////////// // expand/combine {{{1 template Vc_INTRINSIC Vector::Vector(const Vector::Type> *a) : d(VectorHelper::concat(a[0].data(), a[1].data())) { } template inline void Vector::expand(Vector::Type> *x) const { if (Size == 8u) { x[0].data() = VectorHelper::expand0(data()); x[1].data() = VectorHelper::expand1(data()); } } /////////////////////////////////////////////////////////////////////////////////////////// // zeroing {{{1 template Vc_INTRINSIC void Vector::setZero() { data() = VectorHelper::zero(); } template Vc_INTRINSIC void Vector::setZero(const Mask &k) { data() = VectorHelper::andnot_(mm128_reinterpret_cast(k.data()), data()); } template<> Vc_INTRINSIC void Vector::setQnan() { data() = _mm_setallone_pd(); } template<> Vc_INTRINSIC void Vector::setQnan(Mask::Argument k) { data() = _mm_or_pd(data(), k.dataD()); } template<> Vc_INTRINSIC void Vector::setQnan() { data() = _mm_setallone_ps(); } template<> Vc_INTRINSIC void Vector::setQnan(Mask::Argument k) { data() = _mm_or_ps(data(), k.data()); } template<> Vc_INTRINSIC void Vector::setQnan() { d.v()[0] = _mm_setallone_ps(); d.v()[1] = _mm_setallone_ps(); } template<> Vc_INTRINSIC void Vector::setQnan(Mask::Argument k) { d.v()[0] = _mm_or_ps(d.v()[0], k.data()[0]); d.v()[1] = _mm_or_ps(d.v()[1], k.data()[1]); } /////////////////////////////////////////////////////////////////////////////////////////// // stores {{{1 template Vc_INTRINSIC void Vector::store(EntryType *mem) const { VectorHelper::store(mem, data(), Aligned); } template Vc_INTRINSIC void Vector::store(EntryType *mem, const Mask &mask) const { VectorHelper::store(mem, data(), mm128_reinterpret_cast(mask.data()), Aligned); } template template Vc_INTRINSIC void Vector::store(EntryType *mem, A align) const { VectorHelper::store(mem, data(), align); } template template Vc_INTRINSIC void Vector::store(EntryType *mem, const Mask &mask, A align) const { HV::store(mem, data(), mm128_reinterpret_cast(mask.data()), align); } /////////////////////////////////////////////////////////////////////////////////////////// // division {{{1 template Vc_INTRINSIC Vector &WriteMaskedVector::operator/=(const Vector &x) { return operator=(*vec / x); } template<> Vc_INTRINSIC int_v &WriteMaskedVector::operator/=(const int_v &x) { Vc_foreach_bit (int i, mask) { vec->d.m(i) /= x.d.m(i); } return *vec; } template<> Vc_INTRINSIC uint_v &WriteMaskedVector::operator/=(const uint_v &x) { Vc_foreach_bit (int i, mask) { vec->d.m(i) /= x.d.m(i); } return *vec; } template<> Vc_INTRINSIC short_v &WriteMaskedVector::operator/=(const short_v &x) { Vc_foreach_bit (int i, mask) { vec->d.m(i) /= x.d.m(i); } return *vec; } template<> Vc_INTRINSIC ushort_v &WriteMaskedVector::operator/=(const ushort_v &x) { Vc_foreach_bit (int i, mask) { vec->d.m(i) /= x.d.m(i); } return *vec; } template inline Vector &Vector::operator/=(EntryType x) { if (VectorTraits::HasVectorDivision) { return operator/=(Vector(x)); } for_all_vector_entries(i, d.m(i) /= x; ); return *this; } template template Vc_INTRINSIC Vc_PURE VC_EXACT_TYPE(TT, typename DetermineEntryType::Type, Vector) Vector::operator/(TT x) const { if (VectorTraits::HasVectorDivision) { return operator/(Vector(x)); } Vector r; for_all_vector_entries(i, r.d.m(i) = d.m(i) / x; ); return r; } template inline Vector &Vector::operator/=(const Vector &x) { for_all_vector_entries(i, d.m(i) /= x.d.m(i); ); return *this; } template inline Vc_PURE Vector Vector::operator/(const Vector &x) const { Vector r; for_all_vector_entries(i, r.d.m(i) = d.m(i) / x.d.m(i); ); return r; } template<> inline Vector &Vector::operator/=(const Vector &x) { __m128 lo = _mm_cvtepi32_ps(VectorHelper::expand0(d.v())); __m128 hi = _mm_cvtepi32_ps(VectorHelper::expand1(d.v())); lo = _mm_div_ps(lo, _mm_cvtepi32_ps(VectorHelper::expand0(x.d.v()))); hi = _mm_div_ps(hi, _mm_cvtepi32_ps(VectorHelper::expand1(x.d.v()))); d.v() = _mm_packs_epi32(_mm_cvtps_epi32(lo), _mm_cvtps_epi32(hi)); return *this; } template<> inline Vc_PURE Vector Vector::operator/(const Vector &x) const { __m128 lo = _mm_cvtepi32_ps(VectorHelper::expand0(d.v())); __m128 hi = _mm_cvtepi32_ps(VectorHelper::expand1(d.v())); lo = _mm_div_ps(lo, _mm_cvtepi32_ps(VectorHelper::expand0(x.d.v()))); hi = _mm_div_ps(hi, _mm_cvtepi32_ps(VectorHelper::expand1(x.d.v()))); return _mm_packs_epi32(_mm_cvtps_epi32(lo), _mm_cvtps_epi32(hi)); } template<> inline Vector &Vector::operator/=(const Vector &x) { __m128 lo = _mm_cvtepi32_ps(VectorHelper::expand0(d.v())); __m128 hi = _mm_cvtepi32_ps(VectorHelper::expand1(d.v())); lo = _mm_div_ps(lo, _mm_cvtepi32_ps(VectorHelper::expand0(x.d.v()))); hi = _mm_div_ps(hi, _mm_cvtepi32_ps(VectorHelper::expand1(x.d.v()))); d.v() = _mm_packs_epi32(_mm_cvtps_epi32(lo), _mm_cvtps_epi32(hi)); return *this; } template<> Vc_ALWAYS_INLINE Vc_PURE Vector Vector::operator/(const Vector &x) const { __m128 lo = _mm_cvtepi32_ps(VectorHelper::expand0(d.v())); __m128 hi = _mm_cvtepi32_ps(VectorHelper::expand1(d.v())); lo = _mm_div_ps(lo, _mm_cvtepi32_ps(VectorHelper::expand0(x.d.v()))); hi = _mm_div_ps(hi, _mm_cvtepi32_ps(VectorHelper::expand1(x.d.v()))); return _mm_packs_epi32(_mm_cvtps_epi32(lo), _mm_cvtps_epi32(hi)); } template<> Vc_ALWAYS_INLINE Vector &Vector::operator/=(const Vector &x) { d.v() = _mm_div_ps(d.v(), x.d.v()); return *this; } template<> Vc_ALWAYS_INLINE Vc_PURE Vector Vector::operator/(const Vector &x) const { return _mm_div_ps(d.v(), x.d.v()); } template<> Vc_ALWAYS_INLINE Vector &Vector::operator/=(const Vector &x) { d.v()[0] = _mm_div_ps(d.v()[0], x.d.v()[0]); d.v()[1] = _mm_div_ps(d.v()[1], x.d.v()[1]); return *this; } template<> Vc_ALWAYS_INLINE Vc_PURE Vector Vector::operator/(const Vector &x) const { Vector r; r.d.v()[0] = _mm_div_ps(d.v()[0], x.d.v()[0]); r.d.v()[1] = _mm_div_ps(d.v()[1], x.d.v()[1]); return r; } template<> Vc_ALWAYS_INLINE Vector &Vector::operator/=(const Vector &x) { d.v() = _mm_div_pd(d.v(), x.d.v()); return *this; } template<> Vc_ALWAYS_INLINE Vc_PURE Vector Vector::operator/(const Vector &x) const { return _mm_div_pd(d.v(), x.d.v()); } /////////////////////////////////////////////////////////////////////////////////////////// // operator- {{{1 template<> Vc_ALWAYS_INLINE Vector Vc_PURE Vc_FLATTEN Vector::operator-() const { return _mm_xor_pd(d.v(), _mm_setsignmask_pd()); } template<> Vc_ALWAYS_INLINE Vector Vc_PURE Vc_FLATTEN Vector::operator-() const { return _mm_xor_ps(d.v(), _mm_setsignmask_ps()); } template<> Vc_ALWAYS_INLINE Vector Vc_PURE Vc_FLATTEN Vector::operator-() const { return M256::create( _mm_xor_ps(d.v()[0], _mm_setsignmask_ps()), _mm_xor_ps(d.v()[1], _mm_setsignmask_ps())); } template<> Vc_ALWAYS_INLINE Vector Vc_PURE Vc_FLATTEN Vector::operator-() const { #ifdef VC_IMPL_SSSE3 return _mm_sign_epi32(d.v(), _mm_setallone_si128()); #else return _mm_add_epi32(_mm_xor_si128(d.v(), _mm_setallone_si128()), _mm_setone_epi32()); #endif } template<> Vc_ALWAYS_INLINE Vector Vc_PURE Vc_FLATTEN Vector::operator-() const { #ifdef VC_IMPL_SSSE3 return _mm_sign_epi32(d.v(), _mm_setallone_si128()); #else return _mm_add_epi32(_mm_xor_si128(d.v(), _mm_setallone_si128()), _mm_setone_epi32()); #endif } template<> Vc_ALWAYS_INLINE Vector Vc_PURE Vc_FLATTEN Vector::operator-() const { #ifdef VC_IMPL_SSSE3 return _mm_sign_epi16(d.v(), _mm_setallone_si128()); #else return _mm_mullo_epi16(d.v(), _mm_setallone_si128()); #endif } template<> Vc_ALWAYS_INLINE Vector Vc_PURE Vc_FLATTEN Vector::operator-() const { #ifdef VC_IMPL_SSSE3 return _mm_sign_epi16(d.v(), _mm_setallone_si128()); #else return _mm_mullo_epi16(d.v(), _mm_setallone_si128()); #endif } /////////////////////////////////////////////////////////////////////////////////////////// // integer ops {{{1 #define OP_IMPL(T, symbol, fun) \ template<> Vc_ALWAYS_INLINE Vector &Vector::operator symbol##=(const Vector &x) \ { \ d.v() = VectorHelper::fun(d.v(), x.d.v()); \ return *this; \ } \ template<> Vc_ALWAYS_INLINE Vc_PURE Vector Vector::operator symbol(const Vector &x) const \ { \ return VectorHelper::fun(d.v(), x.d.v()); \ } OP_IMPL(int, &, and_) OP_IMPL(int, |, or_) OP_IMPL(int, ^, xor_) OP_IMPL(unsigned int, &, and_) OP_IMPL(unsigned int, |, or_) OP_IMPL(unsigned int, ^, xor_) OP_IMPL(short, &, and_) OP_IMPL(short, |, or_) OP_IMPL(short, ^, xor_) OP_IMPL(unsigned short, &, and_) OP_IMPL(unsigned short, |, or_) OP_IMPL(unsigned short, ^, xor_) OP_IMPL(float, &, and_) OP_IMPL(float, |, or_) OP_IMPL(float, ^, xor_) OP_IMPL(float8, &, and_) OP_IMPL(float8, |, or_) OP_IMPL(float8, ^, xor_) OP_IMPL(double, &, and_) OP_IMPL(double, |, or_) OP_IMPL(double, ^, xor_) #undef OP_IMPL #ifdef VC_IMPL_XOP static Vc_INTRINSIC Vc_CONST __m128i shiftLeft (const int_v &value, const int_v &count) { return _mm_sha_epi32(value.data(), count.data()); } static Vc_INTRINSIC Vc_CONST __m128i shiftLeft (const uint_v &value, const uint_v &count) { return _mm_shl_epi32(value.data(), count.data()); } static Vc_INTRINSIC Vc_CONST __m128i shiftLeft (const short_v &value, const short_v &count) { return _mm_sha_epi16(value.data(), count.data()); } static Vc_INTRINSIC Vc_CONST __m128i shiftLeft (const ushort_v &value, const ushort_v &count) { return _mm_shl_epi16(value.data(), count.data()); } static Vc_INTRINSIC Vc_CONST __m128i shiftRight(const int_v &value, const int_v &count) { return shiftLeft(value, -count ); } static Vc_INTRINSIC Vc_CONST __m128i shiftRight(const uint_v &value, const uint_v &count) { return shiftLeft(value, uint_v(-count)); } static Vc_INTRINSIC Vc_CONST __m128i shiftRight(const short_v &value, const short_v &count) { return shiftLeft(value, -count ); } static Vc_INTRINSIC Vc_CONST __m128i shiftRight(const ushort_v &value, const ushort_v &count) { return shiftLeft(value, ushort_v(-count)); } #define _VC_OP(T, symbol, impl) \ template<> Vc_INTRINSIC T &T::operator symbol##=(T::AsArg shift) \ { \ d.v() = impl(*this, shift); \ return *this; \ } \ template<> Vc_INTRINSIC Vc_PURE T T::operator symbol (T::AsArg shift) const \ { \ return impl(*this, shift); \ } VC_APPLY_2(VC_LIST_INT_VECTOR_TYPES, _VC_OP, <<, shiftLeft) VC_APPLY_2(VC_LIST_INT_VECTOR_TYPES, _VC_OP, >>, shiftRight) #undef _VC_OP #else #if defined(VC_GCC) && VC_GCC == 0x40600 && defined(VC_IMPL_XOP) #define VC_WORKAROUND __attribute__((optimize("no-tree-vectorize"),weak)) #else #define VC_WORKAROUND Vc_INTRINSIC #endif #define OP_IMPL(T, symbol) \ template<> VC_WORKAROUND Vector &Vector::operator symbol##=(Vector::AsArg x) \ { \ for_all_vector_entries(i, \ d.m(i) symbol##= x.d.m(i); \ ); \ return *this; \ } \ template<> inline Vc_PURE Vector Vector::operator symbol(Vector::AsArg x) const \ { \ Vector r; \ for_all_vector_entries(i, \ r.d.m(i) = d.m(i) symbol x.d.m(i); \ ); \ return r; \ } OP_IMPL(int, <<) OP_IMPL(int, >>) OP_IMPL(unsigned int, <<) OP_IMPL(unsigned int, >>) OP_IMPL(short, <<) OP_IMPL(short, >>) OP_IMPL(unsigned short, <<) OP_IMPL(unsigned short, >>) #undef OP_IMPL #undef VC_WORKAROUND #endif template Vc_ALWAYS_INLINE Vector &Vector::operator>>=(int shift) { d.v() = VectorHelper::shiftRight(d.v(), shift); return *this; } template Vc_ALWAYS_INLINE Vc_PURE Vector Vector::operator>>(int shift) const { return VectorHelper::shiftRight(d.v(), shift); } template Vc_ALWAYS_INLINE Vector &Vector::operator<<=(int shift) { d.v() = VectorHelper::shiftLeft(d.v(), shift); return *this; } template Vc_ALWAYS_INLINE Vc_PURE Vector Vector::operator<<(int shift) const { return VectorHelper::shiftLeft(d.v(), shift); } /////////////////////////////////////////////////////////////////////////////////////////// // swizzles {{{1 template Vc_INTRINSIC Vc_PURE const Vector &Vector::abcd() const { return *this; } template Vc_INTRINSIC Vc_PURE const Vector Vector::cdab() const { return Mem::permute(data()); } template Vc_INTRINSIC Vc_PURE const Vector Vector::badc() const { return Mem::permute(data()); } template Vc_INTRINSIC Vc_PURE const Vector Vector::aaaa() const { return Mem::permute(data()); } template Vc_INTRINSIC Vc_PURE const Vector Vector::bbbb() const { return Mem::permute(data()); } template Vc_INTRINSIC Vc_PURE const Vector Vector::cccc() const { return Mem::permute(data()); } template Vc_INTRINSIC Vc_PURE const Vector Vector::dddd() const { return Mem::permute(data()); } template Vc_INTRINSIC Vc_PURE const Vector Vector::bcad() const { return Mem::permute(data()); } template Vc_INTRINSIC Vc_PURE const Vector Vector::bcda() const { return Mem::permute(data()); } template Vc_INTRINSIC Vc_PURE const Vector Vector::dabc() const { return Mem::permute(data()); } template Vc_INTRINSIC Vc_PURE const Vector Vector::acbd() const { return Mem::permute(data()); } template Vc_INTRINSIC Vc_PURE const Vector Vector::dbca() const { return Mem::permute(data()); } template Vc_INTRINSIC Vc_PURE const Vector Vector::dcba() const { return Mem::permute(data()); } template<> Vc_INTRINSIC Vc_PURE const sfloat_v Vector::cdab() const { return M256::create(Mem::permute(d.v()[0]), Mem::permute(d.v()[1])); } template<> Vc_INTRINSIC Vc_PURE const sfloat_v Vector::badc() const { return M256::create(Mem::permute(d.v()[0]), Mem::permute(d.v()[1])); } template<> Vc_INTRINSIC Vc_PURE const sfloat_v Vector::aaaa() const { return M256::create(Mem::permute(d.v()[0]), Mem::permute(d.v()[1])); } template<> Vc_INTRINSIC Vc_PURE const sfloat_v Vector::bbbb() const { return M256::create(Mem::permute(d.v()[0]), Mem::permute(d.v()[1])); } template<> Vc_INTRINSIC Vc_PURE const sfloat_v Vector::cccc() const { return M256::create(Mem::permute(d.v()[0]), Mem::permute(d.v()[1])); } template<> Vc_INTRINSIC Vc_PURE const sfloat_v Vector::dddd() const { return M256::create(Mem::permute(d.v()[0]), Mem::permute(d.v()[1])); } template<> Vc_INTRINSIC Vc_PURE const sfloat_v Vector::bcad() const { return M256::create(Mem::permute(d.v()[0]), Mem::permute(d.v()[1])); } template<> Vc_INTRINSIC Vc_PURE const sfloat_v Vector::bcda() const { return M256::create(Mem::permute(d.v()[0]), Mem::permute(d.v()[1])); } template<> Vc_INTRINSIC Vc_PURE const sfloat_v Vector::dabc() const { return M256::create(Mem::permute(d.v()[0]), Mem::permute(d.v()[1])); } template<> Vc_INTRINSIC Vc_PURE const sfloat_v Vector::acbd() const { return M256::create(Mem::permute(d.v()[0]), Mem::permute(d.v()[1])); } template<> Vc_INTRINSIC Vc_PURE const sfloat_v Vector::dbca() const { return M256::create(Mem::permute(d.v()[0]), Mem::permute(d.v()[1])); } template<> Vc_INTRINSIC Vc_PURE const sfloat_v Vector::dcba() const { return M256::create(Mem::permute(d.v()[0]), Mem::permute(d.v()[1])); } #define VC_SWIZZLES_16BIT_IMPL(T) \ template<> Vc_INTRINSIC Vc_PURE const Vector Vector::cdab() const { return Mem::permute(data()); } \ template<> Vc_INTRINSIC Vc_PURE const Vector Vector::badc() const { return Mem::permute(data()); } \ template<> Vc_INTRINSIC Vc_PURE const Vector Vector::aaaa() const { return Mem::permute(data()); } \ template<> Vc_INTRINSIC Vc_PURE const Vector Vector::bbbb() const { return Mem::permute(data()); } \ template<> Vc_INTRINSIC Vc_PURE const Vector Vector::cccc() const { return Mem::permute(data()); } \ template<> Vc_INTRINSIC Vc_PURE const Vector Vector::dddd() const { return Mem::permute(data()); } \ template<> Vc_INTRINSIC Vc_PURE const Vector Vector::bcad() const { return Mem::permute(data()); } \ template<> Vc_INTRINSIC Vc_PURE const Vector Vector::bcda() const { return Mem::permute(data()); } \ template<> Vc_INTRINSIC Vc_PURE const Vector Vector::dabc() const { return Mem::permute(data()); } \ template<> Vc_INTRINSIC Vc_PURE const Vector Vector::acbd() const { return Mem::permute(data()); } \ template<> Vc_INTRINSIC Vc_PURE const Vector Vector::dbca() const { return Mem::permute(data()); } \ template<> Vc_INTRINSIC Vc_PURE const Vector Vector::dcba() const { return Mem::permute(data()); } VC_SWIZZLES_16BIT_IMPL(short) VC_SWIZZLES_16BIT_IMPL(unsigned short) #undef VC_SWIZZLES_16BIT_IMPL // operators {{{1 #include "../common/operators.h" // isNegative {{{1 template<> Vc_INTRINSIC Vc_PURE float_m float_v::isNegative() const { return sse_cast<__m128>(_mm_srai_epi32(sse_cast<__m128i>(_mm_and_ps(_mm_setsignmask_ps(), d.v())), 31)); } template<> Vc_INTRINSIC Vc_PURE sfloat_m sfloat_v::isNegative() const { return M256::create( sse_cast<__m128>(_mm_srai_epi32(sse_cast<__m128i>(_mm_and_ps(_mm_setsignmask_ps(), d.v()[0])), 31)), sse_cast<__m128>(_mm_srai_epi32(sse_cast<__m128i>(_mm_and_ps(_mm_setsignmask_ps(), d.v()[1])), 31)) ); } template<> Vc_INTRINSIC Vc_PURE double_m double_v::isNegative() const { return Mem::permute(sse_cast<__m128>( _mm_srai_epi32(sse_cast<__m128i>(_mm_and_pd(_mm_setsignmask_pd(), d.v())), 31) )); } // gathers {{{1 template template Vc_ALWAYS_INLINE Vector::Vector(const EntryType *mem, const IndexT *indexes) { gather(mem, indexes); } template template Vc_ALWAYS_INLINE Vector::Vector(const EntryType *mem, VC_ALIGNED_PARAMETER(Vector) indexes) { gather(mem, indexes); } template template Vc_ALWAYS_INLINE Vector::Vector(const EntryType *mem, const IndexT *indexes, MaskArg mask) : d(HT::zero()) { gather(mem, indexes, mask); } template template Vc_ALWAYS_INLINE Vector::Vector(const EntryType *mem, VC_ALIGNED_PARAMETER(Vector) indexes, MaskArg mask) : d(HT::zero()) { gather(mem, indexes, mask); } template template Vc_ALWAYS_INLINE Vector::Vector(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes) { gather(array, member1, indexes); } template template Vc_ALWAYS_INLINE Vector::Vector(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask) : d(HT::zero()) { gather(array, member1, indexes, mask); } template template Vc_ALWAYS_INLINE Vector::Vector(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes) { gather(array, member1, member2, indexes); } template template Vc_ALWAYS_INLINE Vector::Vector(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask) : d(HT::zero()) { gather(array, member1, member2, indexes, mask); } template template Vc_ALWAYS_INLINE Vector::Vector(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes) { gather(array, ptrMember1, outerIndexes, innerIndexes); } template template Vc_ALWAYS_INLINE Vector::Vector(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes, MaskArg mask) : d(HT::zero()) { gather(array, ptrMember1, outerIndexes, innerIndexes, mask); } template struct IndexSizeChecker { static void check() {} }; template struct IndexSizeChecker, Size> { static void check() { VC_STATIC_ASSERT(Vector::Size >= Size, IndexVector_must_have_greater_or_equal_number_of_entries); } }; template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) { IndexSizeChecker::check(); d.v() = _mm_setr_pd(mem[indexes[0]], mem[indexes[1]]); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) { IndexSizeChecker::check(); d.v() = _mm_setr_ps(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]]); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) { IndexSizeChecker::check(); d.v()[0] = _mm_setr_ps(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]]); d.v()[1] = _mm_setr_ps(mem[indexes[4]], mem[indexes[5]], mem[indexes[6]], mem[indexes[7]]); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) { IndexSizeChecker::check(); d.v() = _mm_setr_epi32(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]]); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) { IndexSizeChecker::check(); d.v() = _mm_setr_epi32(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]]); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) { IndexSizeChecker::check(); d.v() = _mm_setr_epi16(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]], mem[indexes[4]], mem[indexes[5]], mem[indexes[6]], mem[indexes[7]]); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) { IndexSizeChecker::check(); d.v() = _mm_setr_epi16(mem[indexes[0]], mem[indexes[1]], mem[indexes[2]], mem[indexes[3]], mem[indexes[4]], mem[indexes[5]], mem[indexes[6]], mem[indexes[7]]); } #ifdef VC_USE_SET_GATHERS template template Vc_ALWAYS_INLINE void Vector::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Vector) indexes, MaskArg mask) { IndexSizeChecker, Size>::check(); Vector indexesTmp = indexes; indexesTmp.setZero(!static_cast::Mask>(mask)); (*this)(mask) = Vector(mem, indexesTmp); } #endif #ifdef VC_USE_BSF_GATHERS #define VC_MASKED_GATHER \ int bits = mask.toInt(); \ while (bits) { \ const int i = _bit_scan_forward(bits); \ bits &= ~(1 << i); /* btr? */ \ d.m(i) = ith_value(i); \ } #elif defined(VC_USE_POPCNT_BSF_GATHERS) #define VC_MASKED_GATHER \ unsigned int bits = mask.toInt(); \ unsigned int low, high = 0; \ switch (mask.count()) { \ case 8: \ high = _bit_scan_reverse(bits); \ d.m(high) = ith_value(high); \ high = (1 << high); \ case 7: \ low = _bit_scan_forward(bits); \ bits ^= high | (1 << low); \ d.m(low) = ith_value(low); \ case 6: \ high = _bit_scan_reverse(bits); \ d.m(high) = ith_value(high); \ high = (1 << high); \ case 5: \ low = _bit_scan_forward(bits); \ bits ^= high | (1 << low); \ d.m(low) = ith_value(low); \ case 4: \ high = _bit_scan_reverse(bits); \ d.m(high) = ith_value(high); \ high = (1 << high); \ case 3: \ low = _bit_scan_forward(bits); \ bits ^= high | (1 << low); \ d.m(low) = ith_value(low); \ case 2: \ high = _bit_scan_reverse(bits); \ d.m(high) = ith_value(high); \ case 1: \ low = _bit_scan_forward(bits); \ d.m(low) = ith_value(low); \ case 0: \ break; \ } #else #define VC_MASKED_GATHER \ if (mask.isEmpty()) { \ return; \ } \ for_all_vector_entries(i, \ if (mask[i]) d.m(i) = ith_value(i); \ ); #endif template template Vc_INTRINSIC void Vector::gather(const EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes, MaskArg mask) { IndexSizeChecker::check(); #define ith_value(_i_) (mem[indexes[_i_]]) VC_MASKED_GATHER #undef ith_value } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes) { IndexSizeChecker::check(); d.v() = _mm_setr_pd(array[indexes[0]].*(member1), array[indexes[1]].*(member1)); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes) { IndexSizeChecker::check(); d.v() = _mm_setr_ps(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1), array[indexes[3]].*(member1)); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes) { IndexSizeChecker::check(); d.v()[0] = _mm_setr_ps(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1), array[indexes[3]].*(member1)); d.v()[1] = _mm_setr_ps(array[indexes[4]].*(member1), array[indexes[5]].*(member1), array[indexes[6]].*(member1), array[indexes[7]].*(member1)); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes) { IndexSizeChecker::check(); d.v() = _mm_setr_epi32(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1), array[indexes[3]].*(member1)); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes) { IndexSizeChecker::check(); d.v() = _mm_setr_epi32(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1), array[indexes[3]].*(member1)); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes) { IndexSizeChecker::check(); d.v() = _mm_setr_epi16(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1), array[indexes[3]].*(member1), array[indexes[4]].*(member1), array[indexes[5]].*(member1), array[indexes[6]].*(member1), array[indexes[7]].*(member1)); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes) { IndexSizeChecker::check(); d.v() = _mm_setr_epi16(array[indexes[0]].*(member1), array[indexes[1]].*(member1), array[indexes[2]].*(member1), array[indexes[3]].*(member1), array[indexes[4]].*(member1), array[indexes[5]].*(member1), array[indexes[6]].*(member1), array[indexes[7]].*(member1)); } template template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask) { IndexSizeChecker::check(); #define ith_value(_i_) (array[indexes[_i_]].*(member1)) VC_MASKED_GATHER #undef ith_value } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes) { IndexSizeChecker::check(); d.v() = _mm_setr_pd(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2)); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes) { IndexSizeChecker::check(); d.v() = _mm_setr_ps(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), array[indexes[2]].*(member1).*(member2), array[indexes[3]].*(member1).*(member2)); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes) { IndexSizeChecker::check(); d.v()[0] = _mm_setr_ps(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), array[indexes[2]].*(member1).*(member2), array[indexes[3]].*(member1).*(member2)); d.v()[1] = _mm_setr_ps(array[indexes[4]].*(member1).*(member2), array[indexes[5]].*(member1).*(member2), array[indexes[6]].*(member1).*(member2), array[indexes[7]].*(member1).*(member2)); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes) { IndexSizeChecker::check(); d.v() = _mm_setr_epi32(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), array[indexes[2]].*(member1).*(member2), array[indexes[3]].*(member1).*(member2)); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes) { IndexSizeChecker::check(); d.v() = _mm_setr_epi32(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), array[indexes[2]].*(member1).*(member2), array[indexes[3]].*(member1).*(member2)); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes) { IndexSizeChecker::check(); d.v() = _mm_setr_epi16(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), array[indexes[2]].*(member1).*(member2), array[indexes[3]].*(member1).*(member2), array[indexes[4]].*(member1).*(member2), array[indexes[5]].*(member1).*(member2), array[indexes[6]].*(member1).*(member2), array[indexes[7]].*(member1).*(member2)); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes) { IndexSizeChecker::check(); d.v() = _mm_setr_epi16(array[indexes[0]].*(member1).*(member2), array[indexes[1]].*(member1).*(member2), array[indexes[2]].*(member1).*(member2), array[indexes[3]].*(member1).*(member2), array[indexes[4]].*(member1).*(member2), array[indexes[5]].*(member1).*(member2), array[indexes[6]].*(member1).*(member2), array[indexes[7]].*(member1).*(member2)); } template template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const S2 S1::* member1, const EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask) { IndexSizeChecker::check(); #define ith_value(_i_) (array[indexes[_i_]].*(member1).*(member2)) VC_MASKED_GATHER #undef ith_value } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes) { IndexSizeChecker::check(); IndexSizeChecker::check(); d.v() = _mm_setr_pd((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]]); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes) { IndexSizeChecker::check(); IndexSizeChecker::check(); d.v() = _mm_setr_ps((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]], (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]]); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes) { IndexSizeChecker::check(); IndexSizeChecker::check(); d.v()[0] = _mm_setr_ps((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]], (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]]); d.v()[1] = _mm_setr_ps((array[outerIndexes[4]].*(ptrMember1))[innerIndexes[4]], (array[outerIndexes[5]].*(ptrMember1))[innerIndexes[5]], (array[outerIndexes[6]].*(ptrMember1))[innerIndexes[6]], (array[outerIndexes[7]].*(ptrMember1))[innerIndexes[7]]); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes) { IndexSizeChecker::check(); IndexSizeChecker::check(); d.v() = _mm_setr_epi32((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]], (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]]); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes) { IndexSizeChecker::check(); IndexSizeChecker::check(); d.v() = _mm_setr_epi32((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]], (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]]); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes) { IndexSizeChecker::check(); IndexSizeChecker::check(); d.v() = _mm_setr_epi16((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]], (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]], (array[outerIndexes[4]].*(ptrMember1))[innerIndexes[4]], (array[outerIndexes[5]].*(ptrMember1))[innerIndexes[5]], (array[outerIndexes[6]].*(ptrMember1))[innerIndexes[6]], (array[outerIndexes[7]].*(ptrMember1))[innerIndexes[7]]); } template<> template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes) { IndexSizeChecker::check(); IndexSizeChecker::check(); d.v() = _mm_setr_epi16((array[outerIndexes[0]].*(ptrMember1))[innerIndexes[0]], (array[outerIndexes[1]].*(ptrMember1))[innerIndexes[1]], (array[outerIndexes[2]].*(ptrMember1))[innerIndexes[2]], (array[outerIndexes[3]].*(ptrMember1))[innerIndexes[3]], (array[outerIndexes[4]].*(ptrMember1))[innerIndexes[4]], (array[outerIndexes[5]].*(ptrMember1))[innerIndexes[5]], (array[outerIndexes[6]].*(ptrMember1))[innerIndexes[6]], (array[outerIndexes[7]].*(ptrMember1))[innerIndexes[7]]); } template template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::gather(const S1 *array, const EntryType *const S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes, MaskArg mask) { IndexSizeChecker::check(); IndexSizeChecker::check(); #define ith_value(_i_) (array[outerIndexes[_i_]].*(ptrMember1))[innerIndexes[_i_]] VC_MASKED_GATHER #undef ith_value } // scatters {{{1 #undef VC_MASKED_GATHER #ifdef VC_USE_BSF_SCATTERS #define VC_MASKED_SCATTER \ int bits = mask.toInt(); \ while (bits) { \ const int i = _bit_scan_forward(bits); \ bits ^= (1 << i); /* btr? */ \ ith_value(i) = d.m(i); \ } #elif defined(VC_USE_POPCNT_BSF_SCATTERS) #define VC_MASKED_SCATTER \ unsigned int bits = mask.toInt(); \ unsigned int low, high = 0; \ switch (mask.count()) { \ case 8: \ high = _bit_scan_reverse(bits); \ ith_value(high) = d.m(high); \ high = (1 << high); \ case 7: \ low = _bit_scan_forward(bits); \ bits ^= high | (1 << low); \ ith_value(low) = d.m(low); \ case 6: \ high = _bit_scan_reverse(bits); \ ith_value(high) = d.m(high); \ high = (1 << high); \ case 5: \ low = _bit_scan_forward(bits); \ bits ^= high | (1 << low); \ ith_value(low) = d.m(low); \ case 4: \ high = _bit_scan_reverse(bits); \ ith_value(high) = d.m(high); \ high = (1 << high); \ case 3: \ low = _bit_scan_forward(bits); \ bits ^= high | (1 << low); \ ith_value(low) = d.m(low); \ case 2: \ high = _bit_scan_reverse(bits); \ ith_value(high) = d.m(high); \ case 1: \ low = _bit_scan_forward(bits); \ ith_value(low) = d.m(low); \ case 0: \ break; \ } #else #define VC_MASKED_SCATTER \ if (mask.isEmpty()) { \ return; \ } \ for_all_vector_entries(i, \ if (mask[i]) ith_value(i) = d.m(i); \ ); #endif template template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::scatter(EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes) const { for_all_vector_entries(i, mem[indexes[i]] = d.m(i); ); } template template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::scatter(EntryType *mem, VC_ALIGNED_PARAMETER(Index) indexes, MaskArg mask) const { #define ith_value(_i_) mem[indexes[_i_]] VC_MASKED_SCATTER #undef ith_value } template template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::scatter(S1 *array, EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes) const { for_all_vector_entries(i, array[indexes[i]].*(member1) = d.m(i); ); } template template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::scatter(S1 *array, EntryType S1::* member1, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask) const { #define ith_value(_i_) array[indexes[_i_]].*(member1) VC_MASKED_SCATTER #undef ith_value } template template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::scatter(S1 *array, S2 S1::* member1, EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes) const { for_all_vector_entries(i, array[indexes[i]].*(member1).*(member2) = d.m(i); ); } template template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::scatter(S1 *array, S2 S1::* member1, EntryType S2::* member2, VC_ALIGNED_PARAMETER(IT) indexes, MaskArg mask) const { #define ith_value(_i_) array[indexes[_i_]].*(member1).*(member2) VC_MASKED_SCATTER #undef ith_value } template template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::scatter(S1 *array, EntryType *S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes) const { for_all_vector_entries(i, (array[innerIndexes[i]].*(ptrMember1))[outerIndexes[i]] = d.m(i); ); } template template Vc_ALWAYS_INLINE void Vc_FLATTEN Vector::scatter(S1 *array, EntryType *S1::* ptrMember1, VC_ALIGNED_PARAMETER(IT1) outerIndexes, VC_ALIGNED_PARAMETER(IT2) innerIndexes, MaskArg mask) const { #define ith_value(_i_) (array[outerIndexes[_i_]].*(ptrMember1))[innerIndexes[_i_]] VC_MASKED_SCATTER #undef ith_value } /////////////////////////////////////////////////////////////////////////////////////////// // operator[] {{{1 template Vc_INTRINSIC typename Vector::EntryType Vc_PURE Vector::operator[](size_t index) const { return d.m(index); } #ifdef VC_GCC template<> Vc_INTRINSIC double Vc_PURE Vector::operator[](size_t index) const { if (__builtin_constant_p(index)) { return extract_double_imm(d.v(), index); } return d.m(index); } template<> Vc_INTRINSIC float Vc_PURE Vector::operator[](size_t index) const { return extract_float(d.v(), index); } template<> Vc_INTRINSIC float Vc_PURE Vector::operator[](size_t index) const { if (__builtin_constant_p(index)) { if (index < 4) { return extract_float_imm(d.v()[0], index); } return extract_float_imm(d.v()[1], index - 4); } return d.m(index); } template<> Vc_INTRINSIC int Vc_PURE Vector::operator[](size_t index) const { if (__builtin_constant_p(index)) { #if VC_GCC >= 0x40601 || !defined(VC_USE_VEX_CODING) // GCC < 4.6.1 incorrectly uses vmovq instead of movq for the following #ifdef __x86_64__ if (index == 0) return _mm_cvtsi128_si64(d.v()) & 0xFFFFFFFFull; if (index == 1) return _mm_cvtsi128_si64(d.v()) >> 32; #else if (index == 0) return _mm_cvtsi128_si32(d.v()); #endif #endif #ifdef VC_IMPL_SSE4_1 return _mm_extract_epi32(d.v(), index); #else return _mm_cvtsi128_si32(_mm_srli_si128(d.v(), index * 4)); #endif } return d.m(index); } template<> Vc_INTRINSIC unsigned int Vc_PURE Vector::operator[](size_t index) const { if (__builtin_constant_p(index)) { #if VC_GCC >= 0x40601 || !defined(VC_USE_VEX_CODING) // GCC < 4.6.1 incorrectly uses vmovq instead of movq for the following #ifdef __x86_64__ if (index == 0) return _mm_cvtsi128_si64(d.v()) & 0xFFFFFFFFull; if (index == 1) return _mm_cvtsi128_si64(d.v()) >> 32; #else if (index == 0) return _mm_cvtsi128_si32(d.v()); #endif #endif #ifdef VC_IMPL_SSE4_1 return _mm_extract_epi32(d.v(), index); #else return _mm_cvtsi128_si32(_mm_srli_si128(d.v(), index * 4)); #endif } return d.m(index); } template<> Vc_INTRINSIC short Vc_PURE Vector::operator[](size_t index) const { if (__builtin_constant_p(index)) { return _mm_extract_epi16(d.v(), index); } return d.m(index); } template<> Vc_INTRINSIC unsigned short Vc_PURE Vector::operator[](size_t index) const { if (__builtin_constant_p(index)) { return _mm_extract_epi16(d.v(), index); } return d.m(index); } #endif // GCC /////////////////////////////////////////////////////////////////////////////////////////// // horizontal ops {{{1 #ifndef VC_IMPL_SSE4_1 // without SSE4.1 integer multiplication is slow and we rather multiply the scalars template<> Vc_INTRINSIC Vc_PURE int Vector::product() const { return (d.m(0) * d.m(1)) * (d.m(2) * d.m(3)); } template<> Vc_INTRINSIC Vc_PURE unsigned int Vector::product() const { return (d.m(0) * d.m(1)) * (d.m(2) * d.m(3)); } #endif template Vc_ALWAYS_INLINE Vc_PURE typename Vector::EntryType Vector::min(MaskArg m) const { Vector tmp = std::numeric_limits >::max(); tmp(m) = *this; return tmp.min(); } template Vc_ALWAYS_INLINE Vc_PURE typename Vector::EntryType Vector::max(MaskArg m) const { Vector tmp = std::numeric_limits >::min(); tmp(m) = *this; return tmp.max(); } template Vc_ALWAYS_INLINE Vc_PURE typename Vector::EntryType Vector::product(MaskArg m) const { Vector tmp(VectorSpecialInitializerOne::One); tmp(m) = *this; return tmp.product(); } template Vc_ALWAYS_INLINE Vc_PURE typename Vector::EntryType Vector::sum(MaskArg m) const { Vector tmp(VectorSpecialInitializerZero::Zero); tmp(m) = *this; return tmp.sum(); } /////////////////////////////////////////////////////////////////////////////////////////// // copySign {{{1 template<> Vc_INTRINSIC Vc_PURE Vector Vector::copySign(Vector::AsArg reference) const { return _mm_or_ps( _mm_and_ps(reference.d.v(), _mm_setsignmask_ps()), _mm_and_ps(d.v(), _mm_setabsmask_ps()) ); } template<> Vc_INTRINSIC Vc_PURE Vector Vector::copySign(Vector::AsArg reference) const { return M256::create( _mm_or_ps( _mm_and_ps(reference.d.v()[0], _mm_setsignmask_ps()), _mm_and_ps(d.v()[0], _mm_setabsmask_ps()) ), _mm_or_ps( _mm_and_ps(reference.d.v()[1], _mm_setsignmask_ps()), _mm_and_ps(d.v()[1], _mm_setabsmask_ps()) ) ); } template<> Vc_INTRINSIC Vc_PURE Vector Vector::copySign(Vector::AsArg reference) const { return _mm_or_pd( _mm_and_pd(reference.d.v(), _mm_setsignmask_pd()), _mm_and_pd(d.v(), _mm_setabsmask_pd()) ); }//}}}1 // exponent {{{1 template<> Vc_INTRINSIC Vc_PURE Vector Vector::exponent() const { VC_ASSERT((*this >= 0.f).isFull()); return Internal::exponent(d.v()); } template<> Vc_INTRINSIC Vc_PURE Vector Vector::exponent() const { VC_ASSERT((*this >= 0.f).isFull()); return Internal::exponent(d.v()); } template<> Vc_INTRINSIC Vc_PURE Vector Vector::exponent() const { VC_ASSERT((*this >= 0.).isFull()); return Internal::exponent(d.v()); } // }}}1 // Random {{{1 static void _doRandomStep(Vector &state0, Vector &state1) { state0.load(&Vc::RandomState[0]); state1.load(&Vc::RandomState[uint_v::Size]); (state1 * 0xdeece66du + 11).store(&Vc::RandomState[uint_v::Size]); uint_v(_mm_xor_si128((state0 * 0xdeece66du + 11).data(), _mm_srli_epi32(state1.data(), 16))).store(&Vc::RandomState[0]); } template Vc_ALWAYS_INLINE Vector Vector::Random() { Vector state0, state1; _doRandomStep(state0, state1); return state0.reinterpretCast >(); } template<> Vc_ALWAYS_INLINE Vector Vector::Random() { Vector state0, state1; _doRandomStep(state0, state1); return _mm_sub_ps(_mm_or_ps(_mm_castsi128_ps(_mm_srli_epi32(state0.data(), 2)), HT::one()), HT::one()); } template<> Vc_ALWAYS_INLINE Vector Vector::Random() { Vector state0, state1; _doRandomStep(state0, state1); state1 ^= state0 >> 16; return M256::create( _mm_sub_ps(_mm_or_ps(_mm_castsi128_ps(_mm_srli_epi32(state0.data(), 2)), VectorHelper::one()), VectorHelper::one()), _mm_sub_ps(_mm_or_ps(_mm_castsi128_ps(_mm_srli_epi32(state1.data(), 2)), VectorHelper::one()), VectorHelper::one()) ); } template<> Vc_ALWAYS_INLINE Vector Vector::Random() { typedef unsigned long long uint64 Vc_MAY_ALIAS; uint64 state0 = *reinterpret_cast(&Vc::RandomState[8]); uint64 state1 = *reinterpret_cast(&Vc::RandomState[10]); const __m128i state = _mm_load_si128(reinterpret_cast(&Vc::RandomState[8])); *reinterpret_cast(&Vc::RandomState[ 8]) = (state0 * 0x5deece66dull + 11); *reinterpret_cast(&Vc::RandomState[10]) = (state1 * 0x5deece66dull + 11); return (Vector(_mm_castsi128_pd(_mm_srli_epi64(state, 12))) | One()) - One(); } // shifted / rotated {{{1 template Vc_INTRINSIC Vc_PURE Vector Vector::shifted(int amount) const { switch (amount) { case 0: return *this; case 1: return mm128_reinterpret_cast(_mm_srli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 1 * sizeof(EntryType))); case 2: return mm128_reinterpret_cast(_mm_srli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 2 * sizeof(EntryType))); case 3: return mm128_reinterpret_cast(_mm_srli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 3 * sizeof(EntryType))); case 4: return mm128_reinterpret_cast(_mm_srli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 4 * sizeof(EntryType))); case 5: return mm128_reinterpret_cast(_mm_srli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 5 * sizeof(EntryType))); case 6: return mm128_reinterpret_cast(_mm_srli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 6 * sizeof(EntryType))); case 7: return mm128_reinterpret_cast(_mm_srli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 7 * sizeof(EntryType))); case 8: return mm128_reinterpret_cast(_mm_srli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 8 * sizeof(EntryType))); case -1: return mm128_reinterpret_cast(_mm_slli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 1 * sizeof(EntryType))); case -2: return mm128_reinterpret_cast(_mm_slli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 2 * sizeof(EntryType))); case -3: return mm128_reinterpret_cast(_mm_slli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 3 * sizeof(EntryType))); case -4: return mm128_reinterpret_cast(_mm_slli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 4 * sizeof(EntryType))); case -5: return mm128_reinterpret_cast(_mm_slli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 5 * sizeof(EntryType))); case -6: return mm128_reinterpret_cast(_mm_slli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 6 * sizeof(EntryType))); case -7: return mm128_reinterpret_cast(_mm_slli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 7 * sizeof(EntryType))); case -8: return mm128_reinterpret_cast(_mm_slli_si128(mm128_reinterpret_cast<__m128i>(d.v()), 8 * sizeof(EntryType))); } return Zero(); } template<> Vc_INTRINSIC Vc_PURE sfloat_v sfloat_v::shifted(int amount) const { switch (amount) { case -7: return M256::create(_mm_setzero_ps(), _mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(d.v()[0]), 3 * sizeof(EntryType)))); case -6: return M256::create(_mm_setzero_ps(), _mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(d.v()[0]), 2 * sizeof(EntryType)))); case -5: return M256::create(_mm_setzero_ps(), _mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(d.v()[0]), 1 * sizeof(EntryType)))); case -4: return M256::create(_mm_setzero_ps(), d.v()[0]); case -3: return M256::create(_mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(d.v()[0]), 3 * sizeof(EntryType))), _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(d.v()[1]), _mm_castps_si128(d.v()[0]), 1 * sizeof(EntryType)))); case -2: return M256::create(_mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(d.v()[0]), 2 * sizeof(EntryType))), _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(d.v()[1]), _mm_castps_si128(d.v()[0]), 2 * sizeof(EntryType)))); case -1: return M256::create(_mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(d.v()[0]), 1 * sizeof(EntryType))), _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(d.v()[1]), _mm_castps_si128(d.v()[0]), 3 * sizeof(EntryType)))); case 0: return *this; case 1: return M256::create(_mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(d.v()[1]), _mm_castps_si128(d.v()[0]), 1 * sizeof(EntryType))), _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(d.v()[1]), 1 * sizeof(EntryType)))); case 2: return M256::create(_mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(d.v()[1]), _mm_castps_si128(d.v()[0]), 2 * sizeof(EntryType))), _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(d.v()[1]), 2 * sizeof(EntryType)))); case 3: return M256::create(_mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(d.v()[1]), _mm_castps_si128(d.v()[0]), 3 * sizeof(EntryType))), _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(d.v()[1]), 3 * sizeof(EntryType)))); case 4: return M256::create(d.v()[1], _mm_setzero_ps()); case 5: return M256::create(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(d.v()[1]), 1 * sizeof(EntryType))), _mm_setzero_ps()); case 6: return M256::create(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(d.v()[1]), 2 * sizeof(EntryType))), _mm_setzero_ps()); case 7: return M256::create(_mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(d.v()[1]), 3 * sizeof(EntryType))), _mm_setzero_ps()); } return Zero(); } template Vc_INTRINSIC Vc_PURE Vector Vector::rotated(int amount) const { const __m128i v = mm128_reinterpret_cast<__m128i>(d.v()); switch (static_cast(amount) % Size) { case 0: return *this; case 1: return mm128_reinterpret_cast(_mm_alignr_epi8(v, v, 1 * sizeof(EntryType))); case 2: return mm128_reinterpret_cast(_mm_alignr_epi8(v, v, 2 * sizeof(EntryType))); case 3: return mm128_reinterpret_cast(_mm_alignr_epi8(v, v, 3 * sizeof(EntryType))); // warning "Immediate parameter to intrinsic call too large" disabled in VcMacros.cmake. // ICC fails to see that the modulo operation (Size == sizeof(VectorType) / sizeof(EntryType)) // disables the following four calls unless sizeof(EntryType) == 2. case 4: return mm128_reinterpret_cast(_mm_alignr_epi8(v, v, 4 * sizeof(EntryType))); case 5: return mm128_reinterpret_cast(_mm_alignr_epi8(v, v, 5 * sizeof(EntryType))); case 6: return mm128_reinterpret_cast(_mm_alignr_epi8(v, v, 6 * sizeof(EntryType))); case 7: return mm128_reinterpret_cast(_mm_alignr_epi8(v, v, 7 * sizeof(EntryType))); } return Zero(); } template<> Vc_INTRINSIC Vc_PURE sfloat_v sfloat_v::rotated(int amount) const { const __m128i v0 = sse_cast<__m128i>(d.v()[0]); const __m128i v1 = sse_cast<__m128i>(d.v()[1]); switch (static_cast(amount) % Size) { case 0: return *this; case 1: return M256::create(sse_cast<__m128>(_mm_alignr_epi8(v1, v0, 1 * sizeof(EntryType))), sse_cast<__m128>(_mm_alignr_epi8(v0, v1, 1 * sizeof(EntryType)))); case 2: return M256::create(sse_cast<__m128>(_mm_alignr_epi8(v1, v0, 2 * sizeof(EntryType))), sse_cast<__m128>(_mm_alignr_epi8(v0, v1, 2 * sizeof(EntryType)))); case 3: return M256::create(sse_cast<__m128>(_mm_alignr_epi8(v1, v0, 3 * sizeof(EntryType))), sse_cast<__m128>(_mm_alignr_epi8(v0, v1, 3 * sizeof(EntryType)))); case 4: return M256::create(d.v()[1], d.v()[0]); case 5: return M256::create(sse_cast<__m128>(_mm_alignr_epi8(v0, v1, 1 * sizeof(EntryType))), sse_cast<__m128>(_mm_alignr_epi8(v1, v0, 1 * sizeof(EntryType)))); case 6: return M256::create(sse_cast<__m128>(_mm_alignr_epi8(v0, v1, 2 * sizeof(EntryType))), sse_cast<__m128>(_mm_alignr_epi8(v1, v0, 2 * sizeof(EntryType)))); case 7: return M256::create(sse_cast<__m128>(_mm_alignr_epi8(v0, v1, 3 * sizeof(EntryType))), sse_cast<__m128>(_mm_alignr_epi8(v1, v0, 3 * sizeof(EntryType)))); } return Zero(); } // }}}1 // sorted specializations {{{1 template<> inline Vc_PURE uint_v uint_v::sorted() const { __m128i x = data(); __m128i y = _mm_shuffle_epi32(x, _MM_SHUFFLE(2, 3, 0, 1)); __m128i l = _mm_min_epu32(x, y); __m128i h = _mm_max_epu32(x, y); x = _mm_unpacklo_epi32(l, h); y = _mm_unpackhi_epi32(h, l); // sort quads l = _mm_min_epu32(x, y); h = _mm_max_epu32(x, y); x = _mm_unpacklo_epi32(l, h); y = _mm_unpackhi_epi64(x, x); l = _mm_min_epu32(x, y); h = _mm_max_epu32(x, y); return _mm_unpacklo_epi32(l, h); } template<> inline Vc_PURE ushort_v ushort_v::sorted() const { __m128i lo, hi, y, x = data(); // sort pairs y = Mem::permute(x); lo = _mm_min_epu16(x, y); hi = _mm_max_epu16(x, y); x = _mm_blend_epi16(lo, hi, 0xaa); // merge left and right quads y = Mem::permute(x); lo = _mm_min_epu16(x, y); hi = _mm_max_epu16(x, y); x = _mm_blend_epi16(lo, hi, 0xcc); y = _mm_srli_si128(x, 2); lo = _mm_min_epu16(x, y); hi = _mm_max_epu16(x, y); x = _mm_blend_epi16(lo, _mm_slli_si128(hi, 2), 0xaa); // merge quads into octs y = _mm_shuffle_epi32(x, _MM_SHUFFLE(1, 0, 3, 2)); y = _mm_shufflelo_epi16(y, _MM_SHUFFLE(0, 1, 2, 3)); lo = _mm_min_epu16(x, y); hi = _mm_max_epu16(x, y); x = _mm_unpacklo_epi16(lo, hi); y = _mm_srli_si128(x, 8); lo = _mm_min_epu16(x, y); hi = _mm_max_epu16(x, y); x = _mm_unpacklo_epi16(lo, hi); y = _mm_srli_si128(x, 8); lo = _mm_min_epu16(x, y); hi = _mm_max_epu16(x, y); return _mm_unpacklo_epi16(lo, hi); } // }}}1 } // namespace SSE } // namespace Vc } // namespace ROOT #include "undomacros.h" // vim: foldmethod=marker