/* This file is part of the Vc library. {{{ Copyright (C) 2012 Matthias Kretz Vc is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Vc is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Vc. If not, see . }}}*/ #ifndef VC_COMMON_INTERLEAVEDMEMORY_H #define VC_COMMON_INTERLEAVEDMEMORY_H #include "macros.h" namespace ROOT { namespace Vc { namespace Common { /** * \internal */ template struct InterleavedMemoryAccessBase { typedef typename V::EntryType T; typedef typename V::IndexType I; typedef typename V::AsArg VArg; typedef T Ta Vc_MAY_ALIAS; const I m_indexes; Ta *const m_data; Vc_ALWAYS_INLINE InterleavedMemoryAccessBase(typename I::AsArg indexes, Ta *data) : m_indexes(indexes), m_data(data) { } // implementations of the following are in {scalar,sse,avx}/interleavedmemory.tcc void deinterleave(V &v0, V &v1) const; void deinterleave(V &v0, V &v1, V &v2) const; void deinterleave(V &v0, V &v1, V &v2, V &v3) const; void deinterleave(V &v0, V &v1, V &v2, V &v3, V &v4) const; void deinterleave(V &v0, V &v1, V &v2, V &v3, V &v4, V &v5) const; void deinterleave(V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6) const; void deinterleave(V &v0, V &v1, V &v2, V &v3, V &v4, V &v5, V &v6, V &v7) const; void interleave(VArg v0, VArg v1); void interleave(VArg v0, VArg v1, VArg v2); void interleave(VArg v0, VArg v1, VArg v2, VArg v3); void interleave(VArg v0, VArg v1, VArg v2, VArg v3, VArg v4); void interleave(VArg v0, VArg v1, VArg v2, VArg v3, VArg v4, VArg v5); void interleave(VArg v0, VArg v1, VArg v2, VArg v3, VArg v4, VArg v5, VArg v6); void interleave(VArg v0, VArg v1, VArg v2, VArg v3, VArg v4, VArg v5, VArg v6, VArg v7); }; /** * \internal */ // delay execution of the deinterleaving gather until operator= template struct InterleavedMemoryReadAccess : public InterleavedMemoryAccessBase { typedef InterleavedMemoryAccessBase Base; typedef typename Base::Ta Ta; typedef typename Base::I I; Vc_ALWAYS_INLINE InterleavedMemoryReadAccess(Ta *data, typename I::AsArg indexes) : Base(indexes * I(StructSize), data) { } }; /** * \internal */ template struct InterleavedMemoryAccess : public InterleavedMemoryReadAccess { typedef InterleavedMemoryAccessBase Base; typedef typename Base::Ta Ta; typedef typename Base::I I; Vc_ALWAYS_INLINE InterleavedMemoryAccess(Ta *data, typename I::AsArg indexes) : InterleavedMemoryReadAccess(data, indexes) { } #define _VC_SCATTER_ASSIGNMENT(LENGTH, parameters) \ Vc_ALWAYS_INLINE void operator=(const VectorTuple &rhs) \ { \ VC_STATIC_ASSERT(LENGTH <= StructSize, You_are_trying_to_scatter_more_data_into_the_struct_than_it_has); \ this->interleave parameters ; \ } \ Vc_ALWAYS_INLINE void operator=(const VectorTuple &rhs) \ { \ VC_STATIC_ASSERT(LENGTH <= StructSize, You_are_trying_to_scatter_more_data_into_the_struct_than_it_has); \ checkIndexesUnique(); \ this->interleave parameters ; \ } _VC_SCATTER_ASSIGNMENT(2, (rhs.l, rhs.r)) _VC_SCATTER_ASSIGNMENT(3, (rhs.l.l, rhs.l.r, rhs.r)); _VC_SCATTER_ASSIGNMENT(4, (rhs.l.l.l, rhs.l.l.r, rhs.l.r, rhs.r)); _VC_SCATTER_ASSIGNMENT(5, (rhs.l.l.l.l, rhs.l.l.l.r, rhs.l.l.r, rhs.l.r, rhs.r)); _VC_SCATTER_ASSIGNMENT(6, (rhs.l.l.l.l.l, rhs.l.l.l.l.r, rhs.l.l.l.r, rhs.l.l.r, rhs.l.r, rhs.r)); _VC_SCATTER_ASSIGNMENT(7, (rhs.l.l.l.l.l.l, rhs.l.l.l.l.l.r, rhs.l.l.l.l.r, rhs.l.l.l.r, rhs.l.l.r, rhs.l.r, rhs.r)); _VC_SCATTER_ASSIGNMENT(8, (rhs.l.l.l.l.l.l.l, rhs.l.l.l.l.l.l.r, rhs.l.l.l.l.l.r, rhs.l.l.l.l.r, rhs.l.l.l.r, rhs.l.l.r, rhs.l.r, rhs.r)); #undef _VC_SCATTER_ASSIGNMENT private: #ifdef NDEBUG Vc_ALWAYS_INLINE void checkIndexesUnique() const {} #else void checkIndexesUnique() const { const I test = Base::m_indexes.sorted(); VC_ASSERT(I::Size == 1 || (test == test.rotated(1)).isEmpty()) } #endif }; #ifdef DOXYGEN } // namespace Common // in doxygen InterleavedMemoryWrapper should appear in the Vc namespace (see the using statement // below) #endif /** * Wraps a pointer to memory with convenience functions to access it via vectors. * * \param S The type of the struct. * \param V The type of the vector to be returned when read. This should reflect the type of the * members inside the struct. * * \see operator[] * \ingroup Utilities * \headerfile interleavedmemory.h */ template class InterleavedMemoryWrapper { typedef typename V::EntryType T; typedef typename V::IndexType I; typedef typename V::AsArg VArg; typedef typename I::AsArg IndexType; typedef InterleavedMemoryAccess Access; typedef InterleavedMemoryReadAccess ReadAccess; typedef T Ta Vc_MAY_ALIAS; Ta *const m_data; VC_STATIC_ASSERT((sizeof(S) / sizeof(T)) * sizeof(T) == sizeof(S), InterleavedMemoryAccess_does_not_support_packed_structs); public: /** * Constructs the wrapper object. * * \param s A pointer to a C-array. */ Vc_ALWAYS_INLINE InterleavedMemoryWrapper(S *s) : m_data(reinterpret_cast(s)) { } /** * Interleaved scatter/gather access. * * Assuming you have a struct of floats and a vector of \p indexes into the array, this function * can be used to access the struct entries as vectors using the minimal number of store or load * instructions. * * \param indexes Vector of indexes that determine the gather locations. * * \return A special (magic) object that executes the loads and deinterleave on assignment to a * vector tuple. * * Example: * \code * struct Foo { * float x, y, z; * }; * * void fillWithBar(Foo *_data, uint_v indexes) * { * Vc::InterleavedMemoryWrapper data(_data); * const float_v x = bar(1); * const float_v y = bar(2); * const float_v z = bar(3); * data[indexes] = (x, y, z); * // it's also possible to just store a subset at the front of the struct: * data[indexes] = (x, y); * // if you want to store a single entry, use scatter: * z.scatter(_data, &Foo::x, indexes); * } * * float_v normalizeStuff(Foo *_data, uint_v indexes) * { * Vc::InterleavedMemoryWrapper data(_data); * float_v x, y, z; * (x, y, z) = data[indexes]; * // it is also possible to just load a subset from the front of the struct: * // (x, y) = data[indexes]; * return Vc::sqrt(x * x + y * y + z * z); * } * \endcode * * You may think of the gather operation (or scatter as the inverse) like this: \verbatim Memory: {x0 y0 z0 x1 y1 z1 x2 y2 z2 x3 y3 z3 x4 y4 z4 x5 y5 z5 x6 y6 z6 x7 y7 z7 x8 y8 z8} indexes: [5, 0, 1, 7] Result in (x, y, z): ({x5 x0 x1 x7}, {y5 y0 y1 y7}, {z5 z0 z1 z7}) \endverbatim * * \warning If \p indexes contains non-unique entries on scatter, the result is undefined. If * \c NDEBUG is not defined the implementation will assert that the \p indexes entries are unique. */ Vc_ALWAYS_INLINE Access operator[](IndexType indexes) { return Access(m_data, indexes); } /// const overload (gathers only) of the above function Vc_ALWAYS_INLINE ReadAccess operator[](IndexType indexes) const { return ReadAccess(m_data, indexes); } /// alias of the above function Vc_ALWAYS_INLINE ReadAccess gather(IndexType indexes) const { return operator[](indexes); } //Vc_ALWAYS_INLINE Access scatter(I indexes, VArg v0, VArg v1); }; #ifndef DOXYGEN } // namespace Common using Common::InterleavedMemoryWrapper; #endif } // namespace Vc } // namespace ROOT #include "undomacros.h" #endif // VC_COMMON_INTERLEAVEDMEMORY_H