From 49f750b2c46f38c606c33a325cb068c50e829d0c Mon Sep 17 00:00:00 2001 From: Stuart Archibald Date: Tue, 13 Mar 2018 23:16:48 +0000 Subject: [PATCH 3/3] Rewrite inlining This rewrites inlining of the allocators to work with GCC and defines __assume_aligned for GCC --- numpy/core/src/mkl_defs/aligned_alloc.c | 139 +++----------------------------- numpy/core/src/mkl_defs/aligned_alloc.h | 98 ++++++++++++++++++++-- numpy/core/src/umath/loops.c.src | 31 ++++--- 3 files changed, 123 insertions(+), 145 deletions(-) diff --git a/numpy/core/src/mkl_defs/aligned_alloc.c b/numpy/core/src/mkl_defs/aligned_alloc.c index 667432d..37eb172 100644 --- a/numpy/core/src/mkl_defs/aligned_alloc.c +++ b/numpy/core/src/mkl_defs/aligned_alloc.c @@ -1,19 +1,6 @@ -#include "mkl.h" -#include -#include -#ifndef Py_PYTHON_H -# include "Python.h" -#endif -#include "numpy/npy_common.h" - -#define ALIGNMENT 64 -#define __THRESHOLD 524288 -#define __UNIT_STRIDE 1 -#define __NULL_STRIDE 0 -#define __8BYTES_ALIGNMENT_OFFSET(ptr) (((size_t) (ptr)) & 0x7) -#define MKL_INT_MAX ((size_t) (~((MKL_UINT) 0) >> 1)) - -static int is_tbb_enabled(void) { +#include "aligned_alloc.h" +#if !defined(_MSC_VER) +int is_tbb_enabled(void) { static int TBB_ENABLED = -1; if (TBB_ENABLED == -1) { char* mkl_threading = getenv("MKL_THREADING_LAYER"); @@ -22,70 +9,9 @@ static int is_tbb_enabled(void) { return TBB_ENABLED; } -static NPY_INLINE void call_dcopy_chunked(size_t size, double* src, double* dest) { - while (size > MKL_INT_MAX) { - cblas_dcopy(MKL_INT_MAX, src , __NULL_STRIDE, dest, __UNIT_STRIDE); - size -= MKL_INT_MAX; - dest += MKL_INT_MAX; - } - if (size > 0) { - if (size >= __THRESHOLD) { - cblas_dcopy(size, src , __NULL_STRIDE, dest, __UNIT_STRIDE); - } else { - memset(dest, 0, size * sizeof(double)); - } - } -} - -inline void *_aligned_alloc(size_t size) { - /* Only available for Linux and OSX (has been explicitly disabled on Windows : see aligned_alloc.h) - * With Windows, we would run into composability issues with modules like h5py which allocate - * memory using libc functions in another library, like hdf5 for instance - */ - size = (size > 0) ? size : 1; - void* data = NULL; - int ret_code = posix_memalign(&data, ALIGNMENT, size); - if (ret_code == 0) { - return data; - } - return NULL; -} - - -#ifdef WITH_ALIGNED_CALLOC -inline void *_aligned_calloc(size_t nelem, size_t elsize) -{ - size_t size = nelem * elsize; - void *data = _aligned_alloc(size); - char *memory = NULL; - - if (data != NULL) { - memory = (char*) data; - if((size > __THRESHOLD) && !is_tbb_enabled()) { - size_t offset = __8BYTES_ALIGNMENT_OFFSET(8 - __8BYTES_ALIGNMENT_OFFSET(memory)); - size_t rem_size, ch_size, n_ch = (size - offset) / sizeof(double); - double init = 0; - if (offset) { - memset(memory, 0, offset); - } - - call_dcopy_chunked(n_ch, &init, (double*) (memory+offset)); - - ch_size = offset + n_ch * sizeof(double); - rem_size = size - ch_size; - if(rem_size > 0) { - memset(memory + ch_size, 0, rem_size); - } - } else { - memset(memory, 0, size); - } - } - return data; -} -#endif #if PY_VERSION_HEX >= 0x03040000 -static int is_tracemalloc_enabled(void) { +int is_tracemalloc_enabled(void) { static int TRACEMALLOC_PRESENT = -1; if (TRACEMALLOC_PRESENT == -1) { TRACEMALLOC_PRESENT = (getenv("PYTHONTRACEMALLOC")) ? 1 : 0; @@ -94,53 +20,12 @@ static int is_tracemalloc_enabled(void) { } #endif -inline void* call_aligned_malloc(size_t size) { -#if PY_VERSION_HEX >= 0x03040000 - if(is_tracemalloc_enabled()){ - return PyMem_RawMalloc(size); - } else -#endif - { - return _aligned_alloc(size); - } -} - -inline void* call_aligned_realloc(void* input, size_t size) { -#if PY_VERSION_HEX >= 0x03040000 - if(is_tracemalloc_enabled()){ - return PyMem_RawRealloc(input, size); - } else -#endif - { - if (input) { - return realloc(input, size ? size : 1); - } - return _aligned_alloc(size); - } -} - -inline void* call_aligned_calloc(size_t num, size_t size) { -#if PY_VERSION_HEX >= 0x03040000 - if(is_tracemalloc_enabled()){ - return PyMem_RawCalloc(num, size); - } else -#endif - { -#ifdef WITH_ALIGNED_CALLOC - return _aligned_calloc(num, size); -#else - return calloc(num, size); -#endif - } -} +// C99 inlining model +extern void* call_aligned_malloc(size_t); +extern void* _aligned_alloc(size_t); +extern void* call_aligned_calloc(size_t, size_t); +extern void* call_aligned_realloc(void*, size_t); +extern void call_free(void*); +#endif // !defined(_MSC_VER) +static int _prevent_empty_translation_unit; -inline void call_free(void* ptr) { -#if PY_VERSION_HEX >= 0x03040000 - if(is_tracemalloc_enabled()){ - PyMem_RawFree(ptr); - } else -#endif - { - free(ptr); - } -} diff --git a/numpy/core/src/mkl_defs/aligned_alloc.h b/numpy/core/src/mkl_defs/aligned_alloc.h index edd0080..ba19bfe 100644 --- a/numpy/core/src/mkl_defs/aligned_alloc.h +++ b/numpy/core/src/mkl_defs/aligned_alloc.h @@ -1,11 +1,95 @@ +// Not on windows #if !defined(ALIGNED_ALLOC_H) && !defined(_MSC_VER) # define ALIGNED_ALLOC_H -# include - inline void* call_aligned_malloc(size_t); - inline void* call_aligned_realloc(void*, size_t); - inline void* call_aligned_calloc(size_t, size_t); - inline void call_free(void*); -# define PyArray_malloc call_aligned_malloc + + +#ifndef Py_PYTHON_H +# include "Python.h" +#endif + +#include +#include + +#include "numpy/npy_common.h" + +#define ALIGNMENT 64 +#define __THRESHOLD 524288 +#define __UNIT_STRIDE 1 +#define __NULL_STRIDE 0 +#define __8BYTES_ALIGNMENT_OFFSET(ptr) (((size_t) (ptr)) & 0x7) +#define MKL_INT_MAX ((size_t) (~((MKL_UINT) 0) >> 1)) + +// prototypes +int is_tbb_enabled(void); +#if PY_VERSION_HEX >= 0x03040000 +int is_tracemalloc_enabled(void); +#endif + + +// inlinables + +NPY_INLINE void* _aligned_alloc(size_t size) { + /* Only available for Linux and OSX (has been explicitly disabled on Windows : see aligned_alloc.h) + * With Windows, we would run into composability issues with modules like h5py which allocate + * memory using libc functions in another library, like hdf5 for instance + */ + size = (size > 0) ? size : 1; + void* data = NULL; + int ret_code = posix_memalign(&data, ALIGNMENT, size); + if (ret_code == 0) { + return data; + } + return NULL; +} + +NPY_INLINE void* call_aligned_malloc(size_t size) { +#if PY_VERSION_HEX >= 0x03040000 + if(is_tracemalloc_enabled()){ + return PyMem_RawMalloc(size); + } else +#endif + { + return _aligned_alloc(size); + } +} + +NPY_INLINE void* call_aligned_realloc(void* input, size_t size) { +#if PY_VERSION_HEX >= 0x03040000 + if(is_tracemalloc_enabled()){ + return PyMem_RawRealloc(input, size); + } else +#endif + { + if (input) { + return realloc(input, size ? size : 1); + } + return _aligned_alloc(size); + } +} + +NPY_INLINE void* call_aligned_calloc(size_t num, size_t size) { +#if PY_VERSION_HEX >= 0x03040000 + if(is_tracemalloc_enabled()){ + return PyMem_RawCalloc(num, size); + } else +#endif + { + return calloc(num, size); + } +} + +NPY_INLINE void call_free(void* ptr) { +#if PY_VERSION_HEX >= 0x03040000 + if(is_tracemalloc_enabled()){ + PyMem_RawFree(ptr); + } else +#endif + { + free(ptr); + } +} + +# define PyArray_malloc call_aligned_malloc # define PyArray_free call_free -# define PyArray_realloc call_aligned_realloc +# define PyArray_realloc call_aligned_realloc #endif diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src index c5b897f..7f8e456 100644 --- a/numpy/core/src/umath/loops.c.src +++ b/numpy/core/src/umath/loops.c.src @@ -35,6 +35,15 @@ #define VML_ASM_THRESHOLD 100000 #define VML_D_THRESHOLD 8000 +// Fix the definition of __assume_aligned for GCC/clang toolchain +#ifndef _MSC_VER +#define __assume_aligned(var, align) (var=(__typeof__(var))\ + __builtin_assume_aligned((void *)var, align)); +#else +// and hide it on windows +#define __assume_aligned(var, align) +#endif + #define MKL_INT_MAX ((npy_intp) ((~((MKL_UINT) 0)) >> 1)) #define CHUNKED_VML_CALL2(vml_func, n, type, in1, op1) \ @@ -155,7 +164,7 @@ npy_intp is1 = steps[0], os1 = steps[1];\ npy_intp n = dimensions[0];\ npy_intp i;\ - _Pragma("simd")\ + \ for(i = 0; i < n; i++, ip1 += is1, op1 += os1) #define UNARY_LOOP_DISPATCH(cond, body)\ @@ -2545,7 +2554,7 @@ NPY_NO_EXPORT void if( DISJOINT_OR_SAME(op1_shifted, ip1_aligned, j_max, 1) && DISJOINT_OR_SAME(op1_shifted, ip2_shifted, j_max, 1) ) { __assume_aligned(ip1_aligned, 64); - _Pragma("simd") + for(j = 0; j < j_max; j++) { op1_shifted[j] = ip1_aligned[j] @OP@ ip2_shifted[j]; } @@ -2714,7 +2723,7 @@ NPY_NO_EXPORT void if( DISJOINT_OR_SAME(op1_shifted, ip1_aligned, j_max, 1) && DISJOINT_OR_SAME(op1_shifted, ip2_shifted, j_max, 1) ) { __assume_aligned(ip1_aligned, 64); - _Pragma("simd") + for(j = 0; j < j_max; j++) { op1_shifted[j] = ip1_aligned[j] @OP@ ip2_shifted[j]; } @@ -2883,7 +2892,7 @@ NPY_NO_EXPORT void if( DISJOINT_OR_SAME(op1_shifted, ip1_aligned, j_max, 1) && DISJOINT_OR_SAME(op1_shifted, ip2_shifted, j_max, 1) ) { __assume_aligned(ip1_aligned, 64); - _Pragma("simd") + for(j = 0; j < j_max; j++) { op1_shifted[j] = ip1_aligned[j] @OP@ ip2_shifted[j]; } @@ -3038,7 +3047,7 @@ NPY_NO_EXPORT void const npy_intp blocked_end = npy_blocked_end(peel, sizeof(@type@), vsize, n); npy_intp i; - _Pragma("novector") + for(i = 0; i < peel; i++) { op1[i] = ip1[i] @OP@ ip2[i]; } @@ -3051,7 +3060,7 @@ NPY_NO_EXPORT void if( DISJOINT_OR_SAME(op1_shifted, ip1_aligned, j_max, 1) && DISJOINT_OR_SAME(op1_shifted, ip2_shifted, j_max, 1) ) { __assume_aligned(ip1_aligned, 64); - _Pragma("simd") + for(j = 0; j < j_max; j++) { op1_shifted[j] = ip1_aligned[j] @OP@ ip2_shifted[j]; } @@ -3066,7 +3075,7 @@ NPY_NO_EXPORT void } } - _Pragma("novector") + for(; i < n; i++) { op1[i] = ip1[i] @OP@ ip2[i]; } @@ -3082,7 +3091,7 @@ NPY_NO_EXPORT void npy_intp i; const @type@ ip1c = ip1[0]; - _Pragma("novector") + for(i = 0; i < peel; i++) { op1[i] = ip1c @OP@ ip2[i]; } @@ -3101,7 +3110,7 @@ NPY_NO_EXPORT void } } - _Pragma("novector") + for(; i < n; i++) { op1[i] = ip1c @OP@ ip2[i]; } @@ -3116,7 +3125,7 @@ NPY_NO_EXPORT void npy_intp i; const @type@ ip2c = ip2[0]; - _Pragma("novector") + for(i = 0; i < peel; i++) { op1[i] = ip1[i] @OP@ ip2c; } @@ -3135,7 +3144,7 @@ NPY_NO_EXPORT void } } - _Pragma("novector") + for(; i < n; i++) { op1[i] = ip1[i] @OP@ ip2c; } -- 1.8.3.1