diff --git a/3rdparty/opencv/include/opencv2/core/detail/async_promise.hpp b/3rdparty/opencv/include/opencv2/core/detail/async_promise.hpp new file mode 100644 index 00000000..6eb3fb52 --- /dev/null +++ b/3rdparty/opencv/include/opencv2/core/detail/async_promise.hpp @@ -0,0 +1,71 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef OPENCV_CORE_ASYNC_PROMISE_HPP +#define OPENCV_CORE_ASYNC_PROMISE_HPP + +#include "../async.hpp" + +#include "exception_ptr.hpp" + +namespace cv { + +/** @addtogroup core_async +@{ +*/ + + +/** @brief Provides result of asynchronous operations + +*/ +class CV_EXPORTS AsyncPromise +{ +public: + ~AsyncPromise() CV_NOEXCEPT; + AsyncPromise() CV_NOEXCEPT; + explicit AsyncPromise(const AsyncPromise& o) CV_NOEXCEPT; + AsyncPromise& operator=(const AsyncPromise& o) CV_NOEXCEPT; + void release() CV_NOEXCEPT; + + /** Returns associated AsyncArray + @note Can be called once + */ + AsyncArray getArrayResult(); + + /** Stores asynchronous result. + @param[in] value result + */ + void setValue(InputArray value); + + // TODO "move" setters + +#if CV__EXCEPTION_PTR + /** Stores exception. + @param[in] exception exception to be raised in AsyncArray + */ + void setException(std::exception_ptr exception); +#endif + + /** Stores exception. + @param[in] exception exception to be raised in AsyncArray + */ + void setException(const cv::Exception& exception); + +#ifdef CV_CXX11 + explicit AsyncPromise(AsyncPromise&& o) { p = o.p; o.p = NULL; } + AsyncPromise& operator=(AsyncPromise&& o) CV_NOEXCEPT { std::swap(p, o.p); return *this; } +#endif + + + // PImpl + typedef struct AsyncArray::Impl Impl; friend struct AsyncArray::Impl; + inline void* _getImpl() const CV_NOEXCEPT { return p; } +protected: + Impl* p; +}; + + +//! @} +} // namespace +#endif // OPENCV_CORE_ASYNC_PROMISE_HPP diff --git a/3rdparty/opencv/include/opencv2/core/detail/exception_ptr.hpp b/3rdparty/opencv/include/opencv2/core/detail/exception_ptr.hpp new file mode 100644 index 00000000..d98ffc40 --- /dev/null +++ b/3rdparty/opencv/include/opencv2/core/detail/exception_ptr.hpp @@ -0,0 +1,27 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef OPENCV_CORE_DETAILS_EXCEPTION_PTR_H +#define OPENCV_CORE_DETAILS_EXCEPTION_PTR_H + +#ifndef CV__EXCEPTION_PTR +# if defined(__ANDROID__) && defined(ATOMIC_INT_LOCK_FREE) && ATOMIC_INT_LOCK_FREE < 2 +# define CV__EXCEPTION_PTR 0 // Not supported, details: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58938 +# elif defined(CV_CXX11) +# define CV__EXCEPTION_PTR 1 +# elif defined(_MSC_VER) +# define CV__EXCEPTION_PTR (_MSC_VER >= 1600) +# elif defined(__clang__) +# define CV__EXCEPTION_PTR 0 // C++11 only (see above) +# elif defined(__GNUC__) && defined(__GXX_EXPERIMENTAL_CXX0X__) +# define CV__EXCEPTION_PTR (__GXX_EXPERIMENTAL_CXX0X__ > 0) +# endif +#endif +#ifndef CV__EXCEPTION_PTR +# define CV__EXCEPTION_PTR 0 +#elif CV__EXCEPTION_PTR +# include // std::exception_ptr +#endif + +#endif // OPENCV_CORE_DETAILS_EXCEPTION_PTR_H diff --git a/3rdparty/opencv/include/opencv2/core/hal/intrin_avx512.hpp b/3rdparty/opencv/include/opencv2/core/hal/intrin_avx512.hpp new file mode 100644 index 00000000..75a3bd4b --- /dev/null +++ b/3rdparty/opencv/include/opencv2/core/hal/intrin_avx512.hpp @@ -0,0 +1,3078 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html + +#ifndef OPENCV_HAL_INTRIN_AVX512_HPP +#define OPENCV_HAL_INTRIN_AVX512_HPP + +#if defined(_MSC_VER) && (_MSC_VER < 1920/*MSVS2019*/) +# pragma warning(disable:4146) // unary minus operator applied to unsigned type, result still unsigned +# pragma warning(disable:4309) // 'argument': truncation of constant value +# pragma warning(disable:4310) // cast truncates constant value +#endif + +#define CVT_ROUND_MODES_IMPLEMENTED 0 + +#define CV_SIMD512 1 +#define CV_SIMD512_64F 1 +#define CV_SIMD512_FP16 0 // no native operations with FP16 type. Only load/store from float32x8 are available (if CV_FP16 == 1) + +#define _v512_set_epu64(a7, a6, a5, a4, a3, a2, a1, a0) _mm512_set_epi64((int64)(a7),(int64)(a6),(int64)(a5),(int64)(a4),(int64)(a3),(int64)(a2),(int64)(a1),(int64)(a0)) +#define _v512_set_epu32(a15, a14, a13, a12, a11, a10, a9, a8, a7, a6, a5, a4, a3, a2, a1, a0) \ + _mm512_set_epi64(((int64)(a15)<<32)|(int64)(a14), ((int64)(a13)<<32)|(int64)(a12), ((int64)(a11)<<32)|(int64)(a10), ((int64)( a9)<<32)|(int64)( a8), \ + ((int64)( a7)<<32)|(int64)( a6), ((int64)( a5)<<32)|(int64)( a4), ((int64)( a3)<<32)|(int64)( a2), ((int64)( a1)<<32)|(int64)( a0)) +#define _v512_set_epu16(a31, a30, a29, a28, a27, a26, a25, a24, a23, a22, a21, a20, a19, a18, a17, a16, \ + a15, a14, a13, a12, a11, a10, a9, a8, a7, a6, a5, a4, a3, a2, a1, a0) \ + _v512_set_epu32(((unsigned)(a31)<<16)|(unsigned)(a30), ((unsigned)(a29)<<16)|(unsigned)(a28), ((unsigned)(a27)<<16)|(unsigned)(a26), ((unsigned)(a25)<<16)|(unsigned)(a24), \ + ((unsigned)(a23)<<16)|(unsigned)(a22), ((unsigned)(a21)<<16)|(unsigned)(a20), ((unsigned)(a19)<<16)|(unsigned)(a18), ((unsigned)(a17)<<16)|(unsigned)(a16), \ + ((unsigned)(a15)<<16)|(unsigned)(a14), ((unsigned)(a13)<<16)|(unsigned)(a12), ((unsigned)(a11)<<16)|(unsigned)(a10), ((unsigned)( a9)<<16)|(unsigned)( a8), \ + ((unsigned)( a7)<<16)|(unsigned)( a6), ((unsigned)( a5)<<16)|(unsigned)( a4), ((unsigned)( a3)<<16)|(unsigned)( a2), ((unsigned)( a1)<<16)|(unsigned)( a0)) +#define _v512_set_epu8(a63, a62, a61, a60, a59, a58, a57, a56, a55, a54, a53, a52, a51, a50, a49, a48, \ + a47, a46, a45, a44, a43, a42, a41, a40, a39, a38, a37, a36, a35, a34, a33, a32, \ + a31, a30, a29, a28, a27, a26, a25, a24, a23, a22, a21, a20, a19, a18, a17, a16, \ + a15, a14, a13, a12, a11, a10, a9, a8, a7, a6, a5, a4, a3, a2, a1, a0) \ + _v512_set_epu32(((unsigned)(a63)<<24)|((unsigned)(a62)<<16)|((unsigned)(a61)<<8)|(unsigned)(a60),((unsigned)(a59)<<24)|((unsigned)(a58)<<16)|((unsigned)(a57)<<8)|(unsigned)(a56), \ + ((unsigned)(a55)<<24)|((unsigned)(a54)<<16)|((unsigned)(a53)<<8)|(unsigned)(a52),((unsigned)(a51)<<24)|((unsigned)(a50)<<16)|((unsigned)(a49)<<8)|(unsigned)(a48), \ + ((unsigned)(a47)<<24)|((unsigned)(a46)<<16)|((unsigned)(a45)<<8)|(unsigned)(a44),((unsigned)(a43)<<24)|((unsigned)(a42)<<16)|((unsigned)(a41)<<8)|(unsigned)(a40), \ + ((unsigned)(a39)<<24)|((unsigned)(a38)<<16)|((unsigned)(a37)<<8)|(unsigned)(a36),((unsigned)(a35)<<24)|((unsigned)(a34)<<16)|((unsigned)(a33)<<8)|(unsigned)(a32), \ + ((unsigned)(a31)<<24)|((unsigned)(a30)<<16)|((unsigned)(a29)<<8)|(unsigned)(a28),((unsigned)(a27)<<24)|((unsigned)(a26)<<16)|((unsigned)(a25)<<8)|(unsigned)(a24), \ + ((unsigned)(a23)<<24)|((unsigned)(a22)<<16)|((unsigned)(a21)<<8)|(unsigned)(a20),((unsigned)(a19)<<24)|((unsigned)(a18)<<16)|((unsigned)(a17)<<8)|(unsigned)(a16), \ + ((unsigned)(a15)<<24)|((unsigned)(a14)<<16)|((unsigned)(a13)<<8)|(unsigned)(a12),((unsigned)(a11)<<24)|((unsigned)(a10)<<16)|((unsigned)( a9)<<8)|(unsigned)( a8), \ + ((unsigned)( a7)<<24)|((unsigned)( a6)<<16)|((unsigned)( a5)<<8)|(unsigned)( a4),((unsigned)( a3)<<24)|((unsigned)( a2)<<16)|((unsigned)( a1)<<8)|(unsigned)( a0)) +#define _v512_set_epi8(a63, a62, a61, a60, a59, a58, a57, a56, a55, a54, a53, a52, a51, a50, a49, a48, \ + a47, a46, a45, a44, a43, a42, a41, a40, a39, a38, a37, a36, a35, a34, a33, a32, \ + a31, a30, a29, a28, a27, a26, a25, a24, a23, a22, a21, a20, a19, a18, a17, a16, \ + a15, a14, a13, a12, a11, a10, a9, a8, a7, a6, a5, a4, a3, a2, a1, a0) \ + _v512_set_epu8((uchar)(a63), (uchar)(a62), (uchar)(a61), (uchar)(a60), (uchar)(a59), (uchar)(a58), (uchar)(a57), (uchar)(a56), \ + (uchar)(a55), (uchar)(a54), (uchar)(a53), (uchar)(a52), (uchar)(a51), (uchar)(a50), (uchar)(a49), (uchar)(a48), \ + (uchar)(a47), (uchar)(a46), (uchar)(a45), (uchar)(a44), (uchar)(a43), (uchar)(a42), (uchar)(a41), (uchar)(a40), \ + (uchar)(a39), (uchar)(a38), (uchar)(a37), (uchar)(a36), (uchar)(a35), (uchar)(a34), (uchar)(a33), (uchar)(a32), \ + (uchar)(a31), (uchar)(a30), (uchar)(a29), (uchar)(a28), (uchar)(a27), (uchar)(a26), (uchar)(a25), (uchar)(a24), \ + (uchar)(a23), (uchar)(a22), (uchar)(a21), (uchar)(a20), (uchar)(a19), (uchar)(a18), (uchar)(a17), (uchar)(a16), \ + (uchar)(a15), (uchar)(a14), (uchar)(a13), (uchar)(a12), (uchar)(a11), (uchar)(a10), (uchar)( a9), (uchar)( a8), \ + (uchar)( a7), (uchar)( a6), (uchar)( a5), (uchar)( a4), (uchar)( a3), (uchar)( a2), (uchar)( a1), (uchar)( a0)) + +#ifndef _mm512_cvtpd_pslo +#ifdef _mm512_zextsi256_si512 +#define _mm512_cvtpd_pslo(a) _mm512_zextps256_ps512(_mm512_cvtpd_ps(a)) +#else +//if preferred way to extend with zeros is unavailable +#define _mm512_cvtpd_pslo(a) _mm512_castps256_ps512(_mm512_cvtpd_ps(a)) +#endif +#endif +///////// Utils //////////// + +namespace +{ + +inline __m512i _v512_combine(const __m256i& lo, const __m256i& hi) +{ return _mm512_inserti32x8(_mm512_castsi256_si512(lo), hi, 1); } + +inline __m512 _v512_combine(const __m256& lo, const __m256& hi) +{ return _mm512_insertf32x8(_mm512_castps256_ps512(lo), hi, 1); } + +inline __m512d _v512_combine(const __m256d& lo, const __m256d& hi) +{ return _mm512_insertf64x4(_mm512_castpd256_pd512(lo), hi, 1); } + +inline int _v_cvtsi512_si32(const __m512i& a) +{ return _mm_cvtsi128_si32(_mm512_castsi512_si128(a)); } + +inline __m256i _v512_extract_high(const __m512i& v) +{ return _mm512_extracti32x8_epi32(v, 1); } + +inline __m256 _v512_extract_high(const __m512& v) +{ return _mm512_extractf32x8_ps(v, 1); } + +inline __m256d _v512_extract_high(const __m512d& v) +{ return _mm512_extractf64x4_pd(v, 1); } + +inline __m256i _v512_extract_low(const __m512i& v) +{ return _mm512_castsi512_si256(v); } + +inline __m256 _v512_extract_low(const __m512& v) +{ return _mm512_castps512_ps256(v); } + +inline __m256d _v512_extract_low(const __m512d& v) +{ return _mm512_castpd512_pd256(v); } + +inline __m512i _v512_insert(const __m512i& a, const __m256i& b) +{ return _mm512_inserti32x8(a, b, 0); } + +inline __m512 _v512_insert(const __m512& a, const __m256& b) +{ return _mm512_insertf32x8(a, b, 0); } + +inline __m512d _v512_insert(const __m512d& a, const __m256d& b) +{ return _mm512_insertf64x4(a, b, 0); } + +} + +namespace cv +{ + +//! @cond IGNORED + +CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN + +///////// Types //////////// + +struct v_uint8x64 +{ + typedef uchar lane_type; + enum { nlanes = 64 }; + __m512i val; + + explicit v_uint8x64(__m512i v) : val(v) {} + v_uint8x64(uchar v0, uchar v1, uchar v2, uchar v3, + uchar v4, uchar v5, uchar v6, uchar v7, + uchar v8, uchar v9, uchar v10, uchar v11, + uchar v12, uchar v13, uchar v14, uchar v15, + uchar v16, uchar v17, uchar v18, uchar v19, + uchar v20, uchar v21, uchar v22, uchar v23, + uchar v24, uchar v25, uchar v26, uchar v27, + uchar v28, uchar v29, uchar v30, uchar v31, + uchar v32, uchar v33, uchar v34, uchar v35, + uchar v36, uchar v37, uchar v38, uchar v39, + uchar v40, uchar v41, uchar v42, uchar v43, + uchar v44, uchar v45, uchar v46, uchar v47, + uchar v48, uchar v49, uchar v50, uchar v51, + uchar v52, uchar v53, uchar v54, uchar v55, + uchar v56, uchar v57, uchar v58, uchar v59, + uchar v60, uchar v61, uchar v62, uchar v63) + { + val = _v512_set_epu8(v63, v62, v61, v60, v59, v58, v57, v56, v55, v54, v53, v52, v51, v50, v49, v48, + v47, v46, v45, v44, v43, v42, v41, v40, v39, v38, v37, v36, v35, v34, v33, v32, + v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16, + v15, v14, v13, v12, v11, v10, v9, v8, v7, v6, v5, v4, v3, v2, v1, v0); + } + v_uint8x64() {} + + static inline v_uint8x64 zero() { return v_uint8x64(_mm512_setzero_si512()); } + + uchar get0() const { return (uchar)_v_cvtsi512_si32(val); } +}; + +struct v_int8x64 +{ + typedef schar lane_type; + enum { nlanes = 64 }; + __m512i val; + + explicit v_int8x64(__m512i v) : val(v) {} + v_int8x64(schar v0, schar v1, schar v2, schar v3, + schar v4, schar v5, schar v6, schar v7, + schar v8, schar v9, schar v10, schar v11, + schar v12, schar v13, schar v14, schar v15, + schar v16, schar v17, schar v18, schar v19, + schar v20, schar v21, schar v22, schar v23, + schar v24, schar v25, schar v26, schar v27, + schar v28, schar v29, schar v30, schar v31, + schar v32, schar v33, schar v34, schar v35, + schar v36, schar v37, schar v38, schar v39, + schar v40, schar v41, schar v42, schar v43, + schar v44, schar v45, schar v46, schar v47, + schar v48, schar v49, schar v50, schar v51, + schar v52, schar v53, schar v54, schar v55, + schar v56, schar v57, schar v58, schar v59, + schar v60, schar v61, schar v62, schar v63) + { + val = _v512_set_epi8(v63, v62, v61, v60, v59, v58, v57, v56, v55, v54, v53, v52, v51, v50, v49, v48, + v47, v46, v45, v44, v43, v42, v41, v40, v39, v38, v37, v36, v35, v34, v33, v32, + v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16, + v15, v14, v13, v12, v11, v10, v9, v8, v7, v6, v5, v4, v3, v2, v1, v0); + } + v_int8x64() {} + + static inline v_int8x64 zero() { return v_int8x64(_mm512_setzero_si512()); } + + schar get0() const { return (schar)_v_cvtsi512_si32(val); } +}; + +struct v_uint16x32 +{ + typedef ushort lane_type; + enum { nlanes = 32 }; + __m512i val; + + explicit v_uint16x32(__m512i v) : val(v) {} + v_uint16x32(ushort v0, ushort v1, ushort v2, ushort v3, + ushort v4, ushort v5, ushort v6, ushort v7, + ushort v8, ushort v9, ushort v10, ushort v11, + ushort v12, ushort v13, ushort v14, ushort v15, + ushort v16, ushort v17, ushort v18, ushort v19, + ushort v20, ushort v21, ushort v22, ushort v23, + ushort v24, ushort v25, ushort v26, ushort v27, + ushort v28, ushort v29, ushort v30, ushort v31) + { + val = _v512_set_epu16(v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16, + v15, v14, v13, v12, v11, v10, v9, v8, v7, v6, v5, v4, v3, v2, v1, v0); + } + v_uint16x32() {} + + static inline v_uint16x32 zero() { return v_uint16x32(_mm512_setzero_si512()); } + + ushort get0() const { return (ushort)_v_cvtsi512_si32(val); } +}; + +struct v_int16x32 +{ + typedef short lane_type; + enum { nlanes = 32 }; + __m512i val; + + explicit v_int16x32(__m512i v) : val(v) {} + v_int16x32(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7, + short v8, short v9, short v10, short v11, short v12, short v13, short v14, short v15, + short v16, short v17, short v18, short v19, short v20, short v21, short v22, short v23, + short v24, short v25, short v26, short v27, short v28, short v29, short v30, short v31) + { + val = _v512_set_epu16((ushort)v31, (ushort)v30, (ushort)v29, (ushort)v28, (ushort)v27, (ushort)v26, (ushort)v25, (ushort)v24, + (ushort)v23, (ushort)v22, (ushort)v21, (ushort)v20, (ushort)v19, (ushort)v18, (ushort)v17, (ushort)v16, + (ushort)v15, (ushort)v14, (ushort)v13, (ushort)v12, (ushort)v11, (ushort)v10, (ushort)v9 , (ushort)v8, + (ushort)v7 , (ushort)v6 , (ushort)v5 , (ushort)v4 , (ushort)v3 , (ushort)v2 , (ushort)v1 , (ushort)v0); + } + v_int16x32() {} + + static inline v_int16x32 zero() { return v_int16x32(_mm512_setzero_si512()); } + + short get0() const { return (short)_v_cvtsi512_si32(val); } +}; + +struct v_uint32x16 +{ + typedef unsigned lane_type; + enum { nlanes = 16 }; + __m512i val; + + explicit v_uint32x16(__m512i v) : val(v) {} + v_uint32x16(unsigned v0, unsigned v1, unsigned v2, unsigned v3, + unsigned v4, unsigned v5, unsigned v6, unsigned v7, + unsigned v8, unsigned v9, unsigned v10, unsigned v11, + unsigned v12, unsigned v13, unsigned v14, unsigned v15) + { + val = _mm512_setr_epi32((int)v0, (int)v1, (int)v2, (int)v3, (int)v4, (int)v5, (int)v6, (int)v7, + (int)v8, (int)v9, (int)v10, (int)v11, (int)v12, (int)v13, (int)v14, (int)v15); + } + v_uint32x16() {} + + static inline v_uint32x16 zero() { return v_uint32x16(_mm512_setzero_si512()); } + + unsigned get0() const { return (unsigned)_v_cvtsi512_si32(val); } +}; + +struct v_int32x16 +{ + typedef int lane_type; + enum { nlanes = 16 }; + __m512i val; + + explicit v_int32x16(__m512i v) : val(v) {} + v_int32x16(int v0, int v1, int v2, int v3, int v4, int v5, int v6, int v7, + int v8, int v9, int v10, int v11, int v12, int v13, int v14, int v15) + { + val = _mm512_setr_epi32(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15); + } + v_int32x16() {} + + static inline v_int32x16 zero() { return v_int32x16(_mm512_setzero_si512()); } + + int get0() const { return _v_cvtsi512_si32(val); } +}; + +struct v_float32x16 +{ + typedef float lane_type; + enum { nlanes = 16 }; + __m512 val; + + explicit v_float32x16(__m512 v) : val(v) {} + v_float32x16(float v0, float v1, float v2, float v3, float v4, float v5, float v6, float v7, + float v8, float v9, float v10, float v11, float v12, float v13, float v14, float v15) + { + val = _mm512_setr_ps(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15); + } + v_float32x16() {} + + static inline v_float32x16 zero() { return v_float32x16(_mm512_setzero_ps()); } + + float get0() const { return _mm_cvtss_f32(_mm512_castps512_ps128(val)); } +}; + +struct v_uint64x8 +{ + typedef uint64 lane_type; + enum { nlanes = 8 }; + __m512i val; + + explicit v_uint64x8(__m512i v) : val(v) {} + v_uint64x8(uint64 v0, uint64 v1, uint64 v2, uint64 v3, uint64 v4, uint64 v5, uint64 v6, uint64 v7) + { val = _mm512_setr_epi64((int64)v0, (int64)v1, (int64)v2, (int64)v3, (int64)v4, (int64)v5, (int64)v6, (int64)v7); } + v_uint64x8() {} + + static inline v_uint64x8 zero() { return v_uint64x8(_mm512_setzero_si512()); } + + uint64 get0() const + { + #if defined __x86_64__ || defined _M_X64 + return (uint64)_mm_cvtsi128_si64(_mm512_castsi512_si128(val)); + #else + int a = _mm_cvtsi128_si32(_mm512_castsi512_si128(val)); + int b = _mm_cvtsi128_si32(_mm512_castsi512_si128(_mm512_srli_epi64(val, 32))); + return (unsigned)a | ((uint64)(unsigned)b << 32); + #endif + } +}; + +struct v_int64x8 +{ + typedef int64 lane_type; + enum { nlanes = 8 }; + __m512i val; + + explicit v_int64x8(__m512i v) : val(v) {} + v_int64x8(int64 v0, int64 v1, int64 v2, int64 v3, int64 v4, int64 v5, int64 v6, int64 v7) + { val = _mm512_setr_epi64(v0, v1, v2, v3, v4, v5, v6, v7); } + v_int64x8() {} + + static inline v_int64x8 zero() { return v_int64x8(_mm512_setzero_si512()); } + + int64 get0() const + { + #if defined __x86_64__ || defined _M_X64 + return (int64)_mm_cvtsi128_si64(_mm512_castsi512_si128(val)); + #else + int a = _mm_cvtsi128_si32(_mm512_castsi512_si128(val)); + int b = _mm_cvtsi128_si32(_mm512_castsi512_si128(_mm512_srli_epi64(val, 32))); + return (int64)((unsigned)a | ((uint64)(unsigned)b << 32)); + #endif + } +}; + +struct v_float64x8 +{ + typedef double lane_type; + enum { nlanes = 8 }; + __m512d val; + + explicit v_float64x8(__m512d v) : val(v) {} + v_float64x8(double v0, double v1, double v2, double v3, double v4, double v5, double v6, double v7) + { val = _mm512_setr_pd(v0, v1, v2, v3, v4, v5, v6, v7); } + v_float64x8() {} + + static inline v_float64x8 zero() { return v_float64x8(_mm512_setzero_pd()); } + + double get0() const { return _mm_cvtsd_f64(_mm512_castpd512_pd128(val)); } +}; + +//////////////// Load and store operations /////////////// + +#define OPENCV_HAL_IMPL_AVX512_LOADSTORE(_Tpvec, _Tp) \ + inline _Tpvec v512_load(const _Tp* ptr) \ + { return _Tpvec(_mm512_loadu_si512((const __m512i*)ptr)); } \ + inline _Tpvec v512_load_aligned(const _Tp* ptr) \ + { return _Tpvec(_mm512_load_si512((const __m512i*)ptr)); } \ + inline _Tpvec v512_load_low(const _Tp* ptr) \ + { \ + __m256i v256 = _mm256_loadu_si256((const __m256i*)ptr); \ + return _Tpvec(_mm512_castsi256_si512(v256)); \ + } \ + inline _Tpvec v512_load_halves(const _Tp* ptr0, const _Tp* ptr1) \ + { \ + __m256i vlo = _mm256_loadu_si256((const __m256i*)ptr0); \ + __m256i vhi = _mm256_loadu_si256((const __m256i*)ptr1); \ + return _Tpvec(_v512_combine(vlo, vhi)); \ + } \ + inline void v_store(_Tp* ptr, const _Tpvec& a) \ + { _mm512_storeu_si512((__m512i*)ptr, a.val); } \ + inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \ + { _mm512_store_si512((__m512i*)ptr, a.val); } \ + inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \ + { _mm512_stream_si512((__m512i*)ptr, a.val); } \ + inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \ + { \ + if( mode == hal::STORE_UNALIGNED ) \ + _mm512_storeu_si512((__m512i*)ptr, a.val); \ + else if( mode == hal::STORE_ALIGNED_NOCACHE ) \ + _mm512_stream_si512((__m512i*)ptr, a.val); \ + else \ + _mm512_store_si512((__m512i*)ptr, a.val); \ + } \ + inline void v_store_low(_Tp* ptr, const _Tpvec& a) \ + { _mm256_storeu_si256((__m256i*)ptr, _v512_extract_low(a.val)); } \ + inline void v_store_high(_Tp* ptr, const _Tpvec& a) \ + { _mm256_storeu_si256((__m256i*)ptr, _v512_extract_high(a.val)); } + +OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_uint8x64, uchar) +OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_int8x64, schar) +OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_uint16x32, ushort) +OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_int16x32, short) +OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_uint32x16, unsigned) +OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_int32x16, int) +OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_uint64x8, uint64) +OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_int64x8, int64) + +#define OPENCV_HAL_IMPL_AVX512_LOADSTORE_FLT(_Tpvec, _Tp, suffix, halfreg) \ + inline _Tpvec v512_load(const _Tp* ptr) \ + { return _Tpvec(_mm512_loadu_##suffix(ptr)); } \ + inline _Tpvec v512_load_aligned(const _Tp* ptr) \ + { return _Tpvec(_mm512_load_##suffix(ptr)); } \ + inline _Tpvec v512_load_low(const _Tp* ptr) \ + { \ + return _Tpvec(_mm512_cast##suffix##256_##suffix##512 \ + (_mm256_loadu_##suffix(ptr))); \ + } \ + inline _Tpvec v512_load_halves(const _Tp* ptr0, const _Tp* ptr1) \ + { \ + halfreg vlo = _mm256_loadu_##suffix(ptr0); \ + halfreg vhi = _mm256_loadu_##suffix(ptr1); \ + return _Tpvec(_v512_combine(vlo, vhi)); \ + } \ + inline void v_store(_Tp* ptr, const _Tpvec& a) \ + { _mm512_storeu_##suffix(ptr, a.val); } \ + inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \ + { _mm512_store_##suffix(ptr, a.val); } \ + inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \ + { _mm512_stream_##suffix(ptr, a.val); } \ + inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \ + { \ + if( mode == hal::STORE_UNALIGNED ) \ + _mm512_storeu_##suffix(ptr, a.val); \ + else if( mode == hal::STORE_ALIGNED_NOCACHE ) \ + _mm512_stream_##suffix(ptr, a.val); \ + else \ + _mm512_store_##suffix(ptr, a.val); \ + } \ + inline void v_store_low(_Tp* ptr, const _Tpvec& a) \ + { _mm256_storeu_##suffix(ptr, _v512_extract_low(a.val)); } \ + inline void v_store_high(_Tp* ptr, const _Tpvec& a) \ + { _mm256_storeu_##suffix(ptr, _v512_extract_high(a.val)); } + +OPENCV_HAL_IMPL_AVX512_LOADSTORE_FLT(v_float32x16, float, ps, __m256) +OPENCV_HAL_IMPL_AVX512_LOADSTORE_FLT(v_float64x8, double, pd, __m256d) + +#define OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, _Tpvecf, suffix, cast) \ + inline _Tpvec v_reinterpret_as_##suffix(const _Tpvecf& a) \ + { return _Tpvec(cast(a.val)); } + +#define OPENCV_HAL_IMPL_AVX512_INIT(_Tpvec, _Tp, suffix, ssuffix, ctype_s) \ + inline _Tpvec v512_setzero_##suffix() \ + { return _Tpvec(_mm512_setzero_si512()); } \ + inline _Tpvec v512_setall_##suffix(_Tp v) \ + { return _Tpvec(_mm512_set1_##ssuffix((ctype_s)v)); } \ + OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint8x64, suffix, OPENCV_HAL_NOP) \ + OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int8x64, suffix, OPENCV_HAL_NOP) \ + OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint16x32, suffix, OPENCV_HAL_NOP) \ + OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int16x32, suffix, OPENCV_HAL_NOP) \ + OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint32x16, suffix, OPENCV_HAL_NOP) \ + OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int32x16, suffix, OPENCV_HAL_NOP) \ + OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint64x8, suffix, OPENCV_HAL_NOP) \ + OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int64x8, suffix, OPENCV_HAL_NOP) \ + OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_float32x16, suffix, _mm512_castps_si512) \ + OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_float64x8, suffix, _mm512_castpd_si512) + +OPENCV_HAL_IMPL_AVX512_INIT(v_uint8x64, uchar, u8, epi8, char) +OPENCV_HAL_IMPL_AVX512_INIT(v_int8x64, schar, s8, epi8, char) +OPENCV_HAL_IMPL_AVX512_INIT(v_uint16x32, ushort, u16, epi16, short) +OPENCV_HAL_IMPL_AVX512_INIT(v_int16x32, short, s16, epi16, short) +OPENCV_HAL_IMPL_AVX512_INIT(v_uint32x16, unsigned, u32, epi32, int) +OPENCV_HAL_IMPL_AVX512_INIT(v_int32x16, int, s32, epi32, int) +OPENCV_HAL_IMPL_AVX512_INIT(v_uint64x8, uint64, u64, epi64, int64) +OPENCV_HAL_IMPL_AVX512_INIT(v_int64x8, int64, s64, epi64, int64) + +#define OPENCV_HAL_IMPL_AVX512_INIT_FLT(_Tpvec, _Tp, suffix, zsuffix, cast) \ + inline _Tpvec v512_setzero_##suffix() \ + { return _Tpvec(_mm512_setzero_##zsuffix()); } \ + inline _Tpvec v512_setall_##suffix(_Tp v) \ + { return _Tpvec(_mm512_set1_##zsuffix(v)); } \ + OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint8x64, suffix, cast) \ + OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int8x64, suffix, cast) \ + OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint16x32, suffix, cast) \ + OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int16x32, suffix, cast) \ + OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint32x16, suffix, cast) \ + OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int32x16, suffix, cast) \ + OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint64x8, suffix, cast) \ + OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int64x8, suffix, cast) + +OPENCV_HAL_IMPL_AVX512_INIT_FLT(v_float32x16, float, f32, ps, _mm512_castsi512_ps) +OPENCV_HAL_IMPL_AVX512_INIT_FLT(v_float64x8, double, f64, pd, _mm512_castsi512_pd) + +inline v_float32x16 v_reinterpret_as_f32(const v_float32x16& a) +{ return a; } +inline v_float32x16 v_reinterpret_as_f32(const v_float64x8& a) +{ return v_float32x16(_mm512_castpd_ps(a.val)); } + +inline v_float64x8 v_reinterpret_as_f64(const v_float64x8& a) +{ return a; } +inline v_float64x8 v_reinterpret_as_f64(const v_float32x16& a) +{ return v_float64x8(_mm512_castps_pd(a.val)); } + +// FP16 +inline v_float32x16 v512_load_expand(const float16_t* ptr) +{ + return v_float32x16(_mm512_cvtph_ps(_mm256_loadu_si256((const __m256i*)ptr))); +} + +inline void v_pack_store(float16_t* ptr, const v_float32x16& a) +{ + __m256i ah = _mm512_cvtps_ph(a.val, 0); + _mm256_storeu_si256((__m256i*)ptr, ah); +} + +/* Recombine & ZIP */ +inline void v_zip(const v_int8x64& a, const v_int8x64& b, v_int8x64& ab0, v_int8x64& ab1) +{ +#if CV_AVX_512VBMI + __m512i mask0 = _v512_set_epu8( 95, 31, 94, 30, 93, 29, 92, 28, 91, 27, 90, 26, 89, 25, 88, 24, + 87, 23, 86, 22, 85, 21, 84, 20, 83, 19, 82, 18, 81, 17, 80, 16, + 79, 15, 78, 14, 77, 13, 76, 12, 75, 11, 74, 10, 73, 9, 72, 8, + 71, 7, 70, 6, 69, 5, 68, 4, 67, 3, 66, 2, 65, 1, 64, 0); + ab0 = v_int8x64(_mm512_permutex2var_epi8(a.val, mask0, b.val)); + __m512i mask1 = _v512_set_epu8(127, 63, 126, 62, 125, 61, 124, 60, 123, 59, 122, 58, 121, 57, 120, 56, + 119, 55, 118, 54, 117, 53, 116, 52, 115, 51, 114, 50, 113, 49, 112, 48, + 111, 47, 110, 46, 109, 45, 108, 44, 107, 43, 106, 42, 105, 41, 104, 40, + 103, 39, 102, 38, 101, 37, 100, 36, 99, 35, 98, 34, 97, 33, 96, 32); + ab1 = v_int8x64(_mm512_permutex2var_epi8(a.val, mask1, b.val)); +#else + __m512i low = _mm512_unpacklo_epi8(a.val, b.val); + __m512i high = _mm512_unpackhi_epi8(a.val, b.val); + ab0 = v_int8x64(_mm512_permutex2var_epi64(low, _v512_set_epu64(11, 10, 3, 2, 9, 8, 1, 0), high)); + ab1 = v_int8x64(_mm512_permutex2var_epi64(low, _v512_set_epu64(15, 14, 7, 6, 13, 12, 5, 4), high)); +#endif +} +inline void v_zip(const v_int16x32& a, const v_int16x32& b, v_int16x32& ab0, v_int16x32& ab1) +{ + __m512i mask0 = _v512_set_epu16(47, 15, 46, 14, 45, 13, 44, 12, 43, 11, 42, 10, 41, 9, 40, 8, + 39, 7, 38, 6, 37, 5, 36, 4, 35, 3, 34, 2, 33, 1, 32, 0); + ab0 = v_int16x32(_mm512_permutex2var_epi16(a.val, mask0, b.val)); + __m512i mask1 = _v512_set_epu16(63, 31, 62, 30, 61, 29, 60, 28, 59, 27, 58, 26, 57, 25, 56, 24, + 55, 23, 54, 22, 53, 21, 52, 20, 51, 19, 50, 18, 49, 17, 48, 16); + ab1 = v_int16x32(_mm512_permutex2var_epi16(a.val, mask1, b.val)); +} +inline void v_zip(const v_int32x16& a, const v_int32x16& b, v_int32x16& ab0, v_int32x16& ab1) +{ + __m512i mask0 = _v512_set_epu32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0); + ab0 = v_int32x16(_mm512_permutex2var_epi32(a.val, mask0, b.val)); + __m512i mask1 = _v512_set_epu32(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8); + ab1 = v_int32x16(_mm512_permutex2var_epi32(a.val, mask1, b.val)); +} +inline void v_zip(const v_int64x8& a, const v_int64x8& b, v_int64x8& ab0, v_int64x8& ab1) +{ + __m512i mask0 = _v512_set_epu64(11, 3, 10, 2, 9, 1, 8, 0); + ab0 = v_int64x8(_mm512_permutex2var_epi64(a.val, mask0, b.val)); + __m512i mask1 = _v512_set_epu64(15, 7, 14, 6, 13, 5, 12, 4); + ab1 = v_int64x8(_mm512_permutex2var_epi64(a.val, mask1, b.val)); +} + +inline void v_zip(const v_uint8x64& a, const v_uint8x64& b, v_uint8x64& ab0, v_uint8x64& ab1) +{ + v_int8x64 i0, i1; + v_zip(v_reinterpret_as_s8(a), v_reinterpret_as_s8(b), i0, i1); + ab0 = v_reinterpret_as_u8(i0); + ab1 = v_reinterpret_as_u8(i1); +} +inline void v_zip(const v_uint16x32& a, const v_uint16x32& b, v_uint16x32& ab0, v_uint16x32& ab1) +{ + v_int16x32 i0, i1; + v_zip(v_reinterpret_as_s16(a), v_reinterpret_as_s16(b), i0, i1); + ab0 = v_reinterpret_as_u16(i0); + ab1 = v_reinterpret_as_u16(i1); +} +inline void v_zip(const v_uint32x16& a, const v_uint32x16& b, v_uint32x16& ab0, v_uint32x16& ab1) +{ + v_int32x16 i0, i1; + v_zip(v_reinterpret_as_s32(a), v_reinterpret_as_s32(b), i0, i1); + ab0 = v_reinterpret_as_u32(i0); + ab1 = v_reinterpret_as_u32(i1); +} +inline void v_zip(const v_uint64x8& a, const v_uint64x8& b, v_uint64x8& ab0, v_uint64x8& ab1) +{ + v_int64x8 i0, i1; + v_zip(v_reinterpret_as_s64(a), v_reinterpret_as_s64(b), i0, i1); + ab0 = v_reinterpret_as_u64(i0); + ab1 = v_reinterpret_as_u64(i1); +} +inline void v_zip(const v_float32x16& a, const v_float32x16& b, v_float32x16& ab0, v_float32x16& ab1) +{ + v_int32x16 i0, i1; + v_zip(v_reinterpret_as_s32(a), v_reinterpret_as_s32(b), i0, i1); + ab0 = v_reinterpret_as_f32(i0); + ab1 = v_reinterpret_as_f32(i1); +} +inline void v_zip(const v_float64x8& a, const v_float64x8& b, v_float64x8& ab0, v_float64x8& ab1) +{ + v_int64x8 i0, i1; + v_zip(v_reinterpret_as_s64(a), v_reinterpret_as_s64(b), i0, i1); + ab0 = v_reinterpret_as_f64(i0); + ab1 = v_reinterpret_as_f64(i1); +} + +#define OPENCV_HAL_IMPL_AVX512_COMBINE(_Tpvec, suffix) \ + inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \ + { return _Tpvec(_v512_combine(_v512_extract_low(a.val), _v512_extract_low(b.val))); } \ + inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b) \ + { return _Tpvec(_v512_insert(b.val, _v512_extract_high(a.val))); } \ + inline void v_recombine(const _Tpvec& a, const _Tpvec& b, \ + _Tpvec& c, _Tpvec& d) \ + { \ + c.val = _v512_combine(_v512_extract_low(a.val),_v512_extract_low(b.val)); \ + d.val = _v512_insert(b.val,_v512_extract_high(a.val)); \ + } + + +OPENCV_HAL_IMPL_AVX512_COMBINE(v_uint8x64, epi8) +OPENCV_HAL_IMPL_AVX512_COMBINE(v_int8x64, epi8) +OPENCV_HAL_IMPL_AVX512_COMBINE(v_uint16x32, epi16) +OPENCV_HAL_IMPL_AVX512_COMBINE(v_int16x32, epi16) +OPENCV_HAL_IMPL_AVX512_COMBINE(v_uint32x16, epi32) +OPENCV_HAL_IMPL_AVX512_COMBINE(v_int32x16, epi32) +OPENCV_HAL_IMPL_AVX512_COMBINE(v_uint64x8, epi64) +OPENCV_HAL_IMPL_AVX512_COMBINE(v_int64x8, epi64) +OPENCV_HAL_IMPL_AVX512_COMBINE(v_float32x16, ps) +OPENCV_HAL_IMPL_AVX512_COMBINE(v_float64x8, pd) + +////////// Arithmetic, bitwise and comparison operations ///////// + +/* Element-wise binary and unary operations */ + +/** Non-saturating arithmetics **/ +#define OPENCV_HAL_IMPL_AVX512_BIN_FUNC(func, _Tpvec, intrin) \ + inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \ + { return _Tpvec(intrin(a.val, b.val)); } + +OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_add_wrap, v_uint8x64, _mm512_add_epi8) +OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_add_wrap, v_int8x64, _mm512_add_epi8) +OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_add_wrap, v_uint16x32, _mm512_add_epi16) +OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_add_wrap, v_int16x32, _mm512_add_epi16) +OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_sub_wrap, v_uint8x64, _mm512_sub_epi8) +OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_sub_wrap, v_int8x64, _mm512_sub_epi8) +OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_sub_wrap, v_uint16x32, _mm512_sub_epi16) +OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_sub_wrap, v_int16x32, _mm512_sub_epi16) +OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_mul_wrap, v_uint16x32, _mm512_mullo_epi16) +OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_mul_wrap, v_int16x32, _mm512_mullo_epi16) + +inline v_uint8x64 v_mul_wrap(const v_uint8x64& a, const v_uint8x64& b) +{ + __m512i ad = _mm512_srai_epi16(a.val, 8); + __m512i bd = _mm512_srai_epi16(b.val, 8); + __m512i p0 = _mm512_mullo_epi16(a.val, b.val); // even + __m512i p1 = _mm512_slli_epi16(_mm512_mullo_epi16(ad, bd), 8); // odd + return v_uint8x64(_mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, p0, p1)); +} +inline v_int8x64 v_mul_wrap(const v_int8x64& a, const v_int8x64& b) +{ + return v_reinterpret_as_s8(v_mul_wrap(v_reinterpret_as_u8(a), v_reinterpret_as_u8(b))); +} + +#define OPENCV_HAL_IMPL_AVX512_BIN_OP(bin_op, _Tpvec, intrin) \ + inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \ + { return _Tpvec(intrin(a.val, b.val)); } \ + inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \ + { a.val = intrin(a.val, b.val); return a; } + +OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_uint32x16, _mm512_add_epi32) +OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_uint32x16, _mm512_sub_epi32) +OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_int32x16, _mm512_add_epi32) +OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_int32x16, _mm512_sub_epi32) +OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_uint64x8, _mm512_add_epi64) +OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_uint64x8, _mm512_sub_epi64) +OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_int64x8, _mm512_add_epi64) +OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_int64x8, _mm512_sub_epi64) + +OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_uint32x16, _mm512_mullo_epi32) +OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_int32x16, _mm512_mullo_epi32) +OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_uint64x8, _mm512_mullo_epi64) +OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_int64x8, _mm512_mullo_epi64) + +/** Saturating arithmetics **/ +OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_uint8x64, _mm512_adds_epu8) +OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_uint8x64, _mm512_subs_epu8) +OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_int8x64, _mm512_adds_epi8) +OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_int8x64, _mm512_subs_epi8) +OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_uint16x32, _mm512_adds_epu16) +OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_uint16x32, _mm512_subs_epu16) +OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_int16x32, _mm512_adds_epi16) +OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_int16x32, _mm512_subs_epi16) + +OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_float32x16, _mm512_add_ps) +OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_float32x16, _mm512_sub_ps) +OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_float32x16, _mm512_mul_ps) +OPENCV_HAL_IMPL_AVX512_BIN_OP(/, v_float32x16, _mm512_div_ps) +OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_float64x8, _mm512_add_pd) +OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_float64x8, _mm512_sub_pd) +OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_float64x8, _mm512_mul_pd) +OPENCV_HAL_IMPL_AVX512_BIN_OP(/, v_float64x8, _mm512_div_pd) + +// saturating multiply +inline v_uint8x64 operator * (const v_uint8x64& a, const v_uint8x64& b) +{ + v_uint16x32 c, d; + v_mul_expand(a, b, c, d); + return v_pack(c, d); +} +inline v_int8x64 operator * (const v_int8x64& a, const v_int8x64& b) +{ + v_int16x32 c, d; + v_mul_expand(a, b, c, d); + return v_pack(c, d); +} +inline v_uint16x32 operator * (const v_uint16x32& a, const v_uint16x32& b) +{ + __m512i pl = _mm512_mullo_epi16(a.val, b.val); + __m512i ph = _mm512_mulhi_epu16(a.val, b.val); + __m512i p0 = _mm512_unpacklo_epi16(pl, ph); + __m512i p1 = _mm512_unpackhi_epi16(pl, ph); + + const __m512i m = _mm512_set1_epi32(65535); + return v_uint16x32(_mm512_packus_epi32(_mm512_min_epu32(p0, m), _mm512_min_epu32(p1, m))); +} +inline v_int16x32 operator * (const v_int16x32& a, const v_int16x32& b) +{ + __m512i pl = _mm512_mullo_epi16(a.val, b.val); + __m512i ph = _mm512_mulhi_epi16(a.val, b.val); + __m512i p0 = _mm512_unpacklo_epi16(pl, ph); + __m512i p1 = _mm512_unpackhi_epi16(pl, ph); + return v_int16x32(_mm512_packs_epi32(p0, p1)); +} + +inline v_uint8x64& operator *= (v_uint8x64& a, const v_uint8x64& b) +{ a = a * b; return a; } +inline v_int8x64& operator *= (v_int8x64& a, const v_int8x64& b) +{ a = a * b; return a; } +inline v_uint16x32& operator *= (v_uint16x32& a, const v_uint16x32& b) +{ a = a * b; return a; } +inline v_int16x32& operator *= (v_int16x32& a, const v_int16x32& b) +{ a = a * b; return a; } + +inline v_int16x32 v_mul_hi(const v_int16x32& a, const v_int16x32& b) { return v_int16x32(_mm512_mulhi_epi16(a.val, b.val)); } +inline v_uint16x32 v_mul_hi(const v_uint16x32& a, const v_uint16x32& b) { return v_uint16x32(_mm512_mulhi_epu16(a.val, b.val)); } + +// Multiply and expand +inline void v_mul_expand(const v_uint8x64& a, const v_uint8x64& b, + v_uint16x32& c, v_uint16x32& d) +{ + v_uint16x32 a0, a1, b0, b1; + v_expand(a, a0, a1); + v_expand(b, b0, b1); + c = v_mul_wrap(a0, b0); + d = v_mul_wrap(a1, b1); +} + +inline void v_mul_expand(const v_int8x64& a, const v_int8x64& b, + v_int16x32& c, v_int16x32& d) +{ + v_int16x32 a0, a1, b0, b1; + v_expand(a, a0, a1); + v_expand(b, b0, b1); + c = v_mul_wrap(a0, b0); + d = v_mul_wrap(a1, b1); +} + +inline void v_mul_expand(const v_int16x32& a, const v_int16x32& b, + v_int32x16& c, v_int32x16& d) +{ + v_int16x32 v0, v1; + v_zip(v_mul_wrap(a, b), v_mul_hi(a, b), v0, v1); + + c = v_reinterpret_as_s32(v0); + d = v_reinterpret_as_s32(v1); +} + +inline void v_mul_expand(const v_uint16x32& a, const v_uint16x32& b, + v_uint32x16& c, v_uint32x16& d) +{ + v_uint16x32 v0, v1; + v_zip(v_mul_wrap(a, b), v_mul_hi(a, b), v0, v1); + + c = v_reinterpret_as_u32(v0); + d = v_reinterpret_as_u32(v1); +} + +inline void v_mul_expand(const v_uint32x16& a, const v_uint32x16& b, + v_uint64x8& c, v_uint64x8& d) +{ + v_zip(v_uint64x8(_mm512_mul_epu32(a.val, b.val)), + v_uint64x8(_mm512_mul_epu32(_mm512_srli_epi64(a.val, 32), _mm512_srli_epi64(b.val, 32))), c, d); +} + +inline void v_mul_expand(const v_int32x16& a, const v_int32x16& b, + v_int64x8& c, v_int64x8& d) +{ + v_zip(v_int64x8(_mm512_mul_epi32(a.val, b.val)), + v_int64x8(_mm512_mul_epi32(_mm512_srli_epi64(a.val, 32), _mm512_srli_epi64(b.val, 32))), c, d); +} + +/** Bitwise shifts **/ +#define OPENCV_HAL_IMPL_AVX512_SHIFT_OP(_Tpuvec, _Tpsvec, suffix) \ + inline _Tpuvec operator << (const _Tpuvec& a, int imm) \ + { return _Tpuvec(_mm512_slli_##suffix(a.val, imm)); } \ + inline _Tpsvec operator << (const _Tpsvec& a, int imm) \ + { return _Tpsvec(_mm512_slli_##suffix(a.val, imm)); } \ + inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \ + { return _Tpuvec(_mm512_srli_##suffix(a.val, imm)); } \ + inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \ + { return _Tpsvec(_mm512_srai_##suffix(a.val, imm)); } \ + template \ + inline _Tpuvec v_shl(const _Tpuvec& a) \ + { return _Tpuvec(_mm512_slli_##suffix(a.val, imm)); } \ + template \ + inline _Tpsvec v_shl(const _Tpsvec& a) \ + { return _Tpsvec(_mm512_slli_##suffix(a.val, imm)); } \ + template \ + inline _Tpuvec v_shr(const _Tpuvec& a) \ + { return _Tpuvec(_mm512_srli_##suffix(a.val, imm)); } \ + template \ + inline _Tpsvec v_shr(const _Tpsvec& a) \ + { return _Tpsvec(_mm512_srai_##suffix(a.val, imm)); } + +OPENCV_HAL_IMPL_AVX512_SHIFT_OP(v_uint16x32, v_int16x32, epi16) +OPENCV_HAL_IMPL_AVX512_SHIFT_OP(v_uint32x16, v_int32x16, epi32) +OPENCV_HAL_IMPL_AVX512_SHIFT_OP(v_uint64x8, v_int64x8, epi64) + + +/** Bitwise logic **/ +#define OPENCV_HAL_IMPL_AVX512_LOGIC_OP(_Tpvec, suffix, not_const) \ + OPENCV_HAL_IMPL_AVX512_BIN_OP(&, _Tpvec, _mm512_and_##suffix) \ + OPENCV_HAL_IMPL_AVX512_BIN_OP(|, _Tpvec, _mm512_or_##suffix) \ + OPENCV_HAL_IMPL_AVX512_BIN_OP(^, _Tpvec, _mm512_xor_##suffix) \ + inline _Tpvec operator ~ (const _Tpvec& a) \ + { return _Tpvec(_mm512_xor_##suffix(a.val, not_const)); } + +OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_uint8x64, si512, _mm512_set1_epi32(-1)) +OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_int8x64, si512, _mm512_set1_epi32(-1)) +OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_uint16x32, si512, _mm512_set1_epi32(-1)) +OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_int16x32, si512, _mm512_set1_epi32(-1)) +OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_uint32x16, si512, _mm512_set1_epi32(-1)) +OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_int32x16, si512, _mm512_set1_epi32(-1)) +OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_uint64x8, si512, _mm512_set1_epi64(-1)) +OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_int64x8, si512, _mm512_set1_epi64(-1)) +OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_float32x16, ps, _mm512_castsi512_ps(_mm512_set1_epi32(-1))) +OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_float64x8, pd, _mm512_castsi512_pd(_mm512_set1_epi32(-1))) + +/** Select **/ +#define OPENCV_HAL_IMPL_AVX512_SELECT(_Tpvec, suffix, zsuf) \ + inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \ + { return _Tpvec(_mm512_mask_blend_##suffix(_mm512_cmp_##suffix##_mask(mask.val, _mm512_setzero_##zsuf(), _MM_CMPINT_EQ), a.val, b.val)); } + +OPENCV_HAL_IMPL_AVX512_SELECT(v_uint8x64, epi8, si512) +OPENCV_HAL_IMPL_AVX512_SELECT(v_int8x64, epi8, si512) +OPENCV_HAL_IMPL_AVX512_SELECT(v_uint16x32, epi16, si512) +OPENCV_HAL_IMPL_AVX512_SELECT(v_int16x32, epi16, si512) +OPENCV_HAL_IMPL_AVX512_SELECT(v_uint32x16, epi32, si512) +OPENCV_HAL_IMPL_AVX512_SELECT(v_int32x16, epi32, si512) +OPENCV_HAL_IMPL_AVX512_SELECT(v_uint64x8, epi64, si512) +OPENCV_HAL_IMPL_AVX512_SELECT(v_int64x8, epi64, si512) +OPENCV_HAL_IMPL_AVX512_SELECT(v_float32x16, ps, ps) +OPENCV_HAL_IMPL_AVX512_SELECT(v_float64x8, pd, pd) + +/** Comparison **/ +#define OPENCV_HAL_IMPL_AVX512_CMP_INT(bin_op, imm8, _Tpvec, sufcmp, sufset, tval) \ + inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \ + { return _Tpvec(_mm512_maskz_set1_##sufset(_mm512_cmp_##sufcmp##_mask(a.val, b.val, imm8), tval)); } + +#define OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(_Tpvec, sufcmp, sufset, tval) \ + OPENCV_HAL_IMPL_AVX512_CMP_INT(==, _MM_CMPINT_EQ, _Tpvec, sufcmp, sufset, tval) \ + OPENCV_HAL_IMPL_AVX512_CMP_INT(!=, _MM_CMPINT_NE, _Tpvec, sufcmp, sufset, tval) \ + OPENCV_HAL_IMPL_AVX512_CMP_INT(<, _MM_CMPINT_LT, _Tpvec, sufcmp, sufset, tval) \ + OPENCV_HAL_IMPL_AVX512_CMP_INT(>, _MM_CMPINT_NLE, _Tpvec, sufcmp, sufset, tval) \ + OPENCV_HAL_IMPL_AVX512_CMP_INT(<=, _MM_CMPINT_LE, _Tpvec, sufcmp, sufset, tval) \ + OPENCV_HAL_IMPL_AVX512_CMP_INT(>=, _MM_CMPINT_NLT, _Tpvec, sufcmp, sufset, tval) + +OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_uint8x64, epu8, epi8, (char)-1) +OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_int8x64, epi8, epi8, (char)-1) +OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_uint16x32, epu16, epi16, (short)-1) +OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_int16x32, epi16, epi16, (short)-1) +OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_uint32x16, epu32, epi32, (int)-1) +OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_int32x16, epi32, epi32, (int)-1) +OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_uint64x8, epu64, epi64, (int64)-1) +OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_int64x8, epi64, epi64, (int64)-1) + +#define OPENCV_HAL_IMPL_AVX512_CMP_FLT(bin_op, imm8, _Tpvec, sufcmp, sufset, tval) \ + inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \ + { return _Tpvec(_mm512_castsi512_##sufcmp(_mm512_maskz_set1_##sufset(_mm512_cmp_##sufcmp##_mask(a.val, b.val, imm8), tval))); } + +#define OPENCV_HAL_IMPL_AVX512_CMP_OP_FLT(_Tpvec, sufcmp, sufset, tval) \ + OPENCV_HAL_IMPL_AVX512_CMP_FLT(==, _CMP_EQ_OQ, _Tpvec, sufcmp, sufset, tval) \ + OPENCV_HAL_IMPL_AVX512_CMP_FLT(!=, _CMP_NEQ_OQ, _Tpvec, sufcmp, sufset, tval) \ + OPENCV_HAL_IMPL_AVX512_CMP_FLT(<, _CMP_LT_OQ, _Tpvec, sufcmp, sufset, tval) \ + OPENCV_HAL_IMPL_AVX512_CMP_FLT(>, _CMP_GT_OQ, _Tpvec, sufcmp, sufset, tval) \ + OPENCV_HAL_IMPL_AVX512_CMP_FLT(<=, _CMP_LE_OQ, _Tpvec, sufcmp, sufset, tval) \ + OPENCV_HAL_IMPL_AVX512_CMP_FLT(>=, _CMP_GE_OQ, _Tpvec, sufcmp, sufset, tval) + +OPENCV_HAL_IMPL_AVX512_CMP_OP_FLT(v_float32x16, ps, epi32, (int)-1) +OPENCV_HAL_IMPL_AVX512_CMP_OP_FLT(v_float64x8, pd, epi64, (int64)-1) + +inline v_float32x16 v_not_nan(const v_float32x16& a) +{ return v_float32x16(_mm512_castsi512_ps(_mm512_maskz_set1_epi32(_mm512_cmp_ps_mask(a.val, a.val, _CMP_ORD_Q), (int)-1))); } +inline v_float64x8 v_not_nan(const v_float64x8& a) +{ return v_float64x8(_mm512_castsi512_pd(_mm512_maskz_set1_epi64(_mm512_cmp_pd_mask(a.val, a.val, _CMP_ORD_Q), (int64)-1))); } + +/** min/max **/ +OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_uint8x64, _mm512_min_epu8) +OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_uint8x64, _mm512_max_epu8) +OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_int8x64, _mm512_min_epi8) +OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_int8x64, _mm512_max_epi8) +OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_uint16x32, _mm512_min_epu16) +OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_uint16x32, _mm512_max_epu16) +OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_int16x32, _mm512_min_epi16) +OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_int16x32, _mm512_max_epi16) +OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_uint32x16, _mm512_min_epu32) +OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_uint32x16, _mm512_max_epu32) +OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_int32x16, _mm512_min_epi32) +OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_int32x16, _mm512_max_epi32) +OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_uint64x8, _mm512_min_epu64) +OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_uint64x8, _mm512_max_epu64) +OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_int64x8, _mm512_min_epi64) +OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_int64x8, _mm512_max_epi64) +OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_float32x16, _mm512_min_ps) +OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_float32x16, _mm512_max_ps) +OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_float64x8, _mm512_min_pd) +OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_float64x8, _mm512_max_pd) + +/** Rotate **/ +namespace { + template + struct _v_rotate_right { static inline v_int8x64 eval(const v_int8x64&, const v_int8x64&) { return v_int8x64(); }}; + template + struct _v_rotate_right { static inline v_int8x64 eval(const v_int8x64& a, const v_int8x64& b) + { + return v_int8x64(_mm512_or_si512(_mm512_srli_epi32(_mm512_alignr_epi32(b.val, a.val, imm32 ), imm4 *8), + _mm512_slli_epi32(_mm512_alignr_epi32(b.val, a.val, imm32 + 1), (4-imm4)*8))); + }}; + template + struct _v_rotate_right { static inline v_int8x64 eval(const v_int8x64& a, const v_int8x64& b) + { + return v_int8x64(_mm512_or_si512(_mm512_srli_epi32(_mm512_alignr_epi32(b.val, a.val, 15), imm4 *8), + _mm512_slli_epi32( b.val, (4-imm4)*8))); + }}; + template + struct _v_rotate_right { static inline v_int8x64 eval(const v_int8x64&, const v_int8x64& b) + { + return v_int8x64(_mm512_or_si512(_mm512_srli_epi32(_mm512_alignr_epi32(_mm512_setzero_si512(), b.val, imm32 - 16), imm4 *8), + _mm512_slli_epi32(_mm512_alignr_epi32(_mm512_setzero_si512(), b.val, imm32 - 15), (4-imm4)*8))); + }}; + template + struct _v_rotate_right { static inline v_int8x64 eval(const v_int8x64&, const v_int8x64& b) + { return v_int8x64(_mm512_srli_epi32(_mm512_alignr_epi32(_mm512_setzero_si512(), b.val, 15), imm4*8)); }}; + template + struct _v_rotate_right { static inline v_int8x64 eval(const v_int8x64& a, const v_int8x64& b) + { return v_int8x64(_mm512_alignr_epi32(b.val, a.val, imm32)); }}; + template<> + struct _v_rotate_right { static inline v_int8x64 eval(const v_int8x64& a, const v_int8x64&) { return a; }}; + template + struct _v_rotate_right { static inline v_int8x64 eval(const v_int8x64&, const v_int8x64& b) + { return v_int8x64(_mm512_alignr_epi32(_mm512_setzero_si512(), b.val, imm32 - 16)); }}; + template<> + struct _v_rotate_right { static inline v_int8x64 eval(const v_int8x64&, const v_int8x64& b) { return b; }}; + template<> + struct _v_rotate_right { static inline v_int8x64 eval(const v_int8x64&, const v_int8x64&) { return v_int8x64(); }}; +} +template inline v_int8x64 v_rotate_right(const v_int8x64& a, const v_int8x64& b) +{ + return imm >= 128 ? v_int8x64() : +#if CV_AVX_512VBMI + v_int8x64(_mm512_permutex2var_epi8(a.val, + _v512_set_epu8(0x3f + imm, 0x3e + imm, 0x3d + imm, 0x3c + imm, 0x3b + imm, 0x3a + imm, 0x39 + imm, 0x38 + imm, + 0x37 + imm, 0x36 + imm, 0x35 + imm, 0x34 + imm, 0x33 + imm, 0x32 + imm, 0x31 + imm, 0x30 + imm, + 0x2f + imm, 0x2e + imm, 0x2d + imm, 0x2c + imm, 0x2b + imm, 0x2a + imm, 0x29 + imm, 0x28 + imm, + 0x27 + imm, 0x26 + imm, 0x25 + imm, 0x24 + imm, 0x23 + imm, 0x22 + imm, 0x21 + imm, 0x20 + imm, + 0x1f + imm, 0x1e + imm, 0x1d + imm, 0x1c + imm, 0x1b + imm, 0x1a + imm, 0x19 + imm, 0x18 + imm, + 0x17 + imm, 0x16 + imm, 0x15 + imm, 0x14 + imm, 0x13 + imm, 0x12 + imm, 0x11 + imm, 0x10 + imm, + 0x0f + imm, 0x0e + imm, 0x0d + imm, 0x0c + imm, 0x0b + imm, 0x0a + imm, 0x09 + imm, 0x08 + imm, + 0x07 + imm, 0x06 + imm, 0x05 + imm, 0x04 + imm, 0x03 + imm, 0x02 + imm, 0x01 + imm, 0x00 + imm), b.val)); +#else + _v_rotate_right 15), imm/4>::eval(a, b); +#endif +} +template +inline v_int8x64 v_rotate_left(const v_int8x64& a, const v_int8x64& b) +{ + if (imm == 0) return a; + if (imm == 64) return b; + if (imm >= 128) return v_int8x64(); +#if CV_AVX_512VBMI + return v_int8x64(_mm512_permutex2var_epi8(b.val, + _v512_set_epi8(0x7f - imm,0x7e - imm,0x7d - imm,0x7c - imm,0x7b - imm,0x7a - imm,0x79 - imm,0x78 - imm, + 0x77 - imm,0x76 - imm,0x75 - imm,0x74 - imm,0x73 - imm,0x72 - imm,0x71 - imm,0x70 - imm, + 0x6f - imm,0x6e - imm,0x6d - imm,0x6c - imm,0x6b - imm,0x6a - imm,0x69 - imm,0x68 - imm, + 0x67 - imm,0x66 - imm,0x65 - imm,0x64 - imm,0x63 - imm,0x62 - imm,0x61 - imm,0x60 - imm, + 0x5f - imm,0x5e - imm,0x5d - imm,0x5c - imm,0x5b - imm,0x5a - imm,0x59 - imm,0x58 - imm, + 0x57 - imm,0x56 - imm,0x55 - imm,0x54 - imm,0x53 - imm,0x52 - imm,0x51 - imm,0x50 - imm, + 0x4f - imm,0x4e - imm,0x4d - imm,0x4c - imm,0x4b - imm,0x4a - imm,0x49 - imm,0x48 - imm, + 0x47 - imm,0x46 - imm,0x45 - imm,0x44 - imm,0x43 - imm,0x42 - imm,0x41 - imm,0x40 - imm), a.val)); +#else + return imm < 64 ? v_rotate_right<64 - imm>(b, a) : v_rotate_right<128 - imm>(v512_setzero_s8(), b); +#endif +} +template +inline v_int8x64 v_rotate_right(const v_int8x64& a) +{ + if (imm == 0) return a; + if (imm >= 64) return v_int8x64(); +#if CV_AVX_512VBMI + return v_int8x64(_mm512_maskz_permutexvar_epi8(0xFFFFFFFFFFFFFFFF >> imm, + _v512_set_epu8(0x3f + imm,0x3e + imm,0x3d + imm,0x3c + imm,0x3b + imm,0x3a + imm,0x39 + imm,0x38 + imm, + 0x37 + imm,0x36 + imm,0x35 + imm,0x34 + imm,0x33 + imm,0x32 + imm,0x31 + imm,0x30 + imm, + 0x2f + imm,0x2e + imm,0x2d + imm,0x2c + imm,0x2b + imm,0x2a + imm,0x29 + imm,0x28 + imm, + 0x27 + imm,0x26 + imm,0x25 + imm,0x24 + imm,0x23 + imm,0x22 + imm,0x21 + imm,0x20 + imm, + 0x1f + imm,0x1e + imm,0x1d + imm,0x1c + imm,0x1b + imm,0x1a + imm,0x19 + imm,0x18 + imm, + 0x17 + imm,0x16 + imm,0x15 + imm,0x14 + imm,0x13 + imm,0x12 + imm,0x11 + imm,0x10 + imm, + 0x0f + imm,0x0e + imm,0x0d + imm,0x0c + imm,0x0b + imm,0x0a + imm,0x09 + imm,0x08 + imm, + 0x07 + imm,0x06 + imm,0x05 + imm,0x04 + imm,0x03 + imm,0x02 + imm,0x01 + imm,0x00 + imm), a.val)); +#else + return v_rotate_right(a, v512_setzero_s8()); +#endif +} +template +inline v_int8x64 v_rotate_left(const v_int8x64& a) +{ + if (imm == 0) return a; + if (imm >= 64) return v_int8x64(); +#if CV_AVX_512VBMI + return v_int8x64(_mm512_maskz_permutexvar_epi8(0xFFFFFFFFFFFFFFFF << imm, + _v512_set_epi8(0x3f - imm,0x3e - imm,0x3d - imm,0x3c - imm,0x3b - imm,0x3a - imm,0x39 - imm,0x38 - imm, + 0x37 - imm,0x36 - imm,0x35 - imm,0x34 - imm,0x33 - imm,0x32 - imm,0x31 - imm,0x30 - imm, + 0x2f - imm,0x2e - imm,0x2d - imm,0x2c - imm,0x2b - imm,0x2a - imm,0x29 - imm,0x28 - imm, + 0x27 - imm,0x26 - imm,0x25 - imm,0x24 - imm,0x23 - imm,0x22 - imm,0x21 - imm,0x20 - imm, + 0x1f - imm,0x1e - imm,0x1d - imm,0x1c - imm,0x1b - imm,0x1a - imm,0x19 - imm,0x18 - imm, + 0x17 - imm,0x16 - imm,0x15 - imm,0x14 - imm,0x13 - imm,0x12 - imm,0x11 - imm,0x10 - imm, + 0x0f - imm,0x0e - imm,0x0d - imm,0x0c - imm,0x0b - imm,0x0a - imm,0x09 - imm,0x08 - imm, + 0x07 - imm,0x06 - imm,0x05 - imm,0x04 - imm,0x03 - imm,0x02 - imm,0x01 - imm,0x00 - imm), a.val)); +#else + return v_rotate_right<64 - imm>(v512_setzero_s8(), a); +#endif +} + +#define OPENCV_HAL_IMPL_AVX512_ROTATE_PM(_Tpvec, suffix) \ +template inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \ +{ return v_reinterpret_as_##suffix(v_rotate_left(v_reinterpret_as_s8(a), v_reinterpret_as_s8(b))); } \ +template inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \ +{ return v_reinterpret_as_##suffix(v_rotate_right(v_reinterpret_as_s8(a), v_reinterpret_as_s8(b))); } \ +template inline _Tpvec v_rotate_left(const _Tpvec& a) \ +{ return v_reinterpret_as_##suffix(v_rotate_left(v_reinterpret_as_s8(a))); } \ +template inline _Tpvec v_rotate_right(const _Tpvec& a) \ +{ return v_reinterpret_as_##suffix(v_rotate_right(v_reinterpret_as_s8(a))); } + +#define OPENCV_HAL_IMPL_AVX512_ROTATE_EC(_Tpvec, suffix) \ +template \ +inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \ +{ \ + enum { SHIFT2 = (_Tpvec::nlanes - imm) }; \ + enum { MASK = ((1 << _Tpvec::nlanes) - 1) }; \ + if (imm == 0) return a; \ + if (imm == _Tpvec::nlanes) return b; \ + if (imm >= 2*_Tpvec::nlanes) return _Tpvec::zero(); \ + return _Tpvec(_mm512_mask_expand_##suffix(_mm512_maskz_compress_##suffix((MASK << SHIFT2)&MASK, b.val), (MASK << (imm))&MASK, a.val)); \ +} \ +template \ +inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \ +{ \ + enum { SHIFT2 = (_Tpvec::nlanes - imm) }; \ + enum { MASK = ((1 << _Tpvec::nlanes) - 1) }; \ + if (imm == 0) return a; \ + if (imm == _Tpvec::nlanes) return b; \ + if (imm >= 2*_Tpvec::nlanes) return _Tpvec::zero(); \ + return _Tpvec(_mm512_mask_expand_##suffix(_mm512_maskz_compress_##suffix((MASK << (imm))&MASK, a.val), (MASK << SHIFT2)&MASK, b.val)); \ +} \ +template \ +inline _Tpvec v_rotate_left(const _Tpvec& a) \ +{ \ + if (imm == 0) return a; \ + if (imm >= _Tpvec::nlanes) return _Tpvec::zero(); \ + return _Tpvec(_mm512_maskz_expand_##suffix((1 << _Tpvec::nlanes) - (1 << (imm)), a.val)); \ +} \ +template \ +inline _Tpvec v_rotate_right(const _Tpvec& a) \ +{ \ + if (imm == 0) return a; \ + if (imm >= _Tpvec::nlanes) return _Tpvec::zero(); \ + return _Tpvec(_mm512_maskz_compress_##suffix((1 << _Tpvec::nlanes) - (1 << (imm)), a.val)); \ +} + +OPENCV_HAL_IMPL_AVX512_ROTATE_PM(v_uint8x64, u8) +OPENCV_HAL_IMPL_AVX512_ROTATE_PM(v_uint16x32, u16) +OPENCV_HAL_IMPL_AVX512_ROTATE_PM(v_int16x32, s16) +OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_uint32x16, epi32) +OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_int32x16, epi32) +OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_uint64x8, epi64) +OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_int64x8, epi64) +OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_float32x16, ps) +OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_float64x8, pd) + +/** Reverse **/ +inline v_uint8x64 v_reverse(const v_uint8x64 &a) +{ +#if CV_AVX_512VBMI + static const __m512i perm = _mm512_set_epi32( + 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f, + 0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f, + 0x20212223, 0x24252627, 0x28292a2b, 0x2c2d2e2f, + 0x30313233, 0x34353637, 0x38393a3b, 0x3c3d3e3f); + return v_uint8x64(_mm512_permutexvar_epi8(perm, a.val)); +#else + static const __m512i shuf = _mm512_set_epi32( + 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f, + 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f, + 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f, + 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f); + static const __m512i perm = _mm512_set_epi64(1, 0, 3, 2, 5, 4, 7, 6); + __m512i vec = _mm512_shuffle_epi8(a.val, shuf); + return v_uint8x64(_mm512_permutexvar_epi64(perm, vec)); +#endif +} + +inline v_int8x64 v_reverse(const v_int8x64 &a) +{ return v_reinterpret_as_s8(v_reverse(v_reinterpret_as_u8(a))); } + +inline v_uint16x32 v_reverse(const v_uint16x32 &a) +{ +#if CV_AVX_512VBMI + static const __m512i perm = _mm512_set_epi32( + 0x00000001, 0x00020003, 0x00040005, 0x00060007, + 0x00080009, 0x000a000b, 0x000c000d, 0x000e000f, + 0x00100011, 0x00120013, 0x00140015, 0x00160017, + 0x00180019, 0x001a001b, 0x001c001d, 0x001e001f); + return v_uint16x32(_mm512_permutexvar_epi16(perm, a.val)); +#else + static const __m512i shuf = _mm512_set_epi32( + 0x01000302, 0x05040706, 0x09080b0a, 0x0d0c0f0e, + 0x01000302, 0x05040706, 0x09080b0a, 0x0d0c0f0e, + 0x01000302, 0x05040706, 0x09080b0a, 0x0d0c0f0e, + 0x01000302, 0x05040706, 0x09080b0a, 0x0d0c0f0e); + static const __m512i perm = _mm512_set_epi64(1, 0, 3, 2, 5, 4, 7, 6); + __m512i vec = _mm512_shuffle_epi8(a.val, shuf); + return v_uint16x32(_mm512_permutexvar_epi64(perm, vec)); +#endif +} + +inline v_int16x32 v_reverse(const v_int16x32 &a) +{ return v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); } + +inline v_uint32x16 v_reverse(const v_uint32x16 &a) +{ + static const __m512i perm = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,14, 15); + return v_uint32x16(_mm512_permutexvar_epi32(perm, a.val)); +} + +inline v_int32x16 v_reverse(const v_int32x16 &a) +{ return v_reinterpret_as_s32(v_reverse(v_reinterpret_as_u32(a))); } + +inline v_float32x16 v_reverse(const v_float32x16 &a) +{ return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); } + +inline v_uint64x8 v_reverse(const v_uint64x8 &a) +{ + static const __m512i perm = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7); + return v_uint64x8(_mm512_permutexvar_epi64(perm, a.val)); +} + +inline v_int64x8 v_reverse(const v_int64x8 &a) +{ return v_reinterpret_as_s64(v_reverse(v_reinterpret_as_u64(a))); } + +inline v_float64x8 v_reverse(const v_float64x8 &a) +{ return v_reinterpret_as_f64(v_reverse(v_reinterpret_as_u64(a))); } + +////////// Reduce ///////// + +/** Reduce **/ +#define OPENCV_HAL_IMPL_AVX512_REDUCE_ADD64(a, b) a + b +#define OPENCV_HAL_IMPL_AVX512_REDUCE_8(sctype, func, _Tpvec, ifunc, scop) \ + inline sctype v_reduce_##func(const _Tpvec& a) \ + { __m256i half = _mm256_##ifunc(_v512_extract_low(a.val), _v512_extract_high(a.val)); \ + sctype CV_DECL_ALIGNED(64) idx[2]; \ + _mm_store_si128((__m128i*)idx, _mm_##ifunc(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1))); \ + return scop(idx[0], idx[1]); } +OPENCV_HAL_IMPL_AVX512_REDUCE_8(uint64, min, v_uint64x8, min_epu64, min) +OPENCV_HAL_IMPL_AVX512_REDUCE_8(uint64, max, v_uint64x8, max_epu64, max) +OPENCV_HAL_IMPL_AVX512_REDUCE_8(uint64, sum, v_uint64x8, add_epi64, OPENCV_HAL_IMPL_AVX512_REDUCE_ADD64) +OPENCV_HAL_IMPL_AVX512_REDUCE_8(int64, min, v_int64x8, min_epi64, min) +OPENCV_HAL_IMPL_AVX512_REDUCE_8(int64, max, v_int64x8, max_epi64, max) +OPENCV_HAL_IMPL_AVX512_REDUCE_8(int64, sum, v_int64x8, add_epi64, OPENCV_HAL_IMPL_AVX512_REDUCE_ADD64) + +#define OPENCV_HAL_IMPL_AVX512_REDUCE_8F(func, ifunc, scop) \ + inline double v_reduce_##func(const v_float64x8& a) \ + { __m256d half = _mm256_##ifunc(_v512_extract_low(a.val), _v512_extract_high(a.val)); \ + double CV_DECL_ALIGNED(64) idx[2]; \ + _mm_store_pd(idx, _mm_##ifunc(_mm256_castpd256_pd128(half), _mm256_extractf128_pd(half, 1))); \ + return scop(idx[0], idx[1]); } +OPENCV_HAL_IMPL_AVX512_REDUCE_8F(min, min_pd, min) +OPENCV_HAL_IMPL_AVX512_REDUCE_8F(max, max_pd, max) +OPENCV_HAL_IMPL_AVX512_REDUCE_8F(sum, add_pd, OPENCV_HAL_IMPL_AVX512_REDUCE_ADD64) + +#define OPENCV_HAL_IMPL_AVX512_REDUCE_16(sctype, func, _Tpvec, ifunc) \ + inline sctype v_reduce_##func(const _Tpvec& a) \ + { __m256i half = _mm256_##ifunc(_v512_extract_low(a.val), _v512_extract_high(a.val)); \ + __m128i quarter = _mm_##ifunc(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1)); \ + quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 8)); \ + quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 4)); \ + return (sctype)_mm_cvtsi128_si32(quarter); } +OPENCV_HAL_IMPL_AVX512_REDUCE_16(uint, min, v_uint32x16, min_epu32) +OPENCV_HAL_IMPL_AVX512_REDUCE_16(uint, max, v_uint32x16, max_epu32) +OPENCV_HAL_IMPL_AVX512_REDUCE_16(int, min, v_int32x16, min_epi32) +OPENCV_HAL_IMPL_AVX512_REDUCE_16(int, max, v_int32x16, max_epi32) + +#define OPENCV_HAL_IMPL_AVX512_REDUCE_16F(func, ifunc) \ + inline float v_reduce_##func(const v_float32x16& a) \ + { __m256 half = _mm256_##ifunc(_v512_extract_low(a.val), _v512_extract_high(a.val)); \ + __m128 quarter = _mm_##ifunc(_mm256_castps256_ps128(half), _mm256_extractf128_ps(half, 1)); \ + quarter = _mm_##ifunc(quarter, _mm_permute_ps(quarter, _MM_SHUFFLE(0, 0, 3, 2))); \ + quarter = _mm_##ifunc(quarter, _mm_permute_ps(quarter, _MM_SHUFFLE(0, 0, 0, 1))); \ + return _mm_cvtss_f32(quarter); } +OPENCV_HAL_IMPL_AVX512_REDUCE_16F(min, min_ps) +OPENCV_HAL_IMPL_AVX512_REDUCE_16F(max, max_ps) + +inline float v_reduce_sum(const v_float32x16& a) +{ + __m256 half = _mm256_add_ps(_v512_extract_low(a.val), _v512_extract_high(a.val)); + __m128 quarter = _mm_add_ps(_mm256_castps256_ps128(half), _mm256_extractf128_ps(half, 1)); + quarter = _mm_hadd_ps(quarter, quarter); + return _mm_cvtss_f32(_mm_hadd_ps(quarter, quarter)); +} +inline int v_reduce_sum(const v_int32x16& a) +{ + __m256i half = _mm256_add_epi32(_v512_extract_low(a.val), _v512_extract_high(a.val)); + __m128i quarter = _mm_add_epi32(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1)); + quarter = _mm_hadd_epi32(quarter, quarter); + return _mm_cvtsi128_si32(_mm_hadd_epi32(quarter, quarter)); +} +inline uint v_reduce_sum(const v_uint32x16& a) +{ return (uint)v_reduce_sum(v_reinterpret_as_s32(a)); } + +#define OPENCV_HAL_IMPL_AVX512_REDUCE_32(sctype, func, _Tpvec, ifunc) \ + inline sctype v_reduce_##func(const _Tpvec& a) \ + { __m256i half = _mm256_##ifunc(_v512_extract_low(a.val), _v512_extract_high(a.val)); \ + __m128i quarter = _mm_##ifunc(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1)); \ + quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 8)); \ + quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 4)); \ + quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 2)); \ + return (sctype)_mm_cvtsi128_si32(quarter); } +OPENCV_HAL_IMPL_AVX512_REDUCE_32(ushort, min, v_uint16x32, min_epu16) +OPENCV_HAL_IMPL_AVX512_REDUCE_32(ushort, max, v_uint16x32, max_epu16) +OPENCV_HAL_IMPL_AVX512_REDUCE_32(short, min, v_int16x32, min_epi16) +OPENCV_HAL_IMPL_AVX512_REDUCE_32(short, max, v_int16x32, max_epi16) + +inline int v_reduce_sum(const v_int16x32& a) +{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); } +inline uint v_reduce_sum(const v_uint16x32& a) +{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); } + +#define OPENCV_HAL_IMPL_AVX512_REDUCE_64(sctype, func, _Tpvec, ifunc) \ + inline sctype v_reduce_##func(const _Tpvec& a) \ + { __m256i half = _mm256_##ifunc(_v512_extract_low(a.val), _v512_extract_high(a.val)); \ + __m128i quarter = _mm_##ifunc(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1)); \ + quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 8)); \ + quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 4)); \ + quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 2)); \ + quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 1)); \ + return (sctype)_mm_cvtsi128_si32(quarter); } +OPENCV_HAL_IMPL_AVX512_REDUCE_64(uchar, min, v_uint8x64, min_epu8) +OPENCV_HAL_IMPL_AVX512_REDUCE_64(uchar, max, v_uint8x64, max_epu8) +OPENCV_HAL_IMPL_AVX512_REDUCE_64(schar, min, v_int8x64, min_epi8) +OPENCV_HAL_IMPL_AVX512_REDUCE_64(schar, max, v_int8x64, max_epi8) + +#define OPENCV_HAL_IMPL_AVX512_REDUCE_64_SUM(sctype, _Tpvec, suffix) \ + inline sctype v_reduce_sum(const _Tpvec& a) \ + { __m512i a16 = _mm512_add_epi16(_mm512_cvt##suffix##_epi16(_v512_extract_low(a.val)), \ + _mm512_cvt##suffix##_epi16(_v512_extract_high(a.val))); \ + a16 = _mm512_cvtepi16_epi32(_mm256_add_epi16(_v512_extract_low(a16), _v512_extract_high(a16))); \ + __m256i a8 = _mm256_add_epi32(_v512_extract_low(a16), _v512_extract_high(a16)); \ + __m128i a4 = _mm_add_epi32(_mm256_castsi256_si128(a8), _mm256_extracti128_si256(a8, 1)); \ + a4 = _mm_hadd_epi32(a4, a4); \ + return (sctype)_mm_cvtsi128_si32(_mm_hadd_epi32(a4, a4)); } +OPENCV_HAL_IMPL_AVX512_REDUCE_64_SUM(uint, v_uint8x64, epu8) +OPENCV_HAL_IMPL_AVX512_REDUCE_64_SUM(int, v_int8x64, epi8) + +inline v_float32x16 v_reduce_sum4(const v_float32x16& a, const v_float32x16& b, + const v_float32x16& c, const v_float32x16& d) +{ + __m256 abl = _mm256_hadd_ps(_v512_extract_low(a.val), _v512_extract_low(b.val)); + __m256 abh = _mm256_hadd_ps(_v512_extract_high(a.val), _v512_extract_high(b.val)); + __m256 cdl = _mm256_hadd_ps(_v512_extract_low(c.val), _v512_extract_low(d.val)); + __m256 cdh = _mm256_hadd_ps(_v512_extract_high(c.val), _v512_extract_high(d.val)); + return v_float32x16(_v512_combine(_mm256_hadd_ps(abl, cdl), _mm256_hadd_ps(abh, cdh))); +} + +inline unsigned v_reduce_sad(const v_uint8x64& a, const v_uint8x64& b) +{ + __m512i val = _mm512_sad_epu8(a.val, b.val); + __m256i half = _mm256_add_epi32(_v512_extract_low(val), _v512_extract_high(val)); + __m128i quarter = _mm_add_epi32(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1)); + return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(quarter, _mm_unpackhi_epi64(quarter, quarter))); +} +inline unsigned v_reduce_sad(const v_int8x64& a, const v_int8x64& b) +{ + __m512i val = _mm512_set1_epi8(-128); + val = _mm512_sad_epu8(_mm512_add_epi8(a.val, val), _mm512_add_epi8(b.val, val)); + __m256i half = _mm256_add_epi32(_v512_extract_low(val), _v512_extract_high(val)); + __m128i quarter = _mm_add_epi32(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1)); + return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(quarter, _mm_unpackhi_epi64(quarter, quarter))); +} +inline unsigned v_reduce_sad(const v_uint16x32& a, const v_uint16x32& b) +{ return v_reduce_sum(v_add_wrap(a - b, b - a)); } +inline unsigned v_reduce_sad(const v_int16x32& a, const v_int16x32& b) +{ return v_reduce_sum(v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b)))); } +inline unsigned v_reduce_sad(const v_uint32x16& a, const v_uint32x16& b) +{ return v_reduce_sum(v_max(a, b) - v_min(a, b)); } +inline unsigned v_reduce_sad(const v_int32x16& a, const v_int32x16& b) +{ return v_reduce_sum(v_reinterpret_as_u32(v_max(a, b) - v_min(a, b))); } +inline float v_reduce_sad(const v_float32x16& a, const v_float32x16& b) +{ return v_reduce_sum((a - b) & v_float32x16(_mm512_castsi512_ps(_mm512_set1_epi32(0x7fffffff)))); } +inline double v_reduce_sad(const v_float64x8& a, const v_float64x8& b) +{ return v_reduce_sum((a - b) & v_float64x8(_mm512_castsi512_pd(_mm512_set1_epi64(0x7fffffffffffffff)))); } + +/** Popcount **/ +inline v_uint8x64 v_popcount(const v_int8x64& a) +{ +#if CV_AVX_512BITALG + return v_uint8x64(_mm512_popcnt_epi8(a.val)); +#elif CV_AVX_512VBMI + __m512i _popcnt_table0 = _v512_set_epu8(7, 6, 6, 5, 6, 5, 5, 4, 6, 5, 5, 4, 5, 4, 4, 3, + 5, 4, 4, 3, 4, 3, 3, 2, 4, 3, 3, 2, 3, 2, 2, 1, + 5, 4, 4, 3, 4, 3, 3, 2, 4, 3, 3, 2, 3, 2, 2, 1, + 4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0); + __m512i _popcnt_table1 = _v512_set_epu8(7, 6, 6, 5, 6, 5, 5, 4, 6, 5, 5, 4, 5, 4, 4, 3, + 6, 5, 5, 4, 5, 4, 4, 3, 5, 4, 4, 3, 4, 3, 3, 2, + 6, 5, 5, 4, 5, 4, 4, 3, 5, 4, 4, 3, 4, 3, 3, 2, + 5, 4, 4, 3, 4, 3, 3, 2, 4, 3, 3, 2, 3, 2, 2, 1); + return v_uint8x64(_mm512_sub_epi8(_mm512_permutex2var_epi8(_popcnt_table0, a.val, _popcnt_table1), _mm512_movm_epi8(_mm512_movepi8_mask(a.val)))); +#else + __m512i _popcnt_table = _mm512_set4_epi32(0x04030302, 0x03020201, 0x03020201, 0x02010100); + __m512i _popcnt_mask = _mm512_set1_epi8(0x0F); + + return v_uint8x64(_mm512_add_epi8(_mm512_shuffle_epi8(_popcnt_table, _mm512_and_si512( a.val, _popcnt_mask)), + _mm512_shuffle_epi8(_popcnt_table, _mm512_and_si512(_mm512_srli_epi16(a.val, 4), _popcnt_mask)))); +#endif +} +inline v_uint16x32 v_popcount(const v_int16x32& a) +{ +#if CV_AVX_512BITALG + return v_uint16x32(_mm512_popcnt_epi16(a.val)); +#elif CV_AVX_512VPOPCNTDQ + __m512i zero = _mm512_setzero_si512(); + return v_uint16x32(_mm512_packs_epi32(_mm512_popcnt_epi32(_mm512_unpacklo_epi16(a.val, zero)), + _mm512_popcnt_epi32(_mm512_unpackhi_epi16(a.val, zero)))); +#else + v_uint8x64 p = v_popcount(v_reinterpret_as_s8(a)); + p += v_rotate_right<1>(p); + return v_reinterpret_as_u16(p) & v512_setall_u16(0x00ff); +#endif +} +inline v_uint32x16 v_popcount(const v_int32x16& a) +{ +#if CV_AVX_512VPOPCNTDQ + return v_uint32x16(_mm512_popcnt_epi32(a.val)); +#else + v_uint8x64 p = v_popcount(v_reinterpret_as_s8(a)); + p += v_rotate_right<1>(p); + p += v_rotate_right<2>(p); + return v_reinterpret_as_u32(p) & v512_setall_u32(0x000000ff); +#endif +} +inline v_uint64x8 v_popcount(const v_int64x8& a) +{ +#if CV_AVX_512VPOPCNTDQ + return v_uint64x8(_mm512_popcnt_epi64(a.val)); +#else + return v_uint64x8(_mm512_sad_epu8(v_popcount(v_reinterpret_as_s8(a)).val, _mm512_setzero_si512())); +#endif +} + + +inline v_uint8x64 v_popcount(const v_uint8x64& a) { return v_popcount(v_reinterpret_as_s8 (a)); } +inline v_uint16x32 v_popcount(const v_uint16x32& a) { return v_popcount(v_reinterpret_as_s16(a)); } +inline v_uint32x16 v_popcount(const v_uint32x16& a) { return v_popcount(v_reinterpret_as_s32(a)); } +inline v_uint64x8 v_popcount(const v_uint64x8& a) { return v_popcount(v_reinterpret_as_s64(a)); } + + +////////// Other math ///////// + +/** Some frequent operations **/ +#define OPENCV_HAL_IMPL_AVX512_MULADD(_Tpvec, suffix) \ + inline _Tpvec v_fma(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \ + { return _Tpvec(_mm512_fmadd_##suffix(a.val, b.val, c.val)); } \ + inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \ + { return _Tpvec(_mm512_fmadd_##suffix(a.val, b.val, c.val)); } \ + inline _Tpvec v_sqrt(const _Tpvec& x) \ + { return _Tpvec(_mm512_sqrt_##suffix(x.val)); } \ + inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \ + { return v_fma(a, a, b * b); } \ + inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \ + { return v_sqrt(v_fma(a, a, b * b)); } + +OPENCV_HAL_IMPL_AVX512_MULADD(v_float32x16, ps) +OPENCV_HAL_IMPL_AVX512_MULADD(v_float64x8, pd) + +inline v_int32x16 v_fma(const v_int32x16& a, const v_int32x16& b, const v_int32x16& c) +{ return a * b + c; } +inline v_int32x16 v_muladd(const v_int32x16& a, const v_int32x16& b, const v_int32x16& c) +{ return v_fma(a, b, c); } + +inline v_float32x16 v_invsqrt(const v_float32x16& x) +{ +#if CV_AVX_512ER + return v_float32x16(_mm512_rsqrt28_ps(x.val)); +#else + v_float32x16 half = x * v512_setall_f32(0.5); + v_float32x16 t = v_float32x16(_mm512_rsqrt14_ps(x.val)); + t *= v512_setall_f32(1.5) - ((t * t) * half); + return t; +#endif +} + +inline v_float64x8 v_invsqrt(const v_float64x8& x) +{ +#if CV_AVX_512ER + return v_float64x8(_mm512_rsqrt28_pd(x.val)); +#else + return v512_setall_f64(1.) / v_sqrt(x); +// v_float64x8 half = x * v512_setall_f64(0.5); +// v_float64x8 t = v_float64x8(_mm512_rsqrt14_pd(x.val)); +// t *= v512_setall_f64(1.5) - ((t * t) * half); +// t *= v512_setall_f64(1.5) - ((t * t) * half); +// return t; +#endif +} + +/** Absolute values **/ +#define OPENCV_HAL_IMPL_AVX512_ABS(_Tpvec, _Tpuvec, suffix) \ + inline _Tpuvec v_abs(const _Tpvec& x) \ + { return _Tpuvec(_mm512_abs_##suffix(x.val)); } + +OPENCV_HAL_IMPL_AVX512_ABS(v_int8x64, v_uint8x64, epi8) +OPENCV_HAL_IMPL_AVX512_ABS(v_int16x32, v_uint16x32, epi16) +OPENCV_HAL_IMPL_AVX512_ABS(v_int32x16, v_uint32x16, epi32) +OPENCV_HAL_IMPL_AVX512_ABS(v_int64x8, v_uint64x8, epi64) + +inline v_float32x16 v_abs(const v_float32x16& x) +{ +#ifdef _mm512_abs_pd + return v_float32x16(_mm512_abs_ps(x.val)); +#else + return v_float32x16(_mm512_castsi512_ps(_mm512_and_si512(_mm512_castps_si512(x.val), + _v512_set_epu64(0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF, + 0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF)))); +#endif +} + +inline v_float64x8 v_abs(const v_float64x8& x) +{ +#ifdef _mm512_abs_pd + #if defined __GNUC__ && (__GNUC__ < 7 || (__GNUC__ == 7 && __GNUC_MINOR__ <= 3) || (__GNUC__ == 8 && __GNUC_MINOR__ <= 2)) + // Workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=87476 + return v_float64x8(_mm512_abs_pd(_mm512_castpd_ps(x.val))); + #else + return v_float64x8(_mm512_abs_pd(x.val)); + #endif +#else + return v_float64x8(_mm512_castsi512_pd(_mm512_and_si512(_mm512_castpd_si512(x.val), + _v512_set_epu64(0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF, + 0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF)))); +#endif +} + +/** Absolute difference **/ +inline v_uint8x64 v_absdiff(const v_uint8x64& a, const v_uint8x64& b) +{ return v_add_wrap(a - b, b - a); } +inline v_uint16x32 v_absdiff(const v_uint16x32& a, const v_uint16x32& b) +{ return v_add_wrap(a - b, b - a); } +inline v_uint32x16 v_absdiff(const v_uint32x16& a, const v_uint32x16& b) +{ return v_max(a, b) - v_min(a, b); } + +inline v_uint8x64 v_absdiff(const v_int8x64& a, const v_int8x64& b) +{ + v_int8x64 d = v_sub_wrap(a, b); + v_int8x64 m = a < b; + return v_reinterpret_as_u8(v_sub_wrap(d ^ m, m)); +} + +inline v_uint16x32 v_absdiff(const v_int16x32& a, const v_int16x32& b) +{ return v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))); } + +inline v_uint32x16 v_absdiff(const v_int32x16& a, const v_int32x16& b) +{ + v_int32x16 d = a - b; + v_int32x16 m = a < b; + return v_reinterpret_as_u32((d ^ m) - m); +} + +inline v_float32x16 v_absdiff(const v_float32x16& a, const v_float32x16& b) +{ return v_abs(a - b); } + +inline v_float64x8 v_absdiff(const v_float64x8& a, const v_float64x8& b) +{ return v_abs(a - b); } + +/** Saturating absolute difference **/ +inline v_int8x64 v_absdiffs(const v_int8x64& a, const v_int8x64& b) +{ + v_int8x64 d = a - b; + v_int8x64 m = a < b; + return (d ^ m) - m; +} +inline v_int16x32 v_absdiffs(const v_int16x32& a, const v_int16x32& b) +{ return v_max(a, b) - v_min(a, b); } + +////////// Conversions ///////// + +/** Rounding **/ +inline v_int32x16 v_round(const v_float32x16& a) +{ return v_int32x16(_mm512_cvtps_epi32(a.val)); } + +inline v_int32x16 v_round(const v_float64x8& a) +{ return v_int32x16(_mm512_castsi256_si512(_mm512_cvtpd_epi32(a.val))); } + +inline v_int32x16 v_round(const v_float64x8& a, const v_float64x8& b) +{ return v_int32x16(_v512_combine(_mm512_cvtpd_epi32(a.val), _mm512_cvtpd_epi32(b.val))); } + +inline v_int32x16 v_trunc(const v_float32x16& a) +{ return v_int32x16(_mm512_cvttps_epi32(a.val)); } + +inline v_int32x16 v_trunc(const v_float64x8& a) +{ return v_int32x16(_mm512_castsi256_si512(_mm512_cvttpd_epi32(a.val))); } + +#if CVT_ROUND_MODES_IMPLEMENTED +inline v_int32x16 v_floor(const v_float32x16& a) +{ return v_int32x16(_mm512_cvt_roundps_epi32(a.val, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)); } + +inline v_int32x16 v_floor(const v_float64x8& a) +{ return v_int32x16(_mm512_castsi256_si512(_mm512_cvt_roundpd_epi32(a.val, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC))); } + +inline v_int32x16 v_ceil(const v_float32x16& a) +{ return v_int32x16(_mm512_cvt_roundps_epi32(a.val, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)); } + +inline v_int32x16 v_ceil(const v_float64x8& a) +{ return v_int32x16(_mm512_castsi256_si512(_mm512_cvt_roundpd_epi32(a.val, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC))); } +#else +inline v_int32x16 v_floor(const v_float32x16& a) +{ return v_int32x16(_mm512_cvtps_epi32(_mm512_roundscale_ps(a.val, 1))); } + +inline v_int32x16 v_floor(const v_float64x8& a) +{ return v_int32x16(_mm512_castsi256_si512(_mm512_cvtpd_epi32(_mm512_roundscale_pd(a.val, 1)))); } + +inline v_int32x16 v_ceil(const v_float32x16& a) +{ return v_int32x16(_mm512_cvtps_epi32(_mm512_roundscale_ps(a.val, 2))); } + +inline v_int32x16 v_ceil(const v_float64x8& a) +{ return v_int32x16(_mm512_castsi256_si512(_mm512_cvtpd_epi32(_mm512_roundscale_pd(a.val, 2)))); } +#endif + +/** To float **/ +inline v_float32x16 v_cvt_f32(const v_int32x16& a) +{ return v_float32x16(_mm512_cvtepi32_ps(a.val)); } + +inline v_float32x16 v_cvt_f32(const v_float64x8& a) +{ return v_float32x16(_mm512_cvtpd_pslo(a.val)); } + +inline v_float32x16 v_cvt_f32(const v_float64x8& a, const v_float64x8& b) +{ return v_float32x16(_v512_combine(_mm512_cvtpd_ps(a.val), _mm512_cvtpd_ps(b.val))); } + +inline v_float64x8 v_cvt_f64(const v_int32x16& a) +{ return v_float64x8(_mm512_cvtepi32_pd(_v512_extract_low(a.val))); } + +inline v_float64x8 v_cvt_f64_high(const v_int32x16& a) +{ return v_float64x8(_mm512_cvtepi32_pd(_v512_extract_high(a.val))); } + +inline v_float64x8 v_cvt_f64(const v_float32x16& a) +{ return v_float64x8(_mm512_cvtps_pd(_v512_extract_low(a.val))); } + +inline v_float64x8 v_cvt_f64_high(const v_float32x16& a) +{ return v_float64x8(_mm512_cvtps_pd(_v512_extract_high(a.val))); } + +// from (Mysticial and wim) https://stackoverflow.com/q/41144668 +inline v_float64x8 v_cvt_f64(const v_int64x8& v) +{ +#if CV_AVX_512DQ + return v_float64x8(_mm512_cvtepi64_pd(v.val)); +#else + // constants encoded as floating-point + __m512i magic_i_lo = _mm512_set1_epi64(0x4330000000000000); // 2^52 + __m512i magic_i_hi32 = _mm512_set1_epi64(0x4530000080000000); // 2^84 + 2^63 + __m512i magic_i_all = _mm512_set1_epi64(0x4530000080100000); // 2^84 + 2^63 + 2^52 + __m512d magic_d_all = _mm512_castsi512_pd(magic_i_all); + + // Blend the 32 lowest significant bits of v with magic_int_lo + __m512i v_lo = _mm512_mask_blend_epi32(0x5555, magic_i_lo, v.val); + // Extract the 32 most significant bits of v + __m512i v_hi = _mm512_srli_epi64(v.val, 32); + // Flip the msb of v_hi and blend with 0x45300000 + v_hi = _mm512_xor_si512(v_hi, magic_i_hi32); + // Compute in double precision + __m512d v_hi_dbl = _mm512_sub_pd(_mm512_castsi512_pd(v_hi), magic_d_all); + // (v_hi - magic_d_all) + v_lo Do not assume associativity of floating point addition + __m512d result = _mm512_add_pd(v_hi_dbl, _mm512_castsi512_pd(v_lo)); + return v_float64x8(result); +#endif +} + +////////////// Lookup table access //////////////////// + +inline v_int8x64 v512_lut(const schar* tab, const int* idx) +{ + __m128i p0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx ), (const int *)tab, 1)); + __m128i p1 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx + 1), (const int *)tab, 1)); + __m128i p2 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx + 2), (const int *)tab, 1)); + __m128i p3 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx + 3), (const int *)tab, 1)); + return v_int8x64(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_castsi128_si512(p0), p1, 1), p2, 2), p3, 3)); +} +inline v_int8x64 v512_lut_pairs(const schar* tab, const int* idx) +{ + __m256i p0 = _mm512_cvtepi32_epi16(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx ), (const int *)tab, 1)); + __m256i p1 = _mm512_cvtepi32_epi16(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx + 1), (const int *)tab, 1)); + return v_int8x64(_v512_combine(p0, p1)); +} +inline v_int8x64 v512_lut_quads(const schar* tab, const int* idx) +{ + return v_int8x64(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx), (const int *)tab, 1)); +} +inline v_uint8x64 v512_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v512_lut((const schar *)tab, idx)); } +inline v_uint8x64 v512_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v512_lut_pairs((const schar *)tab, idx)); } +inline v_uint8x64 v512_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v512_lut_quads((const schar *)tab, idx)); } + +inline v_int16x32 v512_lut(const short* tab, const int* idx) +{ + __m256i p0 = _mm512_cvtepi32_epi16(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx ), (const int *)tab, 2)); + __m256i p1 = _mm512_cvtepi32_epi16(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx + 1), (const int *)tab, 2)); + return v_int16x32(_v512_combine(p0, p1)); +} +inline v_int16x32 v512_lut_pairs(const short* tab, const int* idx) +{ + return v_int16x32(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx), (const int *)tab, 2)); +} +inline v_int16x32 v512_lut_quads(const short* tab, const int* idx) +{ +#if defined(__GNUC__) + return v_int16x32(_mm512_i32gather_epi64(_mm256_loadu_si256((const __m256i*)idx), (const long long int*)tab, 2)); +#else + return v_int16x32(_mm512_i32gather_epi64(_mm256_loadu_si256((const __m256i*)idx), (const int64*)tab, 2)); +#endif +} +inline v_uint16x32 v512_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v512_lut((const short *)tab, idx)); } +inline v_uint16x32 v512_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v512_lut_pairs((const short *)tab, idx)); } +inline v_uint16x32 v512_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v512_lut_quads((const short *)tab, idx)); } + +inline v_int32x16 v512_lut(const int* tab, const int* idx) +{ + return v_int32x16(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx), tab, 4)); +} +inline v_int32x16 v512_lut_pairs(const int* tab, const int* idx) +{ +#if defined(__GNUC__) + return v_int32x16(_mm512_i32gather_epi64(_mm256_loadu_si256((const __m256i*)idx), (const long long int*)tab, 4)); +#else + return v_int32x16(_mm512_i32gather_epi64(_mm256_loadu_si256((const __m256i*)idx), (const int64*)tab, 4)); +#endif +} +inline v_int32x16 v512_lut_quads(const int* tab, const int* idx) +{ + return v_int32x16(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_castsi128_si512( + _mm_loadu_si128((const __m128i*)(tab + idx[0]))), + _mm_loadu_si128((const __m128i*)(tab + idx[1])), 1), + _mm_loadu_si128((const __m128i*)(tab + idx[2])), 2), + _mm_loadu_si128((const __m128i*)(tab + idx[3])), 3)); +} +inline v_uint32x16 v512_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v512_lut((const int *)tab, idx)); } +inline v_uint32x16 v512_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v512_lut_pairs((const int *)tab, idx)); } +inline v_uint32x16 v512_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v512_lut_quads((const int *)tab, idx)); } + +inline v_int64x8 v512_lut(const int64* tab, const int* idx) +{ +#if defined(__GNUC__) + return v_int64x8(_mm512_i32gather_epi64(_mm256_loadu_si256((const __m256i*)idx), (const long long int*)tab, 8)); +#else + return v_int64x8(_mm512_i32gather_epi64(_mm256_loadu_si256((const __m256i*)idx), tab , 8)); +#endif +} +inline v_int64x8 v512_lut_pairs(const int64* tab, const int* idx) +{ + return v_int64x8(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_castsi128_si512( + _mm_loadu_si128((const __m128i*)(tab + idx[0]))), + _mm_loadu_si128((const __m128i*)(tab + idx[1])), 1), + _mm_loadu_si128((const __m128i*)(tab + idx[2])), 2), + _mm_loadu_si128((const __m128i*)(tab + idx[3])), 3)); +} +inline v_uint64x8 v512_lut(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v512_lut((const int64 *)tab, idx)); } +inline v_uint64x8 v512_lut_pairs(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v512_lut_pairs((const int64 *)tab, idx)); } + +inline v_float32x16 v512_lut(const float* tab, const int* idx) +{ + return v_float32x16(_mm512_i32gather_ps(_mm512_loadu_si512((const __m512i*)idx), tab, 4)); +} +inline v_float32x16 v512_lut_pairs(const float* tab, const int* idx) { return v_reinterpret_as_f32(v512_lut_pairs((const int *)tab, idx)); } +inline v_float32x16 v512_lut_quads(const float* tab, const int* idx) { return v_reinterpret_as_f32(v512_lut_quads((const int *)tab, idx)); } + +inline v_float64x8 v512_lut(const double* tab, const int* idx) +{ + return v_float64x8(_mm512_i32gather_pd(_mm256_loadu_si256((const __m256i*)idx), tab, 8)); +} +inline v_float64x8 v512_lut_pairs(const double* tab, const int* idx) +{ + return v_float64x8(_mm512_insertf64x2(_mm512_insertf64x2(_mm512_insertf64x2(_mm512_castpd128_pd512( + _mm_loadu_pd(tab + idx[0])), + _mm_loadu_pd(tab + idx[1]), 1), + _mm_loadu_pd(tab + idx[2]), 2), + _mm_loadu_pd(tab + idx[3]), 3)); +} + +inline v_int32x16 v_lut(const int* tab, const v_int32x16& idxvec) +{ + return v_int32x16(_mm512_i32gather_epi32(idxvec.val, tab, 4)); +} + +inline v_uint32x16 v_lut(const unsigned* tab, const v_int32x16& idxvec) +{ + return v_reinterpret_as_u32(v_lut((const int *)tab, idxvec)); +} + +inline v_float32x16 v_lut(const float* tab, const v_int32x16& idxvec) +{ + return v_float32x16(_mm512_i32gather_ps(idxvec.val, tab, 4)); +} + +inline v_float64x8 v_lut(const double* tab, const v_int32x16& idxvec) +{ + return v_float64x8(_mm512_i32gather_pd(_v512_extract_low(idxvec.val), tab, 8)); +} + +inline void v_lut_deinterleave(const float* tab, const v_int32x16& idxvec, v_float32x16& x, v_float32x16& y) +{ + x.val = _mm512_i32gather_ps(idxvec.val, tab, 4); + y.val = _mm512_i32gather_ps(idxvec.val, &tab[1], 4); +} + +inline void v_lut_deinterleave(const double* tab, const v_int32x16& idxvec, v_float64x8& x, v_float64x8& y) +{ + x.val = _mm512_i32gather_pd(_v512_extract_low(idxvec.val), tab, 8); + y.val = _mm512_i32gather_pd(_v512_extract_low(idxvec.val), &tab[1], 8); +} + +inline v_int8x64 v_interleave_pairs(const v_int8x64& vec) +{ + return v_int8x64(_mm512_shuffle_epi8(vec.val, _mm512_set4_epi32(0x0f0d0e0c, 0x0b090a08, 0x07050604, 0x03010200))); +} +inline v_uint8x64 v_interleave_pairs(const v_uint8x64& vec) { return v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec))); } +inline v_int8x64 v_interleave_quads(const v_int8x64& vec) +{ + return v_int8x64(_mm512_shuffle_epi8(vec.val, _mm512_set4_epi32(0x0f0b0e0a, 0x0d090c08, 0x07030602, 0x05010400))); +} +inline v_uint8x64 v_interleave_quads(const v_uint8x64& vec) { return v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec))); } + +inline v_int16x32 v_interleave_pairs(const v_int16x32& vec) +{ + return v_int16x32(_mm512_shuffle_epi8(vec.val, _mm512_set4_epi32(0x0f0e0b0a, 0x0d0c0908, 0x07060302, 0x05040100))); +} +inline v_uint16x32 v_interleave_pairs(const v_uint16x32& vec) { return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); } +inline v_int16x32 v_interleave_quads(const v_int16x32& vec) +{ + return v_int16x32(_mm512_shuffle_epi8(vec.val, _mm512_set4_epi32(0x0f0e0706, 0x0d0c0504, 0x0b0a0302, 0x09080100))); +} +inline v_uint16x32 v_interleave_quads(const v_uint16x32& vec) { return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); } + +inline v_int32x16 v_interleave_pairs(const v_int32x16& vec) +{ + return v_int32x16(_mm512_shuffle_epi32(vec.val, _MM_PERM_ACBD)); +} +inline v_uint32x16 v_interleave_pairs(const v_uint32x16& vec) { return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); } +inline v_float32x16 v_interleave_pairs(const v_float32x16& vec) { return v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); } + +inline v_int8x64 v_pack_triplets(const v_int8x64& vec) +{ + return v_int8x64(_mm512_permutexvar_epi32(_v512_set_epu64(0x0000000f0000000f, 0x0000000f0000000f, 0x0000000e0000000d, 0x0000000c0000000a, + 0x0000000900000008, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000), + _mm512_shuffle_epi8(vec.val, _mm512_set4_epi32(0xffffff0f, 0x0e0d0c0a, 0x09080605, 0x04020100)))); +} +inline v_uint8x64 v_pack_triplets(const v_uint8x64& vec) { return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); } + +inline v_int16x32 v_pack_triplets(const v_int16x32& vec) +{ + return v_int16x32(_mm512_permutexvar_epi16(_v512_set_epu64(0x001f001f001f001f, 0x001f001f001f001f, 0x001e001d001c001a, 0x0019001800160015, + 0x0014001200110010, 0x000e000d000c000a, 0x0009000800060005, 0x0004000200010000), vec.val)); +} +inline v_uint16x32 v_pack_triplets(const v_uint16x32& vec) { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); } + +inline v_int32x16 v_pack_triplets(const v_int32x16& vec) +{ + return v_int32x16(_mm512_permutexvar_epi32(_v512_set_epu64(0x0000000f0000000f, 0x0000000f0000000f, 0x0000000e0000000d, 0x0000000c0000000a, + 0x0000000900000008, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000), vec.val)); +} +inline v_uint32x16 v_pack_triplets(const v_uint32x16& vec) { return v_reinterpret_as_u32(v_pack_triplets(v_reinterpret_as_s32(vec))); } +inline v_float32x16 v_pack_triplets(const v_float32x16& vec) +{ + return v_float32x16(_mm512_permutexvar_ps(_v512_set_epu64(0x0000000f0000000f, 0x0000000f0000000f, 0x0000000e0000000d, 0x0000000c0000000a, + 0x0000000900000008, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000), vec.val)); +} + +////////// Matrix operations ///////// + +//////// Dot Product //////// + +// 16 >> 32 +inline v_int32x16 v_dotprod(const v_int16x32& a, const v_int16x32& b) +{ return v_int32x16(_mm512_madd_epi16(a.val, b.val)); } +inline v_int32x16 v_dotprod(const v_int16x32& a, const v_int16x32& b, const v_int32x16& c) +{ return v_dotprod(a, b) + c; } + +// 32 >> 64 +inline v_int64x8 v_dotprod(const v_int32x16& a, const v_int32x16& b) +{ + __m512i even = _mm512_mul_epi32(a.val, b.val); + __m512i odd = _mm512_mul_epi32(_mm512_srli_epi64(a.val, 32), _mm512_srli_epi64(b.val, 32)); + return v_int64x8(_mm512_add_epi64(even, odd)); +} +inline v_int64x8 v_dotprod(const v_int32x16& a, const v_int32x16& b, const v_int64x8& c) +{ return v_dotprod(a, b) + c; } + +// 8 >> 32 +inline v_uint32x16 v_dotprod_expand(const v_uint8x64& a, const v_uint8x64& b) +{ + __m512i even_a = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, a.val, _mm512_setzero_si512()); + __m512i odd_a = _mm512_srli_epi16(a.val, 8); + + __m512i even_b = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, b.val, _mm512_setzero_si512()); + __m512i odd_b = _mm512_srli_epi16(b.val, 8); + + __m512i prod0 = _mm512_madd_epi16(even_a, even_b); + __m512i prod1 = _mm512_madd_epi16(odd_a, odd_b); + return v_uint32x16(_mm512_add_epi32(prod0, prod1)); +} +inline v_uint32x16 v_dotprod_expand(const v_uint8x64& a, const v_uint8x64& b, const v_uint32x16& c) +{ return v_dotprod_expand(a, b) + c; } + +inline v_int32x16 v_dotprod_expand(const v_int8x64& a, const v_int8x64& b) +{ + __m512i even_a = _mm512_srai_epi16(_mm512_bslli_epi128(a.val, 1), 8); + __m512i odd_a = _mm512_srai_epi16(a.val, 8); + + __m512i even_b = _mm512_srai_epi16(_mm512_bslli_epi128(b.val, 1), 8); + __m512i odd_b = _mm512_srai_epi16(b.val, 8); + + __m512i prod0 = _mm512_madd_epi16(even_a, even_b); + __m512i prod1 = _mm512_madd_epi16(odd_a, odd_b); + return v_int32x16(_mm512_add_epi32(prod0, prod1)); +} +inline v_int32x16 v_dotprod_expand(const v_int8x64& a, const v_int8x64& b, const v_int32x16& c) +{ return v_dotprod_expand(a, b) + c; } + +// 16 >> 64 +inline v_uint64x8 v_dotprod_expand(const v_uint16x32& a, const v_uint16x32& b) +{ + __m512i mullo = _mm512_mullo_epi16(a.val, b.val); + __m512i mulhi = _mm512_mulhi_epu16(a.val, b.val); + __m512i mul0 = _mm512_unpacklo_epi16(mullo, mulhi); + __m512i mul1 = _mm512_unpackhi_epi16(mullo, mulhi); + + __m512i p02 = _mm512_mask_blend_epi32(0xAAAA, mul0, _mm512_setzero_si512()); + __m512i p13 = _mm512_srli_epi64(mul0, 32); + __m512i p46 = _mm512_mask_blend_epi32(0xAAAA, mul1, _mm512_setzero_si512()); + __m512i p57 = _mm512_srli_epi64(mul1, 32); + + __m512i p15_ = _mm512_add_epi64(p02, p13); + __m512i p9d_ = _mm512_add_epi64(p46, p57); + + return v_uint64x8(_mm512_add_epi64( + _mm512_unpacklo_epi64(p15_, p9d_), + _mm512_unpackhi_epi64(p15_, p9d_) + )); +} +inline v_uint64x8 v_dotprod_expand(const v_uint16x32& a, const v_uint16x32& b, const v_uint64x8& c) +{ return v_dotprod_expand(a, b) + c; } + +inline v_int64x8 v_dotprod_expand(const v_int16x32& a, const v_int16x32& b) +{ + __m512i prod = _mm512_madd_epi16(a.val, b.val); + __m512i even = _mm512_srai_epi64(_mm512_bslli_epi128(prod, 4), 32); + __m512i odd = _mm512_srai_epi64(prod, 32); + return v_int64x8(_mm512_add_epi64(even, odd)); +} +inline v_int64x8 v_dotprod_expand(const v_int16x32& a, const v_int16x32& b, const v_int64x8& c) +{ return v_dotprod_expand(a, b) + c; } + +// 32 >> 64f +inline v_float64x8 v_dotprod_expand(const v_int32x16& a, const v_int32x16& b) +{ return v_cvt_f64(v_dotprod(a, b)); } +inline v_float64x8 v_dotprod_expand(const v_int32x16& a, const v_int32x16& b, const v_float64x8& c) +{ return v_dotprod_expand(a, b) + c; } + +//////// Fast Dot Product //////// + +// 16 >> 32 +inline v_int32x16 v_dotprod_fast(const v_int16x32& a, const v_int16x32& b) +{ return v_dotprod(a, b); } +inline v_int32x16 v_dotprod_fast(const v_int16x32& a, const v_int16x32& b, const v_int32x16& c) +{ return v_dotprod(a, b, c); } + +// 32 >> 64 +inline v_int64x8 v_dotprod_fast(const v_int32x16& a, const v_int32x16& b) +{ return v_dotprod(a, b); } +inline v_int64x8 v_dotprod_fast(const v_int32x16& a, const v_int32x16& b, const v_int64x8& c) +{ return v_dotprod(a, b, c); } + +// 8 >> 32 +inline v_uint32x16 v_dotprod_expand_fast(const v_uint8x64& a, const v_uint8x64& b) +{ return v_dotprod_expand(a, b); } +inline v_uint32x16 v_dotprod_expand_fast(const v_uint8x64& a, const v_uint8x64& b, const v_uint32x16& c) +{ return v_dotprod_expand(a, b, c); } + +inline v_int32x16 v_dotprod_expand_fast(const v_int8x64& a, const v_int8x64& b) +{ return v_dotprod_expand(a, b); } +inline v_int32x16 v_dotprod_expand_fast(const v_int8x64& a, const v_int8x64& b, const v_int32x16& c) +{ return v_dotprod_expand(a, b, c); } + +// 16 >> 64 +inline v_uint64x8 v_dotprod_expand_fast(const v_uint16x32& a, const v_uint16x32& b) +{ + __m512i mullo = _mm512_mullo_epi16(a.val, b.val); + __m512i mulhi = _mm512_mulhi_epu16(a.val, b.val); + __m512i mul0 = _mm512_unpacklo_epi16(mullo, mulhi); + __m512i mul1 = _mm512_unpackhi_epi16(mullo, mulhi); + + __m512i p02 = _mm512_mask_blend_epi32(0xAAAA, mul0, _mm512_setzero_si512()); + __m512i p13 = _mm512_srli_epi64(mul0, 32); + __m512i p46 = _mm512_mask_blend_epi32(0xAAAA, mul1, _mm512_setzero_si512()); + __m512i p57 = _mm512_srli_epi64(mul1, 32); + + __m512i p15_ = _mm512_add_epi64(p02, p13); + __m512i p9d_ = _mm512_add_epi64(p46, p57); + return v_uint64x8(_mm512_add_epi64(p15_, p9d_)); +} +inline v_uint64x8 v_dotprod_expand_fast(const v_uint16x32& a, const v_uint16x32& b, const v_uint64x8& c) +{ return v_dotprod_expand_fast(a, b) + c; } + +inline v_int64x8 v_dotprod_expand_fast(const v_int16x32& a, const v_int16x32& b) +{ return v_dotprod_expand(a, b); } +inline v_int64x8 v_dotprod_expand_fast(const v_int16x32& a, const v_int16x32& b, const v_int64x8& c) +{ return v_dotprod_expand(a, b, c); } + +// 32 >> 64f +inline v_float64x8 v_dotprod_expand_fast(const v_int32x16& a, const v_int32x16& b) +{ return v_dotprod_expand(a, b); } +inline v_float64x8 v_dotprod_expand_fast(const v_int32x16& a, const v_int32x16& b, const v_float64x8& c) +{ return v_dotprod_expand(a, b) + c; } + + +#define OPENCV_HAL_AVX512_SPLAT2_PS(a, im) \ + v_float32x16(_mm512_permute_ps(a.val, _MM_SHUFFLE(im, im, im, im))) + +inline v_float32x16 v_matmul(const v_float32x16& v, + const v_float32x16& m0, const v_float32x16& m1, + const v_float32x16& m2, const v_float32x16& m3) +{ + v_float32x16 v04 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 0); + v_float32x16 v15 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 1); + v_float32x16 v26 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 2); + v_float32x16 v37 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 3); + return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, v37 * m3))); +} + +inline v_float32x16 v_matmuladd(const v_float32x16& v, + const v_float32x16& m0, const v_float32x16& m1, + const v_float32x16& m2, const v_float32x16& a) +{ + v_float32x16 v04 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 0); + v_float32x16 v15 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 1); + v_float32x16 v26 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 2); + return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, a))); +} + +#define OPENCV_HAL_IMPL_AVX512_TRANSPOSE4x4(_Tpvec, suffix, cast_from, cast_to) \ + inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, \ + const _Tpvec& a2, const _Tpvec& a3, \ + _Tpvec& b0, _Tpvec& b1, _Tpvec& b2, _Tpvec& b3) \ + { \ + __m512i t0 = cast_from(_mm512_unpacklo_##suffix(a0.val, a1.val)); \ + __m512i t1 = cast_from(_mm512_unpacklo_##suffix(a2.val, a3.val)); \ + __m512i t2 = cast_from(_mm512_unpackhi_##suffix(a0.val, a1.val)); \ + __m512i t3 = cast_from(_mm512_unpackhi_##suffix(a2.val, a3.val)); \ + b0.val = cast_to(_mm512_unpacklo_epi64(t0, t1)); \ + b1.val = cast_to(_mm512_unpackhi_epi64(t0, t1)); \ + b2.val = cast_to(_mm512_unpacklo_epi64(t2, t3)); \ + b3.val = cast_to(_mm512_unpackhi_epi64(t2, t3)); \ + } + +OPENCV_HAL_IMPL_AVX512_TRANSPOSE4x4(v_uint32x16, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP) +OPENCV_HAL_IMPL_AVX512_TRANSPOSE4x4(v_int32x16, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP) +OPENCV_HAL_IMPL_AVX512_TRANSPOSE4x4(v_float32x16, ps, _mm512_castps_si512, _mm512_castsi512_ps) + +//////////////// Value reordering /////////////// + +/* Expand */ +#define OPENCV_HAL_IMPL_AVX512_EXPAND(_Tpvec, _Tpwvec, _Tp, intrin) \ + inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \ + { \ + b0.val = intrin(_v512_extract_low(a.val)); \ + b1.val = intrin(_v512_extract_high(a.val)); \ + } \ + inline _Tpwvec v_expand_low(const _Tpvec& a) \ + { return _Tpwvec(intrin(_v512_extract_low(a.val))); } \ + inline _Tpwvec v_expand_high(const _Tpvec& a) \ + { return _Tpwvec(intrin(_v512_extract_high(a.val))); } \ + inline _Tpwvec v512_load_expand(const _Tp* ptr) \ + { \ + __m256i a = _mm256_loadu_si256((const __m256i*)ptr); \ + return _Tpwvec(intrin(a)); \ + } + +OPENCV_HAL_IMPL_AVX512_EXPAND(v_uint8x64, v_uint16x32, uchar, _mm512_cvtepu8_epi16) +OPENCV_HAL_IMPL_AVX512_EXPAND(v_int8x64, v_int16x32, schar, _mm512_cvtepi8_epi16) +OPENCV_HAL_IMPL_AVX512_EXPAND(v_uint16x32, v_uint32x16, ushort, _mm512_cvtepu16_epi32) +OPENCV_HAL_IMPL_AVX512_EXPAND(v_int16x32, v_int32x16, short, _mm512_cvtepi16_epi32) +OPENCV_HAL_IMPL_AVX512_EXPAND(v_uint32x16, v_uint64x8, unsigned, _mm512_cvtepu32_epi64) +OPENCV_HAL_IMPL_AVX512_EXPAND(v_int32x16, v_int64x8, int, _mm512_cvtepi32_epi64) + +#define OPENCV_HAL_IMPL_AVX512_EXPAND_Q(_Tpvec, _Tp, intrin) \ + inline _Tpvec v512_load_expand_q(const _Tp* ptr) \ + { \ + __m128i a = _mm_loadu_si128((const __m128i*)ptr); \ + return _Tpvec(intrin(a)); \ + } + +OPENCV_HAL_IMPL_AVX512_EXPAND_Q(v_uint32x16, uchar, _mm512_cvtepu8_epi32) +OPENCV_HAL_IMPL_AVX512_EXPAND_Q(v_int32x16, schar, _mm512_cvtepi8_epi32) + +/* pack */ +// 16 +inline v_int8x64 v_pack(const v_int16x32& a, const v_int16x32& b) +{ return v_int8x64(_mm512_permutexvar_epi64(_v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), _mm512_packs_epi16(a.val, b.val))); } + +inline v_uint8x64 v_pack(const v_uint16x32& a, const v_uint16x32& b) +{ + const __m512i t = _mm512_set1_epi16(255); + return v_uint8x64(_v512_combine(_mm512_cvtepi16_epi8(_mm512_min_epu16(a.val, t)), _mm512_cvtepi16_epi8(_mm512_min_epu16(b.val, t)))); +} + +inline v_uint8x64 v_pack_u(const v_int16x32& a, const v_int16x32& b) +{ + return v_uint8x64(_mm512_permutexvar_epi64(_v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), _mm512_packus_epi16(a.val, b.val))); +} + +inline void v_pack_store(schar* ptr, const v_int16x32& a) +{ v_store_low(ptr, v_pack(a, a)); } + +inline void v_pack_store(uchar* ptr, const v_uint16x32& a) +{ + const __m512i m = _mm512_set1_epi16(255); + _mm256_storeu_si256((__m256i*)ptr, _mm512_cvtepi16_epi8(_mm512_min_epu16(a.val, m))); +} + +inline void v_pack_u_store(uchar* ptr, const v_int16x32& a) +{ v_store_low(ptr, v_pack_u(a, a)); } + +template inline +v_uint8x64 v_rshr_pack(const v_uint16x32& a, const v_uint16x32& b) +{ + // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers. + v_uint16x32 delta = v512_setall_u16((short)(1 << (n-1))); + return v_pack_u(v_reinterpret_as_s16((a + delta) >> n), + v_reinterpret_as_s16((b + delta) >> n)); +} + +template inline +void v_rshr_pack_store(uchar* ptr, const v_uint16x32& a) +{ + v_uint16x32 delta = v512_setall_u16((short)(1 << (n-1))); + v_pack_u_store(ptr, v_reinterpret_as_s16((a + delta) >> n)); +} + +template inline +v_uint8x64 v_rshr_pack_u(const v_int16x32& a, const v_int16x32& b) +{ + v_int16x32 delta = v512_setall_s16((short)(1 << (n-1))); + return v_pack_u((a + delta) >> n, (b + delta) >> n); +} + +template inline +void v_rshr_pack_u_store(uchar* ptr, const v_int16x32& a) +{ + v_int16x32 delta = v512_setall_s16((short)(1 << (n-1))); + v_pack_u_store(ptr, (a + delta) >> n); +} + +template inline +v_int8x64 v_rshr_pack(const v_int16x32& a, const v_int16x32& b) +{ + v_int16x32 delta = v512_setall_s16((short)(1 << (n-1))); + return v_pack((a + delta) >> n, (b + delta) >> n); +} + +template inline +void v_rshr_pack_store(schar* ptr, const v_int16x32& a) +{ + v_int16x32 delta = v512_setall_s16((short)(1 << (n-1))); + v_pack_store(ptr, (a + delta) >> n); +} + +// 32 +inline v_int16x32 v_pack(const v_int32x16& a, const v_int32x16& b) +{ return v_int16x32(_mm512_permutexvar_epi64(_v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), _mm512_packs_epi32(a.val, b.val))); } + +inline v_uint16x32 v_pack(const v_uint32x16& a, const v_uint32x16& b) +{ + const __m512i m = _mm512_set1_epi32(65535); + return v_uint16x32(_v512_combine(_mm512_cvtepi32_epi16(_mm512_min_epu32(a.val, m)), _mm512_cvtepi32_epi16(_mm512_min_epu32(b.val, m)))); +} + +inline v_uint16x32 v_pack_u(const v_int32x16& a, const v_int32x16& b) +{ return v_uint16x32(_mm512_permutexvar_epi64(_v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), _mm512_packus_epi32(a.val, b.val))); } + +inline void v_pack_store(short* ptr, const v_int32x16& a) +{ v_store_low(ptr, v_pack(a, a)); } + +inline void v_pack_store(ushort* ptr, const v_uint32x16& a) +{ + const __m512i m = _mm512_set1_epi32(65535); + _mm256_storeu_si256((__m256i*)ptr, _mm512_cvtepi32_epi16(_mm512_min_epu32(a.val, m))); +} + +inline void v_pack_u_store(ushort* ptr, const v_int32x16& a) +{ v_store_low(ptr, v_pack_u(a, a)); } + + +template inline +v_uint16x32 v_rshr_pack(const v_uint32x16& a, const v_uint32x16& b) +{ + v_uint32x16 delta = v512_setall_u32(1 << (n-1)); + return v_pack_u(v_reinterpret_as_s32((a + delta) >> n), + v_reinterpret_as_s32((b + delta) >> n)); +} + +template inline +void v_rshr_pack_store(ushort* ptr, const v_uint32x16& a) +{ + v_uint32x16 delta = v512_setall_u32(1 << (n-1)); + v_pack_u_store(ptr, v_reinterpret_as_s32((a + delta) >> n)); +} + +template inline +v_uint16x32 v_rshr_pack_u(const v_int32x16& a, const v_int32x16& b) +{ + v_int32x16 delta = v512_setall_s32(1 << (n-1)); + return v_pack_u((a + delta) >> n, (b + delta) >> n); +} + +template inline +void v_rshr_pack_u_store(ushort* ptr, const v_int32x16& a) +{ + v_int32x16 delta = v512_setall_s32(1 << (n-1)); + v_pack_u_store(ptr, (a + delta) >> n); +} + +template inline +v_int16x32 v_rshr_pack(const v_int32x16& a, const v_int32x16& b) +{ + v_int32x16 delta = v512_setall_s32(1 << (n-1)); + return v_pack((a + delta) >> n, (b + delta) >> n); +} + +template inline +void v_rshr_pack_store(short* ptr, const v_int32x16& a) +{ + v_int32x16 delta = v512_setall_s32(1 << (n-1)); + v_pack_store(ptr, (a + delta) >> n); +} + +// 64 +// Non-saturating pack +inline v_uint32x16 v_pack(const v_uint64x8& a, const v_uint64x8& b) +{ return v_uint32x16(_v512_combine(_mm512_cvtepi64_epi32(a.val), _mm512_cvtepi64_epi32(b.val))); } + +inline v_int32x16 v_pack(const v_int64x8& a, const v_int64x8& b) +{ return v_reinterpret_as_s32(v_pack(v_reinterpret_as_u64(a), v_reinterpret_as_u64(b))); } + +inline void v_pack_store(unsigned* ptr, const v_uint64x8& a) +{ _mm256_storeu_si256((__m256i*)ptr, _mm512_cvtepi64_epi32(a.val)); } + +inline void v_pack_store(int* ptr, const v_int64x8& b) +{ v_pack_store((unsigned*)ptr, v_reinterpret_as_u64(b)); } + +template inline +v_uint32x16 v_rshr_pack(const v_uint64x8& a, const v_uint64x8& b) +{ + v_uint64x8 delta = v512_setall_u64((uint64)1 << (n-1)); + return v_pack((a + delta) >> n, (b + delta) >> n); +} + +template inline +void v_rshr_pack_store(unsigned* ptr, const v_uint64x8& a) +{ + v_uint64x8 delta = v512_setall_u64((uint64)1 << (n-1)); + v_pack_store(ptr, (a + delta) >> n); +} + +template inline +v_int32x16 v_rshr_pack(const v_int64x8& a, const v_int64x8& b) +{ + v_int64x8 delta = v512_setall_s64((int64)1 << (n-1)); + return v_pack((a + delta) >> n, (b + delta) >> n); +} + +template inline +void v_rshr_pack_store(int* ptr, const v_int64x8& a) +{ + v_int64x8 delta = v512_setall_s64((int64)1 << (n-1)); + v_pack_store(ptr, (a + delta) >> n); +} + +// pack boolean +inline v_uint8x64 v_pack_b(const v_uint16x32& a, const v_uint16x32& b) +{ return v_uint8x64(_mm512_permutexvar_epi64(_v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), _mm512_packs_epi16(a.val, b.val))); } + +inline v_uint8x64 v_pack_b(const v_uint32x16& a, const v_uint32x16& b, + const v_uint32x16& c, const v_uint32x16& d) +{ + __m512i ab = _mm512_packs_epi32(a.val, b.val); + __m512i cd = _mm512_packs_epi32(c.val, d.val); + + return v_uint8x64(_mm512_permutexvar_epi32(_v512_set_epu32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0), _mm512_packs_epi16(ab, cd))); +} + +inline v_uint8x64 v_pack_b(const v_uint64x8& a, const v_uint64x8& b, const v_uint64x8& c, + const v_uint64x8& d, const v_uint64x8& e, const v_uint64x8& f, + const v_uint64x8& g, const v_uint64x8& h) +{ + __m512i ab = _mm512_packs_epi32(a.val, b.val); + __m512i cd = _mm512_packs_epi32(c.val, d.val); + __m512i ef = _mm512_packs_epi32(e.val, f.val); + __m512i gh = _mm512_packs_epi32(g.val, h.val); + + __m512i abcd = _mm512_packs_epi32(ab, cd); + __m512i efgh = _mm512_packs_epi32(ef, gh); + + return v_uint8x64(_mm512_permutexvar_epi16(_v512_set_epu16(31, 23, 15, 7, 30, 22, 14, 6, 29, 21, 13, 5, 28, 20, 12, 4, + 27, 19, 11, 3, 26, 18, 10, 2, 25, 17, 9, 1, 24, 16, 8, 0), _mm512_packs_epi16(abcd, efgh))); +} + +/* Recombine */ +// its up there with load and store operations + +/* Extract */ +#define OPENCV_HAL_IMPL_AVX512_EXTRACT(_Tpvec) \ + template \ + inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) \ + { return v_rotate_right(a, b); } + +OPENCV_HAL_IMPL_AVX512_EXTRACT(v_uint8x64) +OPENCV_HAL_IMPL_AVX512_EXTRACT(v_int8x64) +OPENCV_HAL_IMPL_AVX512_EXTRACT(v_uint16x32) +OPENCV_HAL_IMPL_AVX512_EXTRACT(v_int16x32) +OPENCV_HAL_IMPL_AVX512_EXTRACT(v_uint32x16) +OPENCV_HAL_IMPL_AVX512_EXTRACT(v_int32x16) +OPENCV_HAL_IMPL_AVX512_EXTRACT(v_uint64x8) +OPENCV_HAL_IMPL_AVX512_EXTRACT(v_int64x8) +OPENCV_HAL_IMPL_AVX512_EXTRACT(v_float32x16) +OPENCV_HAL_IMPL_AVX512_EXTRACT(v_float64x8) + +#define OPENCV_HAL_IMPL_AVX512_EXTRACT_N(_Tpvec, _Tp) \ +template inline _Tp v_extract_n(_Tpvec v) { return v_rotate_right(v).get0(); } + +OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_uint8x64, uchar) +OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_int8x64, schar) +OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_uint16x32, ushort) +OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_int16x32, short) +OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_uint32x16, uint) +OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_int32x16, int) +OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_uint64x8, uint64) +OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_int64x8, int64) +OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_float32x16, float) +OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_float64x8, double) + +template +inline v_uint32x16 v_broadcast_element(v_uint32x16 a) +{ + static const __m512i perm = _mm512_set1_epi32((char)i); + return v_uint32x16(_mm512_permutexvar_epi32(perm, a.val)); +} + +template +inline v_int32x16 v_broadcast_element(const v_int32x16 &a) +{ return v_reinterpret_as_s32(v_broadcast_element(v_reinterpret_as_u32(a))); } + +template +inline v_float32x16 v_broadcast_element(const v_float32x16 &a) +{ return v_reinterpret_as_f32(v_broadcast_element(v_reinterpret_as_u32(a))); } + + +///////////////////// load deinterleave ///////////////////////////// + +inline void v_load_deinterleave( const uchar* ptr, v_uint8x64& a, v_uint8x64& b ) +{ + __m512i ab0 = _mm512_loadu_si512((const __m512i*)ptr); + __m512i ab1 = _mm512_loadu_si512((const __m512i*)(ptr + 64)); +#if CV_AVX_512VBMI + __m512i mask0 = _v512_set_epu8(126, 124, 122, 120, 118, 116, 114, 112, 110, 108, 106, 104, 102, 100, 98, 96, + 94, 92, 90, 88, 86, 84, 82, 80, 78, 76, 74, 72, 70, 68, 66, 64, + 62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, + 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0); + __m512i mask1 = _v512_set_epu8(127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107, 105, 103, 101, 99, 97, + 95, 93, 91, 89, 87, 85, 83, 81, 79, 77, 75, 73, 71, 69, 67, 65, + 63, 61, 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33, + 31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1); + a = v_uint8x64(_mm512_permutex2var_epi8(ab0, mask0, ab1)); + b = v_uint8x64(_mm512_permutex2var_epi8(ab0, mask1, ab1)); +#else + __m512i mask0 = _mm512_set4_epi32(0x0f0d0b09, 0x07050301, 0x0e0c0a08, 0x06040200); + __m512i a0b0 = _mm512_shuffle_epi8(ab0, mask0); + __m512i a1b1 = _mm512_shuffle_epi8(ab1, mask0); + __m512i mask1 = _v512_set_epu64(14, 12, 10, 8, 6, 4, 2, 0); + __m512i mask2 = _v512_set_epu64(15, 13, 11, 9, 7, 5, 3, 1); + a = v_uint8x64(_mm512_permutex2var_epi64(a0b0, mask1, a1b1)); + b = v_uint8x64(_mm512_permutex2var_epi64(a0b0, mask2, a1b1)); +#endif +} + +inline void v_load_deinterleave( const ushort* ptr, v_uint16x32& a, v_uint16x32& b ) +{ + __m512i ab0 = _mm512_loadu_si512((const __m512i*)ptr); + __m512i ab1 = _mm512_loadu_si512((const __m512i*)(ptr + 32)); + __m512i mask0 = _v512_set_epu16(62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, + 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0); + __m512i mask1 = _v512_set_epu16(63, 61, 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33, + 31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1); + a = v_uint16x32(_mm512_permutex2var_epi16(ab0, mask0, ab1)); + b = v_uint16x32(_mm512_permutex2var_epi16(ab0, mask1, ab1)); +} + +inline void v_load_deinterleave( const unsigned* ptr, v_uint32x16& a, v_uint32x16& b ) +{ + __m512i ab0 = _mm512_loadu_si512((const __m512i*)ptr); + __m512i ab1 = _mm512_loadu_si512((const __m512i*)(ptr + 16)); + __m512i mask0 = _v512_set_epu32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0); + __m512i mask1 = _v512_set_epu32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1); + a = v_uint32x16(_mm512_permutex2var_epi32(ab0, mask0, ab1)); + b = v_uint32x16(_mm512_permutex2var_epi32(ab0, mask1, ab1)); +} + +inline void v_load_deinterleave( const uint64* ptr, v_uint64x8& a, v_uint64x8& b ) +{ + __m512i ab0 = _mm512_loadu_si512((const __m512i*)ptr); + __m512i ab1 = _mm512_loadu_si512((const __m512i*)(ptr + 8)); + __m512i mask0 = _v512_set_epu64(14, 12, 10, 8, 6, 4, 2, 0); + __m512i mask1 = _v512_set_epu64(15, 13, 11, 9, 7, 5, 3, 1); + a = v_uint64x8(_mm512_permutex2var_epi64(ab0, mask0, ab1)); + b = v_uint64x8(_mm512_permutex2var_epi64(ab0, mask1, ab1)); +} + +inline void v_load_deinterleave( const uchar* ptr, v_uint8x64& a, v_uint8x64& b, v_uint8x64& c ) +{ + __m512i bgr0 = _mm512_loadu_si512((const __m512i*)ptr); + __m512i bgr1 = _mm512_loadu_si512((const __m512i*)(ptr + 64)); + __m512i bgr2 = _mm512_loadu_si512((const __m512i*)(ptr + 128)); + +#if CV_AVX_512VBMI2 + __m512i mask0 = _v512_set_epu8(126, 123, 120, 117, 114, 111, 108, 105, 102, 99, 96, 93, 90, 87, 84, 81, + 78, 75, 72, 69, 66, 63, 60, 57, 54, 51, 48, 45, 42, 39, 36, 33, + 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0, 62, 59, 56, 53, 50, + 47, 44, 41, 38, 35, 32, 29, 26, 23, 20, 17, 14, 11, 8, 5, 2); + __m512i r0b01 = _mm512_permutex2var_epi8(bgr0, mask0, bgr1); + __m512i b1g12 = _mm512_permutex2var_epi8(bgr1, mask0, bgr2); + __m512i r12b2 = _mm512_permutex2var_epi8(bgr1, + _v512_set_epu8(125, 122, 119, 116, 113, 110, 107, 104, 101, 98, 95, 92, 89, 86, 83, 80, + 77, 74, 71, 68, 65, 127, 124, 121, 118, 115, 112, 109, 106, 103, 100, 97, + 94, 91, 88, 85, 82, 79, 76, 73, 70, 67, 64, 61, 58, 55, 52, 49, + 46, 43, 40, 37, 34, 31, 28, 25, 22, 19, 16, 13, 10, 7, 4, 1), bgr2); + a = v_uint8x64(_mm512_mask_compress_epi8(r12b2, 0xffffffffffe00000, r0b01)); + b = v_uint8x64(_mm512_mask_compress_epi8(b1g12, 0x2492492492492492, bgr0)); + c = v_uint8x64(_mm512_mask_expand_epi8(r0b01, 0xffffffffffe00000, r12b2)); +#elif CV_AVX_512VBMI + __m512i b0g0b1 = _mm512_mask_blend_epi8(0xb6db6db6db6db6db, bgr1, bgr0); + __m512i g1r1g2 = _mm512_mask_blend_epi8(0xb6db6db6db6db6db, bgr2, bgr1); + __m512i r2b2r0 = _mm512_mask_blend_epi8(0xb6db6db6db6db6db, bgr0, bgr2); + a = v_uint8x64(_mm512_permutex2var_epi8(b0g0b1, _v512_set_epu8(125, 122, 119, 116, 113, 110, 107, 104, 101, 98, 95, 92, 89, 86, 83, 80, + 77, 74, 71, 68, 65, 63, 61, 60, 58, 57, 55, 54, 52, 51, 49, 48, + 46, 45, 43, 42, 40, 39, 37, 36, 34, 33, 31, 30, 28, 27, 25, 24, + 23, 21, 20, 18, 17, 15, 14, 12, 11, 9, 8, 6, 5, 3, 2, 0), bgr2)); + b = v_uint8x64(_mm512_permutex2var_epi8(g1r1g2, _v512_set_epu8( 63, 61, 60, 58, 57, 55, 54, 52, 51, 49, 48, 46, 45, 43, 42, 40, + 39, 37, 36, 34, 33, 31, 30, 28, 27, 25, 24, 23, 21, 20, 18, 17, + 15, 14, 12, 11, 9, 8, 6, 5, 3, 2, 0, 126, 123, 120, 117, 114, + 111, 108, 105, 102, 99, 96, 93, 90, 87, 84, 81, 78, 75, 72, 69, 66), bgr0)); + c = v_uint8x64(_mm512_permutex2var_epi8(r2b2r0, _v512_set_epu8( 63, 60, 57, 54, 51, 48, 45, 42, 39, 36, 33, 30, 27, 24, 21, 18, + 15, 12, 9, 6, 3, 0, 125, 122, 119, 116, 113, 110, 107, 104, 101, 98, + 95, 92, 89, 86, 83, 80, 77, 74, 71, 68, 65, 62, 59, 56, 53, 50, + 47, 44, 41, 38, 35, 32, 29, 26, 23, 20, 17, 14, 11, 8, 5, 2), bgr1)); +#else + __m512i mask0 = _v512_set_epu16(61, 58, 55, 52, 49, 46, 43, 40, 37, 34, 63, 60, 57, 54, 51, 48, + 45, 42, 39, 36, 33, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0); + __m512i b01g1 = _mm512_permutex2var_epi16(bgr0, mask0, bgr1); + __m512i r12b2 = _mm512_permutex2var_epi16(bgr1, mask0, bgr2); + __m512i g20r0 = _mm512_permutex2var_epi16(bgr2, mask0, bgr0); + + __m512i b0g0 = _mm512_mask_blend_epi32(0xf800, b01g1, r12b2); + __m512i r0b1 = _mm512_permutex2var_epi16(bgr1, _v512_set_epu16(42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 29, 26, 23, 20, 17, + 14, 11, 8, 5, 2, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43), g20r0); + __m512i g1r1 = _mm512_alignr_epi32(r12b2, g20r0, 11); + a = v_uint8x64(_mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, b0g0, r0b1)); + c = v_uint8x64(_mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, r0b1, g1r1)); + b = v_uint8x64(_mm512_shuffle_epi8(_mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, g1r1, b0g0), _mm512_set4_epi32(0x0e0f0c0d, 0x0a0b0809, 0x06070405, 0x02030001))); +#endif +} + +inline void v_load_deinterleave( const ushort* ptr, v_uint16x32& a, v_uint16x32& b, v_uint16x32& c ) +{ + __m512i bgr0 = _mm512_loadu_si512((const __m512i*)ptr); + __m512i bgr1 = _mm512_loadu_si512((const __m512i*)(ptr + 32)); + __m512i bgr2 = _mm512_loadu_si512((const __m512i*)(ptr + 64)); + + __m512i mask0 = _v512_set_epu16(61, 58, 55, 52, 49, 46, 43, 40, 37, 34, 63, 60, 57, 54, 51, 48, + 45, 42, 39, 36, 33, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0); + __m512i b01g1 = _mm512_permutex2var_epi16(bgr0, mask0, bgr1); + __m512i r12b2 = _mm512_permutex2var_epi16(bgr1, mask0, bgr2); + __m512i g20r0 = _mm512_permutex2var_epi16(bgr2, mask0, bgr0); + + a = v_uint16x32(_mm512_mask_blend_epi32(0xf800, b01g1, r12b2)); + b = v_uint16x32(_mm512_permutex2var_epi16(bgr1, _v512_set_epu16(42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 29, 26, 23, 20, 17, + 14, 11, 8, 5, 2, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43), g20r0)); + c = v_uint16x32(_mm512_alignr_epi32(r12b2, g20r0, 11)); +} + +inline void v_load_deinterleave( const unsigned* ptr, v_uint32x16& a, v_uint32x16& b, v_uint32x16& c ) +{ + __m512i bgr0 = _mm512_loadu_si512((const __m512i*)ptr); + __m512i bgr1 = _mm512_loadu_si512((const __m512i*)(ptr + 16)); + __m512i bgr2 = _mm512_loadu_si512((const __m512i*)(ptr + 32)); + + __m512i mask0 = _v512_set_epu32(29, 26, 23, 20, 17, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0); + __m512i b01r1 = _mm512_permutex2var_epi32(bgr0, mask0, bgr1); + __m512i g12b2 = _mm512_permutex2var_epi32(bgr1, mask0, bgr2); + __m512i r20g0 = _mm512_permutex2var_epi32(bgr2, mask0, bgr0); + + a = v_uint32x16(_mm512_mask_blend_epi32(0xf800, b01r1, g12b2)); + b = v_uint32x16(_mm512_alignr_epi32(g12b2, r20g0, 11)); + c = v_uint32x16(_mm512_permutex2var_epi32(bgr1, _v512_set_epu32(21, 20, 19, 18, 17, 16, 13, 10, 7, 4, 1, 26, 25, 24, 23, 22), r20g0)); +} + +inline void v_load_deinterleave( const uint64* ptr, v_uint64x8& a, v_uint64x8& b, v_uint64x8& c ) +{ + __m512i bgr0 = _mm512_loadu_si512((const __m512i*)ptr); + __m512i bgr1 = _mm512_loadu_si512((const __m512i*)(ptr + 8)); + __m512i bgr2 = _mm512_loadu_si512((const __m512i*)(ptr + 16)); + + __m512i mask0 = _v512_set_epu64(13, 10, 15, 12, 9, 6, 3, 0); + __m512i b01g1 = _mm512_permutex2var_epi64(bgr0, mask0, bgr1); + __m512i r12b2 = _mm512_permutex2var_epi64(bgr1, mask0, bgr2); + __m512i g20r0 = _mm512_permutex2var_epi64(bgr2, mask0, bgr0); + + a = v_uint64x8(_mm512_mask_blend_epi64(0xc0, b01g1, r12b2)); + c = v_uint64x8(_mm512_alignr_epi64(r12b2, g20r0, 6)); + b = v_uint64x8(_mm512_permutex2var_epi64(bgr1, _v512_set_epu64(10, 9, 8, 5, 2, 13, 12, 11), g20r0)); +} + +inline void v_load_deinterleave( const uchar* ptr, v_uint8x64& a, v_uint8x64& b, v_uint8x64& c, v_uint8x64& d ) +{ + __m512i bgra0 = _mm512_loadu_si512((const __m512i*)ptr); + __m512i bgra1 = _mm512_loadu_si512((const __m512i*)(ptr + 64)); + __m512i bgra2 = _mm512_loadu_si512((const __m512i*)(ptr + 128)); + __m512i bgra3 = _mm512_loadu_si512((const __m512i*)(ptr + 192)); + +#if CV_AVX_512VBMI + __m512i mask0 = _v512_set_epu8(126, 124, 122, 120, 118, 116, 114, 112, 110, 108, 106, 104, 102, 100, 98, 96, + 94, 92, 90, 88, 86, 84, 82, 80, 78, 76, 74, 72, 70, 68, 66, 64, + 62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, + 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0); + __m512i mask1 = _v512_set_epu8(127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107, 105, 103, 101, 99, 97, + 95, 93, 91, 89, 87, 85, 83, 81, 79, 77, 75, 73, 71, 69, 67, 65, + 63, 61, 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33, + 31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1); + + __m512i br01 = _mm512_permutex2var_epi8(bgra0, mask0, bgra1); + __m512i ga01 = _mm512_permutex2var_epi8(bgra0, mask1, bgra1); + __m512i br23 = _mm512_permutex2var_epi8(bgra2, mask0, bgra3); + __m512i ga23 = _mm512_permutex2var_epi8(bgra2, mask1, bgra3); + + a = v_uint8x64(_mm512_permutex2var_epi8(br01, mask0, br23)); + c = v_uint8x64(_mm512_permutex2var_epi8(br01, mask1, br23)); + b = v_uint8x64(_mm512_permutex2var_epi8(ga01, mask0, ga23)); + d = v_uint8x64(_mm512_permutex2var_epi8(ga01, mask1, ga23)); +#else + __m512i mask = _mm512_set4_epi32(0x0f0b0703, 0x0e0a0602, 0x0d090501, 0x0c080400); + __m512i b0g0r0a0 = _mm512_shuffle_epi8(bgra0, mask); + __m512i b1g1r1a1 = _mm512_shuffle_epi8(bgra1, mask); + __m512i b2g2r2a2 = _mm512_shuffle_epi8(bgra2, mask); + __m512i b3g3r3a3 = _mm512_shuffle_epi8(bgra3, mask); + + __m512i mask0 = _v512_set_epu32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0); + __m512i mask1 = _v512_set_epu32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1); + + __m512i br01 = _mm512_permutex2var_epi32(b0g0r0a0, mask0, b1g1r1a1); + __m512i ga01 = _mm512_permutex2var_epi32(b0g0r0a0, mask1, b1g1r1a1); + __m512i br23 = _mm512_permutex2var_epi32(b2g2r2a2, mask0, b3g3r3a3); + __m512i ga23 = _mm512_permutex2var_epi32(b2g2r2a2, mask1, b3g3r3a3); + + a = v_uint8x64(_mm512_permutex2var_epi32(br01, mask0, br23)); + c = v_uint8x64(_mm512_permutex2var_epi32(br01, mask1, br23)); + b = v_uint8x64(_mm512_permutex2var_epi32(ga01, mask0, ga23)); + d = v_uint8x64(_mm512_permutex2var_epi32(ga01, mask1, ga23)); +#endif +} + +inline void v_load_deinterleave( const ushort* ptr, v_uint16x32& a, v_uint16x32& b, v_uint16x32& c, v_uint16x32& d ) +{ + __m512i bgra0 = _mm512_loadu_si512((const __m512i*)ptr); + __m512i bgra1 = _mm512_loadu_si512((const __m512i*)(ptr + 32)); + __m512i bgra2 = _mm512_loadu_si512((const __m512i*)(ptr + 64)); + __m512i bgra3 = _mm512_loadu_si512((const __m512i*)(ptr + 96)); + + __m512i mask0 = _v512_set_epu16(62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, + 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0); + __m512i mask1 = _v512_set_epu16(63, 61, 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33, + 31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1); + + __m512i br01 = _mm512_permutex2var_epi16(bgra0, mask0, bgra1); + __m512i ga01 = _mm512_permutex2var_epi16(bgra0, mask1, bgra1); + __m512i br23 = _mm512_permutex2var_epi16(bgra2, mask0, bgra3); + __m512i ga23 = _mm512_permutex2var_epi16(bgra2, mask1, bgra3); + + a = v_uint16x32(_mm512_permutex2var_epi16(br01, mask0, br23)); + c = v_uint16x32(_mm512_permutex2var_epi16(br01, mask1, br23)); + b = v_uint16x32(_mm512_permutex2var_epi16(ga01, mask0, ga23)); + d = v_uint16x32(_mm512_permutex2var_epi16(ga01, mask1, ga23)); +} + +inline void v_load_deinterleave( const unsigned* ptr, v_uint32x16& a, v_uint32x16& b, v_uint32x16& c, v_uint32x16& d ) +{ + __m512i bgra0 = _mm512_loadu_si512((const __m512i*)ptr); + __m512i bgra1 = _mm512_loadu_si512((const __m512i*)(ptr + 16)); + __m512i bgra2 = _mm512_loadu_si512((const __m512i*)(ptr + 32)); + __m512i bgra3 = _mm512_loadu_si512((const __m512i*)(ptr + 48)); + + __m512i mask0 = _v512_set_epu32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0); + __m512i mask1 = _v512_set_epu32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1); + + __m512i br01 = _mm512_permutex2var_epi32(bgra0, mask0, bgra1); + __m512i ga01 = _mm512_permutex2var_epi32(bgra0, mask1, bgra1); + __m512i br23 = _mm512_permutex2var_epi32(bgra2, mask0, bgra3); + __m512i ga23 = _mm512_permutex2var_epi32(bgra2, mask1, bgra3); + + a = v_uint32x16(_mm512_permutex2var_epi32(br01, mask0, br23)); + c = v_uint32x16(_mm512_permutex2var_epi32(br01, mask1, br23)); + b = v_uint32x16(_mm512_permutex2var_epi32(ga01, mask0, ga23)); + d = v_uint32x16(_mm512_permutex2var_epi32(ga01, mask1, ga23)); +} + +inline void v_load_deinterleave( const uint64* ptr, v_uint64x8& a, v_uint64x8& b, v_uint64x8& c, v_uint64x8& d ) +{ + __m512i bgra0 = _mm512_loadu_si512((const __m512i*)ptr); + __m512i bgra1 = _mm512_loadu_si512((const __m512i*)(ptr + 8)); + __m512i bgra2 = _mm512_loadu_si512((const __m512i*)(ptr + 16)); + __m512i bgra3 = _mm512_loadu_si512((const __m512i*)(ptr + 24)); + + __m512i mask0 = _v512_set_epu64(14, 12, 10, 8, 6, 4, 2, 0); + __m512i mask1 = _v512_set_epu64(15, 13, 11, 9, 7, 5, 3, 1); + + __m512i br01 = _mm512_permutex2var_epi64(bgra0, mask0, bgra1); + __m512i ga01 = _mm512_permutex2var_epi64(bgra0, mask1, bgra1); + __m512i br23 = _mm512_permutex2var_epi64(bgra2, mask0, bgra3); + __m512i ga23 = _mm512_permutex2var_epi64(bgra2, mask1, bgra3); + + a = v_uint64x8(_mm512_permutex2var_epi64(br01, mask0, br23)); + c = v_uint64x8(_mm512_permutex2var_epi64(br01, mask1, br23)); + b = v_uint64x8(_mm512_permutex2var_epi64(ga01, mask0, ga23)); + d = v_uint64x8(_mm512_permutex2var_epi64(ga01, mask1, ga23)); +} + +///////////////////////////// store interleave ///////////////////////////////////// + +inline void v_store_interleave( uchar* ptr, const v_uint8x64& x, const v_uint8x64& y, + hal::StoreMode mode=hal::STORE_UNALIGNED ) +{ + v_uint8x64 low, high; + v_zip(x, y, low, high); + if( mode == hal::STORE_ALIGNED_NOCACHE ) + { + _mm512_stream_si512((__m512i*)ptr, low.val); + _mm512_stream_si512((__m512i*)(ptr + 64), high.val); + } + else if( mode == hal::STORE_ALIGNED ) + { + _mm512_store_si512((__m512i*)ptr, low.val); + _mm512_store_si512((__m512i*)(ptr + 64), high.val); + } + else + { + _mm512_storeu_si512((__m512i*)ptr, low.val); + _mm512_storeu_si512((__m512i*)(ptr + 64), high.val); + } +} + +inline void v_store_interleave( ushort* ptr, const v_uint16x32& x, const v_uint16x32& y, + hal::StoreMode mode=hal::STORE_UNALIGNED ) +{ + v_uint16x32 low, high; + v_zip(x, y, low, high); + if( mode == hal::STORE_ALIGNED_NOCACHE ) + { + _mm512_stream_si512((__m512i*)ptr, low.val); + _mm512_stream_si512((__m512i*)(ptr + 32), high.val); + } + else if( mode == hal::STORE_ALIGNED ) + { + _mm512_store_si512((__m512i*)ptr, low.val); + _mm512_store_si512((__m512i*)(ptr + 32), high.val); + } + else + { + _mm512_storeu_si512((__m512i*)ptr, low.val); + _mm512_storeu_si512((__m512i*)(ptr + 32), high.val); + } +} + +inline void v_store_interleave( unsigned* ptr, const v_uint32x16& x, const v_uint32x16& y, + hal::StoreMode mode=hal::STORE_UNALIGNED ) +{ + v_uint32x16 low, high; + v_zip(x, y, low, high); + if( mode == hal::STORE_ALIGNED_NOCACHE ) + { + _mm512_stream_si512((__m512i*)ptr, low.val); + _mm512_stream_si512((__m512i*)(ptr + 16), high.val); + } + else if( mode == hal::STORE_ALIGNED ) + { + _mm512_store_si512((__m512i*)ptr, low.val); + _mm512_store_si512((__m512i*)(ptr + 16), high.val); + } + else + { + _mm512_storeu_si512((__m512i*)ptr, low.val); + _mm512_storeu_si512((__m512i*)(ptr + 16), high.val); + } +} + +inline void v_store_interleave( uint64* ptr, const v_uint64x8& x, const v_uint64x8& y, + hal::StoreMode mode=hal::STORE_UNALIGNED ) +{ + v_uint64x8 low, high; + v_zip(x, y, low, high); + if( mode == hal::STORE_ALIGNED_NOCACHE ) + { + _mm512_stream_si512((__m512i*)ptr, low.val); + _mm512_stream_si512((__m512i*)(ptr + 8), high.val); + } + else if( mode == hal::STORE_ALIGNED ) + { + _mm512_store_si512((__m512i*)ptr, low.val); + _mm512_store_si512((__m512i*)(ptr + 8), high.val); + } + else + { + _mm512_storeu_si512((__m512i*)ptr, low.val); + _mm512_storeu_si512((__m512i*)(ptr + 8), high.val); + } +} + +inline void v_store_interleave( uchar* ptr, const v_uint8x64& a, const v_uint8x64& b, const v_uint8x64& c, + hal::StoreMode mode=hal::STORE_UNALIGNED ) +{ +#if CV_AVX_512VBMI + __m512i mask0 = _v512_set_epu8(127, 84, 20, 126, 83, 19, 125, 82, 18, 124, 81, 17, 123, 80, 16, 122, + 79, 15, 121, 78, 14, 120, 77, 13, 119, 76, 12, 118, 75, 11, 117, 74, + 10, 116, 73, 9, 115, 72, 8, 114, 71, 7, 113, 70, 6, 112, 69, 5, + 111, 68, 4, 110, 67, 3, 109, 66, 2, 108, 65, 1, 107, 64, 0, 106); + __m512i mask1 = _v512_set_epu8( 21, 42, 105, 20, 41, 104, 19, 40, 103, 18, 39, 102, 17, 38, 101, 16, + 37, 100, 15, 36, 99, 14, 35, 98, 13, 34, 97, 12, 33, 96, 11, 32, + 95, 10, 31, 94, 9, 30, 93, 8, 29, 92, 7, 28, 91, 6, 27, 90, + 5, 26, 89, 4, 25, 88, 3, 24, 87, 2, 23, 86, 1, 22, 85, 0); + __m512i mask2 = _v512_set_epu8(106, 127, 63, 105, 126, 62, 104, 125, 61, 103, 124, 60, 102, 123, 59, 101, + 122, 58, 100, 121, 57, 99, 120, 56, 98, 119, 55, 97, 118, 54, 96, 117, + 53, 95, 116, 52, 94, 115, 51, 93, 114, 50, 92, 113, 49, 91, 112, 48, + 90, 111, 47, 89, 110, 46, 88, 109, 45, 87, 108, 44, 86, 107, 43, 85); + __m512i r2g0r0 = _mm512_permutex2var_epi8(b.val, mask0, c.val); + __m512i b0r1b1 = _mm512_permutex2var_epi8(a.val, mask1, c.val); + __m512i g1b2g2 = _mm512_permutex2var_epi8(a.val, mask2, b.val); + + __m512i bgr0 = _mm512_mask_blend_epi8(0x9249249249249249, r2g0r0, b0r1b1); + __m512i bgr1 = _mm512_mask_blend_epi8(0x9249249249249249, b0r1b1, g1b2g2); + __m512i bgr2 = _mm512_mask_blend_epi8(0x9249249249249249, g1b2g2, r2g0r0); +#else + __m512i g1g0 = _mm512_shuffle_epi8(b.val, _mm512_set4_epi32(0x0e0f0c0d, 0x0a0b0809, 0x06070405, 0x02030001)); + __m512i b0g0 = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, a.val, g1g0); + __m512i r0b1 = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, c.val, a.val); + __m512i g1r1 = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, g1g0, c.val); + + __m512i mask0 = _v512_set_epu16(42, 10, 31, 41, 9, 30, 40, 8, 29, 39, 7, 28, 38, 6, 27, 37, + 5, 26, 36, 4, 25, 35, 3, 24, 34, 2, 23, 33, 1, 22, 32, 0); + __m512i mask1 = _v512_set_epu16(21, 52, 41, 20, 51, 40, 19, 50, 39, 18, 49, 38, 17, 48, 37, 16, + 47, 36, 15, 46, 35, 14, 45, 34, 13, 44, 33, 12, 43, 32, 11, 42); + __m512i mask2 = _v512_set_epu16(63, 31, 20, 62, 30, 19, 61, 29, 18, 60, 28, 17, 59, 27, 16, 58, + 26, 15, 57, 25, 14, 56, 24, 13, 55, 23, 12, 54, 22, 11, 53, 21); + __m512i b0g0b2 = _mm512_permutex2var_epi16(b0g0, mask0, r0b1); + __m512i r1b1r0 = _mm512_permutex2var_epi16(b0g0, mask1, g1r1); + __m512i g2r2g1 = _mm512_permutex2var_epi16(r0b1, mask2, g1r1); + + __m512i bgr0 = _mm512_mask_blend_epi16(0x24924924, b0g0b2, r1b1r0); + __m512i bgr1 = _mm512_mask_blend_epi16(0x24924924, r1b1r0, g2r2g1); + __m512i bgr2 = _mm512_mask_blend_epi16(0x24924924, g2r2g1, b0g0b2); +#endif + + if( mode == hal::STORE_ALIGNED_NOCACHE ) + { + _mm512_stream_si512((__m512i*)ptr, bgr0); + _mm512_stream_si512((__m512i*)(ptr + 64), bgr1); + _mm512_stream_si512((__m512i*)(ptr + 128), bgr2); + } + else if( mode == hal::STORE_ALIGNED ) + { + _mm512_store_si512((__m512i*)ptr, bgr0); + _mm512_store_si512((__m512i*)(ptr + 64), bgr1); + _mm512_store_si512((__m512i*)(ptr + 128), bgr2); + } + else + { + _mm512_storeu_si512((__m512i*)ptr, bgr0); + _mm512_storeu_si512((__m512i*)(ptr + 64), bgr1); + _mm512_storeu_si512((__m512i*)(ptr + 128), bgr2); + } +} + +inline void v_store_interleave( ushort* ptr, const v_uint16x32& a, const v_uint16x32& b, const v_uint16x32& c, + hal::StoreMode mode=hal::STORE_UNALIGNED ) +{ + __m512i mask0 = _v512_set_epu16(42, 10, 31, 41, 9, 30, 40, 8, 29, 39, 7, 28, 38, 6, 27, 37, + 5, 26, 36, 4, 25, 35, 3, 24, 34, 2, 23, 33, 1, 22, 32, 0); + __m512i mask1 = _v512_set_epu16(21, 52, 41, 20, 51, 40, 19, 50, 39, 18, 49, 38, 17, 48, 37, 16, + 47, 36, 15, 46, 35, 14, 45, 34, 13, 44, 33, 12, 43, 32, 11, 42); + __m512i mask2 = _v512_set_epu16(63, 31, 20, 62, 30, 19, 61, 29, 18, 60, 28, 17, 59, 27, 16, 58, + 26, 15, 57, 25, 14, 56, 24, 13, 55, 23, 12, 54, 22, 11, 53, 21); + __m512i b0g0b2 = _mm512_permutex2var_epi16(a.val, mask0, b.val); + __m512i r1b1r0 = _mm512_permutex2var_epi16(a.val, mask1, c.val); + __m512i g2r2g1 = _mm512_permutex2var_epi16(b.val, mask2, c.val); + + __m512i bgr0 = _mm512_mask_blend_epi16(0x24924924, b0g0b2, r1b1r0); + __m512i bgr1 = _mm512_mask_blend_epi16(0x24924924, r1b1r0, g2r2g1); + __m512i bgr2 = _mm512_mask_blend_epi16(0x24924924, g2r2g1, b0g0b2); + + if( mode == hal::STORE_ALIGNED_NOCACHE ) + { + _mm512_stream_si512((__m512i*)ptr, bgr0); + _mm512_stream_si512((__m512i*)(ptr + 32), bgr1); + _mm512_stream_si512((__m512i*)(ptr + 64), bgr2); + } + else if( mode == hal::STORE_ALIGNED ) + { + _mm512_store_si512((__m512i*)ptr, bgr0); + _mm512_store_si512((__m512i*)(ptr + 32), bgr1); + _mm512_store_si512((__m512i*)(ptr + 64), bgr2); + } + else + { + _mm512_storeu_si512((__m512i*)ptr, bgr0); + _mm512_storeu_si512((__m512i*)(ptr + 32), bgr1); + _mm512_storeu_si512((__m512i*)(ptr + 64), bgr2); + } +} + +inline void v_store_interleave( unsigned* ptr, const v_uint32x16& a, const v_uint32x16& b, const v_uint32x16& c, + hal::StoreMode mode=hal::STORE_UNALIGNED ) +{ + __m512i mask0 = _v512_set_epu32(26, 31, 15, 25, 30, 14, 24, 29, 13, 23, 28, 12, 22, 27, 11, 21); + __m512i mask1 = _v512_set_epu32(31, 10, 25, 30, 9, 24, 29, 8, 23, 28, 7, 22, 27, 6, 21, 26); + __m512i g1b2g2 = _mm512_permutex2var_epi32(a.val, mask0, b.val); + __m512i r2r1b1 = _mm512_permutex2var_epi32(a.val, mask1, c.val); + + __m512i bgr0 = _mm512_mask_expand_epi32(_mm512_mask_expand_epi32(_mm512_maskz_expand_epi32(0x9249, a.val), 0x2492, b.val), 0x4924, c.val); + __m512i bgr1 = _mm512_mask_blend_epi32(0x9249, r2r1b1, g1b2g2); + __m512i bgr2 = _mm512_mask_blend_epi32(0x9249, g1b2g2, r2r1b1); + + if( mode == hal::STORE_ALIGNED_NOCACHE ) + { + _mm512_stream_si512((__m512i*)ptr, bgr0); + _mm512_stream_si512((__m512i*)(ptr + 16), bgr1); + _mm512_stream_si512((__m512i*)(ptr + 32), bgr2); + } + else if( mode == hal::STORE_ALIGNED ) + { + _mm512_store_si512((__m512i*)ptr, bgr0); + _mm512_store_si512((__m512i*)(ptr + 16), bgr1); + _mm512_store_si512((__m512i*)(ptr + 32), bgr2); + } + else + { + _mm512_storeu_si512((__m512i*)ptr, bgr0); + _mm512_storeu_si512((__m512i*)(ptr + 16), bgr1); + _mm512_storeu_si512((__m512i*)(ptr + 32), bgr2); + } +} + +inline void v_store_interleave( uint64* ptr, const v_uint64x8& a, const v_uint64x8& b, const v_uint64x8& c, + hal::StoreMode mode=hal::STORE_UNALIGNED ) +{ + __m512i mask0 = _v512_set_epu64( 5, 12, 7, 4, 11, 6, 3, 10); + __m512i mask1 = _v512_set_epu64(15, 7, 4, 14, 6, 3, 13, 5); + __m512i r1b1b2 = _mm512_permutex2var_epi64(a.val, mask0, c.val); + __m512i g2r2g1 = _mm512_permutex2var_epi64(b.val, mask1, c.val); + + __m512i bgr0 = _mm512_mask_expand_epi64(_mm512_mask_expand_epi64(_mm512_maskz_expand_epi64(0x49, a.val), 0x92, b.val), 0x24, c.val); + __m512i bgr1 = _mm512_mask_blend_epi64(0xdb, g2r2g1, r1b1b2); + __m512i bgr2 = _mm512_mask_blend_epi64(0xdb, r1b1b2, g2r2g1); + + if( mode == hal::STORE_ALIGNED_NOCACHE ) + { + _mm512_stream_si512((__m512i*)ptr, bgr0); + _mm512_stream_si512((__m512i*)(ptr + 8), bgr1); + _mm512_stream_si512((__m512i*)(ptr + 16), bgr2); + } + else if( mode == hal::STORE_ALIGNED ) + { + _mm512_store_si512((__m512i*)ptr, bgr0); + _mm512_store_si512((__m512i*)(ptr + 8), bgr1); + _mm512_store_si512((__m512i*)(ptr + 16), bgr2); + } + else + { + _mm512_storeu_si512((__m512i*)ptr, bgr0); + _mm512_storeu_si512((__m512i*)(ptr + 8), bgr1); + _mm512_storeu_si512((__m512i*)(ptr + 16), bgr2); + } +} + +inline void v_store_interleave( uchar* ptr, const v_uint8x64& a, const v_uint8x64& b, + const v_uint8x64& c, const v_uint8x64& d, + hal::StoreMode mode=hal::STORE_UNALIGNED ) +{ + v_uint8x64 br01, br23, ga01, ga23; + v_zip(a, c, br01, br23); + v_zip(b, d, ga01, ga23); + v_uint8x64 bgra0, bgra1, bgra2, bgra3; + v_zip(br01, ga01, bgra0, bgra1); + v_zip(br23, ga23, bgra2, bgra3); + + if( mode == hal::STORE_ALIGNED_NOCACHE ) + { + _mm512_stream_si512((__m512i*)ptr, bgra0.val); + _mm512_stream_si512((__m512i*)(ptr + 64), bgra1.val); + _mm512_stream_si512((__m512i*)(ptr + 128), bgra2.val); + _mm512_stream_si512((__m512i*)(ptr + 192), bgra3.val); + } + else if( mode == hal::STORE_ALIGNED ) + { + _mm512_store_si512((__m512i*)ptr, bgra0.val); + _mm512_store_si512((__m512i*)(ptr + 64), bgra1.val); + _mm512_store_si512((__m512i*)(ptr + 128), bgra2.val); + _mm512_store_si512((__m512i*)(ptr + 192), bgra3.val); + } + else + { + _mm512_storeu_si512((__m512i*)ptr, bgra0.val); + _mm512_storeu_si512((__m512i*)(ptr + 64), bgra1.val); + _mm512_storeu_si512((__m512i*)(ptr + 128), bgra2.val); + _mm512_storeu_si512((__m512i*)(ptr + 192), bgra3.val); + } +} + +inline void v_store_interleave( ushort* ptr, const v_uint16x32& a, const v_uint16x32& b, + const v_uint16x32& c, const v_uint16x32& d, + hal::StoreMode mode=hal::STORE_UNALIGNED ) +{ + v_uint16x32 br01, br23, ga01, ga23; + v_zip(a, c, br01, br23); + v_zip(b, d, ga01, ga23); + v_uint16x32 bgra0, bgra1, bgra2, bgra3; + v_zip(br01, ga01, bgra0, bgra1); + v_zip(br23, ga23, bgra2, bgra3); + + if( mode == hal::STORE_ALIGNED_NOCACHE ) + { + _mm512_stream_si512((__m512i*)ptr, bgra0.val); + _mm512_stream_si512((__m512i*)(ptr + 32), bgra1.val); + _mm512_stream_si512((__m512i*)(ptr + 64), bgra2.val); + _mm512_stream_si512((__m512i*)(ptr + 96), bgra3.val); + } + else if( mode == hal::STORE_ALIGNED ) + { + _mm512_store_si512((__m512i*)ptr, bgra0.val); + _mm512_store_si512((__m512i*)(ptr + 32), bgra1.val); + _mm512_store_si512((__m512i*)(ptr + 64), bgra2.val); + _mm512_store_si512((__m512i*)(ptr + 96), bgra3.val); + } + else + { + _mm512_storeu_si512((__m512i*)ptr, bgra0.val); + _mm512_storeu_si512((__m512i*)(ptr + 32), bgra1.val); + _mm512_storeu_si512((__m512i*)(ptr + 64), bgra2.val); + _mm512_storeu_si512((__m512i*)(ptr + 96), bgra3.val); + } +} + +inline void v_store_interleave( unsigned* ptr, const v_uint32x16& a, const v_uint32x16& b, + const v_uint32x16& c, const v_uint32x16& d, + hal::StoreMode mode=hal::STORE_UNALIGNED ) +{ + v_uint32x16 br01, br23, ga01, ga23; + v_zip(a, c, br01, br23); + v_zip(b, d, ga01, ga23); + v_uint32x16 bgra0, bgra1, bgra2, bgra3; + v_zip(br01, ga01, bgra0, bgra1); + v_zip(br23, ga23, bgra2, bgra3); + + if( mode == hal::STORE_ALIGNED_NOCACHE ) + { + _mm512_stream_si512((__m512i*)ptr, bgra0.val); + _mm512_stream_si512((__m512i*)(ptr + 16), bgra1.val); + _mm512_stream_si512((__m512i*)(ptr + 32), bgra2.val); + _mm512_stream_si512((__m512i*)(ptr + 48), bgra3.val); + } + else if( mode == hal::STORE_ALIGNED ) + { + _mm512_store_si512((__m512i*)ptr, bgra0.val); + _mm512_store_si512((__m512i*)(ptr + 16), bgra1.val); + _mm512_store_si512((__m512i*)(ptr + 32), bgra2.val); + _mm512_store_si512((__m512i*)(ptr + 48), bgra3.val); + } + else + { + _mm512_storeu_si512((__m512i*)ptr, bgra0.val); + _mm512_storeu_si512((__m512i*)(ptr + 16), bgra1.val); + _mm512_storeu_si512((__m512i*)(ptr + 32), bgra2.val); + _mm512_storeu_si512((__m512i*)(ptr + 48), bgra3.val); + } +} + +inline void v_store_interleave( uint64* ptr, const v_uint64x8& a, const v_uint64x8& b, + const v_uint64x8& c, const v_uint64x8& d, + hal::StoreMode mode=hal::STORE_UNALIGNED ) +{ + v_uint64x8 br01, br23, ga01, ga23; + v_zip(a, c, br01, br23); + v_zip(b, d, ga01, ga23); + v_uint64x8 bgra0, bgra1, bgra2, bgra3; + v_zip(br01, ga01, bgra0, bgra1); + v_zip(br23, ga23, bgra2, bgra3); + + if( mode == hal::STORE_ALIGNED_NOCACHE ) + { + _mm512_stream_si512((__m512i*)ptr, bgra0.val); + _mm512_stream_si512((__m512i*)(ptr + 8), bgra1.val); + _mm512_stream_si512((__m512i*)(ptr + 16), bgra2.val); + _mm512_stream_si512((__m512i*)(ptr + 24), bgra3.val); + } + else if( mode == hal::STORE_ALIGNED ) + { + _mm512_store_si512((__m512i*)ptr, bgra0.val); + _mm512_store_si512((__m512i*)(ptr + 8), bgra1.val); + _mm512_store_si512((__m512i*)(ptr + 16), bgra2.val); + _mm512_store_si512((__m512i*)(ptr + 24), bgra3.val); + } + else + { + _mm512_storeu_si512((__m512i*)ptr, bgra0.val); + _mm512_storeu_si512((__m512i*)(ptr + 8), bgra1.val); + _mm512_storeu_si512((__m512i*)(ptr + 16), bgra2.val); + _mm512_storeu_si512((__m512i*)(ptr + 24), bgra3.val); + } +} + +#define OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(_Tpvec0, _Tp0, suffix0, _Tpvec1, _Tp1, suffix1) \ +inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0 ) \ +{ \ + _Tpvec1 a1, b1; \ + v_load_deinterleave((const _Tp1*)ptr, a1, b1); \ + a0 = v_reinterpret_as_##suffix0(a1); \ + b0 = v_reinterpret_as_##suffix0(b1); \ +} \ +inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0 ) \ +{ \ + _Tpvec1 a1, b1, c1; \ + v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1); \ + a0 = v_reinterpret_as_##suffix0(a1); \ + b0 = v_reinterpret_as_##suffix0(b1); \ + c0 = v_reinterpret_as_##suffix0(c1); \ +} \ +inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0, _Tpvec0& d0 ) \ +{ \ + _Tpvec1 a1, b1, c1, d1; \ + v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1, d1); \ + a0 = v_reinterpret_as_##suffix0(a1); \ + b0 = v_reinterpret_as_##suffix0(b1); \ + c0 = v_reinterpret_as_##suffix0(c1); \ + d0 = v_reinterpret_as_##suffix0(d1); \ +} \ +inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \ + hal::StoreMode mode=hal::STORE_UNALIGNED ) \ +{ \ + _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \ + _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \ + v_store_interleave((_Tp1*)ptr, a1, b1, mode); \ +} \ +inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, const _Tpvec0& c0, \ + hal::StoreMode mode=hal::STORE_UNALIGNED ) \ +{ \ + _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \ + _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \ + _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \ + v_store_interleave((_Tp1*)ptr, a1, b1, c1, mode); \ +} \ +inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \ + const _Tpvec0& c0, const _Tpvec0& d0, \ + hal::StoreMode mode=hal::STORE_UNALIGNED ) \ +{ \ + _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \ + _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \ + _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \ + _Tpvec1 d1 = v_reinterpret_as_##suffix1(d0); \ + v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1, mode); \ +} + +OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(v_int8x64, schar, s8, v_uint8x64, uchar, u8) +OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(v_int16x32, short, s16, v_uint16x32, ushort, u16) +OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(v_int32x16, int, s32, v_uint32x16, unsigned, u32) +OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(v_float32x16, float, f32, v_uint32x16, unsigned, u32) +OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(v_int64x8, int64, s64, v_uint64x8, uint64, u64) +OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(v_float64x8, double, f64, v_uint64x8, uint64, u64) + +////////// Mask and checks ///////// + +/** Mask **/ +inline int64 v_signmask(const v_int8x64& a) { return (int64)_mm512_movepi8_mask(a.val); } +inline int v_signmask(const v_int16x32& a) { return (int)_mm512_cmp_epi16_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); } +inline int v_signmask(const v_int32x16& a) { return (int)_mm512_cmp_epi32_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); } +inline int v_signmask(const v_int64x8& a) { return (int)_mm512_cmp_epi64_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); } + +inline int64 v_signmask(const v_uint8x64& a) { return v_signmask(v_reinterpret_as_s8(a)); } +inline int v_signmask(const v_uint16x32& a) { return v_signmask(v_reinterpret_as_s16(a)); } +inline int v_signmask(const v_uint32x16& a) { return v_signmask(v_reinterpret_as_s32(a)); } +inline int v_signmask(const v_uint64x8& a) { return v_signmask(v_reinterpret_as_s64(a)); } +inline int v_signmask(const v_float32x16& a) { return v_signmask(v_reinterpret_as_s32(a)); } +inline int v_signmask(const v_float64x8& a) { return v_signmask(v_reinterpret_as_s64(a)); } + +/** Checks **/ +inline bool v_check_all(const v_int8x64& a) { return !(bool)_mm512_cmp_epi8_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_NLT); } +inline bool v_check_any(const v_int8x64& a) { return (bool)_mm512_movepi8_mask(a.val); } +inline bool v_check_all(const v_int16x32& a) { return !(bool)_mm512_cmp_epi16_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_NLT); } +inline bool v_check_any(const v_int16x32& a) { return (bool)_mm512_cmp_epi16_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); } +inline bool v_check_all(const v_int32x16& a) { return !(bool)_mm512_cmp_epi32_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_NLT); } +inline bool v_check_any(const v_int32x16& a) { return (bool)_mm512_cmp_epi32_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); } +inline bool v_check_all(const v_int64x8& a) { return !(bool)_mm512_cmp_epi64_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_NLT); } +inline bool v_check_any(const v_int64x8& a) { return (bool)_mm512_cmp_epi64_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); } + +inline bool v_check_all(const v_float32x16& a) { return v_check_all(v_reinterpret_as_s32(a)); } +inline bool v_check_any(const v_float32x16& a) { return v_check_any(v_reinterpret_as_s32(a)); } +inline bool v_check_all(const v_float64x8& a) { return v_check_all(v_reinterpret_as_s64(a)); } +inline bool v_check_any(const v_float64x8& a) { return v_check_any(v_reinterpret_as_s64(a)); } +inline bool v_check_all(const v_uint8x64& a) { return v_check_all(v_reinterpret_as_s8(a)); } +inline bool v_check_all(const v_uint16x32& a) { return v_check_all(v_reinterpret_as_s16(a)); } +inline bool v_check_all(const v_uint32x16& a) { return v_check_all(v_reinterpret_as_s32(a)); } +inline bool v_check_all(const v_uint64x8& a) { return v_check_all(v_reinterpret_as_s64(a)); } +inline bool v_check_any(const v_uint8x64& a) { return v_check_any(v_reinterpret_as_s8(a)); } +inline bool v_check_any(const v_uint16x32& a) { return v_check_any(v_reinterpret_as_s16(a)); } +inline bool v_check_any(const v_uint32x16& a) { return v_check_any(v_reinterpret_as_s32(a)); } +inline bool v_check_any(const v_uint64x8& a) { return v_check_any(v_reinterpret_as_s64(a)); } + +inline int v_scan_forward(const v_int8x64& a) +{ + int64 mask = _mm512_movepi8_mask(a.val); + int mask32 = (int)mask; + return mask != 0 ? mask32 != 0 ? trailingZeros32(mask32) : 32 + trailingZeros32((int)(mask >> 32)) : 0; +} +inline int v_scan_forward(const v_uint8x64& a) { return v_scan_forward(v_reinterpret_as_s8(a)); } +inline int v_scan_forward(const v_int16x32& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))); } +inline int v_scan_forward(const v_uint16x32& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))); } +inline int v_scan_forward(const v_int32x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))) / 2; } +inline int v_scan_forward(const v_uint32x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))) / 2; } +inline int v_scan_forward(const v_float32x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))) / 2; } +inline int v_scan_forward(const v_int64x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))) / 4; } +inline int v_scan_forward(const v_uint64x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))) / 4; } +inline int v_scan_forward(const v_float64x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))) / 4; } + +inline void v512_cleanup() { _mm256_zeroall(); } + +CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END + +//! @endcond + +} // cv:: + +#endif // OPENCV_HAL_INTRIN_AVX_HPP diff --git a/3rdparty/opencv/include/opencv2/core/hal/intrin_msa.hpp b/3rdparty/opencv/include/opencv2/core/hal/intrin_msa.hpp new file mode 100644 index 00000000..a1fbb093 --- /dev/null +++ b/3rdparty/opencv/include/opencv2/core/hal/intrin_msa.hpp @@ -0,0 +1,1887 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef OPENCV_HAL_INTRIN_MSA_HPP +#define OPENCV_HAL_INTRIN_MSA_HPP + +#include +#include "opencv2/core/utility.hpp" + +namespace cv +{ + +//! @cond IGNORED +CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN + +#define CV_SIMD128 1 + +//MSA implements 128-bit wide vector registers shared with the 64-bit wide floating-point unit registers. +//MSA and FPU can not be both present, unless the FPU has 64-bit floating-point registers. +#define CV_SIMD128_64F 1 + +struct v_uint8x16 +{ + typedef uchar lane_type; + enum { nlanes = 16 }; + + v_uint8x16() {} + explicit v_uint8x16(v16u8 v) : val(v) {} + v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7, + uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15) + { + uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15}; + val = msa_ld1q_u8(v); + } + + uchar get0() const + { + return msa_getq_lane_u8(val, 0); + } + + v16u8 val; +}; + +struct v_int8x16 +{ + typedef schar lane_type; + enum { nlanes = 16 }; + + v_int8x16() {} + explicit v_int8x16(v16i8 v) : val(v) {} + v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7, + schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15) + { + schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15}; + val = msa_ld1q_s8(v); + } + + schar get0() const + { + return msa_getq_lane_s8(val, 0); + } + + v16i8 val; +}; + +struct v_uint16x8 +{ + typedef ushort lane_type; + enum { nlanes = 8 }; + + v_uint16x8() {} + explicit v_uint16x8(v8u16 v) : val(v) {} + v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7) + { + ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7}; + val = msa_ld1q_u16(v); + } + + ushort get0() const + { + return msa_getq_lane_u16(val, 0); + } + + v8u16 val; +}; + +struct v_int16x8 +{ + typedef short lane_type; + enum { nlanes = 8 }; + + v_int16x8() {} + explicit v_int16x8(v8i16 v) : val(v) {} + v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7) + { + short v[] = {v0, v1, v2, v3, v4, v5, v6, v7}; + val = msa_ld1q_s16(v); + } + + short get0() const + { + return msa_getq_lane_s16(val, 0); + } + + v8i16 val; +}; + +struct v_uint32x4 +{ + typedef unsigned int lane_type; + enum { nlanes = 4 }; + + v_uint32x4() {} + explicit v_uint32x4(v4u32 v) : val(v) {} + v_uint32x4(unsigned int v0, unsigned int v1, unsigned int v2, unsigned int v3) + { + unsigned int v[] = {v0, v1, v2, v3}; + val = msa_ld1q_u32(v); + } + + unsigned int get0() const + { + return msa_getq_lane_u32(val, 0); + } + + v4u32 val; +}; + +struct v_int32x4 +{ + typedef int lane_type; + enum { nlanes = 4 }; + + v_int32x4() {} + explicit v_int32x4(v4i32 v) : val(v) {} + v_int32x4(int v0, int v1, int v2, int v3) + { + int v[] = {v0, v1, v2, v3}; + val = msa_ld1q_s32(v); + } + + int get0() const + { + return msa_getq_lane_s32(val, 0); + } + + v4i32 val; +}; + +struct v_float32x4 +{ + typedef float lane_type; + enum { nlanes = 4 }; + + v_float32x4() {} + explicit v_float32x4(v4f32 v) : val(v) {} + v_float32x4(float v0, float v1, float v2, float v3) + { + float v[] = {v0, v1, v2, v3}; + val = msa_ld1q_f32(v); + } + + float get0() const + { + return msa_getq_lane_f32(val, 0); + } + + v4f32 val; +}; + +struct v_uint64x2 +{ + typedef uint64 lane_type; + enum { nlanes = 2 }; + + v_uint64x2() {} + explicit v_uint64x2(v2u64 v) : val(v) {} + v_uint64x2(uint64 v0, uint64 v1) + { + uint64 v[] = {v0, v1}; + val = msa_ld1q_u64(v); + } + + uint64 get0() const + { + return msa_getq_lane_u64(val, 0); + } + + v2u64 val; +}; + +struct v_int64x2 +{ + typedef int64 lane_type; + enum { nlanes = 2 }; + + v_int64x2() {} + explicit v_int64x2(v2i64 v) : val(v) {} + v_int64x2(int64 v0, int64 v1) + { + int64 v[] = {v0, v1}; + val = msa_ld1q_s64(v); + } + + int64 get0() const + { + return msa_getq_lane_s64(val, 0); + } + + v2i64 val; +}; + +struct v_float64x2 +{ + typedef double lane_type; + enum { nlanes = 2 }; + + v_float64x2() {} + explicit v_float64x2(v2f64 v) : val(v) {} + v_float64x2(double v0, double v1) + { + double v[] = {v0, v1}; + val = msa_ld1q_f64(v); + } + + double get0() const + { + return msa_getq_lane_f64(val, 0); + } + + v2f64 val; +}; + +#define OPENCV_HAL_IMPL_MSA_INIT(_Tpv, _Tp, suffix) \ +inline v_##_Tpv v_setzero_##suffix() { return v_##_Tpv(msa_dupq_n_##suffix((_Tp)0)); } \ +inline v_##_Tpv v_setall_##suffix(_Tp v) { return v_##_Tpv(msa_dupq_n_##suffix(v)); } \ +inline v_uint8x16 v_reinterpret_as_u8(const v_##_Tpv& v) { return v_uint8x16(MSA_TPV_REINTERPRET(v16u8, v.val)); } \ +inline v_int8x16 v_reinterpret_as_s8(const v_##_Tpv& v) { return v_int8x16(MSA_TPV_REINTERPRET(v16i8, v.val)); } \ +inline v_uint16x8 v_reinterpret_as_u16(const v_##_Tpv& v) { return v_uint16x8(MSA_TPV_REINTERPRET(v8u16, v.val)); } \ +inline v_int16x8 v_reinterpret_as_s16(const v_##_Tpv& v) { return v_int16x8(MSA_TPV_REINTERPRET(v8i16, v.val)); } \ +inline v_uint32x4 v_reinterpret_as_u32(const v_##_Tpv& v) { return v_uint32x4(MSA_TPV_REINTERPRET(v4u32, v.val)); } \ +inline v_int32x4 v_reinterpret_as_s32(const v_##_Tpv& v) { return v_int32x4(MSA_TPV_REINTERPRET(v4i32, v.val)); } \ +inline v_uint64x2 v_reinterpret_as_u64(const v_##_Tpv& v) { return v_uint64x2(MSA_TPV_REINTERPRET(v2u64, v.val)); } \ +inline v_int64x2 v_reinterpret_as_s64(const v_##_Tpv& v) { return v_int64x2(MSA_TPV_REINTERPRET(v2i64, v.val)); } \ +inline v_float32x4 v_reinterpret_as_f32(const v_##_Tpv& v) { return v_float32x4(MSA_TPV_REINTERPRET(v4f32, v.val)); } \ +inline v_float64x2 v_reinterpret_as_f64(const v_##_Tpv& v) { return v_float64x2(MSA_TPV_REINTERPRET(v2f64, v.val)); } + +OPENCV_HAL_IMPL_MSA_INIT(uint8x16, uchar, u8) +OPENCV_HAL_IMPL_MSA_INIT(int8x16, schar, s8) +OPENCV_HAL_IMPL_MSA_INIT(uint16x8, ushort, u16) +OPENCV_HAL_IMPL_MSA_INIT(int16x8, short, s16) +OPENCV_HAL_IMPL_MSA_INIT(uint32x4, unsigned int, u32) +OPENCV_HAL_IMPL_MSA_INIT(int32x4, int, s32) +OPENCV_HAL_IMPL_MSA_INIT(uint64x2, uint64, u64) +OPENCV_HAL_IMPL_MSA_INIT(int64x2, int64, s64) +OPENCV_HAL_IMPL_MSA_INIT(float32x4, float, f32) +OPENCV_HAL_IMPL_MSA_INIT(float64x2, double, f64) + +#define OPENCV_HAL_IMPL_MSA_PACK(_Tpvec, _Tpwvec, pack, mov, rshr) \ +inline _Tpvec v_##pack(const _Tpwvec& a, const _Tpwvec& b) \ +{ \ + return _Tpvec(mov(a.val, b.val)); \ +} \ +template inline \ +_Tpvec v_rshr_##pack(const _Tpwvec& a, const _Tpwvec& b) \ +{ \ + return _Tpvec(rshr(a.val, b.val, n)); \ +} + +OPENCV_HAL_IMPL_MSA_PACK(v_uint8x16, v_uint16x8, pack, msa_qpack_u16, msa_qrpackr_u16) +OPENCV_HAL_IMPL_MSA_PACK(v_int8x16, v_int16x8, pack, msa_qpack_s16, msa_qrpackr_s16) +OPENCV_HAL_IMPL_MSA_PACK(v_uint16x8, v_uint32x4, pack, msa_qpack_u32, msa_qrpackr_u32) +OPENCV_HAL_IMPL_MSA_PACK(v_int16x8, v_int32x4, pack, msa_qpack_s32, msa_qrpackr_s32) +OPENCV_HAL_IMPL_MSA_PACK(v_uint32x4, v_uint64x2, pack, msa_pack_u64, msa_rpackr_u64) +OPENCV_HAL_IMPL_MSA_PACK(v_int32x4, v_int64x2, pack, msa_pack_s64, msa_rpackr_s64) +OPENCV_HAL_IMPL_MSA_PACK(v_uint8x16, v_int16x8, pack_u, msa_qpacku_s16, msa_qrpackru_s16) +OPENCV_HAL_IMPL_MSA_PACK(v_uint16x8, v_int32x4, pack_u, msa_qpacku_s32, msa_qrpackru_s32) + +#define OPENCV_HAL_IMPL_MSA_PACK_STORE(_Tpvec, _Tp, hreg, suffix, _Tpwvec, pack, mov, rshr) \ +inline void v_##pack##_store(_Tp* ptr, const _Tpwvec& a) \ +{ \ + hreg a1 = mov(a.val); \ + msa_st1_##suffix(ptr, a1); \ +} \ +template inline \ +void v_rshr_##pack##_store(_Tp* ptr, const _Tpwvec& a) \ +{ \ + hreg a1 = rshr(a.val, n); \ + msa_st1_##suffix(ptr, a1); \ +} + +OPENCV_HAL_IMPL_MSA_PACK_STORE(v_uint8x16, uchar, v8u8, u8, v_uint16x8, pack, msa_qmovn_u16, msa_qrshrn_n_u16) +OPENCV_HAL_IMPL_MSA_PACK_STORE(v_int8x16, schar, v8i8, s8, v_int16x8, pack, msa_qmovn_s16, msa_qrshrn_n_s16) +OPENCV_HAL_IMPL_MSA_PACK_STORE(v_uint16x8, ushort, v4u16, u16, v_uint32x4, pack, msa_qmovn_u32, msa_qrshrn_n_u32) +OPENCV_HAL_IMPL_MSA_PACK_STORE(v_int16x8, short, v4i16, s16, v_int32x4, pack, msa_qmovn_s32, msa_qrshrn_n_s32) +OPENCV_HAL_IMPL_MSA_PACK_STORE(v_uint32x4, unsigned, v2u32, u32, v_uint64x2, pack, msa_movn_u64, msa_rshrn_n_u64) +OPENCV_HAL_IMPL_MSA_PACK_STORE(v_int32x4, int, v2i32, s32, v_int64x2, pack, msa_movn_s64, msa_rshrn_n_s64) +OPENCV_HAL_IMPL_MSA_PACK_STORE(v_uint8x16, uchar, v8u8, u8, v_int16x8, pack_u, msa_qmovun_s16, msa_qrshrun_n_s16) +OPENCV_HAL_IMPL_MSA_PACK_STORE(v_uint16x8, ushort, v4u16, u16, v_int32x4, pack_u, msa_qmovun_s32, msa_qrshrun_n_s32) + +// pack boolean +inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b) +{ + return v_uint8x16(msa_pack_u16(a.val, b.val)); +} + +inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b, + const v_uint32x4& c, const v_uint32x4& d) +{ + return v_uint8x16(msa_pack_u16(msa_pack_u32(a.val, b.val), msa_pack_u32(c.val, d.val))); +} + +inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c, + const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f, + const v_uint64x2& g, const v_uint64x2& h) +{ + v8u16 abcd = msa_pack_u32(msa_pack_u64(a.val, b.val), msa_pack_u64(c.val, d.val)); + v8u16 efgh = msa_pack_u32(msa_pack_u64(e.val, f.val), msa_pack_u64(g.val, h.val)); + return v_uint8x16(msa_pack_u16(abcd, efgh)); +} + +inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0, + const v_float32x4& m1, const v_float32x4& m2, + const v_float32x4& m3) +{ + v4f32 v0 = v.val; + v4f32 res = msa_mulq_lane_f32(m0.val, v0, 0); + res = msa_mlaq_lane_f32(res, m1.val, v0, 1); + res = msa_mlaq_lane_f32(res, m2.val, v0, 2); + res = msa_mlaq_lane_f32(res, m3.val, v0, 3); + return v_float32x4(res); +} + +inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0, + const v_float32x4& m1, const v_float32x4& m2, + const v_float32x4& a) +{ + v4f32 v0 = v.val; + v4f32 res = msa_mulq_lane_f32(m0.val, v0, 0); + res = msa_mlaq_lane_f32(res, m1.val, v0, 1); + res = msa_mlaq_lane_f32(res, m2.val, v0, 2); + res = msa_addq_f32(res, a.val); + return v_float32x4(res); +} + +#define OPENCV_HAL_IMPL_MSA_BIN_OP(bin_op, _Tpvec, intrin) \ +inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \ +{ \ + return _Tpvec(intrin(a.val, b.val)); \ +} \ +inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \ +{ \ + a.val = intrin(a.val, b.val); \ + return a; \ +} + +OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_uint8x16, msa_qaddq_u8) +OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_uint8x16, msa_qsubq_u8) +OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_int8x16, msa_qaddq_s8) +OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_int8x16, msa_qsubq_s8) +OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_uint16x8, msa_qaddq_u16) +OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_uint16x8, msa_qsubq_u16) +OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_int16x8, msa_qaddq_s16) +OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_int16x8, msa_qsubq_s16) +OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_int32x4, msa_addq_s32) +OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_int32x4, msa_subq_s32) +OPENCV_HAL_IMPL_MSA_BIN_OP(*, v_int32x4, msa_mulq_s32) +OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_uint32x4, msa_addq_u32) +OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_uint32x4, msa_subq_u32) +OPENCV_HAL_IMPL_MSA_BIN_OP(*, v_uint32x4, msa_mulq_u32) +OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_float32x4, msa_addq_f32) +OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_float32x4, msa_subq_f32) +OPENCV_HAL_IMPL_MSA_BIN_OP(*, v_float32x4, msa_mulq_f32) +OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_int64x2, msa_addq_s64) +OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_int64x2, msa_subq_s64) +OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_uint64x2, msa_addq_u64) +OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_uint64x2, msa_subq_u64) +OPENCV_HAL_IMPL_MSA_BIN_OP(/, v_float32x4, msa_divq_f32) +OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_float64x2, msa_addq_f64) +OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_float64x2, msa_subq_f64) +OPENCV_HAL_IMPL_MSA_BIN_OP(*, v_float64x2, msa_mulq_f64) +OPENCV_HAL_IMPL_MSA_BIN_OP(/, v_float64x2, msa_divq_f64) + +// saturating multiply 8-bit, 16-bit +#define OPENCV_HAL_IMPL_MSA_MUL_SAT(_Tpvec, _Tpwvec) \ +inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \ +{ \ + _Tpwvec c, d; \ + v_mul_expand(a, b, c, d); \ + return v_pack(c, d); \ +} \ +inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \ +{a = a * b; return a; } + +OPENCV_HAL_IMPL_MSA_MUL_SAT(v_int8x16, v_int16x8) +OPENCV_HAL_IMPL_MSA_MUL_SAT(v_uint8x16, v_uint16x8) +OPENCV_HAL_IMPL_MSA_MUL_SAT(v_int16x8, v_int32x4) +OPENCV_HAL_IMPL_MSA_MUL_SAT(v_uint16x8, v_uint32x4) + +// Multiply and expand +inline void v_mul_expand(const v_int8x16& a, const v_int8x16& b, + v_int16x8& c, v_int16x8& d) +{ + v16i8 a_lo, a_hi, b_lo, b_hi; + + ILVRL_B2_SB(a.val, msa_dupq_n_s8(0), a_lo, a_hi); + ILVRL_B2_SB(b.val, msa_dupq_n_s8(0), b_lo, b_hi); + c.val = msa_mulq_s16(msa_paddlq_s8(a_lo), msa_paddlq_s8(b_lo)); + d.val = msa_mulq_s16(msa_paddlq_s8(a_hi), msa_paddlq_s8(b_hi)); +} + +inline void v_mul_expand(const v_uint8x16& a, const v_uint8x16& b, + v_uint16x8& c, v_uint16x8& d) +{ + v16u8 a_lo, a_hi, b_lo, b_hi; + + ILVRL_B2_UB(a.val, msa_dupq_n_u8(0), a_lo, a_hi); + ILVRL_B2_UB(b.val, msa_dupq_n_u8(0), b_lo, b_hi); + c.val = msa_mulq_u16(msa_paddlq_u8(a_lo), msa_paddlq_u8(b_lo)); + d.val = msa_mulq_u16(msa_paddlq_u8(a_hi), msa_paddlq_u8(b_hi)); +} + +inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b, + v_int32x4& c, v_int32x4& d) +{ + v8i16 a_lo, a_hi, b_lo, b_hi; + + ILVRL_H2_SH(a.val, msa_dupq_n_s16(0), a_lo, a_hi); + ILVRL_H2_SH(b.val, msa_dupq_n_s16(0), b_lo, b_hi); + c.val = msa_mulq_s32(msa_paddlq_s16(a_lo), msa_paddlq_s16(b_lo)); + d.val = msa_mulq_s32(msa_paddlq_s16(a_hi), msa_paddlq_s16(b_hi)); +} + +inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b, + v_uint32x4& c, v_uint32x4& d) +{ + v8u16 a_lo, a_hi, b_lo, b_hi; + + ILVRL_H2_UH(a.val, msa_dupq_n_u16(0), a_lo, a_hi); + ILVRL_H2_UH(b.val, msa_dupq_n_u16(0), b_lo, b_hi); + c.val = msa_mulq_u32(msa_paddlq_u16(a_lo), msa_paddlq_u16(b_lo)); + d.val = msa_mulq_u32(msa_paddlq_u16(a_hi), msa_paddlq_u16(b_hi)); +} + +inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b, + v_uint64x2& c, v_uint64x2& d) +{ + v4u32 a_lo, a_hi, b_lo, b_hi; + + ILVRL_W2_UW(a.val, msa_dupq_n_u32(0), a_lo, a_hi); + ILVRL_W2_UW(b.val, msa_dupq_n_u32(0), b_lo, b_hi); + c.val = msa_mulq_u64(msa_paddlq_u32(a_lo), msa_paddlq_u32(b_lo)); + d.val = msa_mulq_u64(msa_paddlq_u32(a_hi), msa_paddlq_u32(b_hi)); +} + +inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b) +{ + v8i16 a_lo, a_hi, b_lo, b_hi; + + ILVRL_H2_SH(a.val, msa_dupq_n_s16(0), a_lo, a_hi); + ILVRL_H2_SH(b.val, msa_dupq_n_s16(0), b_lo, b_hi); + + return v_int16x8(msa_packr_s32(msa_mulq_s32(msa_paddlq_s16(a_lo), msa_paddlq_s16(b_lo)), + msa_mulq_s32(msa_paddlq_s16(a_hi), msa_paddlq_s16(b_hi)), 16)); +} + +inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b) +{ + v8u16 a_lo, a_hi, b_lo, b_hi; + + ILVRL_H2_UH(a.val, msa_dupq_n_u16(0), a_lo, a_hi); + ILVRL_H2_UH(b.val, msa_dupq_n_u16(0), b_lo, b_hi); + + return v_uint16x8(msa_packr_u32(msa_mulq_u32(msa_paddlq_u16(a_lo), msa_paddlq_u16(b_lo)), + msa_mulq_u32(msa_paddlq_u16(a_hi), msa_paddlq_u16(b_hi)), 16)); +} + +//////// Dot Product //////// + +// 16 >> 32 +inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b) +{ return v_int32x4(msa_dotp_s_w(a.val, b.val)); } +inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c) +{ return v_int32x4(msa_dpadd_s_w(c.val , a.val, b.val)); } + +// 32 >> 64 +inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b) +{ return v_int64x2(msa_dotp_s_d(a.val, b.val)); } +inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c) +{ return v_int64x2(msa_dpadd_s_d(c.val , a.val, b.val)); } + +// 8 >> 32 +inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b) +{ + v8u16 even_a = msa_shrq_n_u16(msa_shlq_n_u16(MSA_TPV_REINTERPRET(v8u16, a.val), 8), 8); + v8u16 odd_a = msa_shrq_n_u16(MSA_TPV_REINTERPRET(v8u16, a.val), 8); + v8u16 even_b = msa_shrq_n_u16(msa_shlq_n_u16(MSA_TPV_REINTERPRET(v8u16, b.val), 8), 8); + v8u16 odd_b = msa_shrq_n_u16(MSA_TPV_REINTERPRET(v8u16, b.val), 8); + v4u32 prod = msa_dotp_u_w(even_a, even_b); + return v_uint32x4(msa_dpadd_u_w(prod, odd_a, odd_b)); +} +inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c) +{ + v8u16 even_a = msa_shrq_n_u16(msa_shlq_n_u16(MSA_TPV_REINTERPRET(v8u16, a.val), 8), 8); + v8u16 odd_a = msa_shrq_n_u16(MSA_TPV_REINTERPRET(v8u16, a.val), 8); + v8u16 even_b = msa_shrq_n_u16(msa_shlq_n_u16(MSA_TPV_REINTERPRET(v8u16, b.val), 8), 8); + v8u16 odd_b = msa_shrq_n_u16(MSA_TPV_REINTERPRET(v8u16, b.val), 8); + v4u32 prod = msa_dpadd_u_w(c.val, even_a, even_b); + return v_uint32x4(msa_dpadd_u_w(prod, odd_a, odd_b)); +} + +inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b) +{ + v8i16 prod = msa_dotp_s_h(a.val, b.val); + return v_int32x4(msa_hadd_s32(prod, prod)); +} +inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b, + const v_int32x4& c) +{ return v_dotprod_expand(a, b) + c; } + +// 16 >> 64 +inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b) +{ + v4u32 even_a = msa_shrq_n_u32(msa_shlq_n_u32(MSA_TPV_REINTERPRET(v4u32, a.val), 16), 16); + v4u32 odd_a = msa_shrq_n_u32(MSA_TPV_REINTERPRET(v4u32, a.val), 16); + v4u32 even_b = msa_shrq_n_u32(msa_shlq_n_u32(MSA_TPV_REINTERPRET(v4u32, b.val), 16), 16); + v4u32 odd_b = msa_shrq_n_u32(MSA_TPV_REINTERPRET(v4u32, b.val), 16); + v2u64 prod = msa_dotp_u_d(even_a, even_b); + return v_uint64x2(msa_dpadd_u_d(prod, odd_a, odd_b)); +} +inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, + const v_uint64x2& c) +{ + v4u32 even_a = msa_shrq_n_u32(msa_shlq_n_u32(MSA_TPV_REINTERPRET(v4u32, a.val), 16), 16); + v4u32 odd_a = msa_shrq_n_u32(MSA_TPV_REINTERPRET(v4u32, a.val), 16); + v4u32 even_b = msa_shrq_n_u32(msa_shlq_n_u32(MSA_TPV_REINTERPRET(v4u32, b.val), 16), 16); + v4u32 odd_b = msa_shrq_n_u32(MSA_TPV_REINTERPRET(v4u32, b.val), 16); + v2u64 prod = msa_dpadd_u_d(c.val, even_a, even_b); + return v_uint64x2(msa_dpadd_u_d(prod, odd_a, odd_b)); +} + +inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b) +{ + v4i32 prod = msa_dotp_s_w(a.val, b.val); + return v_int64x2(msa_hadd_s64(prod, prod)); +} +inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c) +{ return v_dotprod_expand(a, b) + c; } + +// 32 >> 64f +inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b) +{ return v_cvt_f64(v_dotprod(a, b)); } +inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c) +{ return v_dotprod_expand(a, b) + c; } + + +//////// Fast Dot Product //////// + +// 16 >> 32 +inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b) +{ return v_dotprod(a, b); } +inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c) +{ return v_dotprod(a, b, c); } + +// 32 >> 64 +inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b) +{ return v_dotprod(a, b); } +inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c) +{ return v_dotprod(a, b, c); } + +// 8 >> 32 +inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b) +{ return v_dotprod_expand(a, b); } +inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c) +{ return v_dotprod_expand(a, b, c); } +inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b) +{ return v_dotprod_expand(a, b); } +inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c) +{ return v_dotprod_expand(a, b, c); } + +// 16 >> 64 +inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b) +{ return v_dotprod_expand(a, b); } +inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c) +{ return v_dotprod_expand(a, b, c); } +inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b) +{ return v_dotprod_expand(a, b); } +inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c) +{ return v_dotprod_expand(a, b, c); } + +// 32 >> 64f +inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b) +{ return v_dotprod_expand(a, b); } +inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c) +{ return v_dotprod_expand(a, b, c); } + +#define OPENCV_HAL_IMPL_MSA_LOGIC_OP(_Tpvec, _Tpv, suffix) \ +OPENCV_HAL_IMPL_MSA_BIN_OP(&, _Tpvec, msa_andq_##suffix) \ +OPENCV_HAL_IMPL_MSA_BIN_OP(|, _Tpvec, msa_orrq_##suffix) \ +OPENCV_HAL_IMPL_MSA_BIN_OP(^, _Tpvec, msa_eorq_##suffix) \ +inline _Tpvec operator ~ (const _Tpvec& a) \ +{ \ + return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_mvnq_u8(MSA_TPV_REINTERPRET(v16u8, a.val)))); \ +} + +OPENCV_HAL_IMPL_MSA_LOGIC_OP(v_uint8x16, v16u8, u8) +OPENCV_HAL_IMPL_MSA_LOGIC_OP(v_int8x16, v16i8, s8) +OPENCV_HAL_IMPL_MSA_LOGIC_OP(v_uint16x8, v8u16, u16) +OPENCV_HAL_IMPL_MSA_LOGIC_OP(v_int16x8, v8i16, s16) +OPENCV_HAL_IMPL_MSA_LOGIC_OP(v_uint32x4, v4u32, u32) +OPENCV_HAL_IMPL_MSA_LOGIC_OP(v_int32x4, v4i32, s32) +OPENCV_HAL_IMPL_MSA_LOGIC_OP(v_uint64x2, v2u64, u64) +OPENCV_HAL_IMPL_MSA_LOGIC_OP(v_int64x2, v2i64, s64) + +#define OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(bin_op, intrin) \ +inline v_float32x4 operator bin_op (const v_float32x4& a, const v_float32x4& b) \ +{ \ + return v_float32x4(MSA_TPV_REINTERPRET(v4f32, intrin(MSA_TPV_REINTERPRET(v4i32, a.val), MSA_TPV_REINTERPRET(v4i32, b.val)))); \ +} \ +inline v_float32x4& operator bin_op##= (v_float32x4& a, const v_float32x4& b) \ +{ \ + a.val = MSA_TPV_REINTERPRET(v4f32, intrin(MSA_TPV_REINTERPRET(v4i32, a.val), MSA_TPV_REINTERPRET(v4i32, b.val))); \ + return a; \ +} + +OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(&, msa_andq_s32) +OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(|, msa_orrq_s32) +OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(^, msa_eorq_s32) + +inline v_float32x4 operator ~ (const v_float32x4& a) +{ + return v_float32x4(MSA_TPV_REINTERPRET(v4f32, msa_mvnq_s32(MSA_TPV_REINTERPRET(v4i32, a.val)))); +} + +/* v_abs */ +#define OPENCV_HAL_IMPL_MSA_ABS(_Tpuvec, _Tpsvec, usuffix, ssuffix) \ +inline _Tpuvec v_abs(const _Tpsvec& a) \ +{ \ + return v_reinterpret_as_##usuffix(_Tpsvec(msa_absq_##ssuffix(a.val))); \ +} + +OPENCV_HAL_IMPL_MSA_ABS(v_uint8x16, v_int8x16, u8, s8) +OPENCV_HAL_IMPL_MSA_ABS(v_uint16x8, v_int16x8, u16, s16) +OPENCV_HAL_IMPL_MSA_ABS(v_uint32x4, v_int32x4, u32, s32) + +/* v_abs(float), v_sqrt, v_invsqrt */ +#define OPENCV_HAL_IMPL_MSA_BASIC_FUNC(_Tpvec, func, intrin) \ +inline _Tpvec func(const _Tpvec& a) \ +{ \ + return _Tpvec(intrin(a.val)); \ +} + +OPENCV_HAL_IMPL_MSA_BASIC_FUNC(v_float32x4, v_abs, msa_absq_f32) +OPENCV_HAL_IMPL_MSA_BASIC_FUNC(v_float64x2, v_abs, msa_absq_f64) +OPENCV_HAL_IMPL_MSA_BASIC_FUNC(v_float32x4, v_sqrt, msa_sqrtq_f32) +OPENCV_HAL_IMPL_MSA_BASIC_FUNC(v_float32x4, v_invsqrt, msa_rsqrtq_f32) +OPENCV_HAL_IMPL_MSA_BASIC_FUNC(v_float64x2, v_sqrt, msa_sqrtq_f64) +OPENCV_HAL_IMPL_MSA_BASIC_FUNC(v_float64x2, v_invsqrt, msa_rsqrtq_f64) + +#define OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(bin_op, intrin) \ +inline v_float64x2 operator bin_op (const v_float64x2& a, const v_float64x2& b) \ +{ \ + return v_float64x2(MSA_TPV_REINTERPRET(v2f64, intrin(MSA_TPV_REINTERPRET(v2i64, a.val), MSA_TPV_REINTERPRET(v2i64, b.val)))); \ +} \ +inline v_float64x2& operator bin_op##= (v_float64x2& a, const v_float64x2& b) \ +{ \ + a.val = MSA_TPV_REINTERPRET(v2f64, intrin(MSA_TPV_REINTERPRET(v2i64, a.val), MSA_TPV_REINTERPRET(v2i64, b.val))); \ + return a; \ +} + +OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(&, msa_andq_s64) +OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(|, msa_orrq_s64) +OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(^, msa_eorq_s64) + +inline v_float64x2 operator ~ (const v_float64x2& a) +{ + return v_float64x2(MSA_TPV_REINTERPRET(v2f64, msa_mvnq_s32(MSA_TPV_REINTERPRET(v4i32, a.val)))); +} + +// TODO: exp, log, sin, cos + +#define OPENCV_HAL_IMPL_MSA_BIN_FUNC(_Tpvec, func, intrin) \ +inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \ +{ \ + return _Tpvec(intrin(a.val, b.val)); \ +} + +OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint8x16, v_min, msa_minq_u8) +OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint8x16, v_max, msa_maxq_u8) +OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int8x16, v_min, msa_minq_s8) +OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int8x16, v_max, msa_maxq_s8) +OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint16x8, v_min, msa_minq_u16) +OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint16x8, v_max, msa_maxq_u16) +OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int16x8, v_min, msa_minq_s16) +OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int16x8, v_max, msa_maxq_s16) +OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint32x4, v_min, msa_minq_u32) +OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint32x4, v_max, msa_maxq_u32) +OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int32x4, v_min, msa_minq_s32) +OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int32x4, v_max, msa_maxq_s32) +OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_float32x4, v_min, msa_minq_f32) +OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_float32x4, v_max, msa_maxq_f32) +OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_float64x2, v_min, msa_minq_f64) +OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_float64x2, v_max, msa_maxq_f64) + +#define OPENCV_HAL_IMPL_MSA_INT_CMP_OP(_Tpvec, _Tpv, suffix, not_suffix) \ +inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \ +{ return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_ceqq_##suffix(a.val, b.val))); } \ +inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \ +{ return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_mvnq_##not_suffix(msa_ceqq_##suffix(a.val, b.val)))); } \ +inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \ +{ return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_cltq_##suffix(a.val, b.val))); } \ +inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \ +{ return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_cgtq_##suffix(a.val, b.val))); } \ +inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \ +{ return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_cleq_##suffix(a.val, b.val))); } \ +inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \ +{ return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_cgeq_##suffix(a.val, b.val))); } + +OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_uint8x16, v16u8, u8, u8) +OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_int8x16, v16i8, s8, u8) +OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_uint16x8, v8u16, u16, u16) +OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_int16x8, v8i16, s16, u16) +OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_uint32x4, v4u32, u32, u32) +OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_int32x4, v4i32, s32, u32) +OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_float32x4, v4f32, f32, u32) +OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_uint64x2, v2u64, u64, u64) +OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_int64x2, v2i64, s64, u64) +OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_float64x2, v2f64, f64, u64) + +inline v_float32x4 v_not_nan(const v_float32x4& a) +{ return v_float32x4(MSA_TPV_REINTERPRET(v4f32, msa_ceqq_f32(a.val, a.val))); } +inline v_float64x2 v_not_nan(const v_float64x2& a) +{ return v_float64x2(MSA_TPV_REINTERPRET(v2f64, msa_ceqq_f64(a.val, a.val))); } + +OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint8x16, v_add_wrap, msa_addq_u8) +OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int8x16, v_add_wrap, msa_addq_s8) +OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint16x8, v_add_wrap, msa_addq_u16) +OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int16x8, v_add_wrap, msa_addq_s16) +OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint8x16, v_sub_wrap, msa_subq_u8) +OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int8x16, v_sub_wrap, msa_subq_s8) +OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint16x8, v_sub_wrap, msa_subq_u16) +OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int16x8, v_sub_wrap, msa_subq_s16) +OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint8x16, v_mul_wrap, msa_mulq_u8) +OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int8x16, v_mul_wrap, msa_mulq_s8) +OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint16x8, v_mul_wrap, msa_mulq_u16) +OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int16x8, v_mul_wrap, msa_mulq_s16) + +OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint8x16, v_absdiff, msa_abdq_u8) +OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint16x8, v_absdiff, msa_abdq_u16) +OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint32x4, v_absdiff, msa_abdq_u32) +OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_float32x4, v_absdiff, msa_abdq_f32) +OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_float64x2, v_absdiff, msa_abdq_f64) + +/** Saturating absolute difference **/ +OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int8x16, v_absdiffs, msa_qabdq_s8) +OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int16x8, v_absdiffs, msa_qabdq_s16) + +#define OPENCV_HAL_IMPL_MSA_BIN_FUNC2(_Tpvec, _Tpvec2, _Tpv, func, intrin) \ +inline _Tpvec2 func(const _Tpvec& a, const _Tpvec& b) \ +{ \ + return _Tpvec2(MSA_TPV_REINTERPRET(_Tpv, intrin(a.val, b.val))); \ +} + +OPENCV_HAL_IMPL_MSA_BIN_FUNC2(v_int8x16, v_uint8x16, v16u8, v_absdiff, msa_abdq_s8) +OPENCV_HAL_IMPL_MSA_BIN_FUNC2(v_int16x8, v_uint16x8, v8u16, v_absdiff, msa_abdq_s16) +OPENCV_HAL_IMPL_MSA_BIN_FUNC2(v_int32x4, v_uint32x4, v4u32, v_absdiff, msa_abdq_s32) + +/* v_magnitude, v_sqr_magnitude, v_fma, v_muladd */ +inline v_float32x4 v_magnitude(const v_float32x4& a, const v_float32x4& b) +{ + v_float32x4 x(msa_mlaq_f32(msa_mulq_f32(a.val, a.val), b.val, b.val)); + return v_sqrt(x); +} + +inline v_float32x4 v_sqr_magnitude(const v_float32x4& a, const v_float32x4& b) +{ + return v_float32x4(msa_mlaq_f32(msa_mulq_f32(a.val, a.val), b.val, b.val)); +} + +inline v_float32x4 v_fma(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c) +{ + return v_float32x4(msa_mlaq_f32(c.val, a.val, b.val)); +} + +inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c) +{ + return v_int32x4(msa_mlaq_s32(c.val, a.val, b.val)); +} + +inline v_float32x4 v_muladd(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c) +{ + return v_fma(a, b, c); +} + +inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c) +{ + return v_fma(a, b, c); +} + +inline v_float64x2 v_magnitude(const v_float64x2& a, const v_float64x2& b) +{ + v_float64x2 x(msa_mlaq_f64(msa_mulq_f64(a.val, a.val), b.val, b.val)); + return v_sqrt(x); +} + +inline v_float64x2 v_sqr_magnitude(const v_float64x2& a, const v_float64x2& b) +{ + return v_float64x2(msa_mlaq_f64(msa_mulq_f64(a.val, a.val), b.val, b.val)); +} + +inline v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c) +{ + return v_float64x2(msa_mlaq_f64(c.val, a.val, b.val)); +} + +inline v_float64x2 v_muladd(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c) +{ + return v_fma(a, b, c); +} + +// trade efficiency for convenience +#define OPENCV_HAL_IMPL_MSA_SHIFT_OP(_Tpvec, suffix, _Tps, ssuffix) \ +inline _Tpvec operator << (const _Tpvec& a, int n) \ +{ return _Tpvec(msa_shlq_##suffix(a.val, msa_dupq_n_##ssuffix((_Tps)n))); } \ +inline _Tpvec operator >> (const _Tpvec& a, int n) \ +{ return _Tpvec(msa_shrq_##suffix(a.val, msa_dupq_n_##ssuffix((_Tps)n))); } \ +template inline _Tpvec v_shl(const _Tpvec& a) \ +{ return _Tpvec(msa_shlq_n_##suffix(a.val, n)); } \ +template inline _Tpvec v_shr(const _Tpvec& a) \ +{ return _Tpvec(msa_shrq_n_##suffix(a.val, n)); } \ +template inline _Tpvec v_rshr(const _Tpvec& a) \ +{ return _Tpvec(msa_rshrq_n_##suffix(a.val, n)); } + +OPENCV_HAL_IMPL_MSA_SHIFT_OP(v_uint8x16, u8, schar, s8) +OPENCV_HAL_IMPL_MSA_SHIFT_OP(v_int8x16, s8, schar, s8) +OPENCV_HAL_IMPL_MSA_SHIFT_OP(v_uint16x8, u16, short, s16) +OPENCV_HAL_IMPL_MSA_SHIFT_OP(v_int16x8, s16, short, s16) +OPENCV_HAL_IMPL_MSA_SHIFT_OP(v_uint32x4, u32, int, s32) +OPENCV_HAL_IMPL_MSA_SHIFT_OP(v_int32x4, s32, int, s32) +OPENCV_HAL_IMPL_MSA_SHIFT_OP(v_uint64x2, u64, int64, s64) +OPENCV_HAL_IMPL_MSA_SHIFT_OP(v_int64x2, s64, int64, s64) + +/* v_rotate_right, v_rotate_left */ +#define OPENCV_HAL_IMPL_MSA_ROTATE_OP(_Tpvec, _Tpv, _Tpvs, suffix) \ +template inline _Tpvec v_rotate_right(const _Tpvec& a) \ +{ \ + return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_extq_##suffix(MSA_TPV_REINTERPRET(_Tpvs, a.val), msa_dupq_n_##suffix(0), n))); \ +} \ +template inline _Tpvec v_rotate_left(const _Tpvec& a) \ +{ \ + return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_extq_##suffix(msa_dupq_n_##suffix(0), MSA_TPV_REINTERPRET(_Tpvs, a.val), _Tpvec::nlanes - n))); \ +} \ +template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a) \ +{ \ + return a; \ +} \ +template inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \ +{ \ + return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_extq_##suffix(MSA_TPV_REINTERPRET(_Tpvs, a.val), MSA_TPV_REINTERPRET(_Tpvs, b.val), n))); \ +} \ +template inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \ +{ \ + return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_extq_##suffix(MSA_TPV_REINTERPRET(_Tpvs, b.val), MSA_TPV_REINTERPRET(_Tpvs, a.val), _Tpvec::nlanes - n))); \ +} \ +template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a, const _Tpvec& b) \ +{ \ + CV_UNUSED(b); \ + return a; \ +} + +OPENCV_HAL_IMPL_MSA_ROTATE_OP(v_uint8x16, v16u8, v16i8, s8) +OPENCV_HAL_IMPL_MSA_ROTATE_OP(v_int8x16, v16i8, v16i8, s8) +OPENCV_HAL_IMPL_MSA_ROTATE_OP(v_uint16x8, v8u16, v8i16, s16) +OPENCV_HAL_IMPL_MSA_ROTATE_OP(v_int16x8, v8i16, v8i16, s16) +OPENCV_HAL_IMPL_MSA_ROTATE_OP(v_uint32x4, v4u32, v4i32, s32) +OPENCV_HAL_IMPL_MSA_ROTATE_OP(v_int32x4, v4i32, v4i32, s32) +OPENCV_HAL_IMPL_MSA_ROTATE_OP(v_float32x4, v4f32, v4i32, s32) +OPENCV_HAL_IMPL_MSA_ROTATE_OP(v_uint64x2, v2u64, v2i64, s64) +OPENCV_HAL_IMPL_MSA_ROTATE_OP(v_int64x2, v2i64, v2i64, s64) +OPENCV_HAL_IMPL_MSA_ROTATE_OP(v_float64x2, v2f64, v2i64, s64) + +#define OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(_Tpvec, _Tp, suffix) \ +inline _Tpvec v_load(const _Tp* ptr) \ +{ return _Tpvec(msa_ld1q_##suffix(ptr)); } \ +inline _Tpvec v_load_aligned(const _Tp* ptr) \ +{ return _Tpvec(msa_ld1q_##suffix(ptr)); } \ +inline _Tpvec v_load_low(const _Tp* ptr) \ +{ return _Tpvec(msa_combine_##suffix(msa_ld1_##suffix(ptr), msa_dup_n_##suffix((_Tp)0))); } \ +inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \ +{ return _Tpvec(msa_combine_##suffix(msa_ld1_##suffix(ptr0), msa_ld1_##suffix(ptr1))); } \ +inline void v_store(_Tp* ptr, const _Tpvec& a) \ +{ msa_st1q_##suffix(ptr, a.val); } \ +inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \ +{ msa_st1q_##suffix(ptr, a.val); } \ +inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \ +{ msa_st1q_##suffix(ptr, a.val); } \ +inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \ +{ msa_st1q_##suffix(ptr, a.val); } \ +inline void v_store_low(_Tp* ptr, const _Tpvec& a) \ +{ \ + int n = _Tpvec::nlanes; \ + for( int i = 0; i < (n/2); i++ ) \ + ptr[i] = a.val[i]; \ +} \ +inline void v_store_high(_Tp* ptr, const _Tpvec& a) \ +{ \ + int n = _Tpvec::nlanes; \ + for( int i = 0; i < (n/2); i++ ) \ + ptr[i] = a.val[i+(n/2)]; \ +} + +OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_uint8x16, uchar, u8) +OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_int8x16, schar, s8) +OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_uint16x8, ushort, u16) +OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_int16x8, short, s16) +OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_uint32x4, unsigned, u32) +OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_int32x4, int, s32) +OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_uint64x2, uint64, u64) +OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_int64x2, int64, s64) +OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_float32x4, float, f32) +OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_float64x2, double, f64) + + +/** Reverse **/ +inline v_uint8x16 v_reverse(const v_uint8x16 &a) +{ + v_uint8x16 c = v_uint8x16((v16u8)__builtin_msa_vshf_b((v16i8)((v2i64){0x08090A0B0C0D0E0F, 0x0001020304050607}), msa_dupq_n_s8(0), (v16i8)a.val)); + return c; +} + +inline v_int8x16 v_reverse(const v_int8x16 &a) +{ return v_reinterpret_as_s8(v_reverse(v_reinterpret_as_u8(a))); } + +inline v_uint16x8 v_reverse(const v_uint16x8 &a) +{ + v_uint16x8 c = v_uint16x8((v8u16)__builtin_msa_vshf_h((v8i16)((v2i64){0x0004000500060007, 0x0000000100020003}), msa_dupq_n_s16(0), (v8i16)a.val)); + return c; +} + +inline v_int16x8 v_reverse(const v_int16x8 &a) +{ return v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); } + +inline v_uint32x4 v_reverse(const v_uint32x4 &a) +{ + v_uint32x4 c; + c.val[0] = a.val[3]; + c.val[1] = a.val[2]; + c.val[2] = a.val[1]; + c.val[3] = a.val[0]; + return c; +} + +inline v_int32x4 v_reverse(const v_int32x4 &a) +{ return v_reinterpret_as_s32(v_reverse(v_reinterpret_as_u32(a))); } + +inline v_float32x4 v_reverse(const v_float32x4 &a) +{ return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); } + +inline v_uint64x2 v_reverse(const v_uint64x2 &a) +{ + v_uint64x2 c; + c.val[0] = a.val[1]; + c.val[1] = a.val[0]; + return c; +} + +inline v_int64x2 v_reverse(const v_int64x2 &a) +{ return v_reinterpret_as_s64(v_reverse(v_reinterpret_as_u64(a))); } + +inline v_float64x2 v_reverse(const v_float64x2 &a) +{ return v_reinterpret_as_f64(v_reverse(v_reinterpret_as_u64(a))); } + + +#define OPENCV_HAL_IMPL_MSA_REDUCE_OP_8U(func, cfunc) \ +inline unsigned short v_reduce_##func(const v_uint16x8& a) \ +{ \ + v8u16 a_lo, a_hi; \ + ILVRL_H2_UH(a.val, msa_dupq_n_u16(0), a_lo, a_hi); \ + v4u32 b = msa_##func##q_u32(msa_paddlq_u16(a_lo), msa_paddlq_u16(a_hi)); \ + v4u32 b_lo, b_hi; \ + ILVRL_W2_UW(b, msa_dupq_n_u32(0), b_lo, b_hi); \ + v2u64 c = msa_##func##q_u64(msa_paddlq_u32(b_lo), msa_paddlq_u32(b_hi)); \ + return (unsigned short)cfunc(c[0], c[1]); \ +} + +OPENCV_HAL_IMPL_MSA_REDUCE_OP_8U(max, std::max) +OPENCV_HAL_IMPL_MSA_REDUCE_OP_8U(min, std::min) + +#define OPENCV_HAL_IMPL_MSA_REDUCE_OP_8S(func, cfunc) \ +inline short v_reduce_##func(const v_int16x8& a) \ +{ \ + v8i16 a_lo, a_hi; \ + ILVRL_H2_SH(a.val, msa_dupq_n_s16(0), a_lo, a_hi); \ + v4i32 b = msa_##func##q_s32(msa_paddlq_s16(a_lo), msa_paddlq_s16(a_hi)); \ + v4i32 b_lo, b_hi; \ + ILVRL_W2_SW(b, msa_dupq_n_s32(0), b_lo, b_hi); \ + v2i64 c = msa_##func##q_s64(msa_paddlq_s32(b_lo), msa_paddlq_s32(b_hi)); \ + return (short)cfunc(c[0], c[1]); \ +} + +OPENCV_HAL_IMPL_MSA_REDUCE_OP_8S(max, std::max) +OPENCV_HAL_IMPL_MSA_REDUCE_OP_8S(min, std::min) + +#define OPENCV_HAL_IMPL_MSA_REDUCE_OP_4(_Tpvec, scalartype, func, cfunc) \ +inline scalartype v_reduce_##func(const _Tpvec& a) \ +{ \ + return (scalartype)cfunc(cfunc(a.val[0], a.val[1]), cfunc(a.val[2], a.val[3])); \ +} + +OPENCV_HAL_IMPL_MSA_REDUCE_OP_4(v_uint32x4, unsigned, max, std::max) +OPENCV_HAL_IMPL_MSA_REDUCE_OP_4(v_uint32x4, unsigned, min, std::min) +OPENCV_HAL_IMPL_MSA_REDUCE_OP_4(v_int32x4, int, max, std::max) +OPENCV_HAL_IMPL_MSA_REDUCE_OP_4(v_int32x4, int, min, std::min) +OPENCV_HAL_IMPL_MSA_REDUCE_OP_4(v_float32x4, float, max, std::max) +OPENCV_HAL_IMPL_MSA_REDUCE_OP_4(v_float32x4, float, min, std::min) + + +#define OPENCV_HAL_IMPL_MSA_REDUCE_OP_16(_Tpvec, scalartype, _Tpvec2, func) \ +inline scalartype v_reduce_##func(const _Tpvec& a) \ +{ \ + _Tpvec2 a1, a2; \ + v_expand(a, a1, a2); \ + return (scalartype)v_reduce_##func(v_##func(a1, a2)); \ +} + +OPENCV_HAL_IMPL_MSA_REDUCE_OP_16(v_uint8x16, uchar, v_uint16x8, min) +OPENCV_HAL_IMPL_MSA_REDUCE_OP_16(v_uint8x16, uchar, v_uint16x8, max) +OPENCV_HAL_IMPL_MSA_REDUCE_OP_16(v_int8x16, char, v_int16x8, min) +OPENCV_HAL_IMPL_MSA_REDUCE_OP_16(v_int8x16, char, v_int16x8, max) + + + +#define OPENCV_HAL_IMPL_MSA_REDUCE_SUM(_Tpvec, scalartype, suffix) \ +inline scalartype v_reduce_sum(const _Tpvec& a) \ +{ \ + return (scalartype)msa_sum_##suffix(a.val); \ +} + +OPENCV_HAL_IMPL_MSA_REDUCE_SUM(v_uint8x16, unsigned char, u8) +OPENCV_HAL_IMPL_MSA_REDUCE_SUM(v_int8x16, char, s8) +OPENCV_HAL_IMPL_MSA_REDUCE_SUM(v_uint16x8, unsigned short, u16) +OPENCV_HAL_IMPL_MSA_REDUCE_SUM(v_int16x8, short, s16) +OPENCV_HAL_IMPL_MSA_REDUCE_SUM(v_uint32x4, unsigned, u32) +OPENCV_HAL_IMPL_MSA_REDUCE_SUM(v_int32x4, int, s32) +OPENCV_HAL_IMPL_MSA_REDUCE_SUM(v_float32x4, float, f32) + +inline uint64 v_reduce_sum(const v_uint64x2& a) +{ return (uint64)(msa_getq_lane_u64(a.val, 0) + msa_getq_lane_u64(a.val, 1)); } +inline int64 v_reduce_sum(const v_int64x2& a) +{ return (int64)(msa_getq_lane_s64(a.val, 0) + msa_getq_lane_s64(a.val, 1)); } +inline double v_reduce_sum(const v_float64x2& a) +{ + return msa_getq_lane_f64(a.val, 0) + msa_getq_lane_f64(a.val, 1); +} + +/* v_reduce_sum4, v_reduce_sad */ +inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b, + const v_float32x4& c, const v_float32x4& d) +{ + v4f32 u0 = msa_addq_f32(MSA_TPV_REINTERPRET(v4f32, msa_ilvevq_s32(MSA_TPV_REINTERPRET(v4i32, b.val), MSA_TPV_REINTERPRET(v4i32, a.val))), + MSA_TPV_REINTERPRET(v4f32, msa_ilvodq_s32(MSA_TPV_REINTERPRET(v4i32, b.val), MSA_TPV_REINTERPRET(v4i32, a.val)))); // a0+a1 b0+b1 a2+a3 b2+b3 + v4f32 u1 = msa_addq_f32(MSA_TPV_REINTERPRET(v4f32, msa_ilvevq_s32(MSA_TPV_REINTERPRET(v4i32, d.val), MSA_TPV_REINTERPRET(v4i32, c.val))), + MSA_TPV_REINTERPRET(v4f32, msa_ilvodq_s32(MSA_TPV_REINTERPRET(v4i32, d.val), MSA_TPV_REINTERPRET(v4i32, c.val)))); // c0+c1 d0+d1 c2+c3 d2+d3 + + return v_float32x4(msa_addq_f32(MSA_TPV_REINTERPRET(v4f32, msa_ilvrq_s64(MSA_TPV_REINTERPRET(v2i64, u1), MSA_TPV_REINTERPRET(v2i64, u0))), + MSA_TPV_REINTERPRET(v4f32, msa_ilvlq_s64(MSA_TPV_REINTERPRET(v2i64, u1), MSA_TPV_REINTERPRET(v2i64, u0))))); +} + +inline unsigned v_reduce_sad(const v_uint8x16& a, const v_uint8x16& b) +{ + v16u8 t0 = msa_abdq_u8(a.val, b.val); + v8u16 t1 = msa_paddlq_u8(t0); + v4u32 t2 = msa_paddlq_u16(t1); + return msa_sum_u32(t2); +} +inline unsigned v_reduce_sad(const v_int8x16& a, const v_int8x16& b) +{ + v16u8 t0 = MSA_TPV_REINTERPRET(v16u8, msa_abdq_s8(a.val, b.val)); + v8u16 t1 = msa_paddlq_u8(t0); + v4u32 t2 = msa_paddlq_u16(t1); + return msa_sum_u32(t2); +} +inline unsigned v_reduce_sad(const v_uint16x8& a, const v_uint16x8& b) +{ + v8u16 t0 = msa_abdq_u16(a.val, b.val); + v4u32 t1 = msa_paddlq_u16(t0); + return msa_sum_u32(t1); +} +inline unsigned v_reduce_sad(const v_int16x8& a, const v_int16x8& b) +{ + v8u16 t0 = MSA_TPV_REINTERPRET(v8u16, msa_abdq_s16(a.val, b.val)); + v4u32 t1 = msa_paddlq_u16(t0); + return msa_sum_u32(t1); +} +inline unsigned v_reduce_sad(const v_uint32x4& a, const v_uint32x4& b) +{ + v4u32 t0 = msa_abdq_u32(a.val, b.val); + return msa_sum_u32(t0); +} +inline unsigned v_reduce_sad(const v_int32x4& a, const v_int32x4& b) +{ + v4u32 t0 = MSA_TPV_REINTERPRET(v4u32, msa_abdq_s32(a.val, b.val)); + return msa_sum_u32(t0); +} +inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b) +{ + v4f32 t0 = msa_abdq_f32(a.val, b.val); + return msa_sum_f32(t0); +} + +/* v_popcount */ +#define OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE8(_Tpvec) \ +inline v_uint8x16 v_popcount(const _Tpvec& a) \ +{ \ + v16u8 t = MSA_TPV_REINTERPRET(v16u8, msa_cntq_s8(MSA_TPV_REINTERPRET(v16i8, a.val))); \ + return v_uint8x16(t); \ +} +OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE8(v_uint8x16) +OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE8(v_int8x16) + +#define OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE16(_Tpvec) \ +inline v_uint16x8 v_popcount(const _Tpvec& a) \ +{ \ + v8u16 t = MSA_TPV_REINTERPRET(v8u16, msa_cntq_s16(MSA_TPV_REINTERPRET(v8i16, a.val))); \ + return v_uint16x8(t); \ +} +OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE16(v_uint16x8) +OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE16(v_int16x8) + +#define OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE32(_Tpvec) \ +inline v_uint32x4 v_popcount(const _Tpvec& a) \ +{ \ + v4u32 t = MSA_TPV_REINTERPRET(v4u32, msa_cntq_s32(MSA_TPV_REINTERPRET(v4i32, a.val))); \ + return v_uint32x4(t); \ +} +OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE32(v_uint32x4) +OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE32(v_int32x4) + +#define OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE64(_Tpvec) \ +inline v_uint64x2 v_popcount(const _Tpvec& a) \ +{ \ + v2u64 t = MSA_TPV_REINTERPRET(v2u64, msa_cntq_s64(MSA_TPV_REINTERPRET(v2i64, a.val))); \ + return v_uint64x2(t); \ +} +OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE64(v_uint64x2) +OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE64(v_int64x2) + +inline int v_signmask(const v_uint8x16& a) +{ + v8i8 m0 = msa_create_s8(CV_BIG_UINT(0x0706050403020100)); + v16u8 v0 = msa_shlq_u8(msa_shrq_n_u8(a.val, 7), msa_combine_s8(m0, m0)); + v8u16 v1 = msa_paddlq_u8(v0); + v4u32 v2 = msa_paddlq_u16(v1); + v2u64 v3 = msa_paddlq_u32(v2); + return (int)msa_getq_lane_u64(v3, 0) + ((int)msa_getq_lane_u64(v3, 1) << 8); +} +inline int v_signmask(const v_int8x16& a) +{ return v_signmask(v_reinterpret_as_u8(a)); } + +inline int v_signmask(const v_uint16x8& a) +{ + v4i16 m0 = msa_create_s16(CV_BIG_UINT(0x0003000200010000)); + v8u16 v0 = msa_shlq_u16(msa_shrq_n_u16(a.val, 15), msa_combine_s16(m0, m0)); + v4u32 v1 = msa_paddlq_u16(v0); + v2u64 v2 = msa_paddlq_u32(v1); + return (int)msa_getq_lane_u64(v2, 0) + ((int)msa_getq_lane_u64(v2, 1) << 4); +} +inline int v_signmask(const v_int16x8& a) +{ return v_signmask(v_reinterpret_as_u16(a)); } + +inline int v_signmask(const v_uint32x4& a) +{ + v2i32 m0 = msa_create_s32(CV_BIG_UINT(0x0000000100000000)); + v4u32 v0 = msa_shlq_u32(msa_shrq_n_u32(a.val, 31), msa_combine_s32(m0, m0)); + v2u64 v1 = msa_paddlq_u32(v0); + return (int)msa_getq_lane_u64(v1, 0) + ((int)msa_getq_lane_u64(v1, 1) << 2); +} +inline int v_signmask(const v_int32x4& a) +{ return v_signmask(v_reinterpret_as_u32(a)); } +inline int v_signmask(const v_float32x4& a) +{ return v_signmask(v_reinterpret_as_u32(a)); } + +inline int v_signmask(const v_uint64x2& a) +{ + v2u64 v0 = msa_shrq_n_u64(a.val, 63); + return (int)msa_getq_lane_u64(v0, 0) + ((int)msa_getq_lane_u64(v0, 1) << 1); +} +inline int v_signmask(const v_int64x2& a) +{ return v_signmask(v_reinterpret_as_u64(a)); } +inline int v_signmask(const v_float64x2& a) +{ return v_signmask(v_reinterpret_as_u64(a)); } + +inline int v_scan_forward(const v_int8x16& a) { return trailingZeros32(v_signmask(a)); } +inline int v_scan_forward(const v_uint8x16& a) { return trailingZeros32(v_signmask(a)); } +inline int v_scan_forward(const v_int16x8& a) { return trailingZeros32(v_signmask(a)); } +inline int v_scan_forward(const v_uint16x8& a) { return trailingZeros32(v_signmask(a)); } +inline int v_scan_forward(const v_int32x4& a) { return trailingZeros32(v_signmask(a)); } +inline int v_scan_forward(const v_uint32x4& a) { return trailingZeros32(v_signmask(a)); } +inline int v_scan_forward(const v_float32x4& a) { return trailingZeros32(v_signmask(a)); } +inline int v_scan_forward(const v_int64x2& a) { return trailingZeros32(v_signmask(a)); } +inline int v_scan_forward(const v_uint64x2& a) { return trailingZeros32(v_signmask(a)); } +inline int v_scan_forward(const v_float64x2& a) { return trailingZeros32(v_signmask(a)); } + +#define OPENCV_HAL_IMPL_MSA_CHECK_ALLANY(_Tpvec, _Tpvec2, suffix, shift) \ +inline bool v_check_all(const v_##_Tpvec& a) \ +{ \ + _Tpvec2 v0 = msa_shrq_n_##suffix(msa_mvnq_##suffix(a.val), shift); \ + v2u64 v1 = MSA_TPV_REINTERPRET(v2u64, v0); \ + return (msa_getq_lane_u64(v1, 0) | msa_getq_lane_u64(v1, 1)) == 0; \ +} \ +inline bool v_check_any(const v_##_Tpvec& a) \ +{ \ + _Tpvec2 v0 = msa_shrq_n_##suffix(a.val, shift); \ + v2u64 v1 = MSA_TPV_REINTERPRET(v2u64, v0); \ + return (msa_getq_lane_u64(v1, 0) | msa_getq_lane_u64(v1, 1)) != 0; \ +} + +OPENCV_HAL_IMPL_MSA_CHECK_ALLANY(uint8x16, v16u8, u8, 7) +OPENCV_HAL_IMPL_MSA_CHECK_ALLANY(uint16x8, v8u16, u16, 15) +OPENCV_HAL_IMPL_MSA_CHECK_ALLANY(uint32x4, v4u32, u32, 31) +OPENCV_HAL_IMPL_MSA_CHECK_ALLANY(uint64x2, v2u64, u64, 63) + +inline bool v_check_all(const v_int8x16& a) +{ return v_check_all(v_reinterpret_as_u8(a)); } +inline bool v_check_all(const v_int16x8& a) +{ return v_check_all(v_reinterpret_as_u16(a)); } +inline bool v_check_all(const v_int32x4& a) +{ return v_check_all(v_reinterpret_as_u32(a)); } +inline bool v_check_all(const v_float32x4& a) +{ return v_check_all(v_reinterpret_as_u32(a)); } + +inline bool v_check_any(const v_int8x16& a) +{ return v_check_any(v_reinterpret_as_u8(a)); } +inline bool v_check_any(const v_int16x8& a) +{ return v_check_any(v_reinterpret_as_u16(a)); } +inline bool v_check_any(const v_int32x4& a) +{ return v_check_any(v_reinterpret_as_u32(a)); } +inline bool v_check_any(const v_float32x4& a) +{ return v_check_any(v_reinterpret_as_u32(a)); } + +inline bool v_check_all(const v_int64x2& a) +{ return v_check_all(v_reinterpret_as_u64(a)); } +inline bool v_check_all(const v_float64x2& a) +{ return v_check_all(v_reinterpret_as_u64(a)); } +inline bool v_check_any(const v_int64x2& a) +{ return v_check_any(v_reinterpret_as_u64(a)); } +inline bool v_check_any(const v_float64x2& a) +{ return v_check_any(v_reinterpret_as_u64(a)); } + +/* v_select */ +#define OPENCV_HAL_IMPL_MSA_SELECT(_Tpvec, _Tpv, _Tpvu) \ +inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \ +{ \ + return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_bslq_u8(MSA_TPV_REINTERPRET(_Tpvu, mask.val), \ + MSA_TPV_REINTERPRET(_Tpvu, b.val), MSA_TPV_REINTERPRET(_Tpvu, a.val)))); \ +} + +OPENCV_HAL_IMPL_MSA_SELECT(v_uint8x16, v16u8, v16u8) +OPENCV_HAL_IMPL_MSA_SELECT(v_int8x16, v16i8, v16u8) +OPENCV_HAL_IMPL_MSA_SELECT(v_uint16x8, v8u16, v16u8) +OPENCV_HAL_IMPL_MSA_SELECT(v_int16x8, v8i16, v16u8) +OPENCV_HAL_IMPL_MSA_SELECT(v_uint32x4, v4u32, v16u8) +OPENCV_HAL_IMPL_MSA_SELECT(v_int32x4, v4i32, v16u8) +OPENCV_HAL_IMPL_MSA_SELECT(v_float32x4, v4f32, v16u8) +OPENCV_HAL_IMPL_MSA_SELECT(v_float64x2, v2f64, v16u8) + +#define OPENCV_HAL_IMPL_MSA_EXPAND(_Tpvec, _Tpwvec, _Tp, suffix, ssuffix, _Tpv, _Tpvs) \ +inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \ +{ \ + _Tpv a_lo = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a.val), msa_dupq_n_##ssuffix(0))); \ + _Tpv a_hi = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a.val), msa_dupq_n_##ssuffix(0))); \ + b0.val = msa_paddlq_##suffix(a_lo); \ + b1.val = msa_paddlq_##suffix(a_hi); \ +} \ +inline _Tpwvec v_expand_low(const _Tpvec& a) \ +{ \ + _Tpv a_lo = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a.val), msa_dupq_n_##ssuffix(0))); \ + return _Tpwvec(msa_paddlq_##suffix(a_lo)); \ +} \ +inline _Tpwvec v_expand_high(const _Tpvec& a) \ +{ \ + _Tpv a_hi = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a.val), msa_dupq_n_##ssuffix(0))); \ + return _Tpwvec(msa_paddlq_##suffix(a_hi)); \ +} \ +inline _Tpwvec v_load_expand(const _Tp* ptr) \ +{ \ + return _Tpwvec(msa_movl_##suffix(msa_ld1_##suffix(ptr))); \ +} + +OPENCV_HAL_IMPL_MSA_EXPAND(v_uint8x16, v_uint16x8, uchar, u8, s8, v16u8, v16i8) +OPENCV_HAL_IMPL_MSA_EXPAND(v_int8x16, v_int16x8, schar, s8, s8, v16i8, v16i8) +OPENCV_HAL_IMPL_MSA_EXPAND(v_uint16x8, v_uint32x4, ushort, u16, s16, v8u16, v8i16) +OPENCV_HAL_IMPL_MSA_EXPAND(v_int16x8, v_int32x4, short, s16, s16, v8i16, v8i16) +OPENCV_HAL_IMPL_MSA_EXPAND(v_uint32x4, v_uint64x2, uint, u32, s32, v4u32, v4i32) +OPENCV_HAL_IMPL_MSA_EXPAND(v_int32x4, v_int64x2, int, s32, s32, v4i32, v4i32) + +inline v_uint32x4 v_load_expand_q(const uchar* ptr) +{ + return v_uint32x4((v4u32){ptr[0], ptr[1], ptr[2], ptr[3]}); +} + +inline v_int32x4 v_load_expand_q(const schar* ptr) +{ + return v_int32x4((v4i32){ptr[0], ptr[1], ptr[2], ptr[3]}); +} + +/* v_zip, v_combine_low, v_combine_high, v_recombine */ +#define OPENCV_HAL_IMPL_MSA_UNPACKS(_Tpvec, _Tpv, _Tpvs, ssuffix) \ +inline void v_zip(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1) \ +{ \ + b0.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a1.val), MSA_TPV_REINTERPRET(_Tpvs, a0.val))); \ + b1.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a1.val), MSA_TPV_REINTERPRET(_Tpvs, a0.val))); \ +} \ +inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \ +{ \ + return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_s64(MSA_TPV_REINTERPRET(v2i64, b.val), MSA_TPV_REINTERPRET(v2i64, a.val)))); \ +} \ +inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b) \ +{ \ + return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_s64(MSA_TPV_REINTERPRET(v2i64, b.val), MSA_TPV_REINTERPRET(v2i64, a.val)))); \ +} \ +inline void v_recombine(const _Tpvec& a, const _Tpvec& b, _Tpvec& c, _Tpvec& d) \ +{ \ + c.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_s64(MSA_TPV_REINTERPRET(v2i64, b.val), MSA_TPV_REINTERPRET(v2i64, a.val))); \ + d.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_s64(MSA_TPV_REINTERPRET(v2i64, b.val), MSA_TPV_REINTERPRET(v2i64, a.val))); \ +} + +OPENCV_HAL_IMPL_MSA_UNPACKS(v_uint8x16, v16u8, v16i8, s8) +OPENCV_HAL_IMPL_MSA_UNPACKS(v_int8x16, v16i8, v16i8, s8) +OPENCV_HAL_IMPL_MSA_UNPACKS(v_uint16x8, v8u16, v8i16, s16) +OPENCV_HAL_IMPL_MSA_UNPACKS(v_int16x8, v8i16, v8i16, s16) +OPENCV_HAL_IMPL_MSA_UNPACKS(v_uint32x4, v4u32, v4i32, s32) +OPENCV_HAL_IMPL_MSA_UNPACKS(v_int32x4, v4i32, v4i32, s32) +OPENCV_HAL_IMPL_MSA_UNPACKS(v_float32x4, v4f32, v4i32, s32) +OPENCV_HAL_IMPL_MSA_UNPACKS(v_float64x2, v2f64, v2i64, s64) + +/* v_extract */ +#define OPENCV_HAL_IMPL_MSA_EXTRACT(_Tpvec, _Tpv, _Tpvs, suffix) \ +template \ +inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) \ +{ \ + return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_extq_##suffix(MSA_TPV_REINTERPRET(_Tpvs, a.val), MSA_TPV_REINTERPRET(_Tpvs, b.val), s))); \ +} + +OPENCV_HAL_IMPL_MSA_EXTRACT(v_uint8x16, v16u8, v16i8, s8) +OPENCV_HAL_IMPL_MSA_EXTRACT(v_int8x16, v16i8, v16i8, s8) +OPENCV_HAL_IMPL_MSA_EXTRACT(v_uint16x8, v8u16, v8i16, s16) +OPENCV_HAL_IMPL_MSA_EXTRACT(v_int16x8, v8i16, v8i16, s16) +OPENCV_HAL_IMPL_MSA_EXTRACT(v_uint32x4, v4u32, v4i32, s32) +OPENCV_HAL_IMPL_MSA_EXTRACT(v_int32x4, v4i32, v4i32, s32) +OPENCV_HAL_IMPL_MSA_EXTRACT(v_uint64x2, v2u64, v2i64, s64) +OPENCV_HAL_IMPL_MSA_EXTRACT(v_int64x2, v2i64, v2i64, s64) +OPENCV_HAL_IMPL_MSA_EXTRACT(v_float32x4, v4f32, v4i32, s32) +OPENCV_HAL_IMPL_MSA_EXTRACT(v_float64x2, v2f64, v2i64, s64) + +/* v_round, v_floor, v_ceil, v_trunc */ +inline v_int32x4 v_round(const v_float32x4& a) +{ + return v_int32x4(msa_cvttintq_s32_f32(a.val)); +} + +inline v_int32x4 v_floor(const v_float32x4& a) +{ + v4i32 a1 = msa_cvttintq_s32_f32(a.val); + return v_int32x4(msa_addq_s32(a1, MSA_TPV_REINTERPRET(v4i32, msa_cgtq_f32(msa_cvtfintq_f32_s32(a1), a.val)))); +} + +inline v_int32x4 v_ceil(const v_float32x4& a) +{ + v4i32 a1 = msa_cvttintq_s32_f32(a.val); + return v_int32x4(msa_subq_s32(a1, MSA_TPV_REINTERPRET(v4i32, msa_cgtq_f32(a.val, msa_cvtfintq_f32_s32(a1))))); +} + +inline v_int32x4 v_trunc(const v_float32x4& a) +{ + return v_int32x4(msa_cvttruncq_s32_f32(a.val)); +} + +inline v_int32x4 v_round(const v_float64x2& a) +{ + return v_int32x4(msa_pack_s64(msa_cvttintq_s64_f64(a.val), msa_dupq_n_s64(0))); +} + +inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b) +{ + return v_int32x4(msa_pack_s64(msa_cvttintq_s64_f64(a.val), msa_cvttintq_s64_f64(b.val))); +} + +inline v_int32x4 v_floor(const v_float64x2& a) +{ + v2f64 a1 = msa_cvtrintq_f64(a.val); + return v_int32x4(msa_pack_s64(msa_addq_s64(msa_cvttruncq_s64_f64(a1), MSA_TPV_REINTERPRET(v2i64, msa_cgtq_f64(a1, a.val))), msa_dupq_n_s64(0))); +} + +inline v_int32x4 v_ceil(const v_float64x2& a) +{ + v2f64 a1 = msa_cvtrintq_f64(a.val); + return v_int32x4(msa_pack_s64(msa_subq_s64(msa_cvttruncq_s64_f64(a1), MSA_TPV_REINTERPRET(v2i64, msa_cgtq_f64(a.val, a1))), msa_dupq_n_s64(0))); +} + +inline v_int32x4 v_trunc(const v_float64x2& a) +{ + return v_int32x4(msa_pack_s64(msa_cvttruncq_s64_f64(a.val), msa_dupq_n_s64(0))); +} + +#define OPENCV_HAL_IMPL_MSA_TRANSPOSE4x4(_Tpvec, _Tpv, _Tpvs, ssuffix) \ +inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, \ + const _Tpvec& a2, const _Tpvec& a3, \ + _Tpvec& b0, _Tpvec& b1, \ + _Tpvec& b2, _Tpvec& b3) \ +{ \ + _Tpv t00 = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a1.val), MSA_TPV_REINTERPRET(_Tpvs, a0.val))); \ + _Tpv t01 = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a1.val), MSA_TPV_REINTERPRET(_Tpvs, a0.val))); \ + _Tpv t10 = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a3.val), MSA_TPV_REINTERPRET(_Tpvs, a2.val))); \ + _Tpv t11 = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a3.val), MSA_TPV_REINTERPRET(_Tpvs, a2.val))); \ + b0.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_s64(MSA_TPV_REINTERPRET(v2i64, t10), MSA_TPV_REINTERPRET(v2i64, t00))); \ + b1.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_s64(MSA_TPV_REINTERPRET(v2i64, t10), MSA_TPV_REINTERPRET(v2i64, t00))); \ + b2.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_s64(MSA_TPV_REINTERPRET(v2i64, t11), MSA_TPV_REINTERPRET(v2i64, t01))); \ + b3.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_s64(MSA_TPV_REINTERPRET(v2i64, t11), MSA_TPV_REINTERPRET(v2i64, t01))); \ +} + +OPENCV_HAL_IMPL_MSA_TRANSPOSE4x4(v_uint32x4, v4u32, v4i32, s32) +OPENCV_HAL_IMPL_MSA_TRANSPOSE4x4(v_int32x4, v4i32, v4i32, s32) +OPENCV_HAL_IMPL_MSA_TRANSPOSE4x4(v_float32x4, v4f32, v4i32, s32) + +#define OPENCV_HAL_IMPL_MSA_INTERLEAVED(_Tpvec, _Tp, suffix) \ +inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b) \ +{ \ + msa_ld2q_##suffix(ptr, &a.val, &b.val); \ +} \ +inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, v_##_Tpvec& c) \ +{ \ + msa_ld3q_##suffix(ptr, &a.val, &b.val, &c.val); \ +} \ +inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, \ + v_##_Tpvec& c, v_##_Tpvec& d) \ +{ \ + msa_ld4q_##suffix(ptr, &a.val, &b.val, &c.val, &d.val); \ +} \ +inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \ + hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \ +{ \ + msa_st2q_##suffix(ptr, a.val, b.val); \ +} \ +inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \ + const v_##_Tpvec& c, hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \ +{ \ + msa_st3q_##suffix(ptr, a.val, b.val, c.val); \ +} \ +inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \ + const v_##_Tpvec& c, const v_##_Tpvec& d, \ + hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) \ +{ \ + msa_st4q_##suffix(ptr, a.val, b.val, c.val, d.val); \ +} + +OPENCV_HAL_IMPL_MSA_INTERLEAVED(uint8x16, uchar, u8) +OPENCV_HAL_IMPL_MSA_INTERLEAVED(int8x16, schar, s8) +OPENCV_HAL_IMPL_MSA_INTERLEAVED(uint16x8, ushort, u16) +OPENCV_HAL_IMPL_MSA_INTERLEAVED(int16x8, short, s16) +OPENCV_HAL_IMPL_MSA_INTERLEAVED(uint32x4, unsigned, u32) +OPENCV_HAL_IMPL_MSA_INTERLEAVED(int32x4, int, s32) +OPENCV_HAL_IMPL_MSA_INTERLEAVED(float32x4, float, f32) +OPENCV_HAL_IMPL_MSA_INTERLEAVED(uint64x2, uint64, u64) +OPENCV_HAL_IMPL_MSA_INTERLEAVED(int64x2, int64, s64) +OPENCV_HAL_IMPL_MSA_INTERLEAVED(float64x2, double, f64) + +/* v_cvt_f32, v_cvt_f64, v_cvt_f64_high */ +inline v_float32x4 v_cvt_f32(const v_int32x4& a) +{ + return v_float32x4(msa_cvtfintq_f32_s32(a.val)); +} + +inline v_float32x4 v_cvt_f32(const v_float64x2& a) +{ + return v_float32x4(msa_cvtfq_f32_f64(a.val, msa_dupq_n_f64(0.0f))); +} + +inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b) +{ + return v_float32x4(msa_cvtfq_f32_f64(a.val, b.val)); +} + +inline v_float64x2 v_cvt_f64(const v_int32x4& a) +{ + return v_float64x2(msa_cvtflq_f64_f32(msa_cvtfintq_f32_s32(a.val))); +} + +inline v_float64x2 v_cvt_f64_high(const v_int32x4& a) +{ + return v_float64x2(msa_cvtfhq_f64_f32(msa_cvtfintq_f32_s32(a.val))); +} + +inline v_float64x2 v_cvt_f64(const v_float32x4& a) +{ + return v_float64x2(msa_cvtflq_f64_f32(a.val)); +} + +inline v_float64x2 v_cvt_f64_high(const v_float32x4& a) +{ + return v_float64x2(msa_cvtfhq_f64_f32(a.val)); +} + +inline v_float64x2 v_cvt_f64(const v_int64x2& a) +{ + return v_float64x2(msa_cvtfintq_f64_s64(a.val)); +} + +////////////// Lookup table access //////////////////// +inline v_int8x16 v_lut(const schar* tab, const int* idx) +{ + schar CV_DECL_ALIGNED(32) elems[16] = + { + tab[idx[ 0]], + tab[idx[ 1]], + tab[idx[ 2]], + tab[idx[ 3]], + tab[idx[ 4]], + tab[idx[ 5]], + tab[idx[ 6]], + tab[idx[ 7]], + tab[idx[ 8]], + tab[idx[ 9]], + tab[idx[10]], + tab[idx[11]], + tab[idx[12]], + tab[idx[13]], + tab[idx[14]], + tab[idx[15]] + }; + return v_int8x16(msa_ld1q_s8(elems)); +} +inline v_int8x16 v_lut_pairs(const schar* tab, const int* idx) +{ + schar CV_DECL_ALIGNED(32) elems[16] = + { + tab[idx[0]], + tab[idx[0] + 1], + tab[idx[1]], + tab[idx[1] + 1], + tab[idx[2]], + tab[idx[2] + 1], + tab[idx[3]], + tab[idx[3] + 1], + tab[idx[4]], + tab[idx[4] + 1], + tab[idx[5]], + tab[idx[5] + 1], + tab[idx[6]], + tab[idx[6] + 1], + tab[idx[7]], + tab[idx[7] + 1] + }; + return v_int8x16(msa_ld1q_s8(elems)); +} +inline v_int8x16 v_lut_quads(const schar* tab, const int* idx) +{ + schar CV_DECL_ALIGNED(32) elems[16] = + { + tab[idx[0]], + tab[idx[0] + 1], + tab[idx[0] + 2], + tab[idx[0] + 3], + tab[idx[1]], + tab[idx[1] + 1], + tab[idx[1] + 2], + tab[idx[1] + 3], + tab[idx[2]], + tab[idx[2] + 1], + tab[idx[2] + 2], + tab[idx[2] + 3], + tab[idx[3]], + tab[idx[3] + 1], + tab[idx[3] + 2], + tab[idx[3] + 3] + }; + return v_int8x16(msa_ld1q_s8(elems)); +} +inline v_uint8x16 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((schar*)tab, idx)); } +inline v_uint8x16 v_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_pairs((schar*)tab, idx)); } +inline v_uint8x16 v_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_quads((schar*)tab, idx)); } + + +inline v_int16x8 v_lut(const short* tab, const int* idx) +{ + short CV_DECL_ALIGNED(32) elems[8] = + { + tab[idx[0]], + tab[idx[1]], + tab[idx[2]], + tab[idx[3]], + tab[idx[4]], + tab[idx[5]], + tab[idx[6]], + tab[idx[7]] + }; + return v_int16x8(msa_ld1q_s16(elems)); +} +inline v_int16x8 v_lut_pairs(const short* tab, const int* idx) +{ + short CV_DECL_ALIGNED(32) elems[8] = + { + tab[idx[0]], + tab[idx[0] + 1], + tab[idx[1]], + tab[idx[1] + 1], + tab[idx[2]], + tab[idx[2] + 1], + tab[idx[3]], + tab[idx[3] + 1] + }; + return v_int16x8(msa_ld1q_s16(elems)); +} +inline v_int16x8 v_lut_quads(const short* tab, const int* idx) +{ + return v_int16x8(msa_combine_s16(msa_ld1_s16(tab + idx[0]), msa_ld1_s16(tab + idx[1]))); +} +inline v_uint16x8 v_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut((short*)tab, idx)); } +inline v_uint16x8 v_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_pairs((short*)tab, idx)); } +inline v_uint16x8 v_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_quads((short*)tab, idx)); } + +inline v_int32x4 v_lut(const int* tab, const int* idx) +{ + int CV_DECL_ALIGNED(32) elems[4] = + { + tab[idx[0]], + tab[idx[1]], + tab[idx[2]], + tab[idx[3]] + }; + return v_int32x4(msa_ld1q_s32(elems)); +} +inline v_int32x4 v_lut_pairs(const int* tab, const int* idx) +{ + return v_int32x4(msa_combine_s32(msa_ld1_s32(tab + idx[0]), msa_ld1_s32(tab + idx[1]))); +} +inline v_int32x4 v_lut_quads(const int* tab, const int* idx) +{ + return v_int32x4(msa_ld1q_s32(tab + idx[0])); +} +inline v_uint32x4 v_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut((int*)tab, idx)); } +inline v_uint32x4 v_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_pairs((int*)tab, idx)); } +inline v_uint32x4 v_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_quads((int*)tab, idx)); } + +inline v_int64x2 v_lut(const int64_t* tab, const int* idx) +{ + return v_int64x2(msa_combine_s64(msa_create_s64(tab[idx[0]]), msa_create_s64(tab[idx[1]]))); +} +inline v_int64x2 v_lut_pairs(const int64_t* tab, const int* idx) +{ + return v_int64x2(msa_ld1q_s64(tab + idx[0])); +} +inline v_uint64x2 v_lut(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut((const int64_t *)tab, idx)); } +inline v_uint64x2 v_lut_pairs(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_pairs((const int64_t *)tab, idx)); } + +inline v_float32x4 v_lut(const float* tab, const int* idx) +{ + float CV_DECL_ALIGNED(32) elems[4] = + { + tab[idx[0]], + tab[idx[1]], + tab[idx[2]], + tab[idx[3]] + }; + return v_float32x4(msa_ld1q_f32(elems)); +} +inline v_float32x4 v_lut_pairs(const float* tab, const int* idx) +{ + uint64 CV_DECL_ALIGNED(32) elems[2] = + { + *(uint64*)(tab + idx[0]), + *(uint64*)(tab + idx[1]) + }; + return v_float32x4(MSA_TPV_REINTERPRET(v4f32, msa_ld1q_u64(elems))); +} +inline v_float32x4 v_lut_quads(const float* tab, const int* idx) +{ + return v_float32x4(msa_ld1q_f32(tab + idx[0])); +} + +inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec) +{ + int CV_DECL_ALIGNED(32) idx[4]; + v_store_aligned(idx, idxvec); + + return v_int32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]); +} + +inline v_uint32x4 v_lut(const unsigned* tab, const v_int32x4& idxvec) +{ + unsigned CV_DECL_ALIGNED(32) elems[4] = + { + tab[msa_getq_lane_s32(idxvec.val, 0)], + tab[msa_getq_lane_s32(idxvec.val, 1)], + tab[msa_getq_lane_s32(idxvec.val, 2)], + tab[msa_getq_lane_s32(idxvec.val, 3)] + }; + return v_uint32x4(msa_ld1q_u32(elems)); +} + +inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec) +{ + int CV_DECL_ALIGNED(32) idx[4]; + v_store_aligned(idx, idxvec); + + return v_float32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]); +} + +inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y) +{ + int CV_DECL_ALIGNED(32) idx[4]; + v_store_aligned(idx, idxvec); + + v4f32 xy02 = msa_combine_f32(msa_ld1_f32(tab + idx[0]), msa_ld1_f32(tab + idx[2])); + v4f32 xy13 = msa_combine_f32(msa_ld1_f32(tab + idx[1]), msa_ld1_f32(tab + idx[3])); + x = v_float32x4(MSA_TPV_REINTERPRET(v4f32, msa_ilvevq_s32(MSA_TPV_REINTERPRET(v4i32, xy13), MSA_TPV_REINTERPRET(v4i32, xy02)))); + y = v_float32x4(MSA_TPV_REINTERPRET(v4f32, msa_ilvodq_s32(MSA_TPV_REINTERPRET(v4i32, xy13), MSA_TPV_REINTERPRET(v4i32, xy02)))); +} + +inline v_int8x16 v_interleave_pairs(const v_int8x16& vec) +{ + v_int8x16 c = v_int8x16(__builtin_msa_vshf_b((v16i8)((v2i64){0x0705060403010200, 0x0F0D0E0C0B090A08}), msa_dupq_n_s8(0), vec.val)); + return c; +} +inline v_uint8x16 v_interleave_pairs(const v_uint8x16& vec) +{ return v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec))); } +inline v_int8x16 v_interleave_quads(const v_int8x16& vec) +{ + v_int8x16 c = v_int8x16(__builtin_msa_vshf_b((v16i8)((v2i64){0x0703060205010400, 0x0F0B0E0A0D090C08}), msa_dupq_n_s8(0), vec.val)); + return c; +} +inline v_uint8x16 v_interleave_quads(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec))); } + +inline v_int16x8 v_interleave_pairs(const v_int16x8& vec) +{ + v_int16x8 c = v_int16x8(__builtin_msa_vshf_h((v8i16)((v2i64){0x0003000100020000, 0x0007000500060004}), msa_dupq_n_s16(0), vec.val)); + return c; +} + +inline v_uint16x8 v_interleave_pairs(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); } + +inline v_int16x8 v_interleave_quads(const v_int16x8& vec) +{ + v_int16x8 c = v_int16x8(__builtin_msa_vshf_h((v8i16)((v2i64){0x0005000100040000, 0x0007000300060002}), msa_dupq_n_s16(0), vec.val)); + return c; +} + +inline v_uint16x8 v_interleave_quads(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); } + +inline v_int32x4 v_interleave_pairs(const v_int32x4& vec) +{ + v_int32x4 c; + c.val[0] = vec.val[0]; + c.val[1] = vec.val[2]; + c.val[2] = vec.val[1]; + c.val[3] = vec.val[3]; + return c; +} + +inline v_uint32x4 v_interleave_pairs(const v_uint32x4& vec) { return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); } +inline v_float32x4 v_interleave_pairs(const v_float32x4& vec) { return v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); } + +inline v_int8x16 v_pack_triplets(const v_int8x16& vec) +{ + v_int8x16 c = v_int8x16(__builtin_msa_vshf_b((v16i8)((v2i64){0x0908060504020100, 0x131211100E0D0C0A}), msa_dupq_n_s8(0), vec.val)); + return c; +} + +inline v_uint8x16 v_pack_triplets(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); } + +inline v_int16x8 v_pack_triplets(const v_int16x8& vec) +{ + v_int16x8 c = v_int16x8(__builtin_msa_vshf_h((v8i16)((v2i64){0x0004000200010000, 0x0009000800060005}), msa_dupq_n_s16(0), vec.val)); + return c; +} + +inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); } +inline v_int32x4 v_pack_triplets(const v_int32x4& vec) { return vec; } +inline v_uint32x4 v_pack_triplets(const v_uint32x4& vec) { return vec; } +inline v_float32x4 v_pack_triplets(const v_float32x4& vec) { return vec; } + +inline v_float64x2 v_lut(const double* tab, const int* idx) +{ + double CV_DECL_ALIGNED(32) elems[2] = + { + tab[idx[0]], + tab[idx[1]] + }; + return v_float64x2(msa_ld1q_f64(elems)); +} + +inline v_float64x2 v_lut_pairs(const double* tab, const int* idx) +{ + return v_float64x2(msa_ld1q_f64(tab + idx[0])); +} + +inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec) +{ + int CV_DECL_ALIGNED(32) idx[4]; + v_store_aligned(idx, idxvec); + + return v_float64x2(tab[idx[0]], tab[idx[1]]); +} + +inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y) +{ + int CV_DECL_ALIGNED(32) idx[4]; + v_store_aligned(idx, idxvec); + + v2f64 xy0 = msa_ld1q_f64(tab + idx[0]); + v2f64 xy1 = msa_ld1q_f64(tab + idx[1]); + x = v_float64x2(MSA_TPV_REINTERPRET(v2f64, msa_ilvevq_s64(MSA_TPV_REINTERPRET(v2i64, xy1), MSA_TPV_REINTERPRET(v2i64, xy0)))); + y = v_float64x2(MSA_TPV_REINTERPRET(v2f64, msa_ilvodq_s64(MSA_TPV_REINTERPRET(v2i64, xy1), MSA_TPV_REINTERPRET(v2i64, xy0)))); +} + +template +inline typename _Tp::lane_type v_extract_n(const _Tp& a) +{ + return v_rotate_right(a).get0(); +} + +template +inline v_uint32x4 v_broadcast_element(const v_uint32x4& a) +{ + return v_setall_u32(v_extract_n(a)); +} +template +inline v_int32x4 v_broadcast_element(const v_int32x4& a) +{ + return v_setall_s32(v_extract_n(a)); +} +template +inline v_float32x4 v_broadcast_element(const v_float32x4& a) +{ + return v_setall_f32(v_extract_n(a)); +} + +////// FP16 support /////// +#if CV_FP16 +inline v_float32x4 v_load_expand(const float16_t* ptr) +{ +#ifndef msa_ld1_f16 + v4f16 v = (v4f16)msa_ld1_s16((const short*)ptr); +#else + v4f16 v = msa_ld1_f16((const __fp16*)ptr); +#endif + return v_float32x4(msa_cvt_f32_f16(v)); +} + +inline void v_pack_store(float16_t* ptr, const v_float32x4& v) +{ + v4f16 hv = msa_cvt_f16_f32(v.val); + +#ifndef msa_st1_f16 + msa_st1_s16((short*)ptr, (int16x4_t)hv); +#else + msa_st1_f16((__fp16*)ptr, hv); +#endif +} +#else +inline v_float32x4 v_load_expand(const float16_t* ptr) +{ + float buf[4]; + for( int i = 0; i < 4; i++ ) + buf[i] = (float)ptr[i]; + return v_load(buf); +} + +inline void v_pack_store(float16_t* ptr, const v_float32x4& v) +{ + float buf[4]; + v_store(buf, v); + for( int i = 0; i < 4; i++ ) + ptr[i] = (float16_t)buf[i]; +} +#endif + +inline void v_cleanup() {} + +CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END + +//! @endcond + +} + +#endif diff --git a/3rdparty/opencv/include/opencv2/core/hal/intrin_wasm.hpp b/3rdparty/opencv/include/opencv2/core/hal/intrin_wasm.hpp new file mode 100644 index 00000000..b4178af8 --- /dev/null +++ b/3rdparty/opencv/include/opencv2/core/hal/intrin_wasm.hpp @@ -0,0 +1,2782 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef OPENCV_HAL_INTRIN_WASM_HPP +#define OPENCV_HAL_INTRIN_WASM_HPP + +#include +#include +#include +#include "opencv2/core/saturate.hpp" + +#define CV_SIMD128 1 +#define CV_SIMD128_64F 0 // Now all implementation of f64 use fallback, so disable it. +#define CV_SIMD128_FP16 0 + +namespace cv +{ + +//! @cond IGNORED + +CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN + +#if (__EMSCRIPTEN_major__ * 1000000 + __EMSCRIPTEN_minor__ * 1000 + __EMSCRIPTEN_tiny__) < (1038046) +// handle renames: https://github.com/emscripten-core/emscripten/pull/9440 (https://github.com/emscripten-core/emscripten/commit/755d5b46cb84d0aa120c10981b11d05646c29673) +#define wasm_i32x4_trunc_saturate_f32x4 wasm_trunc_saturate_i32x4_f32x4 +#define wasm_u32x4_trunc_saturate_f32x4 wasm_trunc_saturate_u32x4_f32x4 +#define wasm_i64x2_trunc_saturate_f64x2 wasm_trunc_saturate_i64x2_f64x2 +#define wasm_u64x2_trunc_saturate_f64x2 wasm_trunc_saturate_u64x2_f64x2 +#define wasm_f32x4_convert_i32x4 wasm_convert_f32x4_i32x4 +#define wasm_f32x4_convert_u32x4 wasm_convert_f32x4_u32x4 +#define wasm_f64x2_convert_i64x2 wasm_convert_f64x2_i64x2 +#define wasm_f64x2_convert_u64x2 wasm_convert_f64x2_u64x2 +#endif // COMPATIBILITY: <1.38.46 + +///////// Types /////////// + +struct v_uint8x16 +{ + typedef uchar lane_type; + typedef v128_t vector_type; + enum { nlanes = 16 }; + + v_uint8x16() {} + explicit v_uint8x16(v128_t v) : val(v) {} + v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7, + uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15) + { + uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15}; + val = wasm_v128_load(v); + } + + uchar get0() const + { + return (uchar)wasm_i8x16_extract_lane(val, 0); + } + + v128_t val; +}; + +struct v_int8x16 +{ + typedef schar lane_type; + typedef v128_t vector_type; + enum { nlanes = 16 }; + + v_int8x16() {} + explicit v_int8x16(v128_t v) : val(v) {} + v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7, + schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15) + { + schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15}; + val = wasm_v128_load(v); + } + + schar get0() const + { + return wasm_i8x16_extract_lane(val, 0); + } + + v128_t val; +}; + +struct v_uint16x8 +{ + typedef ushort lane_type; + typedef v128_t vector_type; + enum { nlanes = 8 }; + + v_uint16x8() {} + explicit v_uint16x8(v128_t v) : val(v) {} + v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7) + { + ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7}; + val = wasm_v128_load(v); + } + + ushort get0() const + { + return (ushort)wasm_i16x8_extract_lane(val, 0); // wasm_u16x8_extract_lane() unimplemented yet + } + + v128_t val; +}; + +struct v_int16x8 +{ + typedef short lane_type; + typedef v128_t vector_type; + enum { nlanes = 8 }; + + v_int16x8() {} + explicit v_int16x8(v128_t v) : val(v) {} + v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7) + { + short v[] = {v0, v1, v2, v3, v4, v5, v6, v7}; + val = wasm_v128_load(v); + } + + short get0() const + { + return wasm_i16x8_extract_lane(val, 0); + } + + v128_t val; +}; + +struct v_uint32x4 +{ + typedef unsigned lane_type; + typedef v128_t vector_type; + enum { nlanes = 4 }; + + v_uint32x4() {} + explicit v_uint32x4(v128_t v) : val(v) {} + v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3) + { + unsigned v[] = {v0, v1, v2, v3}; + val = wasm_v128_load(v); + } + + unsigned get0() const + { + return (unsigned)wasm_i32x4_extract_lane(val, 0); + } + + v128_t val; +}; + +struct v_int32x4 +{ + typedef int lane_type; + typedef v128_t vector_type; + enum { nlanes = 4 }; + + v_int32x4() {} + explicit v_int32x4(v128_t v) : val(v) {} + v_int32x4(int v0, int v1, int v2, int v3) + { + int v[] = {v0, v1, v2, v3}; + val = wasm_v128_load(v); + } + + int get0() const + { + return wasm_i32x4_extract_lane(val, 0); + } + + v128_t val; +}; + +struct v_float32x4 +{ + typedef float lane_type; + typedef v128_t vector_type; + enum { nlanes = 4 }; + + v_float32x4() {} + explicit v_float32x4(v128_t v) : val(v) {} + v_float32x4(float v0, float v1, float v2, float v3) + { + float v[] = {v0, v1, v2, v3}; + val = wasm_v128_load(v); + } + + float get0() const + { + return wasm_f32x4_extract_lane(val, 0); + } + + v128_t val; +}; + +struct v_uint64x2 +{ + typedef uint64 lane_type; + typedef v128_t vector_type; + enum { nlanes = 2 }; + + v_uint64x2() {} + explicit v_uint64x2(v128_t v) : val(v) {} + v_uint64x2(uint64 v0, uint64 v1) + { + uint64 v[] = {v0, v1}; + val = wasm_v128_load(v); + } + + uint64 get0() const + { + return (uint64)wasm_i64x2_extract_lane(val, 0); + } + + v128_t val; +}; + +struct v_int64x2 +{ + typedef int64 lane_type; + typedef v128_t vector_type; + enum { nlanes = 2 }; + + v_int64x2() {} + explicit v_int64x2(v128_t v) : val(v) {} + v_int64x2(int64 v0, int64 v1) + { + int64 v[] = {v0, v1}; + val = wasm_v128_load(v); + } + + int64 get0() const + { + return wasm_i64x2_extract_lane(val, 0); + } + + v128_t val; +}; + +struct v_float64x2 +{ + typedef double lane_type; + typedef v128_t vector_type; + enum { nlanes = 2 }; + + v_float64x2() {} + explicit v_float64x2(v128_t v) : val(v) {} + v_float64x2(double v0, double v1) + { + double v[] = {v0, v1}; + val = wasm_v128_load(v); + } + + double get0() const + { + return wasm_f64x2_extract_lane(val, 0); + } + + v128_t val; +}; + +namespace +{ +#define OPENCV_HAL_IMPL_REINTERPRET_INT(ft, tt) \ +inline tt reinterpret_int(ft x) { union { ft l; tt i; } v; v.l = x; return v.i; } +OPENCV_HAL_IMPL_REINTERPRET_INT(uchar, schar) +OPENCV_HAL_IMPL_REINTERPRET_INT(schar, schar) +OPENCV_HAL_IMPL_REINTERPRET_INT(ushort, short) +OPENCV_HAL_IMPL_REINTERPRET_INT(short, short) +OPENCV_HAL_IMPL_REINTERPRET_INT(unsigned, int) +OPENCV_HAL_IMPL_REINTERPRET_INT(int, int) +OPENCV_HAL_IMPL_REINTERPRET_INT(float, int) +OPENCV_HAL_IMPL_REINTERPRET_INT(uint64, int64) +OPENCV_HAL_IMPL_REINTERPRET_INT(int64, int64) +OPENCV_HAL_IMPL_REINTERPRET_INT(double, int64) + +static const unsigned char popCountTable[] = +{ + 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, + 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, + 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, + 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, + 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, + 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, + 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, + 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, + 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, + 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8, +}; +} // namespace + +static v128_t wasm_unpacklo_i8x16(v128_t a, v128_t b) { + return wasm_v8x16_shuffle(a, b, 0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23); +} + +static v128_t wasm_unpacklo_i16x8(v128_t a, v128_t b) { + return wasm_v8x16_shuffle(a, b, 0,1,16,17,2,3,18,19,4,5,20,21,6,7,22,23); +} + +static v128_t wasm_unpacklo_i32x4(v128_t a, v128_t b) { + return wasm_v8x16_shuffle(a, b, 0,1,2,3,16,17,18,19,4,5,6,7,20,21,22,23); +} + +static v128_t wasm_unpacklo_i64x2(v128_t a, v128_t b) { + return wasm_v8x16_shuffle(a, b, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23); +} + +static v128_t wasm_unpackhi_i8x16(v128_t a, v128_t b) { + return wasm_v8x16_shuffle(a, b, 8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31); +} + +static v128_t wasm_unpackhi_i16x8(v128_t a, v128_t b) { + return wasm_v8x16_shuffle(a, b, 8,9,24,25,10,11,26,27,12,13,28,29,14,15,30,31); +} + +static v128_t wasm_unpackhi_i32x4(v128_t a, v128_t b) { + return wasm_v8x16_shuffle(a, b, 8,9,10,11,24,25,26,27,12,13,14,15,28,29,30,31); +} + +static v128_t wasm_unpackhi_i64x2(v128_t a, v128_t b) { + return wasm_v8x16_shuffle(a, b, 8,9,10,11,12,13,14,15,24,25,26,27,28,29,30,31); +} + +/** Convert **/ +// 8 >> 16 +inline v128_t v128_cvtu8x16_i16x8(const v128_t& a) +{ + const v128_t z = wasm_i8x16_splat(0); + return wasm_unpacklo_i8x16(a, z); +} +inline v128_t v128_cvti8x16_i16x8(const v128_t& a) +{ return wasm_i16x8_shr(wasm_unpacklo_i8x16(a, a), 8); } +// 8 >> 32 +inline v128_t v128_cvtu8x16_i32x4(const v128_t& a) +{ + const v128_t z = wasm_i8x16_splat(0); + return wasm_unpacklo_i16x8(wasm_unpacklo_i8x16(a, z), z); +} +inline v128_t v128_cvti8x16_i32x4(const v128_t& a) +{ + v128_t r = wasm_unpacklo_i8x16(a, a); + r = wasm_unpacklo_i8x16(r, r); + return wasm_i32x4_shr(r, 24); +} +// 16 >> 32 +inline v128_t v128_cvtu16x8_i32x4(const v128_t& a) +{ + const v128_t z = wasm_i8x16_splat(0); + return wasm_unpacklo_i16x8(a, z); +} +inline v128_t v128_cvti16x8_i32x4(const v128_t& a) +{ return wasm_i32x4_shr(wasm_unpacklo_i16x8(a, a), 16); } +// 32 >> 64 +inline v128_t v128_cvtu32x4_i64x2(const v128_t& a) +{ + const v128_t z = wasm_i8x16_splat(0); + return wasm_unpacklo_i32x4(a, z); +} +inline v128_t v128_cvti32x4_i64x2(const v128_t& a) +{ return wasm_unpacklo_i32x4(a, wasm_i32x4_shr(a, 31)); } + +// 16 << 8 +inline v128_t v128_cvtu8x16_i16x8_high(const v128_t& a) +{ + const v128_t z = wasm_i8x16_splat(0); + return wasm_unpackhi_i8x16(a, z); +} +inline v128_t v128_cvti8x16_i16x8_high(const v128_t& a) +{ return wasm_i16x8_shr(wasm_unpackhi_i8x16(a, a), 8); } +// 32 << 16 +inline v128_t v128_cvtu16x8_i32x4_high(const v128_t& a) +{ + const v128_t z = wasm_i8x16_splat(0); + return wasm_unpackhi_i16x8(a, z); +} +inline v128_t v128_cvti16x8_i32x4_high(const v128_t& a) +{ return wasm_i32x4_shr(wasm_unpackhi_i16x8(a, a), 16); } +// 64 << 32 +inline v128_t v128_cvtu32x4_i64x2_high(const v128_t& a) +{ + const v128_t z = wasm_i8x16_splat(0); + return wasm_unpackhi_i32x4(a, z); +} +inline v128_t v128_cvti32x4_i64x2_high(const v128_t& a) +{ return wasm_unpackhi_i32x4(a, wasm_i32x4_shr(a, 31)); } + +#define OPENCV_HAL_IMPL_WASM_INITVEC(_Tpvec, _Tp, suffix, zsuffix, _Tps) \ +inline _Tpvec v_setzero_##suffix() { return _Tpvec(wasm_##zsuffix##_splat((_Tps)0)); } \ +inline _Tpvec v_setall_##suffix(_Tp v) { return _Tpvec(wasm_##zsuffix##_splat((_Tps)v)); } \ +template inline _Tpvec v_reinterpret_as_##suffix(const _Tpvec0& a) \ +{ return _Tpvec(a.val); } + +OPENCV_HAL_IMPL_WASM_INITVEC(v_uint8x16, uchar, u8, i8x16, schar) +OPENCV_HAL_IMPL_WASM_INITVEC(v_int8x16, schar, s8, i8x16, schar) +OPENCV_HAL_IMPL_WASM_INITVEC(v_uint16x8, ushort, u16, i16x8, short) +OPENCV_HAL_IMPL_WASM_INITVEC(v_int16x8, short, s16, i16x8, short) +OPENCV_HAL_IMPL_WASM_INITVEC(v_uint32x4, unsigned, u32, i32x4, int) +OPENCV_HAL_IMPL_WASM_INITVEC(v_int32x4, int, s32, i32x4, int) +OPENCV_HAL_IMPL_WASM_INITVEC(v_float32x4, float, f32, f32x4, float) +OPENCV_HAL_IMPL_WASM_INITVEC(v_uint64x2, uint64, u64, i64x2, int64) +OPENCV_HAL_IMPL_WASM_INITVEC(v_int64x2, int64, s64, i64x2, int64) +OPENCV_HAL_IMPL_WASM_INITVEC(v_float64x2, double, f64, f64x2, double) + +//////////////// PACK /////////////// +inline v_uint8x16 v_pack(const v_uint16x8& a, const v_uint16x8& b) +{ + v128_t maxval = wasm_i16x8_splat(255); + v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_u16x8_gt(a.val, maxval)); + v128_t b1 = wasm_v128_bitselect(maxval, b.val, wasm_u16x8_gt(b.val, maxval)); + return v_uint8x16(wasm_v8x16_shuffle(a1, b1, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30)); +} +inline v_int8x16 v_pack(const v_int16x8& a, const v_int16x8& b) +{ + v128_t maxval = wasm_i16x8_splat(127); + v128_t minval = wasm_i16x8_splat(-128); + v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_i16x8_gt(a.val, maxval)); + v128_t b1 = wasm_v128_bitselect(maxval, b.val, wasm_i16x8_gt(b.val, maxval)); + v128_t a2 = wasm_v128_bitselect(minval, a1, wasm_i16x8_lt(a1, minval)); + v128_t b2 = wasm_v128_bitselect(minval, b1, wasm_i16x8_lt(b1, minval)); + return v_int8x16(wasm_v8x16_shuffle(a2, b2, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30)); +} +inline v_uint16x8 v_pack(const v_uint32x4& a, const v_uint32x4& b) +{ + v128_t maxval = wasm_i32x4_splat(65535); + v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_u32x4_gt(a.val, maxval)); + v128_t b1 = wasm_v128_bitselect(maxval, b.val, wasm_u32x4_gt(b.val, maxval)); + return v_uint16x8(wasm_v8x16_shuffle(a1, b1, 0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29)); +} +inline v_int16x8 v_pack(const v_int32x4& a, const v_int32x4& b) +{ + v128_t maxval = wasm_i32x4_splat(32767); + v128_t minval = wasm_i32x4_splat(-32768); + v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_i32x4_gt(a.val, maxval)); + v128_t b1 = wasm_v128_bitselect(maxval, b.val, wasm_i32x4_gt(b.val, maxval)); + v128_t a2 = wasm_v128_bitselect(minval, a1, wasm_i32x4_lt(a1, minval)); + v128_t b2 = wasm_v128_bitselect(minval, b1, wasm_i32x4_lt(b1, minval)); + return v_int16x8(wasm_v8x16_shuffle(a2, b2, 0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29)); +} +inline v_uint32x4 v_pack(const v_uint64x2& a, const v_uint64x2& b) +{ + return v_uint32x4(wasm_v8x16_shuffle(a.val, b.val, 0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27)); +} +inline v_int32x4 v_pack(const v_int64x2& a, const v_int64x2& b) +{ + return v_int32x4(wasm_v8x16_shuffle(a.val, b.val, 0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27)); +} +inline v_uint8x16 v_pack_u(const v_int16x8& a, const v_int16x8& b) +{ + v128_t maxval = wasm_i16x8_splat(255); + v128_t minval = wasm_i16x8_splat(0); + v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_i16x8_gt(a.val, maxval)); + v128_t b1 = wasm_v128_bitselect(maxval, b.val, wasm_i16x8_gt(b.val, maxval)); + v128_t a2 = wasm_v128_bitselect(minval, a1, wasm_i16x8_lt(a1, minval)); + v128_t b2 = wasm_v128_bitselect(minval, b1, wasm_i16x8_lt(b1, minval)); + return v_uint8x16(wasm_v8x16_shuffle(a2, b2, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30)); +} +inline v_uint16x8 v_pack_u(const v_int32x4& a, const v_int32x4& b) +{ + v128_t maxval = wasm_i32x4_splat(65535); + v128_t minval = wasm_i32x4_splat(0); + v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_i32x4_gt(a.val, maxval)); + v128_t b1 = wasm_v128_bitselect(maxval, b.val, wasm_i32x4_gt(b.val, maxval)); + v128_t a2 = wasm_v128_bitselect(minval, a1, wasm_i32x4_lt(a1, minval)); + v128_t b2 = wasm_v128_bitselect(minval, b1, wasm_i32x4_lt(b1, minval)); + return v_uint16x8(wasm_v8x16_shuffle(a2, b2, 0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29)); +} + +template +inline v_uint8x16 v_rshr_pack(const v_uint16x8& a, const v_uint16x8& b) +{ + v128_t delta = wasm_i16x8_splat(((short)1 << (n-1))); + v128_t a1 = wasm_u16x8_shr(wasm_i16x8_add(a.val, delta), n); + v128_t b1 = wasm_u16x8_shr(wasm_i16x8_add(b.val, delta), n); + v128_t maxval = wasm_i16x8_splat(255); + v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_u16x8_gt(a1, maxval)); + v128_t b2 = wasm_v128_bitselect(maxval, b1, wasm_u16x8_gt(b1, maxval)); + return v_uint8x16(wasm_v8x16_shuffle(a2, b2, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30)); +} +template +inline v_int8x16 v_rshr_pack(const v_int16x8& a, const v_int16x8& b) +{ + v128_t delta = wasm_i16x8_splat(((short)1 << (n-1))); + v128_t a1 = wasm_i16x8_shr(wasm_i16x8_add(a.val, delta), n); + v128_t b1 = wasm_i16x8_shr(wasm_i16x8_add(b.val, delta), n); + v128_t maxval = wasm_i16x8_splat(127); + v128_t minval = wasm_i16x8_splat(-128); + v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_i16x8_gt(a1, maxval)); + v128_t b2 = wasm_v128_bitselect(maxval, b1, wasm_i16x8_gt(b1, maxval)); + v128_t a3 = wasm_v128_bitselect(minval, a2, wasm_i16x8_lt(a1, minval)); + v128_t b3 = wasm_v128_bitselect(minval, b2, wasm_i16x8_lt(b1, minval)); + return v_int8x16(wasm_v8x16_shuffle(a3, b3, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30)); +} +template +inline v_uint16x8 v_rshr_pack(const v_uint32x4& a, const v_uint32x4& b) +{ + v128_t delta = wasm_i32x4_splat(((int)1 << (n-1))); + v128_t a1 = wasm_u32x4_shr(wasm_i32x4_add(a.val, delta), n); + v128_t b1 = wasm_u32x4_shr(wasm_i32x4_add(b.val, delta), n); + v128_t maxval = wasm_i32x4_splat(65535); + v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_u32x4_gt(a1, maxval)); + v128_t b2 = wasm_v128_bitselect(maxval, b1, wasm_u32x4_gt(b1, maxval)); + return v_uint16x8(wasm_v8x16_shuffle(a2, b2, 0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29)); +} +template +inline v_int16x8 v_rshr_pack(const v_int32x4& a, const v_int32x4& b) +{ + v128_t delta = wasm_i32x4_splat(((int)1 << (n-1))); + v128_t a1 = wasm_i32x4_shr(wasm_i32x4_add(a.val, delta), n); + v128_t b1 = wasm_i32x4_shr(wasm_i32x4_add(b.val, delta), n); + v128_t maxval = wasm_i32x4_splat(32767); + v128_t minval = wasm_i16x8_splat(-32768); + v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_i32x4_gt(a1, maxval)); + v128_t b2 = wasm_v128_bitselect(maxval, b1, wasm_i32x4_gt(b1, maxval)); + v128_t a3 = wasm_v128_bitselect(minval, a2, wasm_i32x4_lt(a1, minval)); + v128_t b3 = wasm_v128_bitselect(minval, b2, wasm_i32x4_lt(b1, minval)); + return v_int16x8(wasm_v8x16_shuffle(a3, b3, 0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29)); +} +template +inline v_uint32x4 v_rshr_pack(const v_uint64x2& a, const v_uint64x2& b) +{ + v128_t delta = wasm_i64x2_splat(((int64)1 << (n-1))); + v128_t a1 = wasm_u64x2_shr(wasm_i64x2_add(a.val, delta), n); + v128_t b1 = wasm_u64x2_shr(wasm_i64x2_add(b.val, delta), n); + return v_uint32x4(wasm_v8x16_shuffle(a1, b1, 0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27)); +} +template +inline v_int32x4 v_rshr_pack(const v_int64x2& a, const v_int64x2& b) +{ + v128_t delta = wasm_i64x2_splat(((int64)1 << (n-1))); + v128_t a1 = wasm_i64x2_shr(wasm_i64x2_add(a.val, delta), n); + v128_t b1 = wasm_i64x2_shr(wasm_i64x2_add(b.val, delta), n); + return v_int32x4(wasm_v8x16_shuffle(a1, b1, 0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27)); +} +template +inline v_uint8x16 v_rshr_pack_u(const v_int16x8& a, const v_int16x8& b) +{ + v128_t delta = wasm_i16x8_splat(((short)1 << (n-1))); + v128_t a1 = wasm_i16x8_shr(wasm_i16x8_add(a.val, delta), n); + v128_t b1 = wasm_i16x8_shr(wasm_i16x8_add(b.val, delta), n); + v128_t maxval = wasm_i16x8_splat(255); + v128_t minval = wasm_i16x8_splat(0); + v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_i16x8_gt(a1, maxval)); + v128_t b2 = wasm_v128_bitselect(maxval, b1, wasm_i16x8_gt(b1, maxval)); + v128_t a3 = wasm_v128_bitselect(minval, a2, wasm_i16x8_lt(a1, minval)); + v128_t b3 = wasm_v128_bitselect(minval, b2, wasm_i16x8_lt(b1, minval)); + return v_uint8x16(wasm_v8x16_shuffle(a3, b3, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30)); +} +template +inline v_uint16x8 v_rshr_pack_u(const v_int32x4& a, const v_int32x4& b) +{ + v128_t delta = wasm_i32x4_splat(((int)1 << (n-1))); + v128_t a1 = wasm_i32x4_shr(wasm_i32x4_add(a.val, delta), n); + v128_t b1 = wasm_i32x4_shr(wasm_i32x4_add(b.val, delta), n); + v128_t maxval = wasm_i32x4_splat(65535); + v128_t minval = wasm_i16x8_splat(0); + v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_i32x4_gt(a1, maxval)); + v128_t b2 = wasm_v128_bitselect(maxval, b1, wasm_i32x4_gt(b1, maxval)); + v128_t a3 = wasm_v128_bitselect(minval, a2, wasm_i32x4_lt(a1, minval)); + v128_t b3 = wasm_v128_bitselect(minval, b2, wasm_i32x4_lt(b1, minval)); + return v_uint16x8(wasm_v8x16_shuffle(a3, b3, 0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29)); +} + +inline void v_pack_store(uchar* ptr, const v_uint16x8& a) +{ + v128_t maxval = wasm_i16x8_splat(255); + v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_u16x8_gt(a.val, maxval)); + v128_t r = wasm_v8x16_shuffle(a1, a1, 0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14); + uchar t_ptr[16]; + wasm_v128_store(t_ptr, r); + for (int i=0; i<8; ++i) { + ptr[i] = t_ptr[i]; + } +} +inline void v_pack_store(schar* ptr, const v_int16x8& a) +{ + v128_t maxval = wasm_i16x8_splat(127); + v128_t minval = wasm_i16x8_splat(-128); + v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_i16x8_gt(a.val, maxval)); + v128_t a2 = wasm_v128_bitselect(minval, a1, wasm_i16x8_lt(a1, minval)); + v128_t r = wasm_v8x16_shuffle(a2, a2, 0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14); + schar t_ptr[16]; + wasm_v128_store(t_ptr, r); + for (int i=0; i<8; ++i) { + ptr[i] = t_ptr[i]; + } +} +inline void v_pack_store(ushort* ptr, const v_uint32x4& a) +{ + v128_t maxval = wasm_i32x4_splat(65535); + v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_u32x4_gt(a.val, maxval)); + v128_t r = wasm_v8x16_shuffle(a1, a1, 0,1,4,5,8,9,12,13,0,1,4,5,8,9,12,13); + ushort t_ptr[8]; + wasm_v128_store(t_ptr, r); + for (int i=0; i<4; ++i) { + ptr[i] = t_ptr[i]; + } +} +inline void v_pack_store(short* ptr, const v_int32x4& a) +{ + v128_t maxval = wasm_i32x4_splat(32767); + v128_t minval = wasm_i32x4_splat(-32768); + v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_i32x4_gt(a.val, maxval)); + v128_t a2 = wasm_v128_bitselect(minval, a1, wasm_i32x4_lt(a1, minval)); + v128_t r = wasm_v8x16_shuffle(a2, a2, 0,1,4,5,8,9,12,13,0,1,4,5,8,9,12,13); + short t_ptr[8]; + wasm_v128_store(t_ptr, r); + for (int i=0; i<4; ++i) { + ptr[i] = t_ptr[i]; + } +} +inline void v_pack_store(unsigned* ptr, const v_uint64x2& a) +{ + v128_t r = wasm_v8x16_shuffle(a.val, a.val, 0,1,2,3,8,9,10,11,0,1,2,3,8,9,10,11); + unsigned t_ptr[4]; + wasm_v128_store(t_ptr, r); + for (int i=0; i<2; ++i) { + ptr[i] = t_ptr[i]; + } +} +inline void v_pack_store(int* ptr, const v_int64x2& a) +{ + v128_t r = wasm_v8x16_shuffle(a.val, a.val, 0,1,2,3,8,9,10,11,0,1,2,3,8,9,10,11); + int t_ptr[4]; + wasm_v128_store(t_ptr, r); + for (int i=0; i<2; ++i) { + ptr[i] = t_ptr[i]; + } +} +inline void v_pack_u_store(uchar* ptr, const v_int16x8& a) +{ + v128_t maxval = wasm_i16x8_splat(255); + v128_t minval = wasm_i16x8_splat(0); + v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_i16x8_gt(a.val, maxval)); + v128_t a2 = wasm_v128_bitselect(minval, a1, wasm_i16x8_lt(a1, minval)); + v128_t r = wasm_v8x16_shuffle(a2, a2, 0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14); + uchar t_ptr[16]; + wasm_v128_store(t_ptr, r); + for (int i=0; i<8; ++i) { + ptr[i] = t_ptr[i]; + } +} +inline void v_pack_u_store(ushort* ptr, const v_int32x4& a) +{ + v128_t maxval = wasm_i32x4_splat(65535); + v128_t minval = wasm_i32x4_splat(0); + v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_i32x4_gt(a.val, maxval)); + v128_t a2 = wasm_v128_bitselect(minval, a1, wasm_i32x4_lt(a1, minval)); + v128_t r = wasm_v8x16_shuffle(a2, a2, 0,1,4,5,8,9,12,13,0,1,4,5,8,9,12,13); + ushort t_ptr[8]; + wasm_v128_store(t_ptr, r); + for (int i=0; i<4; ++i) { + ptr[i] = t_ptr[i]; + } +} + +template +inline void v_rshr_pack_store(uchar* ptr, const v_uint16x8& a) +{ + v128_t delta = wasm_i16x8_splat((short)(1 << (n-1))); + v128_t a1 = wasm_u16x8_shr(wasm_i16x8_add(a.val, delta), n); + v128_t maxval = wasm_i16x8_splat(255); + v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_u16x8_gt(a1, maxval)); + v128_t r = wasm_v8x16_shuffle(a2, a2, 0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14); + uchar t_ptr[16]; + wasm_v128_store(t_ptr, r); + for (int i=0; i<8; ++i) { + ptr[i] = t_ptr[i]; + } +} +template +inline void v_rshr_pack_store(schar* ptr, const v_int16x8& a) +{ + v128_t delta = wasm_i16x8_splat(((short)1 << (n-1))); + v128_t a1 = wasm_i16x8_shr(wasm_i16x8_add(a.val, delta), n); + v128_t maxval = wasm_i16x8_splat(127); + v128_t minval = wasm_i16x8_splat(-128); + v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_i16x8_gt(a1, maxval)); + v128_t a3 = wasm_v128_bitselect(minval, a2, wasm_i16x8_lt(a1, minval)); + v128_t r = wasm_v8x16_shuffle(a3, a3, 0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14); + schar t_ptr[16]; + wasm_v128_store(t_ptr, r); + for (int i=0; i<8; ++i) { + ptr[i] = t_ptr[i]; + } +} +template +inline void v_rshr_pack_store(ushort* ptr, const v_uint32x4& a) +{ + v128_t delta = wasm_i32x4_splat(((int)1 << (n-1))); + v128_t a1 = wasm_u32x4_shr(wasm_i32x4_add(a.val, delta), n); + v128_t maxval = wasm_i32x4_splat(65535); + v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_u32x4_gt(a1, maxval)); + v128_t r = wasm_v8x16_shuffle(a2, a2, 0,1,4,5,8,9,12,13,0,1,4,5,8,9,12,13); + ushort t_ptr[8]; + wasm_v128_store(t_ptr, r); + for (int i=0; i<4; ++i) { + ptr[i] = t_ptr[i]; + } +} +template +inline void v_rshr_pack_store(short* ptr, const v_int32x4& a) +{ + v128_t delta = wasm_i32x4_splat(((int)1 << (n-1))); + v128_t a1 = wasm_i32x4_shr(wasm_i32x4_add(a.val, delta), n); + v128_t maxval = wasm_i32x4_splat(32767); + v128_t minval = wasm_i32x4_splat(-32768); + v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_i32x4_gt(a1, maxval)); + v128_t a3 = wasm_v128_bitselect(minval, a2, wasm_i32x4_lt(a1, minval)); + v128_t r = wasm_v8x16_shuffle(a3, a3, 0,1,4,5,8,9,12,13,0,1,4,5,8,9,12,13); + short t_ptr[8]; + wasm_v128_store(t_ptr, r); + for (int i=0; i<4; ++i) { + ptr[i] = t_ptr[i]; + } +} +template +inline void v_rshr_pack_store(unsigned* ptr, const v_uint64x2& a) +{ + v128_t delta = wasm_i64x2_splat(((int64)1 << (n-1))); + v128_t a1 = wasm_u64x2_shr(wasm_i64x2_add(a.val, delta), n); + v128_t r = wasm_v8x16_shuffle(a1, a1, 0,1,2,3,8,9,10,11,0,1,2,3,8,9,10,11); + unsigned t_ptr[4]; + wasm_v128_store(t_ptr, r); + for (int i=0; i<2; ++i) { + ptr[i] = t_ptr[i]; + } +} +template +inline void v_rshr_pack_store(int* ptr, const v_int64x2& a) +{ + v128_t delta = wasm_i64x2_splat(((int64)1 << (n-1))); + v128_t a1 = wasm_i64x2_shr(wasm_i64x2_add(a.val, delta), n); + v128_t r = wasm_v8x16_shuffle(a1, a1, 0,1,2,3,8,9,10,11,0,1,2,3,8,9,10,11); + int t_ptr[4]; + wasm_v128_store(t_ptr, r); + for (int i=0; i<2; ++i) { + ptr[i] = t_ptr[i]; + } +} +template +inline void v_rshr_pack_u_store(uchar* ptr, const v_int16x8& a) +{ + v128_t delta = wasm_i16x8_splat(((short)1 << (n-1))); + v128_t a1 = wasm_i16x8_shr(wasm_i16x8_add(a.val, delta), n); + v128_t maxval = wasm_i16x8_splat(255); + v128_t minval = wasm_i16x8_splat(0); + v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_i16x8_gt(a1, maxval)); + v128_t a3 = wasm_v128_bitselect(minval, a2, wasm_i16x8_lt(a1, minval)); + v128_t r = wasm_v8x16_shuffle(a3, a3, 0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14); + uchar t_ptr[16]; + wasm_v128_store(t_ptr, r); + for (int i=0; i<8; ++i) { + ptr[i] = t_ptr[i]; + } +} +template +inline void v_rshr_pack_u_store(ushort* ptr, const v_int32x4& a) +{ + v128_t delta = wasm_i32x4_splat(((int)1 << (n-1))); + v128_t a1 = wasm_i32x4_shr(wasm_i32x4_add(a.val, delta), n); + v128_t maxval = wasm_i32x4_splat(65535); + v128_t minval = wasm_i32x4_splat(0); + v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_i32x4_gt(a1, maxval)); + v128_t a3 = wasm_v128_bitselect(minval, a2, wasm_i32x4_lt(a1, minval)); + v128_t r = wasm_v8x16_shuffle(a3, a3, 0,1,4,5,8,9,12,13,0,1,4,5,8,9,12,13); + ushort t_ptr[8]; + wasm_v128_store(t_ptr, r); + for (int i=0; i<4; ++i) { + ptr[i] = t_ptr[i]; + } +} + +inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b) +{ + v128_t maxval = wasm_i16x8_splat(255); + v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_u16x8_gt(a.val, maxval)); + v128_t b1 = wasm_v128_bitselect(maxval, b.val, wasm_u16x8_gt(b.val, maxval)); + return v_uint8x16(wasm_v8x16_shuffle(a1, b1, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30)); +} + +inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b, + const v_uint32x4& c, const v_uint32x4& d) +{ + v128_t maxval = wasm_i32x4_splat(255); + v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_u32x4_gt(a.val, maxval)); + v128_t b1 = wasm_v128_bitselect(maxval, b.val, wasm_u32x4_gt(b.val, maxval)); + v128_t c1 = wasm_v128_bitselect(maxval, c.val, wasm_u32x4_gt(c.val, maxval)); + v128_t d1 = wasm_v128_bitselect(maxval, d.val, wasm_u32x4_gt(d.val, maxval)); + v128_t ab = wasm_v8x16_shuffle(a1, b1, 0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28); + v128_t cd = wasm_v8x16_shuffle(c1, d1, 0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28); + return v_uint8x16(wasm_v8x16_shuffle(ab, cd, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23)); +} + +inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c, + const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f, + const v_uint64x2& g, const v_uint64x2& h) +{ + v128_t maxval = wasm_i32x4_splat(255); + v128_t a1 = wasm_v128_bitselect(maxval, a.val, ((__u64x2)(a.val) > (__u64x2)maxval)); + v128_t b1 = wasm_v128_bitselect(maxval, b.val, ((__u64x2)(b.val) > (__u64x2)maxval)); + v128_t c1 = wasm_v128_bitselect(maxval, c.val, ((__u64x2)(c.val) > (__u64x2)maxval)); + v128_t d1 = wasm_v128_bitselect(maxval, d.val, ((__u64x2)(d.val) > (__u64x2)maxval)); + v128_t e1 = wasm_v128_bitselect(maxval, e.val, ((__u64x2)(e.val) > (__u64x2)maxval)); + v128_t f1 = wasm_v128_bitselect(maxval, f.val, ((__u64x2)(f.val) > (__u64x2)maxval)); + v128_t g1 = wasm_v128_bitselect(maxval, g.val, ((__u64x2)(g.val) > (__u64x2)maxval)); + v128_t h1 = wasm_v128_bitselect(maxval, h.val, ((__u64x2)(h.val) > (__u64x2)maxval)); + v128_t ab = wasm_v8x16_shuffle(a1, b1, 0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24); + v128_t cd = wasm_v8x16_shuffle(c1, d1, 0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24); + v128_t ef = wasm_v8x16_shuffle(e1, f1, 0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24); + v128_t gh = wasm_v8x16_shuffle(g1, h1, 0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24); + v128_t abcd = wasm_v8x16_shuffle(ab, cd, 0,1,2,3,16,17,18,19,0,1,2,3,16,17,18,19); + v128_t efgh = wasm_v8x16_shuffle(ef, gh, 0,1,2,3,16,17,18,19,0,1,2,3,16,17,18,19); + return v_uint8x16(wasm_v8x16_shuffle(abcd, efgh, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23)); +} + +inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0, + const v_float32x4& m1, const v_float32x4& m2, + const v_float32x4& m3) +{ + v128_t v0 = wasm_f32x4_splat(wasm_f32x4_extract_lane(v.val, 0)); + v128_t v1 = wasm_f32x4_splat(wasm_f32x4_extract_lane(v.val, 1)); + v128_t v2 = wasm_f32x4_splat(wasm_f32x4_extract_lane(v.val, 2)); + v128_t v3 = wasm_f32x4_splat(wasm_f32x4_extract_lane(v.val, 3)); + v0 = wasm_f32x4_mul(v0, m0.val); + v1 = wasm_f32x4_mul(v1, m1.val); + v2 = wasm_f32x4_mul(v2, m2.val); + v3 = wasm_f32x4_mul(v3, m3.val); + + return v_float32x4(wasm_f32x4_add(wasm_f32x4_add(v0, v1), wasm_f32x4_add(v2, v3))); +} + +inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0, + const v_float32x4& m1, const v_float32x4& m2, + const v_float32x4& a) +{ + v128_t v0 = wasm_f32x4_splat(wasm_f32x4_extract_lane(v.val, 0)); + v128_t v1 = wasm_f32x4_splat(wasm_f32x4_extract_lane(v.val, 1)); + v128_t v2 = wasm_f32x4_splat(wasm_f32x4_extract_lane(v.val, 2)); + v0 = wasm_f32x4_mul(v0, m0.val); + v1 = wasm_f32x4_mul(v1, m1.val); + v2 = wasm_f32x4_mul(v2, m2.val); + + return v_float32x4(wasm_f32x4_add(wasm_f32x4_add(v0, v1), wasm_f32x4_add(v2, a.val))); +} + +#define OPENCV_HAL_IMPL_WASM_BIN_OP(bin_op, _Tpvec, intrin) \ +inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \ +{ \ + return _Tpvec(intrin(a.val, b.val)); \ +} \ +inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \ +{ \ + a.val = intrin(a.val, b.val); \ + return a; \ +} + +OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_uint8x16, wasm_u8x16_add_saturate) +OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_uint8x16, wasm_u8x16_sub_saturate) +OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_int8x16, wasm_i8x16_add_saturate) +OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_int8x16, wasm_i8x16_sub_saturate) +OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_uint16x8, wasm_u16x8_add_saturate) +OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_uint16x8, wasm_u16x8_sub_saturate) +OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_int16x8, wasm_i16x8_add_saturate) +OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_int16x8, wasm_i16x8_sub_saturate) +OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_uint32x4, wasm_i32x4_add) +OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_uint32x4, wasm_i32x4_sub) +OPENCV_HAL_IMPL_WASM_BIN_OP(*, v_uint32x4, wasm_i32x4_mul) +OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_int32x4, wasm_i32x4_add) +OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_int32x4, wasm_i32x4_sub) +OPENCV_HAL_IMPL_WASM_BIN_OP(*, v_int32x4, wasm_i32x4_mul) +OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_float32x4, wasm_f32x4_add) +OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_float32x4, wasm_f32x4_sub) +OPENCV_HAL_IMPL_WASM_BIN_OP(*, v_float32x4, wasm_f32x4_mul) +OPENCV_HAL_IMPL_WASM_BIN_OP(/, v_float32x4, wasm_f32x4_div) +OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_uint64x2, wasm_i64x2_add) +OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_uint64x2, wasm_i64x2_sub) +OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_int64x2, wasm_i64x2_add) +OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_int64x2, wasm_i64x2_sub) +OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_float64x2, wasm_f64x2_add) +OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_float64x2, wasm_f64x2_sub) +OPENCV_HAL_IMPL_WASM_BIN_OP(*, v_float64x2, wasm_f64x2_mul) +OPENCV_HAL_IMPL_WASM_BIN_OP(/, v_float64x2, wasm_f64x2_div) + +// saturating multiply 8-bit, 16-bit +#define OPENCV_HAL_IMPL_WASM_MUL_SAT(_Tpvec, _Tpwvec) \ +inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \ +{ \ + _Tpwvec c, d; \ + v_mul_expand(a, b, c, d); \ + return v_pack(c, d); \ +} \ +inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \ +{ a = a * b; return a; } + +OPENCV_HAL_IMPL_WASM_MUL_SAT(v_uint8x16, v_uint16x8) +OPENCV_HAL_IMPL_WASM_MUL_SAT(v_int8x16, v_int16x8) +OPENCV_HAL_IMPL_WASM_MUL_SAT(v_uint16x8, v_uint32x4) +OPENCV_HAL_IMPL_WASM_MUL_SAT(v_int16x8, v_int32x4) + +// Multiply and expand +inline void v_mul_expand(const v_uint8x16& a, const v_uint8x16& b, + v_uint16x8& c, v_uint16x8& d) +{ + v_uint16x8 a0, a1, b0, b1; + v_expand(a, a0, a1); + v_expand(b, b0, b1); + c = v_mul_wrap(a0, b0); + d = v_mul_wrap(a1, b1); +} + +inline void v_mul_expand(const v_int8x16& a, const v_int8x16& b, + v_int16x8& c, v_int16x8& d) +{ + v_int16x8 a0, a1, b0, b1; + v_expand(a, a0, a1); + v_expand(b, b0, b1); + c = v_mul_wrap(a0, b0); + d = v_mul_wrap(a1, b1); +} + +inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b, + v_int32x4& c, v_int32x4& d) +{ + v_int32x4 a0, a1, b0, b1; + v_expand(a, a0, a1); + v_expand(b, b0, b1); + c.val = wasm_i32x4_mul(a0.val, b0.val); + d.val = wasm_i32x4_mul(a1.val, b1.val); +} + +inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b, + v_uint32x4& c, v_uint32x4& d) +{ + v_uint32x4 a0, a1, b0, b1; + v_expand(a, a0, a1); + v_expand(b, b0, b1); + c.val = wasm_i32x4_mul(a0.val, b0.val); + d.val = wasm_i32x4_mul(a1.val, b1.val); +} + +inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b, + v_uint64x2& c, v_uint64x2& d) +{ + v_uint64x2 a0, a1, b0, b1; + v_expand(a, a0, a1); + v_expand(b, b0, b1); + c.val = ((__u64x2)(a0.val) * (__u64x2)(b0.val)); + d.val = ((__u64x2)(a1.val) * (__u64x2)(b1.val)); +} + +inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b) +{ + v_int32x4 a0, a1, b0, b1; + v_expand(a, a0, a1); + v_expand(b, b0, b1); + v128_t c = wasm_i32x4_mul(a0.val, b0.val); + v128_t d = wasm_i32x4_mul(a1.val, b1.val); + return v_int16x8(wasm_v8x16_shuffle(c, d, 2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31)); +} +inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b) +{ + v_uint32x4 a0, a1, b0, b1; + v_expand(a, a0, a1); + v_expand(b, b0, b1); + v128_t c = wasm_i32x4_mul(a0.val, b0.val); + v128_t d = wasm_i32x4_mul(a1.val, b1.val); + return v_uint16x8(wasm_v8x16_shuffle(c, d, 2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31)); +} + +//////// Dot Product //////// + +inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b) +{ + v128_t a0 = wasm_i32x4_shr(wasm_i32x4_shl(a.val, 16), 16); + v128_t a1 = wasm_i32x4_shr(a.val, 16); + v128_t b0 = wasm_i32x4_shr(wasm_i32x4_shl(b.val, 16), 16); + v128_t b1 = wasm_i32x4_shr(b.val, 16); + v128_t c = wasm_i32x4_mul(a0, b0); + v128_t d = wasm_i32x4_mul(a1, b1); + return v_int32x4(wasm_i32x4_add(c, d)); +} + +inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c) +{ return v_dotprod(a, b) + c; } + +inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b) +{ + v128_t a0 = wasm_i64x2_shr(wasm_i64x2_shl(a.val, 32), 32); + v128_t a1 = wasm_i64x2_shr(a.val, 32); + v128_t b0 = wasm_i64x2_shr(wasm_i64x2_shl(b.val, 32), 32); + v128_t b1 = wasm_i64x2_shr(b.val, 32); + v128_t c = (v128_t)((__i64x2)a0 * (__i64x2)b0); + v128_t d = (v128_t)((__i64x2)a1 * (__i64x2)b1); + return v_int64x2(wasm_i64x2_add(c, d)); +} +inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c) +{ + return v_dotprod(a, b) + c; +} + +// 8 >> 32 +inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b) +{ + v128_t a0 = wasm_u16x8_shr(wasm_i16x8_shl(a.val, 8), 8); + v128_t a1 = wasm_u16x8_shr(a.val, 8); + v128_t b0 = wasm_u16x8_shr(wasm_i16x8_shl(b.val, 8), 8); + v128_t b1 = wasm_u16x8_shr(b.val, 8); + return v_uint32x4(( + v_dotprod(v_int16x8(a0), v_int16x8(b0)) + + v_dotprod(v_int16x8(a1), v_int16x8(b1))).val + ); +} +inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c) +{ return v_dotprod_expand(a, b) + c; } + +inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b) +{ + v128_t a0 = wasm_i16x8_shr(wasm_i16x8_shl(a.val, 8), 8); + v128_t a1 = wasm_i16x8_shr(a.val, 8); + v128_t b0 = wasm_i16x8_shr(wasm_i16x8_shl(b.val, 8), 8); + v128_t b1 = wasm_i16x8_shr(b.val, 8); + return v_int32x4( + v_dotprod(v_int16x8(a0), v_int16x8(b0)) + + v_dotprod(v_int16x8(a1), v_int16x8(b1)) + ); +} +inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c) +{ return v_dotprod_expand(a, b) + c; } + +// 16 >> 64 +inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b) +{ + v128_t a0 = wasm_u32x4_shr(wasm_i32x4_shl(a.val, 16), 16); + v128_t a1 = wasm_u32x4_shr(a.val, 16); + v128_t b0 = wasm_u32x4_shr(wasm_i32x4_shl(b.val, 16), 16); + v128_t b1 = wasm_u32x4_shr(b.val, 16); + return v_uint64x2(( + v_dotprod(v_int32x4(a0), v_int32x4(b0)) + + v_dotprod(v_int32x4(a1), v_int32x4(b1))).val + ); +} +inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c) +{ return v_dotprod_expand(a, b) + c; } + +inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b) +{ + v128_t a0 = wasm_i32x4_shr(wasm_i32x4_shl(a.val, 16), 16); + v128_t a1 = wasm_i32x4_shr(a.val, 16); + v128_t b0 = wasm_i32x4_shr(wasm_i32x4_shl(b.val, 16), 16); + v128_t b1 = wasm_i32x4_shr(b.val, 16); + return v_int64x2(( + v_dotprod(v_int32x4(a0), v_int32x4(b0)) + + v_dotprod(v_int32x4(a1), v_int32x4(b1))) + ); +} + +inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c) +{ return v_dotprod_expand(a, b) + c; } + +// 32 >> 64f +inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b) +{ return v_cvt_f64(v_dotprod(a, b)); } +inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c) +{ return v_dotprod_expand(a, b) + c; } + +//////// Fast Dot Product //////// + +// 16 >> 32 +inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b) +{ return v_dotprod(a, b); } +inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c) +{ return v_dotprod(a, b, c); } + +// 32 >> 64 +inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b) +{ return v_dotprod(a, b); } +inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c) +{ return v_dotprod(a, b, c); } + +// 8 >> 32 +inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b) +{ return v_dotprod_expand(a, b); } +inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c) +{ return v_dotprod_expand(a, b, c); } +inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b) +{ return v_dotprod_expand(a, b); } +inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c) +{ return v_dotprod_expand(a, b, c); } + +// 16 >> 64 +inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b) +{ return v_dotprod_expand(a, b); } +inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c) +{ return v_dotprod_expand(a, b, c); } +inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b) +{ return v_dotprod_expand(a, b); } +inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c) +{ return v_dotprod_expand(a, b, c); } + +// 32 >> 64f +inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b) +{ return v_dotprod_expand(a, b); } +inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c) +{ return v_dotprod_expand(a, b, c); } + +#define OPENCV_HAL_IMPL_WASM_LOGIC_OP(_Tpvec) \ +OPENCV_HAL_IMPL_WASM_BIN_OP(&, _Tpvec, wasm_v128_and) \ +OPENCV_HAL_IMPL_WASM_BIN_OP(|, _Tpvec, wasm_v128_or) \ +OPENCV_HAL_IMPL_WASM_BIN_OP(^, _Tpvec, wasm_v128_xor) \ +inline _Tpvec operator ~ (const _Tpvec& a) \ +{ \ + return _Tpvec(wasm_v128_not(a.val)); \ +} + +OPENCV_HAL_IMPL_WASM_LOGIC_OP(v_uint8x16) +OPENCV_HAL_IMPL_WASM_LOGIC_OP(v_int8x16) +OPENCV_HAL_IMPL_WASM_LOGIC_OP(v_uint16x8) +OPENCV_HAL_IMPL_WASM_LOGIC_OP(v_int16x8) +OPENCV_HAL_IMPL_WASM_LOGIC_OP(v_uint32x4) +OPENCV_HAL_IMPL_WASM_LOGIC_OP(v_int32x4) +OPENCV_HAL_IMPL_WASM_LOGIC_OP(v_uint64x2) +OPENCV_HAL_IMPL_WASM_LOGIC_OP(v_int64x2) +OPENCV_HAL_IMPL_WASM_LOGIC_OP(v_float32x4) +OPENCV_HAL_IMPL_WASM_LOGIC_OP(v_float64x2) + +inline v_float32x4 v_sqrt(const v_float32x4& x) +{ + return v_float32x4(wasm_f32x4_sqrt(x.val)); +} + +inline v_float32x4 v_invsqrt(const v_float32x4& x) +{ + const v128_t _1_0 = wasm_f32x4_splat(1.0); + return v_float32x4(wasm_f32x4_div(_1_0, wasm_f32x4_sqrt(x.val))); +} + +inline v_float64x2 v_sqrt(const v_float64x2& x) +{ + return v_float64x2(wasm_f64x2_sqrt(x.val)); +} + +inline v_float64x2 v_invsqrt(const v_float64x2& x) +{ + const v128_t _1_0 = wasm_f64x2_splat(1.0); + return v_float64x2(wasm_f64x2_div(_1_0, wasm_f64x2_sqrt(x.val))); +} + +#define OPENCV_HAL_IMPL_WASM_ABS_INT_FUNC(_Tpuvec, _Tpsvec, suffix, zsuffix, shiftWidth) \ +inline _Tpuvec v_abs(const _Tpsvec& x) \ +{ \ + v128_t s = wasm_##suffix##_shr(x.val, shiftWidth); \ + v128_t f = wasm_##zsuffix##_shr(x.val, shiftWidth); \ + return _Tpuvec(wasm_##zsuffix##_add(wasm_v128_xor(x.val, f), s)); \ +} + +OPENCV_HAL_IMPL_WASM_ABS_INT_FUNC(v_uint8x16, v_int8x16, u8x16, i8x16, 7) +OPENCV_HAL_IMPL_WASM_ABS_INT_FUNC(v_uint16x8, v_int16x8, u16x8, i16x8, 15) +OPENCV_HAL_IMPL_WASM_ABS_INT_FUNC(v_uint32x4, v_int32x4, u32x4, i32x4, 31) + +inline v_float32x4 v_abs(const v_float32x4& x) +{ return v_float32x4(wasm_f32x4_abs(x.val)); } +inline v_float64x2 v_abs(const v_float64x2& x) +{ + return v_float64x2(wasm_f64x2_abs(x.val)); +} + +// TODO: exp, log, sin, cos + +#define OPENCV_HAL_IMPL_WASM_BIN_FUNC(_Tpvec, func, intrin) \ +inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \ +{ \ + return _Tpvec(intrin(a.val, b.val)); \ +} + +OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_float32x4, v_min, wasm_f32x4_min) +OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_float32x4, v_max, wasm_f32x4_max) +OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_float64x2, v_min, wasm_f64x2_min) +OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_float64x2, v_max, wasm_f64x2_max) + +#define OPENCV_HAL_IMPL_WASM_MINMAX_S_INIT_FUNC(_Tpvec, suffix) \ +inline _Tpvec v_min(const _Tpvec& a, const _Tpvec& b) \ +{ \ + return _Tpvec(wasm_v128_bitselect(b.val, a.val, wasm_##suffix##_gt(a.val, b.val))); \ +} \ +inline _Tpvec v_max(const _Tpvec& a, const _Tpvec& b) \ +{ \ + return _Tpvec(wasm_v128_bitselect(a.val, b.val, wasm_##suffix##_gt(a.val, b.val))); \ +} + +OPENCV_HAL_IMPL_WASM_MINMAX_S_INIT_FUNC(v_int8x16, i8x16) +OPENCV_HAL_IMPL_WASM_MINMAX_S_INIT_FUNC(v_int16x8, i16x8) +OPENCV_HAL_IMPL_WASM_MINMAX_S_INIT_FUNC(v_int32x4, i32x4) + +#define OPENCV_HAL_IMPL_WASM_MINMAX_U_INIT_FUNC(_Tpvec, suffix, deltaNum) \ +inline _Tpvec v_min(const _Tpvec& a, const _Tpvec& b) \ +{ \ + v128_t delta = wasm_##suffix##_splat(deltaNum); \ + v128_t mask = wasm_##suffix##_gt(wasm_v128_xor(a.val, delta), wasm_v128_xor(b.val, delta)); \ + return _Tpvec(wasm_v128_bitselect(b.val, a.val, mask)); \ +} \ +inline _Tpvec v_max(const _Tpvec& a, const _Tpvec& b) \ +{ \ + v128_t delta = wasm_##suffix##_splat(deltaNum); \ + v128_t mask = wasm_##suffix##_gt(wasm_v128_xor(a.val, delta), wasm_v128_xor(b.val, delta)); \ + return _Tpvec(wasm_v128_bitselect(a.val, b.val, mask)); \ +} + +OPENCV_HAL_IMPL_WASM_MINMAX_U_INIT_FUNC(v_uint8x16, i8x16, (schar)0x80) +OPENCV_HAL_IMPL_WASM_MINMAX_U_INIT_FUNC(v_uint16x8, i16x8, (short)0x8000) +OPENCV_HAL_IMPL_WASM_MINMAX_U_INIT_FUNC(v_uint32x4, i32x4, (int)0x80000000) + +#define OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(_Tpvec, suffix, esuffix) \ +inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \ +{ return _Tpvec(wasm_##esuffix##_eq(a.val, b.val)); } \ +inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \ +{ return _Tpvec(wasm_##esuffix##_ne(a.val, b.val)); } \ +inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \ +{ return _Tpvec(wasm_##suffix##_lt(a.val, b.val)); } \ +inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \ +{ return _Tpvec(wasm_##suffix##_gt(a.val, b.val)); } \ +inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \ +{ return _Tpvec(wasm_##suffix##_le(a.val, b.val)); } \ +inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \ +{ return _Tpvec(wasm_##suffix##_ge(a.val, b.val)); } + +OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(v_uint8x16, u8x16, i8x16) +OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(v_int8x16, i8x16, i8x16) +OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(v_uint16x8, u16x8, i16x8) +OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(v_int16x8, i16x8, i16x8) +OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(v_uint32x4, u32x4, i32x4) +OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(v_int32x4, i32x4, i32x4) +OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(v_float32x4, f32x4, f32x4) +OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(v_float64x2, f64x2, f64x2) + +#define OPENCV_HAL_IMPL_WASM_64BIT_CMP_OP(_Tpvec, cast) \ +inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \ +{ return cast(v_reinterpret_as_f64(a) == v_reinterpret_as_f64(b)); } \ +inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \ +{ return cast(v_reinterpret_as_f64(a) != v_reinterpret_as_f64(b)); } + +OPENCV_HAL_IMPL_WASM_64BIT_CMP_OP(v_uint64x2, v_reinterpret_as_u64) +OPENCV_HAL_IMPL_WASM_64BIT_CMP_OP(v_int64x2, v_reinterpret_as_s64) + +inline v_float32x4 v_not_nan(const v_float32x4& a) +{ + v128_t z = wasm_i32x4_splat(0x7fffffff); + v128_t t = wasm_i32x4_splat(0x7f800000); + return v_float32x4(wasm_u32x4_lt(wasm_v128_and(a.val, z), t)); +} +inline v_float64x2 v_not_nan(const v_float64x2& a) +{ + v128_t z = wasm_i64x2_splat(0x7fffffffffffffff); + v128_t t = wasm_i64x2_splat(0x7ff0000000000000); + return v_float64x2((__u64x2)(wasm_v128_and(a.val, z)) < (__u64x2)t); +} + +OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_uint8x16, v_add_wrap, wasm_i8x16_add) +OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_int8x16, v_add_wrap, wasm_i8x16_add) +OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_uint16x8, v_add_wrap, wasm_i16x8_add) +OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_int16x8, v_add_wrap, wasm_i16x8_add) +OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_uint8x16, v_sub_wrap, wasm_i8x16_sub) +OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_int8x16, v_sub_wrap, wasm_i8x16_sub) +OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_uint16x8, v_sub_wrap, wasm_i16x8_sub) +OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_int16x8, v_sub_wrap, wasm_i16x8_sub) +#if (__EMSCRIPTEN_major__ * 1000000 + __EMSCRIPTEN_minor__ * 1000 + __EMSCRIPTEN_tiny__) >= (1039012) +// details: https://github.com/opencv/opencv/issues/18097 ( https://github.com/emscripten-core/emscripten/issues/12018 ) +// 1.39.12: https://github.com/emscripten-core/emscripten/commit/cd801d0f110facfd694212a3c8b2ed2ffcd630e2 +inline v_uint8x16 v_mul_wrap(const v_uint8x16& a, const v_uint8x16& b) +{ + uchar a_[16], b_[16]; + wasm_v128_store(a_, a.val); + wasm_v128_store(b_, b.val); + for (int i = 0; i < 16; i++) + a_[i] = (uchar)(a_[i] * b_[i]); + return v_uint8x16(wasm_v128_load(a_)); +} +inline v_int8x16 v_mul_wrap(const v_int8x16& a, const v_int8x16& b) +{ + schar a_[16], b_[16]; + wasm_v128_store(a_, a.val); + wasm_v128_store(b_, b.val); + for (int i = 0; i < 16; i++) + a_[i] = (schar)(a_[i] * b_[i]); + return v_int8x16(wasm_v128_load(a_)); +} +#else +OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_uint8x16, v_mul_wrap, wasm_i8x16_mul) +OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_int8x16, v_mul_wrap, wasm_i8x16_mul) +#endif +OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_uint16x8, v_mul_wrap, wasm_i16x8_mul) +OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_int16x8, v_mul_wrap, wasm_i16x8_mul) + + +/** Absolute difference **/ + +inline v_uint8x16 v_absdiff(const v_uint8x16& a, const v_uint8x16& b) +{ return v_add_wrap(a - b, b - a); } +inline v_uint16x8 v_absdiff(const v_uint16x8& a, const v_uint16x8& b) +{ return v_add_wrap(a - b, b - a); } +inline v_uint32x4 v_absdiff(const v_uint32x4& a, const v_uint32x4& b) +{ return v_max(a, b) - v_min(a, b); } + +inline v_uint8x16 v_absdiff(const v_int8x16& a, const v_int8x16& b) +{ + v_int8x16 d = v_sub_wrap(a, b); + v_int8x16 m = a < b; + return v_reinterpret_as_u8(v_sub_wrap(d ^ m, m)); +} +inline v_uint16x8 v_absdiff(const v_int16x8& a, const v_int16x8& b) +{ + return v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))); +} +inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b) +{ + v_int32x4 d = a - b; + v_int32x4 m = a < b; + return v_reinterpret_as_u32((d ^ m) - m); +} + +/** Saturating absolute difference **/ +inline v_int8x16 v_absdiffs(const v_int8x16& a, const v_int8x16& b) +{ + v_int8x16 d = a - b; + v_int8x16 m = a < b; + return (d ^ m) - m; + } +inline v_int16x8 v_absdiffs(const v_int16x8& a, const v_int16x8& b) +{ return v_max(a, b) - v_min(a, b); } + + +inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c) +{ + return a * b + c; +} + +inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c) +{ + return v_fma(a, b, c); +} + +inline v_float32x4 v_fma(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c) +{ + return a * b + c; +} + +inline v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c) +{ + return a * b + c; +} + +inline v_float32x4 v_absdiff(const v_float32x4& a, const v_float32x4& b) +{ + v128_t absmask_vec = wasm_i32x4_splat(0x7fffffff); + return v_float32x4(wasm_v128_and(wasm_f32x4_sub(a.val, b.val), absmask_vec)); +} +inline v_float64x2 v_absdiff(const v_float64x2& a, const v_float64x2& b) +{ + v128_t absmask_vec = wasm_u64x2_shr(wasm_i32x4_splat(-1), 1); + return v_float64x2(wasm_v128_and(wasm_f64x2_sub(a.val, b.val), absmask_vec)); +} + +#define OPENCV_HAL_IMPL_WASM_MISC_FLT_OP(_Tpvec, suffix) \ +inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \ +{ \ + v128_t a_Square = wasm_##suffix##_mul(a.val, a.val); \ + v128_t b_Square = wasm_##suffix##_mul(b.val, b.val); \ + return _Tpvec(wasm_##suffix##_sqrt(wasm_##suffix##_add(a_Square, b_Square))); \ +} \ +inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \ +{ \ + v128_t a_Square = wasm_##suffix##_mul(a.val, a.val); \ + v128_t b_Square = wasm_##suffix##_mul(b.val, b.val); \ + return _Tpvec(wasm_##suffix##_add(a_Square, b_Square)); \ +} \ +inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \ +{ \ + return _Tpvec(wasm_##suffix##_add(wasm_##suffix##_mul(a.val, b.val), c.val)); \ +} + +OPENCV_HAL_IMPL_WASM_MISC_FLT_OP(v_float32x4, f32x4) +OPENCV_HAL_IMPL_WASM_MISC_FLT_OP(v_float64x2, f64x2) + +#define OPENCV_HAL_IMPL_WASM_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, ssuffix) \ +inline _Tpuvec operator << (const _Tpuvec& a, int imm) \ +{ \ + return _Tpuvec(wasm_##suffix##_shl(a.val, imm)); \ +} \ +inline _Tpsvec operator << (const _Tpsvec& a, int imm) \ +{ \ + return _Tpsvec(wasm_##suffix##_shl(a.val, imm)); \ +} \ +inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \ +{ \ + return _Tpuvec(wasm_##ssuffix##_shr(a.val, imm)); \ +} \ +inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \ +{ \ + return _Tpsvec(wasm_##suffix##_shr(a.val, imm)); \ +} \ +template \ +inline _Tpuvec v_shl(const _Tpuvec& a) \ +{ \ + return _Tpuvec(wasm_##suffix##_shl(a.val, imm)); \ +} \ +template \ +inline _Tpsvec v_shl(const _Tpsvec& a) \ +{ \ + return _Tpsvec(wasm_##suffix##_shl(a.val, imm)); \ +} \ +template \ +inline _Tpuvec v_shr(const _Tpuvec& a) \ +{ \ + return _Tpuvec(wasm_##ssuffix##_shr(a.val, imm)); \ +} \ +template \ +inline _Tpsvec v_shr(const _Tpsvec& a) \ +{ \ + return _Tpsvec(wasm_##suffix##_shr(a.val, imm)); \ +} + +OPENCV_HAL_IMPL_WASM_SHIFT_OP(v_uint8x16, v_int8x16, i8x16, u8x16) +OPENCV_HAL_IMPL_WASM_SHIFT_OP(v_uint16x8, v_int16x8, i16x8, u16x8) +OPENCV_HAL_IMPL_WASM_SHIFT_OP(v_uint32x4, v_int32x4, i32x4, u32x4) +OPENCV_HAL_IMPL_WASM_SHIFT_OP(v_uint64x2, v_int64x2, i64x2, u64x2) + +namespace hal_wasm_internal +{ + template 16)), + bool is_first = (imm == 0), + bool is_second = (imm == 16), + bool is_other = (((imm > 0) && (imm < 16)))> + class v_wasm_palignr_u8_class; + + template + class v_wasm_palignr_u8_class; + + template + class v_wasm_palignr_u8_class + { + public: + inline v128_t operator()(const v128_t& a, const v128_t&) const + { + return a; + } + }; + + template + class v_wasm_palignr_u8_class + { + public: + inline v128_t operator()(const v128_t&, const v128_t& b) const + { + return b; + } + }; + + template + class v_wasm_palignr_u8_class + { + public: + inline v128_t operator()(const v128_t& a, const v128_t& b) const + { + enum { imm2 = (sizeof(v128_t) - imm) }; + return wasm_v8x16_shuffle(a, b, + imm, imm+1, imm+2, imm+3, + imm+4, imm+5, imm+6, imm+7, + imm+8, imm+9, imm+10, imm+11, + imm+12, imm+13, imm+14, imm+15); + } + }; + + template + inline v128_t v_wasm_palignr_u8(const v128_t& a, const v128_t& b) + { + CV_StaticAssert((imm >= 0) && (imm <= 16), "Invalid imm for v_wasm_palignr_u8."); + return v_wasm_palignr_u8_class()(a, b); + } +} + +template +inline _Tpvec v_rotate_right(const _Tpvec &a) +{ + using namespace hal_wasm_internal; + enum { imm2 = (imm * sizeof(typename _Tpvec::lane_type)) }; + v128_t z = wasm_i8x16_splat(0); + return _Tpvec(v_wasm_palignr_u8(a.val, z)); +} + +template +inline _Tpvec v_rotate_left(const _Tpvec &a) +{ + using namespace hal_wasm_internal; + enum { imm2 = ((_Tpvec::nlanes - imm) * sizeof(typename _Tpvec::lane_type)) }; + v128_t z = wasm_i8x16_splat(0); + return _Tpvec(v_wasm_palignr_u8(z, a.val)); +} + +template +inline _Tpvec v_rotate_right(const _Tpvec &a, const _Tpvec &b) +{ + using namespace hal_wasm_internal; + enum { imm2 = (imm * sizeof(typename _Tpvec::lane_type)) }; + return _Tpvec(v_wasm_palignr_u8(a.val, b.val)); +} + +template +inline _Tpvec v_rotate_left(const _Tpvec &a, const _Tpvec &b) +{ + using namespace hal_wasm_internal; + enum { imm2 = ((_Tpvec::nlanes - imm) * sizeof(typename _Tpvec::lane_type)) }; + return _Tpvec(v_wasm_palignr_u8(b.val, a.val)); +} + +#define OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(_Tpvec, _Tp) \ +inline _Tpvec v_load(const _Tp* ptr) \ +{ return _Tpvec(wasm_v128_load(ptr)); } \ +inline _Tpvec v_load_aligned(const _Tp* ptr) \ +{ return _Tpvec(wasm_v128_load(ptr)); } \ +inline _Tpvec v_load_low(const _Tp* ptr) \ +{ \ + _Tp tmp[_Tpvec::nlanes] = {0}; \ + for (int i=0; i<_Tpvec::nlanes/2; ++i) { \ + tmp[i] = ptr[i]; \ + } \ + return _Tpvec(wasm_v128_load(tmp)); \ +} \ +inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \ +{ \ + _Tp tmp[_Tpvec::nlanes]; \ + for (int i=0; i<_Tpvec::nlanes/2; ++i) { \ + tmp[i] = ptr0[i]; \ + tmp[i+_Tpvec::nlanes/2] = ptr1[i]; \ + } \ + return _Tpvec(wasm_v128_load(tmp)); \ +} \ +inline void v_store(_Tp* ptr, const _Tpvec& a) \ +{ wasm_v128_store(ptr, a.val); } \ +inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \ +{ wasm_v128_store(ptr, a.val); } \ +inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \ +{ wasm_v128_store(ptr, a.val); } \ +inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \ +{ \ + wasm_v128_store(ptr, a.val); \ +} \ +inline void v_store_low(_Tp* ptr, const _Tpvec& a) \ +{ \ + _Tpvec::lane_type a_[_Tpvec::nlanes]; \ + wasm_v128_store(a_, a.val); \ + for (int i = 0; i < (_Tpvec::nlanes / 2); i++) \ + ptr[i] = a_[i]; \ +} \ +inline void v_store_high(_Tp* ptr, const _Tpvec& a) \ +{ \ + _Tpvec::lane_type a_[_Tpvec::nlanes]; \ + wasm_v128_store(a_, a.val); \ + for (int i = 0; i < (_Tpvec::nlanes / 2); i++) \ + ptr[i] = a_[i + (_Tpvec::nlanes / 2)]; \ +} + +OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_uint8x16, uchar) +OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_int8x16, schar) +OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_uint16x8, ushort) +OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_int16x8, short) +OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_uint32x4, unsigned) +OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_int32x4, int) +OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_uint64x2, uint64) +OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_int64x2, int64) +OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_float32x4, float) +OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_float64x2, double) + + +/** Reverse **/ +inline v_uint8x16 v_reverse(const v_uint8x16 &a) +{ return v_uint8x16(wasm_v8x16_shuffle(a.val, a.val, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); } + +inline v_int8x16 v_reverse(const v_int8x16 &a) +{ return v_reinterpret_as_s8(v_reverse(v_reinterpret_as_u8(a))); } + +inline v_uint16x8 v_reverse(const v_uint16x8 &a) +{ return v_uint16x8(wasm_v8x16_shuffle(a.val, a.val, 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1)); } + +inline v_int16x8 v_reverse(const v_int16x8 &a) +{ return v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); } + +inline v_uint32x4 v_reverse(const v_uint32x4 &a) +{ return v_uint32x4(wasm_v8x16_shuffle(a.val, a.val, 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3)); } + +inline v_int32x4 v_reverse(const v_int32x4 &a) +{ return v_reinterpret_as_s32(v_reverse(v_reinterpret_as_u32(a))); } + +inline v_float32x4 v_reverse(const v_float32x4 &a) +{ return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); } + +inline v_uint64x2 v_reverse(const v_uint64x2 &a) +{ return v_uint64x2(wasm_v8x16_shuffle(a.val, a.val, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7)); } + +inline v_int64x2 v_reverse(const v_int64x2 &a) +{ return v_reinterpret_as_s64(v_reverse(v_reinterpret_as_u64(a))); } + +inline v_float64x2 v_reverse(const v_float64x2 &a) +{ return v_reinterpret_as_f64(v_reverse(v_reinterpret_as_u64(a))); } + + +#define OPENCV_HAL_IMPL_WASM_REDUCE_OP_4_SUM(_Tpvec, scalartype, regtype, suffix, esuffix) \ +inline scalartype v_reduce_sum(const _Tpvec& a) \ +{ \ + regtype val = a.val; \ + val = wasm_##suffix##_add(val, wasm_v8x16_shuffle(val, val, 8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7)); \ + val = wasm_##suffix##_add(val, wasm_v8x16_shuffle(val, val, 4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3)); \ + return (scalartype)wasm_##esuffix##_extract_lane(val, 0); \ +} + +OPENCV_HAL_IMPL_WASM_REDUCE_OP_4_SUM(v_uint32x4, unsigned, v128_t, i32x4, i32x4) +OPENCV_HAL_IMPL_WASM_REDUCE_OP_4_SUM(v_int32x4, int, v128_t, i32x4, i32x4) +OPENCV_HAL_IMPL_WASM_REDUCE_OP_4_SUM(v_float32x4, float, v128_t, f32x4, f32x4) + +// To do: Optimize v_reduce_sum with wasm intrin. +// Now use fallback implementation as there is no widening op in wasm intrin. + +#define OPENCV_HAL_IMPL_FALLBACK_REDUCE_OP_SUM(_Tpvec, scalartype) \ +inline scalartype v_reduce_sum(const _Tpvec& a) \ +{ \ + _Tpvec::lane_type a_[_Tpvec::nlanes]; \ + wasm_v128_store(a_, a.val); \ + scalartype c = a_[0]; \ + for (int i = 1; i < _Tpvec::nlanes; i++) \ + c += a_[i]; \ + return c; \ +} + +OPENCV_HAL_IMPL_FALLBACK_REDUCE_OP_SUM(v_uint8x16, unsigned) +OPENCV_HAL_IMPL_FALLBACK_REDUCE_OP_SUM(v_int8x16, int) +OPENCV_HAL_IMPL_FALLBACK_REDUCE_OP_SUM(v_uint16x8, unsigned) +OPENCV_HAL_IMPL_FALLBACK_REDUCE_OP_SUM(v_int16x8, int) + + +#define OPENCV_HAL_IMPL_WASM_REDUCE_OP_2_SUM(_Tpvec, scalartype, regtype, suffix, esuffix) \ +inline scalartype v_reduce_sum(const _Tpvec& a) \ +{ \ + regtype val = a.val; \ + val = wasm_##suffix##_add(val, wasm_v8x16_shuffle(val, val, 8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7)); \ + return (scalartype)wasm_##esuffix##_extract_lane(val, 0); \ +} +OPENCV_HAL_IMPL_WASM_REDUCE_OP_2_SUM(v_uint64x2, uint64, v128_t, i64x2, i64x2) +OPENCV_HAL_IMPL_WASM_REDUCE_OP_2_SUM(v_int64x2, int64, v128_t, i64x2, i64x2) +OPENCV_HAL_IMPL_WASM_REDUCE_OP_2_SUM(v_float64x2, double, v128_t, f64x2,f64x2) + +inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b, + const v_float32x4& c, const v_float32x4& d) +{ + v128_t ac = wasm_f32x4_add(wasm_unpacklo_i32x4(a.val, c.val), wasm_unpackhi_i32x4(a.val, c.val)); + v128_t bd = wasm_f32x4_add(wasm_unpacklo_i32x4(b.val, d.val), wasm_unpackhi_i32x4(b.val, d.val)); + return v_float32x4(wasm_f32x4_add(wasm_unpacklo_i32x4(ac, bd), wasm_unpackhi_i32x4(ac, bd))); +} + +#define OPENCV_HAL_IMPL_WASM_REDUCE_OP(_Tpvec, scalartype, func, scalar_func) \ +inline scalartype v_reduce_##func(const _Tpvec& a) \ +{ \ + scalartype buf[_Tpvec::nlanes]; \ + v_store(buf, a); \ + scalartype tmp = buf[0]; \ + for (int i=1; i<_Tpvec::nlanes; ++i) { \ + tmp = scalar_func(tmp, buf[i]); \ + } \ + return tmp; \ +} + +OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_uint8x16, uchar, max, std::max) +OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_uint8x16, uchar, min, std::min) +OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_int8x16, schar, max, std::max) +OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_int8x16, schar, min, std::min) +OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_uint16x8, ushort, max, std::max) +OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_uint16x8, ushort, min, std::min) +OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_int16x8, short, max, std::max) +OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_int16x8, short, min, std::min) +OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_uint32x4, unsigned, max, std::max) +OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_uint32x4, unsigned, min, std::min) +OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_int32x4, int, max, std::max) +OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_int32x4, int, min, std::min) +OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_float32x4, float, max, std::max) +OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_float32x4, float, min, std::min) + +inline unsigned v_reduce_sad(const v_uint8x16& a, const v_uint8x16& b) +{ + v_uint16x8 l16, h16; + v_uint32x4 l16_l32, l16_h32, h16_l32, h16_h32; + v_expand(v_absdiff(a, b), l16, h16); + v_expand(l16, l16_l32, l16_h32); + v_expand(h16, h16_l32, h16_h32); + return v_reduce_sum(l16_l32+l16_h32+h16_l32+h16_h32); +} +inline unsigned v_reduce_sad(const v_int8x16& a, const v_int8x16& b) +{ + v_uint16x8 l16, h16; + v_uint32x4 l16_l32, l16_h32, h16_l32, h16_h32; + v_expand(v_absdiff(a, b), l16, h16); + v_expand(l16, l16_l32, l16_h32); + v_expand(h16, h16_l32, h16_h32); + return v_reduce_sum(l16_l32+l16_h32+h16_l32+h16_h32); +} +inline unsigned v_reduce_sad(const v_uint16x8& a, const v_uint16x8& b) +{ + v_uint32x4 l, h; + v_expand(v_absdiff(a, b), l, h); + return v_reduce_sum(l + h); +} +inline unsigned v_reduce_sad(const v_int16x8& a, const v_int16x8& b) +{ + v_uint32x4 l, h; + v_expand(v_absdiff(a, b), l, h); + return v_reduce_sum(l + h); +} +inline unsigned v_reduce_sad(const v_uint32x4& a, const v_uint32x4& b) +{ + return v_reduce_sum(v_absdiff(a, b)); +} +inline unsigned v_reduce_sad(const v_int32x4& a, const v_int32x4& b) +{ + return v_reduce_sum(v_absdiff(a, b)); +} +inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b) +{ + return v_reduce_sum(v_absdiff(a, b)); +} + +inline v_uint8x16 v_popcount(const v_uint8x16& a) +{ + v128_t m1 = wasm_i32x4_splat(0x55555555); + v128_t m2 = wasm_i32x4_splat(0x33333333); + v128_t m4 = wasm_i32x4_splat(0x0f0f0f0f); + v128_t p = a.val; + p = wasm_i32x4_add(wasm_v128_and(wasm_u32x4_shr(p, 1), m1), wasm_v128_and(p, m1)); + p = wasm_i32x4_add(wasm_v128_and(wasm_u32x4_shr(p, 2), m2), wasm_v128_and(p, m2)); + p = wasm_i32x4_add(wasm_v128_and(wasm_u32x4_shr(p, 4), m4), wasm_v128_and(p, m4)); + return v_uint8x16(p); +} +inline v_uint16x8 v_popcount(const v_uint16x8& a) +{ + v_uint8x16 p = v_popcount(v_reinterpret_as_u8(a)); + p += v_rotate_right<1>(p); + return v_reinterpret_as_u16(p) & v_setall_u16(0x00ff); +} +inline v_uint32x4 v_popcount(const v_uint32x4& a) +{ + v_uint8x16 p = v_popcount(v_reinterpret_as_u8(a)); + p += v_rotate_right<1>(p); + p += v_rotate_right<2>(p); + return v_reinterpret_as_u32(p) & v_setall_u32(0x000000ff); +} +inline v_uint64x2 v_popcount(const v_uint64x2& a) +{ + uint64 a_[2], b_[2] = { 0 }; + wasm_v128_store(a_, a.val); + for (int i = 0; i < 16; i++) + b_[i / 8] += popCountTable[((uint8_t*)a_)[i]]; + return v_uint64x2(wasm_v128_load(b_)); +} +inline v_uint8x16 v_popcount(const v_int8x16& a) +{ return v_popcount(v_reinterpret_as_u8(a)); } +inline v_uint16x8 v_popcount(const v_int16x8& a) +{ return v_popcount(v_reinterpret_as_u16(a)); } +inline v_uint32x4 v_popcount(const v_int32x4& a) +{ return v_popcount(v_reinterpret_as_u32(a)); } +inline v_uint64x2 v_popcount(const v_int64x2& a) +{ return v_popcount(v_reinterpret_as_u64(a)); } + +#define OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(_Tpvec, suffix, scalarType) \ +inline int v_signmask(const _Tpvec& a) \ +{ \ + _Tpvec::lane_type a_[_Tpvec::nlanes]; \ + wasm_v128_store(a_, a.val); \ + int mask = 0; \ + for (int i = 0; i < _Tpvec::nlanes; i++) \ + mask |= (reinterpret_int(a_[i]) < 0) << i; \ + return mask; \ +} \ +inline bool v_check_all(const _Tpvec& a) \ +{ return wasm_i8x16_all_true(wasm_##suffix##_lt(a.val, wasm_##suffix##_splat(0))); } \ +inline bool v_check_any(const _Tpvec& a) \ +{ return wasm_i8x16_any_true(wasm_##suffix##_lt(a.val, wasm_##suffix##_splat(0)));; } + +OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(v_uint8x16, i8x16, schar) +OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(v_int8x16, i8x16, schar) +OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(v_uint16x8, i16x8, short) +OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(v_int16x8, i16x8, short) +OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(v_uint32x4, i32x4, int) +OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(v_int32x4, i32x4, int) +OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(v_float32x4, i32x4, float) +OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(v_float64x2, f64x2, double) + +#define OPENCV_HAL_IMPL_WASM_CHECK_ALL_ANY(_Tpvec, suffix, esuffix) \ +inline bool v_check_all(const _Tpvec& a) \ +{ \ + v128_t masked = v_reinterpret_as_##esuffix(a).val; \ + masked = wasm_i32x4_replace_lane(masked, 0, 0xffffffff); \ + masked = wasm_i32x4_replace_lane(masked, 2, 0xffffffff); \ + return wasm_i8x16_all_true(wasm_##suffix##_lt(masked, wasm_##suffix##_splat(0))); \ +} \ +inline bool v_check_any(const _Tpvec& a) \ +{ \ + v128_t masked = v_reinterpret_as_##esuffix(a).val; \ + masked = wasm_i32x4_replace_lane(masked, 0, 0x0); \ + masked = wasm_i32x4_replace_lane(masked, 2, 0x0); \ + return wasm_i8x16_any_true(wasm_##suffix##_lt(masked, wasm_##suffix##_splat(0))); \ +} \ + +OPENCV_HAL_IMPL_WASM_CHECK_ALL_ANY(v_int64x2, i32x4, s32) +OPENCV_HAL_IMPL_WASM_CHECK_ALL_ANY(v_uint64x2, i32x4, u32) + + +inline int v_scan_forward(const v_int8x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); } +inline int v_scan_forward(const v_uint8x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); } +inline int v_scan_forward(const v_int16x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 2; } +inline int v_scan_forward(const v_uint16x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 2; } +inline int v_scan_forward(const v_int32x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; } +inline int v_scan_forward(const v_uint32x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; } +inline int v_scan_forward(const v_float32x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; } +inline int v_scan_forward(const v_int64x2& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; } +inline int v_scan_forward(const v_uint64x2& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; } +inline int v_scan_forward(const v_float64x2& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; } + +#define OPENCV_HAL_IMPL_WASM_SELECT(_Tpvec) \ +inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \ +{ \ + return _Tpvec(wasm_v128_bitselect(a.val, b.val, mask.val)); \ +} + +OPENCV_HAL_IMPL_WASM_SELECT(v_uint8x16) +OPENCV_HAL_IMPL_WASM_SELECT(v_int8x16) +OPENCV_HAL_IMPL_WASM_SELECT(v_uint16x8) +OPENCV_HAL_IMPL_WASM_SELECT(v_int16x8) +OPENCV_HAL_IMPL_WASM_SELECT(v_uint32x4) +OPENCV_HAL_IMPL_WASM_SELECT(v_int32x4) +OPENCV_HAL_IMPL_WASM_SELECT(v_uint64x2) +OPENCV_HAL_IMPL_WASM_SELECT(v_int64x2) +OPENCV_HAL_IMPL_WASM_SELECT(v_float32x4) +OPENCV_HAL_IMPL_WASM_SELECT(v_float64x2) + +#define OPENCV_HAL_IMPL_WASM_EXPAND(_Tpvec, _Tpwvec, _Tp, intrin) \ +inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \ +{ \ + b0.val = intrin(a.val); \ + b1.val = __CV_CAT(intrin, _high)(a.val); \ +} \ +inline _Tpwvec v_expand_low(const _Tpvec& a) \ +{ return _Tpwvec(intrin(a.val)); } \ +inline _Tpwvec v_expand_high(const _Tpvec& a) \ +{ return _Tpwvec(__CV_CAT(intrin, _high)(a.val)); } \ +inline _Tpwvec v_load_expand(const _Tp* ptr) \ +{ \ + v128_t a = wasm_v128_load(ptr); \ + return _Tpwvec(intrin(a)); \ +} + +OPENCV_HAL_IMPL_WASM_EXPAND(v_uint8x16, v_uint16x8, uchar, v128_cvtu8x16_i16x8) +OPENCV_HAL_IMPL_WASM_EXPAND(v_int8x16, v_int16x8, schar, v128_cvti8x16_i16x8) +OPENCV_HAL_IMPL_WASM_EXPAND(v_uint16x8, v_uint32x4, ushort, v128_cvtu16x8_i32x4) +OPENCV_HAL_IMPL_WASM_EXPAND(v_int16x8, v_int32x4, short, v128_cvti16x8_i32x4) +OPENCV_HAL_IMPL_WASM_EXPAND(v_uint32x4, v_uint64x2, unsigned, v128_cvtu32x4_i64x2) +OPENCV_HAL_IMPL_WASM_EXPAND(v_int32x4, v_int64x2, int, v128_cvti32x4_i64x2) + +#define OPENCV_HAL_IMPL_WASM_EXPAND_Q(_Tpvec, _Tp, intrin) \ +inline _Tpvec v_load_expand_q(const _Tp* ptr) \ +{ \ + v128_t a = wasm_v128_load(ptr); \ + return _Tpvec(intrin(a)); \ +} + +OPENCV_HAL_IMPL_WASM_EXPAND_Q(v_uint32x4, uchar, v128_cvtu8x16_i32x4) +OPENCV_HAL_IMPL_WASM_EXPAND_Q(v_int32x4, schar, v128_cvti8x16_i32x4) + +#define OPENCV_HAL_IMPL_WASM_UNPACKS(_Tpvec, suffix) \ +inline void v_zip(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1) \ +{ \ + b0.val = wasm_unpacklo_##suffix(a0.val, a1.val); \ + b1.val = wasm_unpackhi_##suffix(a0.val, a1.val); \ +} \ +inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \ +{ \ + return _Tpvec(wasm_unpacklo_i64x2(a.val, b.val)); \ +} \ +inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b) \ +{ \ + return _Tpvec(wasm_unpackhi_i64x2(a.val, b.val)); \ +} \ +inline void v_recombine(const _Tpvec& a, const _Tpvec& b, _Tpvec& c, _Tpvec& d) \ +{ \ + c.val = wasm_unpacklo_i64x2(a.val, b.val); \ + d.val = wasm_unpackhi_i64x2(a.val, b.val); \ +} + +OPENCV_HAL_IMPL_WASM_UNPACKS(v_uint8x16, i8x16) +OPENCV_HAL_IMPL_WASM_UNPACKS(v_int8x16, i8x16) +OPENCV_HAL_IMPL_WASM_UNPACKS(v_uint16x8, i16x8) +OPENCV_HAL_IMPL_WASM_UNPACKS(v_int16x8, i16x8) +OPENCV_HAL_IMPL_WASM_UNPACKS(v_uint32x4, i32x4) +OPENCV_HAL_IMPL_WASM_UNPACKS(v_int32x4, i32x4) +OPENCV_HAL_IMPL_WASM_UNPACKS(v_float32x4, i32x4) +OPENCV_HAL_IMPL_WASM_UNPACKS(v_float64x2, i64x2) + +template +inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) +{ + return v_rotate_right(a, b); +} + +inline v_int32x4 v_round(const v_float32x4& a) +{ + v128_t h = wasm_f32x4_splat(0.5); + return v_int32x4(wasm_i32x4_trunc_saturate_f32x4(wasm_f32x4_add(a.val, h))); +} + +inline v_int32x4 v_floor(const v_float32x4& a) +{ + v128_t a1 = wasm_i32x4_trunc_saturate_f32x4(a.val); + v128_t mask = wasm_f32x4_lt(a.val, wasm_f32x4_convert_i32x4(a1)); + return v_int32x4(wasm_i32x4_add(a1, mask)); +} + +inline v_int32x4 v_ceil(const v_float32x4& a) +{ + v128_t a1 = wasm_i32x4_trunc_saturate_f32x4(a.val); + v128_t mask = wasm_f32x4_gt(a.val, wasm_f32x4_convert_i32x4(a1)); + return v_int32x4(wasm_i32x4_sub(a1, mask)); +} + +inline v_int32x4 v_trunc(const v_float32x4& a) +{ return v_int32x4(wasm_i32x4_trunc_saturate_f32x4(a.val)); } + +#define OPENCV_HAL_IMPL_WASM_MATH_FUNC(func, cfunc) \ +inline v_int32x4 func(const v_float64x2& a) \ +{ \ + double a_[2]; \ + wasm_v128_store(a_, a.val); \ + int c_[4]; \ + c_[0] = cfunc(a_[0]); \ + c_[1] = cfunc(a_[1]); \ + c_[2] = 0; \ + c_[3] = 0; \ + return v_int32x4(wasm_v128_load(c_)); \ +} + +OPENCV_HAL_IMPL_WASM_MATH_FUNC(v_round, cvRound) +OPENCV_HAL_IMPL_WASM_MATH_FUNC(v_floor, cvFloor) +OPENCV_HAL_IMPL_WASM_MATH_FUNC(v_ceil, cvCeil) +OPENCV_HAL_IMPL_WASM_MATH_FUNC(v_trunc, int) + +inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b) +{ + double a_[2], b_[2]; + wasm_v128_store(a_, a.val); + wasm_v128_store(b_, b.val); + int c_[4]; + c_[0] = cvRound(a_[0]); + c_[1] = cvRound(a_[1]); + c_[2] = cvRound(b_[0]); + c_[3] = cvRound(b_[1]); + return v_int32x4(wasm_v128_load(c_)); +} + +#define OPENCV_HAL_IMPL_WASM_TRANSPOSE4x4(_Tpvec, suffix) \ +inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, \ + const _Tpvec& a2, const _Tpvec& a3, \ + _Tpvec& b0, _Tpvec& b1, \ + _Tpvec& b2, _Tpvec& b3) \ +{ \ + v128_t t0 = wasm_unpacklo_##suffix(a0.val, a1.val); \ + v128_t t1 = wasm_unpacklo_##suffix(a2.val, a3.val); \ + v128_t t2 = wasm_unpackhi_##suffix(a0.val, a1.val); \ + v128_t t3 = wasm_unpackhi_##suffix(a2.val, a3.val); \ +\ + b0.val = wasm_unpacklo_i64x2(t0, t1); \ + b1.val = wasm_unpackhi_i64x2(t0, t1); \ + b2.val = wasm_unpacklo_i64x2(t2, t3); \ + b3.val = wasm_unpackhi_i64x2(t2, t3); \ +} + +OPENCV_HAL_IMPL_WASM_TRANSPOSE4x4(v_uint32x4, i32x4) +OPENCV_HAL_IMPL_WASM_TRANSPOSE4x4(v_int32x4, i32x4) +OPENCV_HAL_IMPL_WASM_TRANSPOSE4x4(v_float32x4, i32x4) + +// load deinterleave +inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b) +{ + v128_t t00 = wasm_v128_load(ptr); + v128_t t01 = wasm_v128_load(ptr + 16); + + a.val = wasm_v8x16_shuffle(t00, t01, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30); + b.val = wasm_v8x16_shuffle(t00, t01, 1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31); +} + +inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c) +{ + v128_t t00 = wasm_v128_load(ptr); + v128_t t01 = wasm_v128_load(ptr + 16); + v128_t t02 = wasm_v128_load(ptr + 32); + + v128_t t10 = wasm_v8x16_shuffle(t00, t01, 0,3,6,9,12,15,18,21,24,27,30,1,2,4,5,7); + v128_t t11 = wasm_v8x16_shuffle(t00, t01, 1,4,7,10,13,16,19,22,25,28,31,0,2,3,5,6); + v128_t t12 = wasm_v8x16_shuffle(t00, t01, 2,5,8,11,14,17,20,23,26,29,0,1,3,4,6,7); + + a.val = wasm_v8x16_shuffle(t10, t02, 0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29); + b.val = wasm_v8x16_shuffle(t11, t02, 0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30); + c.val = wasm_v8x16_shuffle(t12, t02, 0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31); +} + +inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c, v_uint8x16& d) +{ + v128_t u0 = wasm_v128_load(ptr); // a0 b0 c0 d0 a1 b1 c1 d1 ... + v128_t u1 = wasm_v128_load(ptr + 16); // a4 b4 c4 d4 ... + v128_t u2 = wasm_v128_load(ptr + 32); // a8 b8 c8 d8 ... + v128_t u3 = wasm_v128_load(ptr + 48); // a12 b12 c12 d12 ... + + v128_t v0 = wasm_v8x16_shuffle(u0, u1, 0,4,8,12,16,20,24,28,1,5,9,13,17,21,25,29); + v128_t v1 = wasm_v8x16_shuffle(u2, u3, 0,4,8,12,16,20,24,28,1,5,9,13,17,21,25,29); + v128_t v2 = wasm_v8x16_shuffle(u0, u1, 2,6,10,14,18,22,26,30,3,7,11,15,19,23,27,31); + v128_t v3 = wasm_v8x16_shuffle(u2, u3, 2,6,10,14,18,22,26,30,3,7,11,15,19,23,27,31); + + a.val = wasm_v8x16_shuffle(v0, v1, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23); + b.val = wasm_v8x16_shuffle(v0, v1, 8,9,10,11,12,13,14,15,24,25,26,27,28,29,30,31); + c.val = wasm_v8x16_shuffle(v2, v3, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23); + d.val = wasm_v8x16_shuffle(v2, v3, 8,9,10,11,12,13,14,15,24,25,26,27,28,29,30,31); +} + +inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b) +{ + v128_t v0 = wasm_v128_load(ptr); // a0 b0 a1 b1 a2 b2 a3 b3 + v128_t v1 = wasm_v128_load(ptr + 8); // a4 b4 a5 b5 a6 b6 a7 b7 + + a.val = wasm_v8x16_shuffle(v0, v1, 0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29); // a0 a1 a2 a3 a4 a5 a6 a7 + b.val = wasm_v8x16_shuffle(v0, v1, 2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31); // b0 b1 ab b3 b4 b5 b6 b7 +} + +inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c) +{ + v128_t t00 = wasm_v128_load(ptr); // a0 b0 c0 a1 b1 c1 a2 b2 + v128_t t01 = wasm_v128_load(ptr + 8); // c2 a3 b3 c3 a4 b4 c4 a5 + v128_t t02 = wasm_v128_load(ptr + 16); // b5 c5 a6 b6 c6 a7 b7 c7 + + v128_t t10 = wasm_v8x16_shuffle(t00, t01, 0,1,6,7,12,13,18,19,24,25,30,31,2,3,4,5); + v128_t t11 = wasm_v8x16_shuffle(t00, t01, 2,3,8,9,14,15,20,21,26,27,0,1,4,5,6,7); + v128_t t12 = wasm_v8x16_shuffle(t00, t01, 4,5,10,11,16,17,22,23,28,29,0,1,2,3,6,7); + + a.val = wasm_v8x16_shuffle(t10, t02, 0,1,2,3,4,5,6,7,8,9,10,11,20,21,26,27); + b.val = wasm_v8x16_shuffle(t11, t02, 0,1,2,3,4,5,6,7,8,9,16,17,22,23,28,29); + c.val = wasm_v8x16_shuffle(t12, t02, 0,1,2,3,4,5,6,7,8,9,18,19,24,25,30,31); +} + +inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c, v_uint16x8& d) +{ + v128_t u0 = wasm_v128_load(ptr); // a0 b0 c0 d0 a1 b1 c1 d1 + v128_t u1 = wasm_v128_load(ptr + 8); // a2 b2 c2 d2 ... + v128_t u2 = wasm_v128_load(ptr + 16); // a4 b4 c4 d4 ... + v128_t u3 = wasm_v128_load(ptr + 24); // a6 b6 c6 d6 ... + + v128_t v0 = wasm_v8x16_shuffle(u0, u1, 0,1,8,9,16,17,24,25,2,3,10,11,18,19,26,27); // a0 a1 a2 a3 b0 b1 b2 b3 + v128_t v1 = wasm_v8x16_shuffle(u2, u3, 0,1,8,9,16,17,24,25,2,3,10,11,18,19,26,27); // a4 a5 a6 a7 b4 b5 b6 b7 + v128_t v2 = wasm_v8x16_shuffle(u0, u1, 4,5,12,13,20,21,28,29,6,7,14,15,22,23,30,31); // c0 c1 c2 c3 d0 d1 d2 d3 + v128_t v3 = wasm_v8x16_shuffle(u2, u3, 4,5,12,13,20,21,28,29,6,7,14,15,22,23,30,31); // c4 c5 c6 c7 d4 d5 d6 d7 + + a.val = wasm_v8x16_shuffle(v0, v1, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23); + b.val = wasm_v8x16_shuffle(v0, v1, 8,9,10,11,12,13,14,15,24,25,26,27,28,29,30,31); + c.val = wasm_v8x16_shuffle(v2, v3, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23); + d.val = wasm_v8x16_shuffle(v2, v3, 8,9,10,11,12,13,14,15,24,25,26,27,28,29,30,31); +} + +inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b) +{ + v128_t v0 = wasm_v128_load(ptr); // a0 b0 a1 b1 + v128_t v1 = wasm_v128_load(ptr + 4); // a2 b2 a3 b3 + + a.val = wasm_v8x16_shuffle(v0, v1, 0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27); // a0 a1 a2 a3 + b.val = wasm_v8x16_shuffle(v0, v1, 4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31); // b0 b1 b2 b3 +} + +inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c) +{ + v128_t t00 = wasm_v128_load(ptr); // a0 b0 c0 a1 + v128_t t01 = wasm_v128_load(ptr + 4); // b2 c2 a3 b3 + v128_t t02 = wasm_v128_load(ptr + 8); // c3 a4 b4 c4 + + v128_t t10 = wasm_v8x16_shuffle(t00, t01, 0,1,2,3,12,13,14,15,24,25,26,27,4,5,6,7); + v128_t t11 = wasm_v8x16_shuffle(t00, t01, 4,5,6,7,16,17,18,19,28,29,30,31,0,1,2,3); + v128_t t12 = wasm_v8x16_shuffle(t00, t01, 8,9,10,11,20,21,22,23,0,1,2,3,4,5,6,7); + + a.val = wasm_v8x16_shuffle(t10, t02, 0,1,2,3,4,5,6,7,8,9,10,11,20,21,22,23); + b.val = wasm_v8x16_shuffle(t11, t02, 0,1,2,3,4,5,6,7,8,9,10,11,24,25,26,27); + c.val = wasm_v8x16_shuffle(t12, t02, 0,1,2,3,4,5,6,7,16,17,18,19,28,29,30,31); +} + +inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c, v_uint32x4& d) +{ + v_uint32x4 s0(wasm_v128_load(ptr)); // a0 b0 c0 d0 + v_uint32x4 s1(wasm_v128_load(ptr + 4)); // a1 b1 c1 d1 + v_uint32x4 s2(wasm_v128_load(ptr + 8)); // a2 b2 c2 d2 + v_uint32x4 s3(wasm_v128_load(ptr + 12)); // a3 b3 c3 d3 + + v_transpose4x4(s0, s1, s2, s3, a, b, c, d); +} + +inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b) +{ + v128_t v0 = wasm_v128_load(ptr); // a0 b0 a1 b1 + v128_t v1 = wasm_v128_load((ptr + 4)); // a2 b2 a3 b3 + + a.val = wasm_v8x16_shuffle(v0, v1, 0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27); // a0 a1 a2 a3 + b.val = wasm_v8x16_shuffle(v0, v1, 4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31); // b0 b1 b2 b3 +} + +inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b, v_float32x4& c) +{ + v128_t t00 = wasm_v128_load(ptr); // a0 b0 c0 a1 + v128_t t01 = wasm_v128_load(ptr + 4); // b2 c2 a3 b3 + v128_t t02 = wasm_v128_load(ptr + 8); // c3 a4 b4 c4 + + v128_t t10 = wasm_v8x16_shuffle(t00, t01, 0,1,2,3,12,13,14,15,24,25,26,27,4,5,6,7); + v128_t t11 = wasm_v8x16_shuffle(t00, t01, 4,5,6,7,16,17,18,19,28,29,30,31,0,1,2,3); + v128_t t12 = wasm_v8x16_shuffle(t00, t01, 8,9,10,11,20,21,22,23,0,1,2,3,4,5,6,7); + + a.val = wasm_v8x16_shuffle(t10, t02, 0,1,2,3,4,5,6,7,8,9,10,11,20,21,22,23); + b.val = wasm_v8x16_shuffle(t11, t02, 0,1,2,3,4,5,6,7,8,9,10,11,24,25,26,27); + c.val = wasm_v8x16_shuffle(t12, t02, 0,1,2,3,4,5,6,7,16,17,18,19,28,29,30,31); +} + +inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b, v_float32x4& c, v_float32x4& d) +{ + v_float32x4 s0(wasm_v128_load(ptr)); // a0 b0 c0 d0 + v_float32x4 s1(wasm_v128_load(ptr + 4)); // a1 b1 c1 d1 + v_float32x4 s2(wasm_v128_load(ptr + 8)); // a2 b2 c2 d2 + v_float32x4 s3(wasm_v128_load(ptr + 12)); // a3 b3 c3 d3 + + v_transpose4x4(s0, s1, s2, s3, a, b, c, d); +} + +inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a, v_uint64x2& b) +{ + v128_t t0 = wasm_v128_load(ptr); // a0 b0 + v128_t t1 = wasm_v128_load(ptr + 2); // a1 b1 + + a.val = wasm_unpacklo_i64x2(t0, t1); + b.val = wasm_unpackhi_i64x2(t0, t1); +} + +inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a, v_uint64x2& b, v_uint64x2& c) +{ + v128_t t0 = wasm_v128_load(ptr); // a0, b0 + v128_t t1 = wasm_v128_load(ptr + 2); // c0, a1 + v128_t t2 = wasm_v128_load(ptr + 4); // b1, c1 + + a.val = wasm_v8x16_shuffle(t0, t1, 0,1,2,3,4,5,6,7,24,25,26,27,28,29,30,31); + b.val = wasm_v8x16_shuffle(t0, t2, 8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23); + c.val = wasm_v8x16_shuffle(t1, t2, 0,1,2,3,4,5,6,7,24,25,26,27,28,29,30,31); +} + +inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a, + v_uint64x2& b, v_uint64x2& c, v_uint64x2& d) +{ + v128_t t0 = wasm_v128_load(ptr); // a0 b0 + v128_t t1 = wasm_v128_load(ptr + 2); // c0 d0 + v128_t t2 = wasm_v128_load(ptr + 4); // a1 b1 + v128_t t3 = wasm_v128_load(ptr + 6); // c1 d1 + + a.val = wasm_unpacklo_i64x2(t0, t2); + b.val = wasm_unpackhi_i64x2(t0, t2); + c.val = wasm_unpacklo_i64x2(t1, t3); + d.val = wasm_unpackhi_i64x2(t1, t3); +} + +// store interleave + +inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b, + hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED) +{ + v128_t v0 = wasm_unpacklo_i8x16(a.val, b.val); + v128_t v1 = wasm_unpackhi_i8x16(a.val, b.val); + + wasm_v128_store(ptr, v0); + wasm_v128_store(ptr + 16, v1); +} + +inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b, + const v_uint8x16& c, hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED) +{ + v128_t t00 = wasm_v8x16_shuffle(a.val, b.val, 0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5); + v128_t t01 = wasm_v8x16_shuffle(a.val, b.val, 21,0,6,22,0,7,23,0,8,24,0,9,25,0,10,26); + v128_t t02 = wasm_v8x16_shuffle(a.val, b.val, 0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0); + + v128_t t10 = wasm_v8x16_shuffle(t00, c.val, 0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15); + v128_t t11 = wasm_v8x16_shuffle(t01, c.val, 0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15); + v128_t t12 = wasm_v8x16_shuffle(t02, c.val, 26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31); + + wasm_v128_store(ptr, t10); + wasm_v128_store(ptr + 16, t11); + wasm_v128_store(ptr + 32, t12); +} + +inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b, + const v_uint8x16& c, const v_uint8x16& d, + hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED) +{ + // a0 a1 a2 a3 .... + // b0 b1 b2 b3 .... + // c0 c1 c2 c3 .... + // d0 d1 d2 d3 .... + v128_t u0 = wasm_unpacklo_i8x16(a.val, c.val); // a0 c0 a1 c1 ... + v128_t u1 = wasm_unpackhi_i8x16(a.val, c.val); // a8 c8 a9 c9 ... + v128_t u2 = wasm_unpacklo_i8x16(b.val, d.val); // b0 d0 b1 d1 ... + v128_t u3 = wasm_unpackhi_i8x16(b.val, d.val); // b8 d8 b9 d9 ... + + v128_t v0 = wasm_unpacklo_i8x16(u0, u2); // a0 b0 c0 d0 ... + v128_t v1 = wasm_unpackhi_i8x16(u0, u2); // a4 b4 c4 d4 ... + v128_t v2 = wasm_unpacklo_i8x16(u1, u3); // a8 b8 c8 d8 ... + v128_t v3 = wasm_unpackhi_i8x16(u1, u3); // a12 b12 c12 d12 ... + + wasm_v128_store(ptr, v0); + wasm_v128_store(ptr + 16, v1); + wasm_v128_store(ptr + 32, v2); + wasm_v128_store(ptr + 48, v3); +} + +inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16x8& b, + hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED) +{ + v128_t v0 = wasm_unpacklo_i16x8(a.val, b.val); + v128_t v1 = wasm_unpackhi_i16x8(a.val, b.val); + + wasm_v128_store(ptr, v0); + wasm_v128_store(ptr + 8, v1); +} + +inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, + const v_uint16x8& b, const v_uint16x8& c, + hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED) +{ + v128_t t00 = wasm_v8x16_shuffle(a.val, b.val, 0,1,16,17,0,0,2,3,18,19,0,0,4,5,20,21); + v128_t t01 = wasm_v8x16_shuffle(a.val, b.val, 0,0,6,7,22,23,0,0,8,9,24,25,0,0,10,11); + v128_t t02 = wasm_v8x16_shuffle(a.val, b.val, 26,27,0,0,12,13,28,29,0,0,14,15,30,31,0,0); + + v128_t t10 = wasm_v8x16_shuffle(t00, c.val, 0,1,2,3,16,17,6,7,8,9,18,19,12,13,14,15); + v128_t t11 = wasm_v8x16_shuffle(t01, c.val, 20,21,2,3,4,5,22,23,8,9,10,11,24,25,14,15); + v128_t t12 = wasm_v8x16_shuffle(t02, c.val, 0,1,26,27,4,5,6,7,28,29,10,11,12,13,30,31); + + wasm_v128_store(ptr, t10); + wasm_v128_store(ptr + 8, t11); + wasm_v128_store(ptr + 16, t12); +} + +inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16x8& b, + const v_uint16x8& c, const v_uint16x8& d, + hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED) +{ + // a0 a1 a2 a3 .... + // b0 b1 b2 b3 .... + // c0 c1 c2 c3 .... + // d0 d1 d2 d3 .... + v128_t u0 = wasm_unpacklo_i16x8(a.val, c.val); // a0 c0 a1 c1 ... + v128_t u1 = wasm_unpackhi_i16x8(a.val, c.val); // a4 c4 a5 c5 ... + v128_t u2 = wasm_unpacklo_i16x8(b.val, d.val); // b0 d0 b1 d1 ... + v128_t u3 = wasm_unpackhi_i16x8(b.val, d.val); // b4 d4 b5 d5 ... + + v128_t v0 = wasm_unpacklo_i16x8(u0, u2); // a0 b0 c0 d0 ... + v128_t v1 = wasm_unpackhi_i16x8(u0, u2); // a2 b2 c2 d2 ... + v128_t v2 = wasm_unpacklo_i16x8(u1, u3); // a4 b4 c4 d4 ... + v128_t v3 = wasm_unpackhi_i16x8(u1, u3); // a6 b6 c6 d6 ... + + wasm_v128_store(ptr, v0); + wasm_v128_store(ptr + 8, v1); + wasm_v128_store(ptr + 16, v2); + wasm_v128_store(ptr + 24, v3); +} + +inline void v_store_interleave( unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b, + hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED) +{ + v128_t v0 = wasm_unpacklo_i32x4(a.val, b.val); + v128_t v1 = wasm_unpackhi_i32x4(a.val, b.val); + + wasm_v128_store(ptr, v0); + wasm_v128_store(ptr + 4, v1); +} + +inline void v_store_interleave( unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b, + const v_uint32x4& c, hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED) +{ + v128_t t00 = wasm_v8x16_shuffle(a.val, b.val, 0,1,2,3,16,17,18,19,0,0,0,0,4,5,6,7); + v128_t t01 = wasm_v8x16_shuffle(a.val, b.val, 20,21,22,23,0,0,0,0,8,9,10,11,24,25,26,27); + v128_t t02 = wasm_v8x16_shuffle(a.val, b.val, 0,0,0,0,12,13,14,15,28,29,30,31,0,0,0,0); + + v128_t t10 = wasm_v8x16_shuffle(t00, c.val, 0,1,2,3,4,5,6,7,16,17,18,19,12,13,14,15); + v128_t t11 = wasm_v8x16_shuffle(t01, c.val, 0,1,2,3,20,21,22,23,8,9,10,11,12,13,14,15); + v128_t t12 = wasm_v8x16_shuffle(t02, c.val, 24,25,26,27,4,5,6,7,8,9,10,11,28,29,30,31); + + wasm_v128_store(ptr, t10); + wasm_v128_store(ptr + 4, t11); + wasm_v128_store(ptr + 8, t12); +} + +inline void v_store_interleave(unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b, + const v_uint32x4& c, const v_uint32x4& d, + hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED) +{ + v_uint32x4 v0, v1, v2, v3; + v_transpose4x4(a, b, c, d, v0, v1, v2, v3); + + wasm_v128_store(ptr, v0.val); + wasm_v128_store(ptr + 4, v1.val); + wasm_v128_store(ptr + 8, v2.val); + wasm_v128_store(ptr + 12, v3.val); +} + +// 2-channel, float only +inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b, + hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED) +{ + v128_t v0 = wasm_unpacklo_i32x4(a.val, b.val); + v128_t v1 = wasm_unpackhi_i32x4(a.val, b.val); + + wasm_v128_store(ptr, v0); + wasm_v128_store(ptr + 4, v1); +} + +inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b, + const v_float32x4& c, hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED) +{ + v128_t t00 = wasm_v8x16_shuffle(a.val, b.val, 0,1,2,3,16,17,18,19,0,0,0,0,4,5,6,7); + v128_t t01 = wasm_v8x16_shuffle(a.val, b.val, 20,21,22,23,0,0,0,0,8,9,10,11,24,25,26,27); + v128_t t02 = wasm_v8x16_shuffle(a.val, b.val, 0,0,0,0,12,13,14,15,28,29,30,31,0,0,0,0); + + v128_t t10 = wasm_v8x16_shuffle(t00, c.val, 0,1,2,3,4,5,6,7,16,17,18,19,12,13,14,15); + v128_t t11 = wasm_v8x16_shuffle(t01, c.val, 0,1,2,3,20,21,22,23,8,9,10,11,12,13,14,15); + v128_t t12 = wasm_v8x16_shuffle(t02, c.val, 24,25,26,27,4,5,6,7,8,9,10,11,28,29,30,31); + + wasm_v128_store(ptr, t10); + wasm_v128_store(ptr + 4, t11); + wasm_v128_store(ptr + 8, t12); +} + +inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b, + const v_float32x4& c, const v_float32x4& d, + hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED) +{ + v_float32x4 v0, v1, v2, v3; + v_transpose4x4(a, b, c, d, v0, v1, v2, v3); + + wasm_v128_store(ptr, v0.val); + wasm_v128_store(ptr + 4, v1.val); + wasm_v128_store(ptr + 8, v2.val); + wasm_v128_store(ptr + 12, v3.val); +} + +inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b, + hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED) +{ + v128_t v0 = wasm_unpacklo_i64x2(a.val, b.val); + v128_t v1 = wasm_unpackhi_i64x2(a.val, b.val); + + wasm_v128_store(ptr, v0); + wasm_v128_store(ptr + 2, v1); +} + +inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b, + const v_uint64x2& c, hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED) +{ + v128_t v0 = wasm_v8x16_shuffle(a.val, b.val, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23); + v128_t v1 = wasm_v8x16_shuffle(a.val, c.val, 16,17,18,19,20,21,22,23,8,9,10,11,12,13,14,15); + v128_t v2 = wasm_v8x16_shuffle(b.val, c.val, 8,9,10,11,12,13,14,15,24,25,26,27,28,29,30,31); + + wasm_v128_store(ptr, v0); + wasm_v128_store(ptr + 2, v1); + wasm_v128_store(ptr + 4, v2); +} + +inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b, + const v_uint64x2& c, const v_uint64x2& d, + hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED) +{ + v128_t v0 = wasm_unpacklo_i64x2(a.val, b.val); + v128_t v1 = wasm_unpacklo_i64x2(c.val, d.val); + v128_t v2 = wasm_unpackhi_i64x2(a.val, b.val); + v128_t v3 = wasm_unpackhi_i64x2(c.val, d.val); + + wasm_v128_store(ptr, v0); + wasm_v128_store(ptr + 2, v1); + wasm_v128_store(ptr + 4, v2); + wasm_v128_store(ptr + 6, v3); +} + +#define OPENCV_HAL_IMPL_WASM_LOADSTORE_INTERLEAVE(_Tpvec0, _Tp0, suffix0, _Tpvec1, _Tp1, suffix1) \ +inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0 ) \ +{ \ + _Tpvec1 a1, b1; \ + v_load_deinterleave((const _Tp1*)ptr, a1, b1); \ + a0 = v_reinterpret_as_##suffix0(a1); \ + b0 = v_reinterpret_as_##suffix0(b1); \ +} \ +inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0 ) \ +{ \ + _Tpvec1 a1, b1, c1; \ + v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1); \ + a0 = v_reinterpret_as_##suffix0(a1); \ + b0 = v_reinterpret_as_##suffix0(b1); \ + c0 = v_reinterpret_as_##suffix0(c1); \ +} \ +inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0, _Tpvec0& d0 ) \ +{ \ + _Tpvec1 a1, b1, c1, d1; \ + v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1, d1); \ + a0 = v_reinterpret_as_##suffix0(a1); \ + b0 = v_reinterpret_as_##suffix0(b1); \ + c0 = v_reinterpret_as_##suffix0(c1); \ + d0 = v_reinterpret_as_##suffix0(d1); \ +} \ +inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \ + hal::StoreMode mode = hal::STORE_UNALIGNED ) \ +{ \ + _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \ + _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \ + v_store_interleave((_Tp1*)ptr, a1, b1, mode); \ +} \ +inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \ + const _Tpvec0& c0, hal::StoreMode mode = hal::STORE_UNALIGNED ) \ +{ \ + _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \ + _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \ + _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \ + v_store_interleave((_Tp1*)ptr, a1, b1, c1, mode); \ +} \ +inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \ + const _Tpvec0& c0, const _Tpvec0& d0, \ + hal::StoreMode mode = hal::STORE_UNALIGNED ) \ +{ \ + _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \ + _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \ + _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \ + _Tpvec1 d1 = v_reinterpret_as_##suffix1(d0); \ + v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1, mode); \ +} + +OPENCV_HAL_IMPL_WASM_LOADSTORE_INTERLEAVE(v_int8x16, schar, s8, v_uint8x16, uchar, u8) +OPENCV_HAL_IMPL_WASM_LOADSTORE_INTERLEAVE(v_int16x8, short, s16, v_uint16x8, ushort, u16) +OPENCV_HAL_IMPL_WASM_LOADSTORE_INTERLEAVE(v_int32x4, int, s32, v_uint32x4, unsigned, u32) +OPENCV_HAL_IMPL_WASM_LOADSTORE_INTERLEAVE(v_int64x2, int64, s64, v_uint64x2, uint64, u64) +OPENCV_HAL_IMPL_WASM_LOADSTORE_INTERLEAVE(v_float64x2, double, f64, v_uint64x2, uint64, u64) + +inline v_float32x4 v_cvt_f32(const v_int32x4& a) +{ + return v_float32x4(wasm_f32x4_convert_i32x4(a.val)); +} + +inline v_float32x4 v_cvt_f32(const v_float64x2& a) +{ + double a_[2]; + wasm_v128_store(a_, a.val); + float c_[4]; + c_[0] = (float)(a_[0]); + c_[1] = (float)(a_[1]); + c_[2] = 0; + c_[3] = 0; + return v_float32x4(wasm_v128_load(c_)); +} + +inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b) +{ + double a_[2], b_[2]; + wasm_v128_store(a_, a.val); + wasm_v128_store(b_, b.val); + float c_[4]; + c_[0] = (float)(a_[0]); + c_[1] = (float)(a_[1]); + c_[2] = (float)(b_[0]); + c_[3] = (float)(b_[1]); + return v_float32x4(wasm_v128_load(c_)); +} + +inline v_float64x2 v_cvt_f64(const v_int32x4& a) +{ +#ifdef __wasm_unimplemented_simd128__ + v128_t p = v128_cvti32x4_i64x2(a.val); + return v_float64x2(wasm_f64x2_convert_i64x2(p)); +#else + int a_[4]; + wasm_v128_store(a_, a.val); + double c_[2]; + c_[0] = (double)(a_[0]); + c_[1] = (double)(a_[1]); + return v_float64x2(wasm_v128_load(c_)); +#endif +} + +inline v_float64x2 v_cvt_f64_high(const v_int32x4& a) +{ +#ifdef __wasm_unimplemented_simd128__ + v128_t p = v128_cvti32x4_i64x2_high(a.val); + return v_float64x2(wasm_f64x2_convert_i64x2(p)); +#else + int a_[4]; + wasm_v128_store(a_, a.val); + double c_[2]; + c_[0] = (double)(a_[2]); + c_[1] = (double)(a_[3]); + return v_float64x2(wasm_v128_load(c_)); +#endif +} + +inline v_float64x2 v_cvt_f64(const v_float32x4& a) +{ + float a_[4]; + wasm_v128_store(a_, a.val); + double c_[2]; + c_[0] = (double)(a_[0]); + c_[1] = (double)(a_[1]); + return v_float64x2(wasm_v128_load(c_)); +} + +inline v_float64x2 v_cvt_f64_high(const v_float32x4& a) +{ + float a_[4]; + wasm_v128_store(a_, a.val); + double c_[2]; + c_[0] = (double)(a_[2]); + c_[1] = (double)(a_[3]); + return v_float64x2(wasm_v128_load(c_)); +} + +inline v_float64x2 v_cvt_f64(const v_int64x2& a) +{ +#ifdef __wasm_unimplemented_simd128__ + return v_float64x2(wasm_f64x2_convert_i64x2(a.val)); +#else + int64 a_[2]; + wasm_v128_store(a_, a.val); + double c_[2]; + c_[0] = (double)(a_[0]); + c_[1] = (double)(a_[1]); + return v_float64x2(wasm_v128_load(c_)); +#endif +} + +////////////// Lookup table access //////////////////// + +inline v_int8x16 v_lut(const schar* tab, const int* idx) +{ + return v_int8x16(tab[idx[0]], tab[idx[1]], tab[idx[ 2]], tab[idx[ 3]], tab[idx[ 4]], tab[idx[ 5]], tab[idx[ 6]], tab[idx[ 7]], + tab[idx[8]], tab[idx[9]], tab[idx[10]], tab[idx[11]], tab[idx[12]], tab[idx[13]], tab[idx[14]], tab[idx[15]]); +} +inline v_int8x16 v_lut_pairs(const schar* tab, const int* idx) +{ + return v_int8x16(tab[idx[0]], tab[idx[0]+1], tab[idx[1]], tab[idx[1]+1], tab[idx[2]], tab[idx[2]+1], tab[idx[3]], tab[idx[3]+1], + tab[idx[4]], tab[idx[4]+1], tab[idx[5]], tab[idx[5]+1], tab[idx[6]], tab[idx[6]+1], tab[idx[7]], tab[idx[7]+1]); +} +inline v_int8x16 v_lut_quads(const schar* tab, const int* idx) +{ + return v_int8x16(tab[idx[0]], tab[idx[0]+1], tab[idx[0]+2], tab[idx[0]+3], tab[idx[1]], tab[idx[1]+1], tab[idx[1]+2], tab[idx[1]+3], + tab[idx[2]], tab[idx[2]+1], tab[idx[2]+2], tab[idx[2]+3], tab[idx[3]], tab[idx[3]+1], tab[idx[3]+2], tab[idx[3]+3]); +} +inline v_uint8x16 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((const schar *)tab, idx)); } +inline v_uint8x16 v_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_pairs((const schar *)tab, idx)); } +inline v_uint8x16 v_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_quads((const schar *)tab, idx)); } + +inline v_int16x8 v_lut(const short* tab, const int* idx) +{ + return v_int16x8(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]], + tab[idx[4]], tab[idx[5]], tab[idx[6]], tab[idx[7]]); +} +inline v_int16x8 v_lut_pairs(const short* tab, const int* idx) +{ + return v_int16x8(tab[idx[0]], tab[idx[0]+1], tab[idx[1]], tab[idx[1]+1], + tab[idx[2]], tab[idx[2]+1], tab[idx[3]], tab[idx[3]+1]); +} +inline v_int16x8 v_lut_quads(const short* tab, const int* idx) +{ + return v_int16x8(tab[idx[0]], tab[idx[0]+1], tab[idx[0]+2], tab[idx[0]+3], + tab[idx[1]], tab[idx[1]+1], tab[idx[1]+2], tab[idx[1]+3]); +} +inline v_uint16x8 v_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut((const short *)tab, idx)); } +inline v_uint16x8 v_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_pairs((const short *)tab, idx)); } +inline v_uint16x8 v_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_quads((const short *)tab, idx)); } + +inline v_int32x4 v_lut(const int* tab, const int* idx) +{ + return v_int32x4(tab[idx[0]], tab[idx[1]], + tab[idx[2]], tab[idx[3]]); +} +inline v_int32x4 v_lut_pairs(const int* tab, const int* idx) +{ + return v_int32x4(tab[idx[0]], tab[idx[0]+1], + tab[idx[1]], tab[idx[1]+1]); +} +inline v_int32x4 v_lut_quads(const int* tab, const int* idx) +{ + return v_int32x4(wasm_v128_load(tab + idx[0])); +} +inline v_uint32x4 v_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut((const int *)tab, idx)); } +inline v_uint32x4 v_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_pairs((const int *)tab, idx)); } +inline v_uint32x4 v_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_quads((const int *)tab, idx)); } + +inline v_int64x2 v_lut(const int64_t* tab, const int* idx) +{ + return v_int64x2(tab[idx[0]], tab[idx[1]]); +} +inline v_int64x2 v_lut_pairs(const int64_t* tab, const int* idx) +{ + return v_int64x2(wasm_v128_load(tab + idx[0])); +} +inline v_uint64x2 v_lut(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut((const int64_t *)tab, idx)); } +inline v_uint64x2 v_lut_pairs(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_pairs((const int64_t *)tab, idx)); } + +inline v_float32x4 v_lut(const float* tab, const int* idx) +{ + return v_float32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]); +} +inline v_float32x4 v_lut_pairs(const float* tab, const int* idx) { return v_reinterpret_as_f32(v_lut_pairs((const int *)tab, idx)); } +inline v_float32x4 v_lut_quads(const float* tab, const int* idx) { return v_reinterpret_as_f32(v_lut_quads((const int *)tab, idx)); } + +inline v_float64x2 v_lut(const double* tab, const int* idx) +{ + return v_float64x2(tab[idx[0]], tab[idx[1]]); +} +inline v_float64x2 v_lut_pairs(const double* tab, const int* idx) +{ + return v_float64x2(wasm_v128_load(tab + idx[0])); +} + +inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec) +{ + return v_int32x4(tab[wasm_i32x4_extract_lane(idxvec.val, 0)], + tab[wasm_i32x4_extract_lane(idxvec.val, 1)], + tab[wasm_i32x4_extract_lane(idxvec.val, 2)], + tab[wasm_i32x4_extract_lane(idxvec.val, 3)]); +} + +inline v_uint32x4 v_lut(const unsigned* tab, const v_int32x4& idxvec) +{ + return v_reinterpret_as_u32(v_lut((const int *)tab, idxvec)); +} + +inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec) +{ + return v_float32x4(tab[wasm_i32x4_extract_lane(idxvec.val, 0)], + tab[wasm_i32x4_extract_lane(idxvec.val, 1)], + tab[wasm_i32x4_extract_lane(idxvec.val, 2)], + tab[wasm_i32x4_extract_lane(idxvec.val, 3)]); +} + +inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec) +{ + return v_float64x2(tab[wasm_i32x4_extract_lane(idxvec.val, 0)], + tab[wasm_i32x4_extract_lane(idxvec.val, 1)]); +} + +// loads pairs from the table and deinterleaves them, e.g. returns: +// x = (tab[idxvec[0], tab[idxvec[1]], tab[idxvec[2]], tab[idxvec[3]]), +// y = (tab[idxvec[0]+1], tab[idxvec[1]+1], tab[idxvec[2]+1], tab[idxvec[3]+1]) +// note that the indices are float's indices, not the float-pair indices. +// in theory, this function can be used to implement bilinear interpolation, +// when idxvec are the offsets within the image. +inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y) +{ + x = v_float32x4(tab[wasm_i32x4_extract_lane(idxvec.val, 0)], + tab[wasm_i32x4_extract_lane(idxvec.val, 1)], + tab[wasm_i32x4_extract_lane(idxvec.val, 2)], + tab[wasm_i32x4_extract_lane(idxvec.val, 3)]); + y = v_float32x4(tab[wasm_i32x4_extract_lane(idxvec.val, 0)+1], + tab[wasm_i32x4_extract_lane(idxvec.val, 1)+1], + tab[wasm_i32x4_extract_lane(idxvec.val, 2)+1], + tab[wasm_i32x4_extract_lane(idxvec.val, 3)+1]); +} + +inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y) +{ + v128_t xy0 = wasm_v128_load(tab + wasm_i32x4_extract_lane(idxvec.val, 0)); + v128_t xy1 = wasm_v128_load(tab + wasm_i32x4_extract_lane(idxvec.val, 1)); + x.val = wasm_unpacklo_i64x2(xy0, xy1); + y.val = wasm_unpacklo_i64x2(xy0, xy1); +} + +inline v_int8x16 v_interleave_pairs(const v_int8x16& vec) +{ + return v_int8x16(wasm_v8x16_shuffle(vec.val, vec.val, 0,2,1,3,4,6,5,7,8,10,9,11,12,14,13,15)); +} +inline v_uint8x16 v_interleave_pairs(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec))); } +inline v_int8x16 v_interleave_quads(const v_int8x16& vec) +{ + return v_int8x16(wasm_v8x16_shuffle(vec.val, vec.val, 0,4,1,5,2,6,3,7,8,12,9,13,10,14,11,15)); +} +inline v_uint8x16 v_interleave_quads(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec))); } + +inline v_int16x8 v_interleave_pairs(const v_int16x8& vec) +{ + return v_int16x8(wasm_v8x16_shuffle(vec.val, vec.val, 0,1,4,5,2,3,6,7,8,9,12,13,10,11,14,15)); +} +inline v_uint16x8 v_interleave_pairs(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); } +inline v_int16x8 v_interleave_quads(const v_int16x8& vec) +{ + return v_int16x8(wasm_v8x16_shuffle(vec.val, vec.val, 0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15)); +} +inline v_uint16x8 v_interleave_quads(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); } + +inline v_int32x4 v_interleave_pairs(const v_int32x4& vec) +{ + return v_int32x4(wasm_v8x16_shuffle(vec.val, vec.val, 0,1,2,3,8,9,10,11,4,5,6,7,12,13,14,15)); +} +inline v_uint32x4 v_interleave_pairs(const v_uint32x4& vec) { return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); } +inline v_float32x4 v_interleave_pairs(const v_float32x4& vec) +{ + return v_float32x4(wasm_v8x16_shuffle(vec.val, vec.val, 0,1,2,3,8,9,10,11,4,5,6,7,12,13,14,15)); +} + +inline v_int8x16 v_pack_triplets(const v_int8x16& vec) +{ + return v_int8x16(wasm_v8x16_shuffle(vec.val, vec.val, 0,1,2,4,5,6,8,9,10,12,13,14,16,16,16,16)); +} +inline v_uint8x16 v_pack_triplets(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); } + +inline v_int16x8 v_pack_triplets(const v_int16x8& vec) +{ + return v_int16x8(wasm_v8x16_shuffle(vec.val, vec.val, 0,1,2,3,4,5,8,9,10,11,12,13,14,15,6,7)); +} +inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); } + +inline v_int32x4 v_pack_triplets(const v_int32x4& vec) { return vec; } +inline v_uint32x4 v_pack_triplets(const v_uint32x4& vec) { return vec; } +inline v_float32x4 v_pack_triplets(const v_float32x4& vec) { return vec; } + +template +inline typename _Tp::lane_type v_extract_n(const _Tp& a) +{ + return v_rotate_right(a).get0(); +} + +template +inline v_uint32x4 v_broadcast_element(const v_uint32x4& a) +{ + return v_setall_u32(v_extract_n(a)); +} +template +inline v_int32x4 v_broadcast_element(const v_int32x4& a) +{ + return v_setall_s32(v_extract_n(a)); +} +template +inline v_float32x4 v_broadcast_element(const v_float32x4& a) +{ + return v_setall_f32(v_extract_n(a)); +} + + +////////////// FP16 support /////////////////////////// + +inline v_float32x4 v_load_expand(const float16_t* ptr) +{ + float a[4]; + for (int i = 0; i < 4; i++) + a[i] = ptr[i]; + return v_float32x4(wasm_v128_load(a)); +} + +inline void v_pack_store(float16_t* ptr, const v_float32x4& v) +{ + double v_[4]; + wasm_v128_store(v_, v.val); + ptr[0] = float16_t(v_[0]); + ptr[1] = float16_t(v_[1]); + ptr[2] = float16_t(v_[2]); + ptr[3] = float16_t(v_[3]); +} + +inline void v_cleanup() {} + +CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END + +//! @endcond + +} + +#endif diff --git a/3rdparty/opencv/include/opencv2/core/hal/msa_macros.h b/3rdparty/opencv/include/opencv2/core/hal/msa_macros.h new file mode 100644 index 00000000..bd6ddb12 --- /dev/null +++ b/3rdparty/opencv/include/opencv2/core/hal/msa_macros.h @@ -0,0 +1,1558 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef OPENCV_CORE_HAL_MSA_MACROS_H +#define OPENCV_CORE_HAL_MSA_MACROS_H + +#ifdef __mips_msa +#include "msa.h" +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* Define 64 bits vector types */ +typedef signed char v8i8 __attribute__ ((vector_size(8), aligned(8))); +typedef unsigned char v8u8 __attribute__ ((vector_size(8), aligned(8))); +typedef short v4i16 __attribute__ ((vector_size(8), aligned(8))); +typedef unsigned short v4u16 __attribute__ ((vector_size(8), aligned(8))); +typedef int v2i32 __attribute__ ((vector_size(8), aligned(8))); +typedef unsigned int v2u32 __attribute__ ((vector_size(8), aligned(8))); +typedef long long v1i64 __attribute__ ((vector_size(8), aligned(8))); +typedef unsigned long long v1u64 __attribute__ ((vector_size(8), aligned(8))); +typedef float v2f32 __attribute__ ((vector_size(8), aligned(8))); +typedef double v1f64 __attribute__ ((vector_size(8), aligned(8))); + + +/* Load values from the given memory a 64-bit vector. */ +#define msa_ld1_s8(__a) (*((v8i8*)(__a))) +#define msa_ld1_s16(__a) (*((v4i16*)(__a))) +#define msa_ld1_s32(__a) (*((v2i32*)(__a))) +#define msa_ld1_s64(__a) (*((v1i64*)(__a))) +#define msa_ld1_u8(__a) (*((v8u8*)(__a))) +#define msa_ld1_u16(__a) (*((v4u16*)(__a))) +#define msa_ld1_u32(__a) (*((v2u32*)(__a))) +#define msa_ld1_u64(__a) (*((v1u64*)(__a))) +#define msa_ld1_f32(__a) (*((v2f32*)(__a))) +#define msa_ld1_f64(__a) (*((v1f64*)(__a))) + +/* Load values from the given memory address to a 128-bit vector */ +#define msa_ld1q_s8(__a) ((v16i8)__builtin_msa_ld_b(__a, 0)) +#define msa_ld1q_s16(__a) ((v8i16)__builtin_msa_ld_h(__a, 0)) +#define msa_ld1q_s32(__a) ((v4i32)__builtin_msa_ld_w(__a, 0)) +#define msa_ld1q_s64(__a) ((v2i64)__builtin_msa_ld_d(__a, 0)) +#define msa_ld1q_u8(__a) ((v16u8)__builtin_msa_ld_b(__a, 0)) +#define msa_ld1q_u16(__a) ((v8u16)__builtin_msa_ld_h(__a, 0)) +#define msa_ld1q_u32(__a) ((v4u32)__builtin_msa_ld_w(__a, 0)) +#define msa_ld1q_u64(__a) ((v2u64)__builtin_msa_ld_d(__a, 0)) +#define msa_ld1q_f32(__a) ((v4f32)__builtin_msa_ld_w(__a, 0)) +#define msa_ld1q_f64(__a) ((v2f64)__builtin_msa_ld_d(__a, 0)) + +/* Store 64bits vector elements values to the given memory address. */ +#define msa_st1_s8(__a, __b) (*((v8i8*)(__a)) = __b) +#define msa_st1_s16(__a, __b) (*((v4i16*)(__a)) = __b) +#define msa_st1_s32(__a, __b) (*((v2i32*)(__a)) = __b) +#define msa_st1_s64(__a, __b) (*((v1i64*)(__a)) = __b) +#define msa_st1_u8(__a, __b) (*((v8u8*)(__a)) = __b) +#define msa_st1_u16(__a, __b) (*((v4u16*)(__a)) = __b) +#define msa_st1_u32(__a, __b) (*((v2u32*)(__a)) = __b) +#define msa_st1_u64(__a, __b) (*((v1u64*)(__a)) = __b) +#define msa_st1_f32(__a, __b) (*((v2f32*)(__a)) = __b) +#define msa_st1_f64(__a, __b) (*((v1f64*)(__a)) = __b) + +/* Store the values of elements in the 128 bits vector __a to the given memory address __a. */ +#define msa_st1q_s8(__a, __b) (__builtin_msa_st_b((v16i8)(__b), __a, 0)) +#define msa_st1q_s16(__a, __b) (__builtin_msa_st_h((v8i16)(__b), __a, 0)) +#define msa_st1q_s32(__a, __b) (__builtin_msa_st_w((v4i32)(__b), __a, 0)) +#define msa_st1q_s64(__a, __b) (__builtin_msa_st_d((v2i64)(__b), __a, 0)) +#define msa_st1q_u8(__a, __b) (__builtin_msa_st_b((v16i8)(__b), __a, 0)) +#define msa_st1q_u16(__a, __b) (__builtin_msa_st_h((v8i16)(__b), __a, 0)) +#define msa_st1q_u32(__a, __b) (__builtin_msa_st_w((v4i32)(__b), __a, 0)) +#define msa_st1q_u64(__a, __b) (__builtin_msa_st_d((v2i64)(__b), __a, 0)) +#define msa_st1q_f32(__a, __b) (__builtin_msa_st_w((v4i32)(__b), __a, 0)) +#define msa_st1q_f64(__a, __b) (__builtin_msa_st_d((v2i64)(__b), __a, 0)) + +/* Store the value of the element with the index __c in vector __a to the given memory address __a. */ +#define msa_st1_lane_s8(__a, __b, __c) (*((int8_t*)(__a)) = __b[__c]) +#define msa_st1_lane_s16(__a, __b, __c) (*((int16_t*)(__a)) = __b[__c]) +#define msa_st1_lane_s32(__a, __b, __c) (*((int32_t*)(__a)) = __b[__c]) +#define msa_st1_lane_s64(__a, __b, __c) (*((int64_t*)(__a)) = __b[__c]) +#define msa_st1_lane_u8(__a, __b, __c) (*((uint8_t*)(__a)) = __b[__c]) +#define msa_st1_lane_u16(__a, __b, __c) (*((uint16_t*)(__a)) = __b[__c]) +#define msa_st1_lane_u32(__a, __b, __c) (*((uint32_t*)(__a)) = __b[__c]) +#define msa_st1_lane_u64(__a, __b, __c) (*((uint64_t*)(__a)) = __b[__c]) +#define msa_st1_lane_f32(__a, __b, __c) (*((float*)(__a)) = __b[__c]) +#define msa_st1_lane_f64(__a, __b, __c) (*((double*)(__a)) = __b[__c]) +#define msa_st1q_lane_s8(__a, __b, __c) (*((int8_t*)(__a)) = (int8_t)__builtin_msa_copy_s_b(__b, __c)) +#define msa_st1q_lane_s16(__a, __b, __c) (*((int16_t*)(__a)) = (int16_t)__builtin_msa_copy_s_h(__b, __c)) +#define msa_st1q_lane_s32(__a, __b, __c) (*((int32_t*)(__a)) = __builtin_msa_copy_s_w(__b, __c)) +#define msa_st1q_lane_s64(__a, __b, __c) (*((int64_t*)(__a)) = __builtin_msa_copy_s_d(__b, __c)) +#define msa_st1q_lane_u8(__a, __b, __c) (*((uint8_t*)(__a)) = (uint8_t)__builtin_msa_copy_u_b((v16i8)(__b), __c)) +#define msa_st1q_lane_u16(__a, __b, __c) (*((uint16_t*)(__a)) = (uint16_t)__builtin_msa_copy_u_h((v8i16)(__b), __c)) +#define msa_st1q_lane_u32(__a, __b, __c) (*((uint32_t*)(__a)) = __builtin_msa_copy_u_w((v4i32)(__b), __c)) +#define msa_st1q_lane_u64(__a, __b, __c) (*((uint64_t*)(__a)) = __builtin_msa_copy_u_d((v2i64)(__b), __c)) +#define msa_st1q_lane_f32(__a, __b, __c) (*((float*)(__a)) = __b[__c]) +#define msa_st1q_lane_f64(__a, __b, __c) (*((double*)(__a)) = __b[__c]) + +/* Duplicate elements for 64-bit doubleword vectors */ +#define msa_dup_n_s8(__a) ((v8i8)__builtin_msa_copy_s_d((v2i64)__builtin_msa_fill_b((int32_t)(__a)), 0)) +#define msa_dup_n_s16(__a) ((v4i16)__builtin_msa_copy_s_d((v2i64)__builtin_msa_fill_h((int32_t)(__a)), 0)) +#define msa_dup_n_s32(__a) ((v2i32){__a, __a}) +#define msa_dup_n_s64(__a) ((v1i64){__a}) +#define msa_dup_n_u8(__a) ((v8u8)__builtin_msa_copy_u_d((v2i64)__builtin_msa_fill_b((int32_t)(__a)), 0)) +#define msa_dup_n_u16(__a) ((v4u16)__builtin_msa_copy_u_d((v2i64)__builtin_msa_fill_h((int32_t)(__a)), 0)) +#define msa_dup_n_u32(__a) ((v2u32){__a, __a}) +#define msa_dup_n_u64(__a) ((v1u64){__a}) +#define msa_dup_n_f32(__a) ((v2f32){__a, __a}) +#define msa_dup_n_f64(__a) ((v1f64){__a}) + +/* Duplicate elements for 128-bit quadword vectors */ +#define msa_dupq_n_s8(__a) (__builtin_msa_fill_b((int32_t)(__a))) +#define msa_dupq_n_s16(__a) (__builtin_msa_fill_h((int32_t)(__a))) +#define msa_dupq_n_s32(__a) (__builtin_msa_fill_w((int32_t)(__a))) +#define msa_dupq_n_s64(__a) (__builtin_msa_fill_d((int64_t)(__a))) +#define msa_dupq_n_u8(__a) ((v16u8)__builtin_msa_fill_b((int32_t)(__a))) +#define msa_dupq_n_u16(__a) ((v8u16)__builtin_msa_fill_h((int32_t)(__a))) +#define msa_dupq_n_u32(__a) ((v4u32)__builtin_msa_fill_w((int32_t)(__a))) +#define msa_dupq_n_u64(__a) ((v2u64)__builtin_msa_fill_d((int64_t)(__a))) +#define msa_dupq_n_f32(__a) ((v4f32){__a, __a, __a, __a}) +#define msa_dupq_n_f64(__a) ((v2f64){__a, __a}) +#define msa_dupq_lane_s8(__a, __b) (__builtin_msa_splat_b(__a, __b)) +#define msa_dupq_lane_s16(__a, __b) (__builtin_msa_splat_h(__a, __b)) +#define msa_dupq_lane_s32(__a, __b) (__builtin_msa_splat_w(__a, __b)) +#define msa_dupq_lane_s64(__a, __b) (__builtin_msa_splat_d(__a, __b)) +#define msa_dupq_lane_u8(__a, __b) ((v16u8)__builtin_msa_splat_b((v16i8)(__a), __b)) +#define msa_dupq_lane_u16(__a, __b) ((v8u16)__builtin_msa_splat_h((v8i16)(__a), __b)) +#define msa_dupq_lane_u32(__a, __b) ((v4u32)__builtin_msa_splat_w((v4i32)(__a), __b)) +#define msa_dupq_lane_u64(__a, __b) ((v2u64)__builtin_msa_splat_d((v2i64)(__a), __b)) + +/* Create a 64 bits vector */ +#define msa_create_s8(__a) ((v8i8)((uint64_t)(__a))) +#define msa_create_s16(__a) ((v4i16)((uint64_t)(__a))) +#define msa_create_s32(__a) ((v2i32)((uint64_t)(__a))) +#define msa_create_s64(__a) ((v1i64)((uint64_t)(__a))) +#define msa_create_u8(__a) ((v8u8)((uint64_t)(__a))) +#define msa_create_u16(__a) ((v4u16)((uint64_t)(__a))) +#define msa_create_u32(__a) ((v2u32)((uint64_t)(__a))) +#define msa_create_u64(__a) ((v1u64)((uint64_t)(__a))) +#define msa_create_f32(__a) ((v2f32)((uint64_t)(__a))) +#define msa_create_f64(__a) ((v1f64)((uint64_t)(__a))) + +/* Sign extends or zero extends each element in a 64 bits vector to twice its original length, and places the results in a 128 bits vector. */ +/*Transform v8i8 to v8i16*/ +#define msa_movl_s8(__a) \ +((v8i16){(__a)[0], (__a)[1], (__a)[2], (__a)[3], \ + (__a)[4], (__a)[5], (__a)[6], (__a)[7]}) + +/*Transform v8u8 to v8u16*/ +#define msa_movl_u8(__a) \ +((v8u16){(__a)[0], (__a)[1], (__a)[2], (__a)[3], \ + (__a)[4], (__a)[5], (__a)[6], (__a)[7]}) + +/*Transform v4i16 to v8i16*/ +#define msa_movl_s16(__a) ((v4i32){(__a)[0], (__a)[1], (__a)[2], (__a)[3]}) + +/*Transform v2i32 to v4i32*/ +#define msa_movl_s32(__a) ((v2i64){(__a)[0], (__a)[1]}) + +/*Transform v4u16 to v8u16*/ +#define msa_movl_u16(__a) ((v4u32){(__a)[0], (__a)[1], (__a)[2], (__a)[3]}) + +/*Transform v2u32 to v4u32*/ +#define msa_movl_u32(__a) ((v2u64){(__a)[0], (__a)[1]}) + +/* Copies the least significant half of each element of a 128 bits vector into the corresponding elements of a 64 bits vector. */ +#define msa_movn_s16(__a) \ +({ \ + v16i8 __d = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)(__a)); \ + (v8i8)__builtin_msa_copy_s_d((v2i64)__d, 0); \ +}) + +#define msa_movn_s32(__a) \ +({ \ + v8i16 __d = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)(__a)); \ + (v4i16)__builtin_msa_copy_s_d((v2i64)__d, 0); \ +}) + +#define msa_movn_s64(__a) \ +({ \ + v4i32 __d = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)(__a)); \ + (v2i32)__builtin_msa_copy_s_d((v2i64)__d, 0); \ +}) + +#define msa_movn_u16(__a) \ +({ \ + v16i8 __d = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)(__a)); \ + (v8u8)__builtin_msa_copy_u_d((v2i64)__d, 0); \ +}) + +#define msa_movn_u32(__a) \ +({ \ + v8i16 __d = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)(__a)); \ + (v4u16)__builtin_msa_copy_u_d((v2i64)__d, 0); \ +}) + +#define msa_movn_u64(__a) \ +({ \ + v4i32 __d = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)(__a)); \ + (v2u32)__builtin_msa_copy_u_d((v2i64)__d, 0); \ +}) + +/* qmovn */ +#define msa_qmovn_s16(__a) \ +({ \ + v16i8 __d = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)__builtin_msa_sat_s_h((v8i16)(__a), 7)); \ + (v8i8)__builtin_msa_copy_s_d((v2i64)__d, 0); \ +}) + +#define msa_qmovn_s32(__a) \ +({ \ + v8i16 __d = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)__builtin_msa_sat_s_w((v4i32)(__a), 15)); \ + (v4i16)__builtin_msa_copy_s_d((v2i64)__d, 0); \ +}) + +#define msa_qmovn_s64(__a) \ +({ \ + v4i32 __d = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)__builtin_msa_sat_s_d((v2i64)(__a), 31)); \ + (v2i32)__builtin_msa_copy_s_d((v2i64)__d, 0); \ +}) + +#define msa_qmovn_u16(__a) \ +({ \ + v16i8 __d = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)__builtin_msa_sat_u_h((v8u16)(__a), 7)); \ + (v8u8)__builtin_msa_copy_u_d((v2i64)__d, 0); \ +}) + +#define msa_qmovn_u32(__a) \ +({ \ + v8i16 __d = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)__builtin_msa_sat_u_w((v4u32)(__a), 15)); \ + (v4u16)__builtin_msa_copy_u_d((v2i64)__d, 0); \ +}) + +#define msa_qmovn_u64(__a) \ +({ \ + v4i32 __d = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)__builtin_msa_sat_u_d((v2u64)(__a), 31)); \ + (v2u32)__builtin_msa_copy_u_d((v2i64)__d, 0); \ +}) + +/* qmovun */ +#define msa_qmovun_s16(__a) \ +({ \ + v8i16 __d = __builtin_msa_max_s_h(__builtin_msa_fill_h(0), (v8i16)(__a)); \ + v16i8 __e = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)__builtin_msa_sat_u_h((v8u16)__d, 7)); \ + (v8u8)__builtin_msa_copy_u_d((v2i64)__e, 0); \ +}) + +#define msa_qmovun_s32(__a) \ +({ \ + v4i32 __d = __builtin_msa_max_s_w(__builtin_msa_fill_w(0), (v4i32)(__a)); \ + v8i16 __e = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)__builtin_msa_sat_u_w((v4u32)__d, 15)); \ + (v4u16)__builtin_msa_copy_u_d((v2i64)__e, 0); \ +}) + +#define msa_qmovun_s64(__a) \ +({ \ + v2i64 __d = __builtin_msa_max_s_d(__builtin_msa_fill_d(0), (v2i64)(__a)); \ + v4i32 __e = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)__builtin_msa_sat_u_d((v2u64)__d, 31)); \ + (v2u32)__builtin_msa_copy_u_d((v2i64)__e, 0); \ +}) + +/* Right shift elements in a 128 bits vector by an immediate value, and places the results in a 64 bits vector. */ +#define msa_shrn_n_s16(__a, __b) \ +({ \ + v16i8 __d = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)__builtin_msa_srai_h((v8i16)(__a), (int)(__b))); \ + (v8i8)__builtin_msa_copy_s_d((v2i64)__d, 0); \ +}) + +#define msa_shrn_n_s32(__a, __b) \ +({ \ + v8i16 __d = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)__builtin_msa_srai_w((v4i32)(__a), (int)(__b))); \ + (v4i16)__builtin_msa_copy_s_d((v2i64)__d, 0); \ +}) + +#define msa_shrn_n_s64(__a, __b) \ +({ \ + v4i32 __d = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)__builtin_msa_srai_d((v2i64)(__a), (int)(__b))); \ + (v2i32)__builtin_msa_copy_s_d((v2i64)__d, 0); \ +}) + +#define msa_shrn_n_u16(__a, __b) \ +({ \ + v16i8 __d = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)__builtin_msa_srli_h((v8i16)(__a), (int)(__b))); \ + (v8u8)__builtin_msa_copy_u_d((v2i64)__d, 0); \ +}) + +#define msa_shrn_n_u32(__a, __b) \ +({ \ + v8i16 __d = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)__builtin_msa_srli_w((v4i32)(__a), (int)(__b))); \ + (v4u16)__builtin_msa_copy_u_d((v2i64)__d, 0); \ +}) + +#define msa_shrn_n_u64(__a, __b) \ +({ \ + v4i32 __d = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)__builtin_msa_srli_d((v2i64)(__a), (int)(__b))); \ + (v2u32)__builtin_msa_copy_u_d((v2i64)__d, 0); \ +}) + +/* Right shift elements in a 128 bits vector by an immediate value, and places the results in a 64 bits vector. */ +#define msa_rshrn_n_s16(__a, __b) \ +({ \ + v16i8 __d = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)__builtin_msa_srari_h((v8i16)(__a), (int)__b)); \ + (v8i8)__builtin_msa_copy_s_d((v2i64)__d, 0); \ +}) + +#define msa_rshrn_n_s32(__a, __b) \ +({ \ + v8i16 __d = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)__builtin_msa_srari_w((v4i32)(__a), (int)__b)); \ + (v4i16)__builtin_msa_copy_s_d((v2i64)__d, 0); \ +}) + +#define msa_rshrn_n_s64(__a, __b) \ +({ \ + v4i32 __d = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)__builtin_msa_srari_d((v2i64)(__a), (int)__b)); \ + (v2i32)__builtin_msa_copy_s_d((v2i64)__d, 0); \ +}) + +#define msa_rshrn_n_u16(__a, __b) \ +({ \ + v16i8 __d = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)__builtin_msa_srlri_h((v8i16)(__a), (int)__b)); \ + (v8u8)__builtin_msa_copy_u_d((v2i64)__d, 0); \ +}) + +#define msa_rshrn_n_u32(__a, __b) \ +({ \ + v8i16 __d = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)__builtin_msa_srlri_w((v4i32)(__a), (int)__b)); \ + (v4u16)__builtin_msa_copy_u_d((v2i64)__d, 0); \ +}) + +#define msa_rshrn_n_u64(__a, __b) \ +({ \ + v4i32 __d = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)__builtin_msa_srlri_d((v2i64)(__a), (int)__b)); \ + (v2u32)__builtin_msa_copy_u_d((v2i64)__d, 0); \ +}) + +/* Right shift elements in a 128 bits vector by an immediate value, saturate the results and them in a 64 bits vector. */ +#define msa_qrshrn_n_s16(__a, __b) \ +({ \ + v8i16 __d = __builtin_msa_sat_s_h(__builtin_msa_srari_h((v8i16)(__a), (int)(__b)), 7); \ + v16i8 __e = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)__d); \ + (v8i8)__builtin_msa_copy_s_d((v2i64)__e, 0); \ +}) + +#define msa_qrshrn_n_s32(__a, __b) \ +({ \ + v4i32 __d = __builtin_msa_sat_s_w(__builtin_msa_srari_w((v4i32)(__a), (int)(__b)), 15); \ + v8i16 __e = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)__d); \ + (v4i16)__builtin_msa_copy_s_d((v2i64)__e, 0); \ +}) + +#define msa_qrshrn_n_s64(__a, __b) \ +({ \ + v2i64 __d = __builtin_msa_sat_s_d(__builtin_msa_srari_d((v2i64)(__a), (int)(__b)), 31); \ + v4i32 __e = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)__d); \ + (v2i32)__builtin_msa_copy_s_d((v2i64)__e, 0); \ +}) + +#define msa_qrshrn_n_u16(__a, __b) \ +({ \ + v8u16 __d = __builtin_msa_sat_u_h((v8u16)__builtin_msa_srlri_h((v8i16)(__a), (int)(__b)), 7); \ + v16i8 __e = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)__d); \ + (v8u8)__builtin_msa_copy_u_d((v2i64)__e, 0); \ +}) + +#define msa_qrshrn_n_u32(__a, __b) \ +({ \ + v4u32 __d = __builtin_msa_sat_u_w((v4u32)__builtin_msa_srlri_w((v4i32)(__a), (int)(__b)), 15); \ + v8i16 __e = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)__d); \ + (v4u16)__builtin_msa_copy_u_d((v2i64)__e, 0); \ +}) + +#define msa_qrshrn_n_u64(__a, __b) \ +({ \ + v2u64 __d = __builtin_msa_sat_u_d((v2u64)__builtin_msa_srlri_d((v2i64)(__a), (int)(__b)), 31); \ + v4i32 __e = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)__d); \ + (v2u32)__builtin_msa_copy_u_d((v2i64)__e, 0); \ +}) + +/* Right shift elements in a 128 bits vector by an immediate value, saturate the results and them in a 64 bits vector. + Input is signed and output is unsigned. */ +#define msa_qrshrun_n_s16(__a, __b) \ +({ \ + v8i16 __d = __builtin_msa_srlri_h(__builtin_msa_max_s_h(__builtin_msa_fill_h(0), (v8i16)(__a)), (int)(__b)); \ + v16i8 __e = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)__builtin_msa_sat_u_h((v8u16)__d, 7)); \ + (v8u8)__builtin_msa_copy_u_d((v2i64)__e, 0); \ +}) + +#define msa_qrshrun_n_s32(__a, __b) \ +({ \ + v4i32 __d = __builtin_msa_srlri_w(__builtin_msa_max_s_w(__builtin_msa_fill_w(0), (v4i32)(__a)), (int)(__b)); \ + v8i16 __e = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)__builtin_msa_sat_u_w((v4u32)__d, 15)); \ + (v4u16)__builtin_msa_copy_u_d((v2i64)__e, 0); \ +}) + +#define msa_qrshrun_n_s64(__a, __b) \ +({ \ + v2i64 __d = __builtin_msa_srlri_d(__builtin_msa_max_s_d(__builtin_msa_fill_d(0), (v2i64)(__a)), (int)(__b)); \ + v4i32 __e = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)__builtin_msa_sat_u_d((v2u64)__d, 31)); \ + (v2u32)__builtin_msa_copy_u_d((v2i64)__e, 0); \ +}) + +/* pack */ +#define msa_pack_s16(__a, __b) (__builtin_msa_pckev_b((v16i8)(__b), (v16i8)(__a))) +#define msa_pack_s32(__a, __b) (__builtin_msa_pckev_h((v8i16)(__b), (v8i16)(__a))) +#define msa_pack_s64(__a, __b) (__builtin_msa_pckev_w((v4i32)(__b), (v4i32)(__a))) +#define msa_pack_u16(__a, __b) ((v16u8)__builtin_msa_pckev_b((v16i8)(__b), (v16i8)(__a))) +#define msa_pack_u32(__a, __b) ((v8u16)__builtin_msa_pckev_h((v8i16)(__b), (v8i16)(__a))) +#define msa_pack_u64(__a, __b) ((v4u32)__builtin_msa_pckev_w((v4i32)(__b), (v4i32)(__a))) + +/* qpack */ +#define msa_qpack_s16(__a, __b) \ +(__builtin_msa_pckev_b((v16i8)__builtin_msa_sat_s_h((v8i16)(__b), 7), (v16i8)__builtin_msa_sat_s_h((v8i16)(__a), 7))) +#define msa_qpack_s32(__a, __b) \ +(__builtin_msa_pckev_h((v8i16)__builtin_msa_sat_s_w((v4i32)(__b), 15), (v8i16)__builtin_msa_sat_s_w((v4i32)(__a), 15))) +#define msa_qpack_s64(__a, __b) \ +(__builtin_msa_pckev_w((v4i32)__builtin_msa_sat_s_d((v2i64)(__b), 31), (v4i32)__builtin_msa_sat_s_d((v2i64)(__a), 31))) +#define msa_qpack_u16(__a, __b) \ +((v16u8)__builtin_msa_pckev_b((v16i8)__builtin_msa_sat_u_h((v8u16)(__b), 7), (v16i8)__builtin_msa_sat_u_h((v8u16)(__a), 7))) +#define msa_qpack_u32(__a, __b) \ +((v8u16)__builtin_msa_pckev_h((v8i16)__builtin_msa_sat_u_w((v4u32)(__b), 15), (v8i16)__builtin_msa_sat_u_w((v4u32)(__a), 15))) +#define msa_qpack_u64(__a, __b) \ +((v4u32)__builtin_msa_pckev_w((v4i32)__builtin_msa_sat_u_d((v2u64)(__b), 31), (v4i32)__builtin_msa_sat_u_d((v2u64)(__a), 31))) + +/* qpacku */ +#define msa_qpacku_s16(__a, __b) \ +((v16u8)__builtin_msa_pckev_b((v16i8)__builtin_msa_sat_u_h((v8u16)(__builtin_msa_max_s_h(__builtin_msa_fill_h(0), (v8i16)(__b))), 7), \ + (v16i8)__builtin_msa_sat_u_h((v8u16)(__builtin_msa_max_s_h(__builtin_msa_fill_h(0), (v8i16)(__a))), 7))) +#define msa_qpacku_s32(__a, __b) \ +((v8u16)__builtin_msa_pckev_h((v8i16)__builtin_msa_sat_u_w((v4u32)(__builtin_msa_max_s_w(__builtin_msa_fill_w(0), (v4i32)(__b))), 15), \ + (v8i16)__builtin_msa_sat_u_w((v4u32)(__builtin_msa_max_s_w(__builtin_msa_fill_w(0), (v4i32)(__a))), 15))) +#define msa_qpacku_s64(__a, __b) \ +((v4u32)__builtin_msa_pckev_w((v4i32)__builtin_msa_sat_u_d((v2u64)(__builtin_msa_max_s_d(__builtin_msa_fill_d(0), (v2i64)(__b))), 31), \ + (v4i32)__builtin_msa_sat_u_d((v2u64)(__builtin_msa_max_s_d(__builtin_msa_fill_d(0), (v2i64)(__a))), 31))) + +/* packr */ +#define msa_packr_s16(__a, __b, __c) \ +(__builtin_msa_pckev_b((v16i8)__builtin_msa_srai_h((v8i16)(__b), (int)(__c)), (v16i8)__builtin_msa_srai_h((v8i16)(__a), (int)(__c)))) +#define msa_packr_s32(__a, __b, __c) \ +(__builtin_msa_pckev_h((v8i16)__builtin_msa_srai_w((v4i32)(__b), (int)(__c)), (v8i16)__builtin_msa_srai_w((v4i32)(__a), (int)(__c)))) +#define msa_packr_s64(__a, __b, __c) \ +(__builtin_msa_pckev_w((v4i32)__builtin_msa_srai_d((v2i64)(__b), (int)(__c)), (v4i32)__builtin_msa_srai_d((v2i64)(__a), (int)(__c)))) +#define msa_packr_u16(__a, __b, __c) \ +((v16u8)__builtin_msa_pckev_b((v16i8)__builtin_msa_srli_h((v8i16)(__b), (int)(__c)), (v16i8)__builtin_msa_srli_h((v8i16)(__a), (int)(__c)))) +#define msa_packr_u32(__a, __b, __c) \ +((v8u16)__builtin_msa_pckev_h((v8i16)__builtin_msa_srli_w((v4i32)(__b), (int)(__c)), (v8i16)__builtin_msa_srli_w((v4i32)(__a), (int)(__c)))) +#define msa_packr_u64(__a, __b, __c) \ +((v4u32)__builtin_msa_pckev_w((v4i32)__builtin_msa_srli_d((v2i64)(__b), (int)(__c)), (v4i32)__builtin_msa_srli_d((v2i64)(__a), (int)(__c)))) + +/* rpackr */ +#define msa_rpackr_s16(__a, __b, __c) \ +(__builtin_msa_pckev_b((v16i8)__builtin_msa_srari_h((v8i16)(__b), (int)(__c)), (v16i8)__builtin_msa_srari_h((v8i16)(__a), (int)(__c)))) +#define msa_rpackr_s32(__a, __b, __c) \ +(__builtin_msa_pckev_h((v8i16)__builtin_msa_srari_w((v4i32)(__b), (int)(__c)), (v8i16)__builtin_msa_srari_w((v4i32)(__a), (int)(__c)))) +#define msa_rpackr_s64(__a, __b, __c) \ +(__builtin_msa_pckev_w((v4i32)__builtin_msa_srari_d((v2i64)(__b), (int)(__c)), (v4i32)__builtin_msa_srari_d((v2i64)(__a), (int)(__c)))) +#define msa_rpackr_u16(__a, __b, __c) \ +((v16u8)__builtin_msa_pckev_b((v16i8)__builtin_msa_srlri_h((v8i16)(__b), (int)(__c)), (v16i8)__builtin_msa_srlri_h((v8i16)(__a), (int)(__c)))) +#define msa_rpackr_u32(__a, __b, __c) \ +((v8u16)__builtin_msa_pckev_h((v8i16)__builtin_msa_srlri_w((v4i32)(__b), (int)(__c)), (v8i16)__builtin_msa_srlri_w((v4i32)(__a), (int)(__c)))) +#define msa_rpackr_u64(__a, __b, __c) \ +((v4u32)__builtin_msa_pckev_w((v4i32)__builtin_msa_srlri_d((v2i64)(__b), (int)(__c)), (v4i32)__builtin_msa_srlri_d((v2i64)(__a), (int)(__c)))) + +/* qrpackr */ +#define msa_qrpackr_s16(__a, __b, __c) \ +(__builtin_msa_pckev_b((v16i8)__builtin_msa_sat_s_h(__builtin_msa_srari_h((v8i16)(__b), (int)(__c)), 7), \ + (v16i8)__builtin_msa_sat_s_h(__builtin_msa_srari_h((v8i16)(__a), (int)(__c)), 7))) +#define msa_qrpackr_s32(__a, __b, __c) \ +(__builtin_msa_pckev_h((v8i16)__builtin_msa_sat_s_w(__builtin_msa_srari_w((v4i32)(__b), (int)(__c)), 15), \ + (v8i16)__builtin_msa_sat_s_w(__builtin_msa_srari_w((v4i32)(__a), (int)(__c)), 15))) +#define msa_qrpackr_s64(__a, __b, __c) \ +(__builtin_msa_pckev_w((v4i32)__builtin_msa_sat_s_d(__builtin_msa_srari_d((v2i64)(__b), (int)(__c)), 31), \ + (v4i32)__builtin_msa_sat_s_d(__builtin_msa_srari_d((v2i64)(__a), (int)(__c)), 31))) +#define msa_qrpackr_u16(__a, __b, __c) \ +((v16u8)__builtin_msa_pckev_b((v16i8)__builtin_msa_sat_u_h((v8u16)__builtin_msa_srlri_h((v8i16)(__b), (int)(__c)), 7), \ + (v16i8)__builtin_msa_sat_u_h((v8u16)__builtin_msa_srlri_h((v8i16)(__a), (int)(__c)), 7))) +#define msa_qrpackr_u32(__a, __b, __c) \ +((v8u16)__builtin_msa_pckev_h((v8i16)__builtin_msa_sat_u_w((v4u32)__builtin_msa_srlri_w((v4i32)(__b), (int)(__c)), 15), \ + (v8i16)__builtin_msa_sat_u_w((v4u32)__builtin_msa_srlri_w((v4i32)(__a), (int)(__c)), 15))) +#define msa_qrpackr_u64(__a, __b, __c) \ +((v4u32)__builtin_msa_pckev_w((v4i32)__builtin_msa_sat_u_d((v2u64)__builtin_msa_srlri_d((v2i64)(__b), (int)(__c)), 31), \ + (v4i32)__builtin_msa_sat_u_d((v2u64)__builtin_msa_srlri_d((v2i64)(__a), (int)(__c)), 31))) + +/* qrpackru */ +#define msa_qrpackru_s16(__a, __b, __c) \ +({ \ + v8i16 __d = __builtin_msa_srlri_h(__builtin_msa_max_s_h(__builtin_msa_fill_h(0), (v8i16)(__a)), (int)(__c)); \ + v8i16 __e = __builtin_msa_srlri_h(__builtin_msa_max_s_h(__builtin_msa_fill_h(0), (v8i16)(__b)), (int)(__c)); \ + (v16u8)__builtin_msa_pckev_b((v16i8)__builtin_msa_sat_u_h((v8u16)__e, 7), (v16i8)__builtin_msa_sat_u_h((v8u16)__d, 7)); \ +}) + +#define msa_qrpackru_s32(__a, __b, __c) \ +({ \ + v4i32 __d = __builtin_msa_srlri_w(__builtin_msa_max_s_w(__builtin_msa_fill_w(0), (v4i32)(__a)), (int)(__c)); \ + v4i32 __e = __builtin_msa_srlri_w(__builtin_msa_max_s_w(__builtin_msa_fill_w(0), (v4i32)(__b)), (int)(__c)); \ + (v8u16)__builtin_msa_pckev_h((v8i16)__builtin_msa_sat_u_w((v4u32)__e, 15), (v8i16)__builtin_msa_sat_u_w((v4u32)__d, 15)); \ +}) + +#define msa_qrpackru_s64(__a, __b, __c) \ +({ \ + v2i64 __d = __builtin_msa_srlri_d(__builtin_msa_max_s_d(__builtin_msa_fill_d(0), (v2i64)(__a)), (int)(__c)); \ + v2i64 __e = __builtin_msa_srlri_d(__builtin_msa_max_s_d(__builtin_msa_fill_d(0), (v2i64)(__b)), (int)(__c)); \ + (v4u32)__builtin_msa_pckev_w((v4i32)__builtin_msa_sat_u_d((v2u64)__e, 31), (v4i32)__builtin_msa_sat_u_d((v2u64)__d, 31)); \ +}) + +/* Minimum values between corresponding elements in the two vectors are written to the returned vector. */ +#define msa_minq_s8(__a, __b) (__builtin_msa_min_s_b(__a, __b)) +#define msa_minq_s16(__a, __b) (__builtin_msa_min_s_h(__a, __b)) +#define msa_minq_s32(__a, __b) (__builtin_msa_min_s_w(__a, __b)) +#define msa_minq_s64(__a, __b) (__builtin_msa_min_s_d(__a, __b)) +#define msa_minq_u8(__a, __b) ((v16u8)__builtin_msa_min_u_b(__a, __b)) +#define msa_minq_u16(__a, __b) ((v8u16)__builtin_msa_min_u_h(__a, __b)) +#define msa_minq_u32(__a, __b) ((v4u32)__builtin_msa_min_u_w(__a, __b)) +#define msa_minq_u64(__a, __b) ((v2u64)__builtin_msa_min_u_d(__a, __b)) +#define msa_minq_f32(__a, __b) (__builtin_msa_fmin_w(__a, __b)) +#define msa_minq_f64(__a, __b) (__builtin_msa_fmin_d(__a, __b)) + +/* Maximum values between corresponding elements in the two vectors are written to the returned vector. */ +#define msa_maxq_s8(__a, __b) (__builtin_msa_max_s_b(__a, __b)) +#define msa_maxq_s16(__a, __b) (__builtin_msa_max_s_h(__a, __b)) +#define msa_maxq_s32(__a, __b) (__builtin_msa_max_s_w(__a, __b)) +#define msa_maxq_s64(__a, __b) (__builtin_msa_max_s_d(__a, __b)) +#define msa_maxq_u8(__a, __b) ((v16u8)__builtin_msa_max_u_b(__a, __b)) +#define msa_maxq_u16(__a, __b) ((v8u16)__builtin_msa_max_u_h(__a, __b)) +#define msa_maxq_u32(__a, __b) ((v4u32)__builtin_msa_max_u_w(__a, __b)) +#define msa_maxq_u64(__a, __b) ((v2u64)__builtin_msa_max_u_d(__a, __b)) +#define msa_maxq_f32(__a, __b) (__builtin_msa_fmax_w(__a, __b)) +#define msa_maxq_f64(__a, __b) (__builtin_msa_fmax_d(__a, __b)) + +/* Vector type reinterpretion */ +#define MSA_TPV_REINTERPRET(_Tpv, Vec) ((_Tpv)(Vec)) + +/* Add the odd elements in vector __a with the even elements in vector __b to double width elements in the returned vector. */ +/* v8i16 msa_hadd_s16 ((v16i8)__a, (v16i8)__b) */ +#define msa_hadd_s16(__a, __b) (__builtin_msa_hadd_s_h((v16i8)(__a), (v16i8)(__b))) +/* v4i32 msa_hadd_s32 ((v8i16)__a, (v8i16)__b) */ +#define msa_hadd_s32(__a, __b) (__builtin_msa_hadd_s_w((v8i16)(__a), (v8i16)(__b))) +/* v2i64 msa_hadd_s64 ((v4i32)__a, (v4i32)__b) */ +#define msa_hadd_s64(__a, __b) (__builtin_msa_hadd_s_d((v4i32)(__a), (v4i32)(__b))) + +/* Copy even elements in __a to the left half and even elements in __b to the right half and return the result vector. */ +#define msa_pckev_s8(__a, __b) (__builtin_msa_pckev_b((v16i8)(__a), (v16i8)(__b))) +#define msa_pckev_s16(__a, __b) (__builtin_msa_pckev_h((v8i16)(__a), (v8i16)(__b))) +#define msa_pckev_s32(__a, __b) (__builtin_msa_pckev_w((v4i32)(__a), (v4i32)(__b))) +#define msa_pckev_s64(__a, __b) (__builtin_msa_pckev_d((v2i64)(__a), (v2i64)(__b))) + +/* Copy even elements in __a to the left half and even elements in __b to the right half and return the result vector. */ +#define msa_pckod_s8(__a, __b) (__builtin_msa_pckod_b((v16i8)(__a), (v16i8)(__b))) +#define msa_pckod_s16(__a, __b) (__builtin_msa_pckod_h((v8i16)(__a), (v8i16)(__b))) +#define msa_pckod_s32(__a, __b) (__builtin_msa_pckod_w((v4i32)(__a), (v4i32)(__b))) +#define msa_pckod_s64(__a, __b) (__builtin_msa_pckod_d((v2i64)(__a), (v2i64)(__b))) + +#ifdef _MIPSEB +#define LANE_IMM0_1(x) (0b1 - ((x) & 0b1)) +#define LANE_IMM0_3(x) (0b11 - ((x) & 0b11)) +#define LANE_IMM0_7(x) (0b111 - ((x) & 0b111)) +#define LANE_IMM0_15(x) (0b1111 - ((x) & 0b1111)) +#else +#define LANE_IMM0_1(x) ((x) & 0b1) +#define LANE_IMM0_3(x) ((x) & 0b11) +#define LANE_IMM0_7(x) ((x) & 0b111) +#define LANE_IMM0_15(x) ((x) & 0b1111) +#endif + +#define msa_get_lane_u8(__a, __b) ((uint8_t)(__a)[LANE_IMM0_7(__b)]) +#define msa_get_lane_s8(__a, __b) ((int8_t)(__a)[LANE_IMM0_7(__b)]) +#define msa_get_lane_u16(__a, __b) ((uint16_t)(__a)[LANE_IMM0_3(__b)]) +#define msa_get_lane_s16(__a, __b) ((int16_t)(__a)[LANE_IMM0_3(__b)]) +#define msa_get_lane_u32(__a, __b) ((uint32_t)(__a)[LANE_IMM0_1(__b)]) +#define msa_get_lane_s32(__a, __b) ((int32_t)(__a)[LANE_IMM0_1(__b)]) +#define msa_get_lane_f32(__a, __b) ((float)(__a)[LANE_IMM0_3(__b)]) +#define msa_get_lane_s64(__a, __b) ((int64_t)(__a)[LANE_IMM0_1(__b)]) +#define msa_get_lane_u64(__a, __b) ((uint64_t)(__a)[LANE_IMM0_1(__b)]) +#define msa_get_lane_f64(__a, __b) ((double)(__a)[LANE_IMM0_1(__b)]) +#define msa_getq_lane_u8(__a, imm0_15) ((uint8_t)__builtin_msa_copy_u_b((v16i8)(__a), imm0_15)) +#define msa_getq_lane_s8(__a, imm0_15) ((int8_t)__builtin_msa_copy_s_b(__a, imm0_15)) +#define msa_getq_lane_u16(__a, imm0_7) ((uint16_t)__builtin_msa_copy_u_h((v8i16)(__a), imm0_7)) +#define msa_getq_lane_s16(__a, imm0_7) ((int16_t)__builtin_msa_copy_s_h(__a, imm0_7)) +#define msa_getq_lane_u32(__a, imm0_3) __builtin_msa_copy_u_w((v4i32)(__a), imm0_3) +#define msa_getq_lane_s32 __builtin_msa_copy_s_w +#define msa_getq_lane_f32(__a, __b) ((float)(__a)[LANE_IMM0_3(__b)]) +#define msa_getq_lane_f64(__a, __b) ((double)(__a)[LANE_IMM0_1(__b)]) +#if (__mips == 64) +#define msa_getq_lane_u64(__a, imm0_1) __builtin_msa_copy_u_d((v2i64)(__a), imm0_1) +#define msa_getq_lane_s64 __builtin_msa_copy_s_d +#else +#define msa_getq_lane_u64(__a, imm0_1) ((uint64_t)(__a)[LANE_IMM0_1(imm0_1)]) +#define msa_getq_lane_s64(__a, imm0_1) ((int64_t)(__a)[LANE_IMM0_1(imm0_1)]) +#endif + +/* combine */ +#if (__mips == 64) +#define __COMBINE_64_64(__TYPE, a, b) ((__TYPE)((v2u64){((v1u64)(a))[0], ((v1u64)(b))[0]})) +#else +#define __COMBINE_64_64(__TYPE, a, b) ((__TYPE)((v4u32){((v2u32)(a))[0], ((v2u32)(a))[1], \ + ((v2u32)(b))[0], ((v2u32)(b))[1]})) +#endif + +/* v16i8 msa_combine_s8 (v8i8 __a, v8i8 __b) */ +#define msa_combine_s8(__a, __b) __COMBINE_64_64(v16i8, __a, __b) + +/* v8i16 msa_combine_s16(v4i16 __a, v4i16 __b) */ +#define msa_combine_s16(__a, __b) __COMBINE_64_64(v8i16, __a, __b) + +/* v4i32 msa_combine_s32(v2i32 __a, v2i32 __b) */ +#define msa_combine_s32(__a, __b) __COMBINE_64_64(v4i32, __a, __b) + +/* v2i64 msa_combine_s64(v1i64 __a, v1i64 __b) */ +#define msa_combine_s64(__a, __b) __COMBINE_64_64(v2i64, __a, __b) + +/* v4f32 msa_combine_f32(v2f32 __a, v2f32 __b) */ +#define msa_combine_f32(__a, __b) __COMBINE_64_64(v4f32, __a, __b) + +/* v16u8 msa_combine_u8(v8u8 __a, v8u8 __b) */ +#define msa_combine_u8(__a, __b) __COMBINE_64_64(v16u8, __a, __b) + +/* v8u16 msa_combine_u16(v4u16 __a, v4u16 __b) */ +#define msa_combine_u16(__a, __b) __COMBINE_64_64(v8u16, __a, __b) + +/* v4u32 msa_combine_u32(v2u32 __a, v2u32 __b) */ +#define msa_combine_u32(__a, __b) __COMBINE_64_64(v4u32, __a, __b) + +/* v2u64 msa_combine_u64(v1u64 __a, v1u64 __b) */ +#define msa_combine_u64(__a, __b) __COMBINE_64_64(v2u64, __a, __b) + +/* v2f64 msa_combine_f64(v1f64 __a, v1f64 __b) */ +#define msa_combine_f64(__a, __b) __COMBINE_64_64(v2f64, __a, __b) + +/* get_low, get_high */ +#if (__mips == 64) +#define __GET_LOW(__TYPE, a) ((__TYPE)((v1u64)(__builtin_msa_copy_u_d((v2i64)(a), 0)))) +#define __GET_HIGH(__TYPE, a) ((__TYPE)((v1u64)(__builtin_msa_copy_u_d((v2i64)(a), 1)))) +#else +#define __GET_LOW(__TYPE, a) ((__TYPE)(((v2u64)(a))[0])) +#define __GET_HIGH(__TYPE, a) ((__TYPE)(((v2u64)(a))[1])) +#endif + +/* v8i8 msa_get_low_s8(v16i8 __a) */ +#define msa_get_low_s8(__a) __GET_LOW(v8i8, __a) + +/* v4i16 msa_get_low_s16(v8i16 __a) */ +#define msa_get_low_s16(__a) __GET_LOW(v4i16, __a) + +/* v2i32 msa_get_low_s32(v4i32 __a) */ +#define msa_get_low_s32(__a) __GET_LOW(v2i32, __a) + +/* v1i64 msa_get_low_s64(v2i64 __a) */ +#define msa_get_low_s64(__a) __GET_LOW(v1i64, __a) + +/* v8u8 msa_get_low_u8(v16u8 __a) */ +#define msa_get_low_u8(__a) __GET_LOW(v8u8, __a) + +/* v4u16 msa_get_low_u16(v8u16 __a) */ +#define msa_get_low_u16(__a) __GET_LOW(v4u16, __a) + +/* v2u32 msa_get_low_u32(v4u32 __a) */ +#define msa_get_low_u32(__a) __GET_LOW(v2u32, __a) + +/* v1u64 msa_get_low_u64(v2u64 __a) */ +#define msa_get_low_u64(__a) __GET_LOW(v1u64, __a) + +/* v2f32 msa_get_low_f32(v4f32 __a) */ +#define msa_get_low_f32(__a) __GET_LOW(v2f32, __a) + +/* v1f64 msa_get_low_f64(v2f64 __a) */ +#define msa_get_low_f64(__a) __GET_LOW(v1f64, __a) + +/* v8i8 msa_get_high_s8(v16i8 __a) */ +#define msa_get_high_s8(__a) __GET_HIGH(v8i8, __a) + +/* v4i16 msa_get_high_s16(v8i16 __a) */ +#define msa_get_high_s16(__a) __GET_HIGH(v4i16, __a) + +/* v2i32 msa_get_high_s32(v4i32 __a) */ +#define msa_get_high_s32(__a) __GET_HIGH(v2i32, __a) + +/* v1i64 msa_get_high_s64(v2i64 __a) */ +#define msa_get_high_s64(__a) __GET_HIGH(v1i64, __a) + +/* v8u8 msa_get_high_u8(v16u8 __a) */ +#define msa_get_high_u8(__a) __GET_HIGH(v8u8, __a) + +/* v4u16 msa_get_high_u16(v8u16 __a) */ +#define msa_get_high_u16(__a) __GET_HIGH(v4u16, __a) + +/* v2u32 msa_get_high_u32(v4u32 __a) */ +#define msa_get_high_u32(__a) __GET_HIGH(v2u32, __a) + +/* v1u64 msa_get_high_u64(v2u64 __a) */ +#define msa_get_high_u64(__a) __GET_HIGH(v1u64, __a) + +/* v2f32 msa_get_high_f32(v4f32 __a) */ +#define msa_get_high_f32(__a) __GET_HIGH(v2f32, __a) + +/* v1f64 msa_get_high_f64(v2f64 __a) */ +#define msa_get_high_f64(__a) __GET_HIGH(v1f64, __a) + +/* ri = ai * b[lane] */ +/* v4f32 msa_mulq_lane_f32(v4f32 __a, v4f32 __b, const int __lane) */ +#define msa_mulq_lane_f32(__a, __b, __lane) ((__a) * msa_getq_lane_f32(__b, __lane)) + +/* ri = ai + bi * c[lane] */ +/* v4f32 msa_mlaq_lane_f32(v4f32 __a, v4f32 __b, v4f32 __c, const int __lane) */ +#define msa_mlaq_lane_f32(__a, __b, __c, __lane) ((__a) + ((__b) * msa_getq_lane_f32(__c, __lane))) + +/* uint16_t msa_sum_u16(v8u16 __a)*/ +#define msa_sum_u16(__a) \ +({ \ + v4u32 _b; \ + v2u64 _c; \ + _b = __builtin_msa_hadd_u_w(__a, __a); \ + _c = __builtin_msa_hadd_u_d(_b, _b); \ + (uint16_t)(_c[0] + _c[1]); \ +}) + +/* int16_t msa_sum_s16(v8i16 __a) */ +#define msa_sum_s16(__a) \ +({ \ + v4i32 _b; \ + v2i64 _c; \ + _b = __builtin_msa_hadd_s_w(__a, __a); \ + _c = __builtin_msa_hadd_s_d(_b, _b); \ + (int16_t)(_c[0] + _c[1]); \ +}) + + +/* uint32_t msa_sum_u32(v4u32 __a)*/ +#define msa_sum_u32(__a) \ +({ \ + v2u64 _b; \ + _b = __builtin_msa_hadd_u_d(__a, __a); \ + (uint32_t)(_b[0] + _b[1]); \ +}) + +/* int32_t msa_sum_s32(v4i32 __a)*/ +#define msa_sum_s32(__a) \ +({ \ + v2i64 _b; \ + _b = __builtin_msa_hadd_s_d(__a, __a); \ + (int32_t)(_b[0] + _b[1]); \ +}) + +/* uint8_t msa_sum_u8(v16u8 __a)*/ +#define msa_sum_u8(__a) \ +({ \ + v8u16 _b16; \ + v4u32 _c32; \ + _b16 = __builtin_msa_hadd_u_h(__a, __a); \ + _c32 = __builtin_msa_hadd_u_w(_b16, _b16); \ + (uint8_t)msa_sum_u32(_c32); \ +}) + +/* int8_t msa_sum_s8(v16s8 __a)*/ +#define msa_sum_s8(__a) \ +({ \ + v8i16 _b16; \ + v4i32 _c32; \ + _b16 = __builtin_msa_hadd_s_h(__a, __a); \ + _c32 = __builtin_msa_hadd_s_w(_b16, _b16); \ + (int8_t)msa_sum_s32(_c32); \ +}) + +/* float msa_sum_f32(v4f32 __a)*/ +#define msa_sum_f32(__a) ((__a)[0] + (__a)[1] + (__a)[2] + (__a)[3]) + +/* v8u16 msa_paddlq_u8(v16u8 __a) */ +#define msa_paddlq_u8(__a) (__builtin_msa_hadd_u_h(__a, __a)) + +/* v8i16 msa_paddlq_s8(v16i8 __a) */ +#define msa_paddlq_s8(__a) (__builtin_msa_hadd_s_h(__a, __a)) + +/* v4u32 msa_paddlq_u16 (v8u16 __a)*/ +#define msa_paddlq_u16(__a) (__builtin_msa_hadd_u_w(__a, __a)) + +/* v4i32 msa_paddlq_s16 (v8i16 __a)*/ +#define msa_paddlq_s16(__a) (__builtin_msa_hadd_s_w(__a, __a)) + +/* v2u64 msa_paddlq_u32(v4u32 __a) */ +#define msa_paddlq_u32(__a) (__builtin_msa_hadd_u_d(__a, __a)) + +/* v2i64 msa_paddlq_s32(v4i32 __a) */ +#define msa_paddlq_s32(__a) (__builtin_msa_hadd_s_d(__a, __a)) + +#define V8U8_2_V8U16(x) {(uint16_t)x[0], (uint16_t)x[1], (uint16_t)x[2], (uint16_t)x[3], \ + (uint16_t)x[4], (uint16_t)x[5], (uint16_t)x[6], (uint16_t)x[7]} +#define V8U8_2_V8I16(x) {(int16_t)x[0], (int16_t)x[1], (int16_t)x[2], (int16_t)x[3], \ + (int16_t)x[4], (int16_t)x[5], (int16_t)x[6], (int16_t)x[7]} +#define V8I8_2_V8I16(x) {(int16_t)x[0], (int16_t)x[1], (int16_t)x[2], (int16_t)x[3], \ + (int16_t)x[4], (int16_t)x[5], (int16_t)x[6], (int16_t)x[7]} +#define V4U16_2_V4U32(x) {(uint32_t)x[0], (uint32_t)x[1], (uint32_t)x[2], (uint32_t)x[3]} +#define V4U16_2_V4I32(x) {(int32_t)x[0], (int32_t)x[1], (int32_t)x[2], (int32_t)x[3]} +#define V4I16_2_V4I32(x) {(int32_t)x[0], (int32_t)x[1], (int32_t)x[2], (int32_t)x[3]} +#define V2U32_2_V2U64(x) {(uint64_t)x[0], (uint64_t)x[1]} +#define V2U32_2_V2I64(x) {(int64_t)x[0], (int64_t)x[1]} + +/* v8u16 msa_mull_u8(v8u8 __a, v8u8 __b) */ +#define msa_mull_u8(__a, __b) ((v8u16)__builtin_msa_mulv_h((v8i16)V8U8_2_V8I16(__a), (v8i16)V8U8_2_V8I16(__b))) + +/* v8i16 msa_mull_s8(v8i8 __a, v8i8 __b)*/ +#define msa_mull_s8(__a, __b) (__builtin_msa_mulv_h((v8i16)V8I8_2_V8I16(__a), (v8i16)V8I8_2_V8I16(__b))) + +/* v4u32 msa_mull_u16(v4u16 __a, v4u16 __b) */ +#define msa_mull_u16(__a, __b) ((v4u32)__builtin_msa_mulv_w((v4i32)V4U16_2_V4I32(__a), (v4i32)V4U16_2_V4I32(__b))) + +/* v4i32 msa_mull_s16(v4i16 __a, v4i16 __b) */ +#define msa_mull_s16(__a, __b) (__builtin_msa_mulv_w((v4i32)V4I16_2_V4I32(__a), (v4i32)V4I16_2_V4I32(__b))) + +/* v2u64 msa_mull_u32(v2u32 __a, v2u32 __b) */ +#define msa_mull_u32(__a, __b) ((v2u64)__builtin_msa_mulv_d((v2i64)V2U32_2_V2I64(__a), (v2i64)V2U32_2_V2I64(__b))) + +/* bitwise and: __builtin_msa_and_v */ +#define msa_andq_u8(__a, __b) ((v16u8)__builtin_msa_and_v((v16u8)(__a), (v16u8)(__b))) +#define msa_andq_s8(__a, __b) ((v16i8)__builtin_msa_and_v((v16u8)(__a), (v16u8)(__b))) +#define msa_andq_u16(__a, __b) ((v8u16)__builtin_msa_and_v((v16u8)(__a), (v16u8)(__b))) +#define msa_andq_s16(__a, __b) ((v8i16)__builtin_msa_and_v((v16u8)(__a), (v16u8)(__b))) +#define msa_andq_u32(__a, __b) ((v4u32)__builtin_msa_and_v((v16u8)(__a), (v16u8)(__b))) +#define msa_andq_s32(__a, __b) ((v4i32)__builtin_msa_and_v((v16u8)(__a), (v16u8)(__b))) +#define msa_andq_u64(__a, __b) ((v2u64)__builtin_msa_and_v((v16u8)(__a), (v16u8)(__b))) +#define msa_andq_s64(__a, __b) ((v2i64)__builtin_msa_and_v((v16u8)(__a), (v16u8)(__b))) + +/* bitwise or: __builtin_msa_or_v */ +#define msa_orrq_u8(__a, __b) ((v16u8)__builtin_msa_or_v((v16u8)(__a), (v16u8)(__b))) +#define msa_orrq_s8(__a, __b) ((v16i8)__builtin_msa_or_v((v16u8)(__a), (v16u8)(__b))) +#define msa_orrq_u16(__a, __b) ((v8u16)__builtin_msa_or_v((v16u8)(__a), (v16u8)(__b))) +#define msa_orrq_s16(__a, __b) ((v8i16)__builtin_msa_or_v((v16u8)(__a), (v16u8)(__b))) +#define msa_orrq_u32(__a, __b) ((v4u32)__builtin_msa_or_v((v16u8)(__a), (v16u8)(__b))) +#define msa_orrq_s32(__a, __b) ((v4i32)__builtin_msa_or_v((v16u8)(__a), (v16u8)(__b))) +#define msa_orrq_u64(__a, __b) ((v2u64)__builtin_msa_or_v((v16u8)(__a), (v16u8)(__b))) +#define msa_orrq_s64(__a, __b) ((v2i64)__builtin_msa_or_v((v16u8)(__a), (v16u8)(__b))) + +/* bitwise xor: __builtin_msa_xor_v */ +#define msa_eorq_u8(__a, __b) ((v16u8)__builtin_msa_xor_v((v16u8)(__a), (v16u8)(__b))) +#define msa_eorq_s8(__a, __b) ((v16i8)__builtin_msa_xor_v((v16u8)(__a), (v16u8)(__b))) +#define msa_eorq_u16(__a, __b) ((v8u16)__builtin_msa_xor_v((v16u8)(__a), (v16u8)(__b))) +#define msa_eorq_s16(__a, __b) ((v8i16)__builtin_msa_xor_v((v16u8)(__a), (v16u8)(__b))) +#define msa_eorq_u32(__a, __b) ((v4u32)__builtin_msa_xor_v((v16u8)(__a), (v16u8)(__b))) +#define msa_eorq_s32(__a, __b) ((v4i32)__builtin_msa_xor_v((v16u8)(__a), (v16u8)(__b))) +#define msa_eorq_u64(__a, __b) ((v2u64)__builtin_msa_xor_v((v16u8)(__a), (v16u8)(__b))) +#define msa_eorq_s64(__a, __b) ((v2i64)__builtin_msa_xor_v((v16u8)(__a), (v16u8)(__b))) + +/* bitwise not: v16u8 __builtin_msa_xori_b (v16u8, 0xff) */ +#define msa_mvnq_u8(__a) ((v16u8)__builtin_msa_xori_b((v16u8)(__a), 0xFF)) +#define msa_mvnq_s8(__a) ((v16i8)__builtin_msa_xori_b((v16u8)(__a), 0xFF)) +#define msa_mvnq_u16(__a) ((v8u16)__builtin_msa_xori_b((v16u8)(__a), 0xFF)) +#define msa_mvnq_s16(__a) ((v8i16)__builtin_msa_xori_b((v16u8)(__a), 0xFF)) +#define msa_mvnq_u32(__a) ((v4u32)__builtin_msa_xori_b((v16u8)(__a), 0xFF)) +#define msa_mvnq_s32(__a) ((v4i32)__builtin_msa_xori_b((v16u8)(__a), 0xFF)) +#define msa_mvnq_u64(__a) ((v2u64)__builtin_msa_xori_b((v16u8)(__a), 0xFF)) +#define msa_mvnq_s64(__a) ((v2i64)__builtin_msa_xori_b((v16u8)(__a), 0xFF)) + +/* compare equal: ceq -> ri = ai == bi ? 1...1:0...0 */ +#define msa_ceqq_u8(__a, __b) ((v16u8)__builtin_msa_ceq_b((v16i8)(__a), (v16i8)(__b))) +#define msa_ceqq_s8(__a, __b) ((v16u8)__builtin_msa_ceq_b((v16i8)(__a), (v16i8)(__b))) +#define msa_ceqq_u16(__a, __b) ((v8u16)__builtin_msa_ceq_h((v8i16)(__a), (v8i16)(__b))) +#define msa_ceqq_s16(__a, __b) ((v8u16)__builtin_msa_ceq_h((v8i16)(__a), (v8i16)(__b))) +#define msa_ceqq_u32(__a, __b) ((v4u32)__builtin_msa_ceq_w((v4i32)(__a), (v4i32)(__b))) +#define msa_ceqq_s32(__a, __b) ((v4u32)__builtin_msa_ceq_w((v4i32)(__a), (v4i32)(__b))) +#define msa_ceqq_f32(__a, __b) ((v4u32)__builtin_msa_fceq_w((v4f32)(__a), (v4f32)(__b))) +#define msa_ceqq_u64(__a, __b) ((v2u64)__builtin_msa_ceq_d((v2i64)(__a), (v2i64)(__b))) +#define msa_ceqq_s64(__a, __b) ((v2u64)__builtin_msa_ceq_d((v2i64)(__a), (v2i64)(__b))) +#define msa_ceqq_f64(__a, __b) ((v2u64)__builtin_msa_fceq_d((v2f64)(__a), (v2f64)(__b))) + +/* Compare less-than: clt -> ri = ai < bi ? 1...1:0...0 */ +#define msa_cltq_u8(__a, __b) ((v16u8)__builtin_msa_clt_u_b((v16u8)(__a), (v16u8)(__b))) +#define msa_cltq_s8(__a, __b) ((v16u8)__builtin_msa_clt_s_b((v16i8)(__a), (v16i8)(__b))) +#define msa_cltq_u16(__a, __b) ((v8u16)__builtin_msa_clt_u_h((v8u16)(__a), (v8u16)(__b))) +#define msa_cltq_s16(__a, __b) ((v8u16)__builtin_msa_clt_s_h((v8i16)(__a), (v8i16)(__b))) +#define msa_cltq_u32(__a, __b) ((v4u32)__builtin_msa_clt_u_w((v4u32)(__a), (v4u32)(__b))) +#define msa_cltq_s32(__a, __b) ((v4u32)__builtin_msa_clt_s_w((v4i32)(__a), (v4i32)(__b))) +#define msa_cltq_f32(__a, __b) ((v4u32)__builtin_msa_fclt_w((v4f32)(__a), (v4f32)(__b))) +#define msa_cltq_u64(__a, __b) ((v2u64)__builtin_msa_clt_u_d((v2u64)(__a), (v2u64)(__b))) +#define msa_cltq_s64(__a, __b) ((v2u64)__builtin_msa_clt_s_d((v2i64)(__a), (v2i64)(__b))) +#define msa_cltq_f64(__a, __b) ((v2u64)__builtin_msa_fclt_d((v2f64)(__a), (v2f64)(__b))) + +/* compare greater-than: cgt -> ri = ai > bi ? 1...1:0...0 */ +#define msa_cgtq_u8(__a, __b) ((v16u8)__builtin_msa_clt_u_b((v16u8)(__b), (v16u8)(__a))) +#define msa_cgtq_s8(__a, __b) ((v16u8)__builtin_msa_clt_s_b((v16i8)(__b), (v16i8)(__a))) +#define msa_cgtq_u16(__a, __b) ((v8u16)__builtin_msa_clt_u_h((v8u16)(__b), (v8u16)(__a))) +#define msa_cgtq_s16(__a, __b) ((v8u16)__builtin_msa_clt_s_h((v8i16)(__b), (v8i16)(__a))) +#define msa_cgtq_u32(__a, __b) ((v4u32)__builtin_msa_clt_u_w((v4u32)(__b), (v4u32)(__a))) +#define msa_cgtq_s32(__a, __b) ((v4u32)__builtin_msa_clt_s_w((v4i32)(__b), (v4i32)(__a))) +#define msa_cgtq_f32(__a, __b) ((v4u32)__builtin_msa_fclt_w((v4f32)(__b), (v4f32)(__a))) +#define msa_cgtq_u64(__a, __b) ((v2u64)__builtin_msa_clt_u_d((v2u64)(__b), (v2u64)(__a))) +#define msa_cgtq_s64(__a, __b) ((v2u64)__builtin_msa_clt_s_d((v2i64)(__b), (v2i64)(__a))) +#define msa_cgtq_f64(__a, __b) ((v2u64)__builtin_msa_fclt_d((v2f64)(__b), (v2f64)(__a))) + +/* compare less-equal: cle -> ri = ai <= bi ? 1...1:0...0 */ +#define msa_cleq_u8(__a, __b) ((v16u8)__builtin_msa_cle_u_b((v16u8)(__a), (v16u8)(__b))) +#define msa_cleq_s8(__a, __b) ((v16u8)__builtin_msa_cle_s_b((v16i8)(__a), (v16i8)(__b))) +#define msa_cleq_u16(__a, __b) ((v8u16)__builtin_msa_cle_u_h((v8u16)(__a), (v8u16)(__b))) +#define msa_cleq_s16(__a, __b) ((v8u16)__builtin_msa_cle_s_h((v8i16)(__a), (v8i16)(__b))) +#define msa_cleq_u32(__a, __b) ((v4u32)__builtin_msa_cle_u_w((v4u32)(__a), (v4u32)(__b))) +#define msa_cleq_s32(__a, __b) ((v4u32)__builtin_msa_cle_s_w((v4i32)(__a), (v4i32)(__b))) +#define msa_cleq_f32(__a, __b) ((v4u32)__builtin_msa_fcle_w((v4f32)(__a), (v4f32)(__b))) +#define msa_cleq_u64(__a, __b) ((v2u64)__builtin_msa_cle_u_d((v2u64)(__a), (v2u64)(__b))) +#define msa_cleq_s64(__a, __b) ((v2u64)__builtin_msa_cle_s_d((v2i64)(__a), (v2i64)(__b))) +#define msa_cleq_f64(__a, __b) ((v2u64)__builtin_msa_fcle_d((v2f64)(__a), (v2f64)(__b))) + +/* compare greater-equal: cge -> ri = ai >= bi ? 1...1:0...0 */ +#define msa_cgeq_u8(__a, __b) ((v16u8)__builtin_msa_cle_u_b((v16u8)(__b), (v16u8)(__a))) +#define msa_cgeq_s8(__a, __b) ((v16u8)__builtin_msa_cle_s_b((v16i8)(__b), (v16i8)(__a))) +#define msa_cgeq_u16(__a, __b) ((v8u16)__builtin_msa_cle_u_h((v8u16)(__b), (v8u16)(__a))) +#define msa_cgeq_s16(__a, __b) ((v8u16)__builtin_msa_cle_s_h((v8i16)(__b), (v8i16)(__a))) +#define msa_cgeq_u32(__a, __b) ((v4u32)__builtin_msa_cle_u_w((v4u32)(__b), (v4u32)(__a))) +#define msa_cgeq_s32(__a, __b) ((v4u32)__builtin_msa_cle_s_w((v4i32)(__b), (v4i32)(__a))) +#define msa_cgeq_f32(__a, __b) ((v4u32)__builtin_msa_fcle_w((v4f32)(__b), (v4f32)(__a))) +#define msa_cgeq_u64(__a, __b) ((v2u64)__builtin_msa_cle_u_d((v2u64)(__b), (v2u64)(__a))) +#define msa_cgeq_s64(__a, __b) ((v2u64)__builtin_msa_cle_s_d((v2i64)(__b), (v2i64)(__a))) +#define msa_cgeq_f64(__a, __b) ((v2u64)__builtin_msa_fcle_d((v2f64)(__b), (v2f64)(__a))) + +/* Shift Left Logical: shl -> ri = ai << bi; */ +#define msa_shlq_u8(__a, __b) ((v16u8)__builtin_msa_sll_b((v16i8)(__a), (v16i8)(__b))) +#define msa_shlq_s8(__a, __b) ((v16i8)__builtin_msa_sll_b((v16i8)(__a), (v16i8)(__b))) +#define msa_shlq_u16(__a, __b) ((v8u16)__builtin_msa_sll_h((v8i16)(__a), (v8i16)(__b))) +#define msa_shlq_s16(__a, __b) ((v8i16)__builtin_msa_sll_h((v8i16)(__a), (v8i16)(__b))) +#define msa_shlq_u32(__a, __b) ((v4u32)__builtin_msa_sll_w((v4i32)(__a), (v4i32)(__b))) +#define msa_shlq_s32(__a, __b) ((v4i32)__builtin_msa_sll_w((v4i32)(__a), (v4i32)(__b))) +#define msa_shlq_u64(__a, __b) ((v2u64)__builtin_msa_sll_d((v2i64)(__a), (v2i64)(__b))) +#define msa_shlq_s64(__a, __b) ((v2i64)__builtin_msa_sll_d((v2i64)(__a), (v2i64)(__b))) + +/* Immediate Shift Left Logical: shl -> ri = ai << imm; */ +#define msa_shlq_n_u8(__a, __imm) ((v16u8)__builtin_msa_slli_b((v16i8)(__a), __imm)) +#define msa_shlq_n_s8(__a, __imm) ((v16i8)__builtin_msa_slli_b((v16i8)(__a), __imm)) +#define msa_shlq_n_u16(__a, __imm) ((v8u16)__builtin_msa_slli_h((v8i16)(__a), __imm)) +#define msa_shlq_n_s16(__a, __imm) ((v8i16)__builtin_msa_slli_h((v8i16)(__a), __imm)) +#define msa_shlq_n_u32(__a, __imm) ((v4u32)__builtin_msa_slli_w((v4i32)(__a), __imm)) +#define msa_shlq_n_s32(__a, __imm) ((v4i32)__builtin_msa_slli_w((v4i32)(__a), __imm)) +#define msa_shlq_n_u64(__a, __imm) ((v2u64)__builtin_msa_slli_d((v2i64)(__a), __imm)) +#define msa_shlq_n_s64(__a, __imm) ((v2i64)__builtin_msa_slli_d((v2i64)(__a), __imm)) + +/* shift right: shrq -> ri = ai >> bi; */ +#define msa_shrq_u8(__a, __b) ((v16u8)__builtin_msa_srl_b((v16i8)(__a), (v16i8)(__b))) +#define msa_shrq_s8(__a, __b) ((v16i8)__builtin_msa_sra_b((v16i8)(__a), (v16i8)(__b))) +#define msa_shrq_u16(__a, __b) ((v8u16)__builtin_msa_srl_h((v8i16)(__a), (v8i16)(__b))) +#define msa_shrq_s16(__a, __b) ((v8i16)__builtin_msa_sra_h((v8i16)(__a), (v8i16)(__b))) +#define msa_shrq_u32(__a, __b) ((v4u32)__builtin_msa_srl_w((v4i32)(__a), (v4i32)(__b))) +#define msa_shrq_s32(__a, __b) ((v4i32)__builtin_msa_sra_w((v4i32)(__a), (v4i32)(__b))) +#define msa_shrq_u64(__a, __b) ((v2u64)__builtin_msa_srl_d((v2i64)(__a), (v2i64)(__b))) +#define msa_shrq_s64(__a, __b) ((v2i64)__builtin_msa_sra_d((v2i64)(__a), (v2i64)(__b))) + +/* Immediate Shift Right: shr -> ri = ai >> imm; */ +#define msa_shrq_n_u8(__a, __imm) ((v16u8)__builtin_msa_srli_b((v16i8)(__a), __imm)) +#define msa_shrq_n_s8(__a, __imm) ((v16i8)__builtin_msa_srai_b((v16i8)(__a), __imm)) +#define msa_shrq_n_u16(__a, __imm) ((v8u16)__builtin_msa_srli_h((v8i16)(__a), __imm)) +#define msa_shrq_n_s16(__a, __imm) ((v8i16)__builtin_msa_srai_h((v8i16)(__a), __imm)) +#define msa_shrq_n_u32(__a, __imm) ((v4u32)__builtin_msa_srli_w((v4i32)(__a), __imm)) +#define msa_shrq_n_s32(__a, __imm) ((v4i32)__builtin_msa_srai_w((v4i32)(__a), __imm)) +#define msa_shrq_n_u64(__a, __imm) ((v2u64)__builtin_msa_srli_d((v2i64)(__a), __imm)) +#define msa_shrq_n_s64(__a, __imm) ((v2i64)__builtin_msa_srai_d((v2i64)(__a), __imm)) + +/* Immediate Shift Right Rounded: shr -> ri = ai >> (rounded)imm; */ +#define msa_rshrq_n_u8(__a, __imm) ((v16u8)__builtin_msa_srlri_b((v16i8)(__a), __imm)) +#define msa_rshrq_n_s8(__a, __imm) ((v16i8)__builtin_msa_srari_b((v16i8)(__a), __imm)) +#define msa_rshrq_n_u16(__a, __imm) ((v8u16)__builtin_msa_srlri_h((v8i16)(__a), __imm)) +#define msa_rshrq_n_s16(__a, __imm) ((v8i16)__builtin_msa_srari_h((v8i16)(__a), __imm)) +#define msa_rshrq_n_u32(__a, __imm) ((v4u32)__builtin_msa_srlri_w((v4i32)(__a), __imm)) +#define msa_rshrq_n_s32(__a, __imm) ((v4i32)__builtin_msa_srari_w((v4i32)(__a), __imm)) +#define msa_rshrq_n_u64(__a, __imm) ((v2u64)__builtin_msa_srlri_d((v2i64)(__a), __imm)) +#define msa_rshrq_n_s64(__a, __imm) ((v2i64)__builtin_msa_srari_d((v2i64)(__a), __imm)) + +/* Vector saturating rounding shift left, qrshl -> ri = ai << bi; */ +#define msa_qrshrq_s32(a, b) ((v4i32)__msa_srar_w((v4i32)(a), (v4i32)(b))) + +/* Rename the msa builtin func to unify the name style for intrin_msa.hpp */ +#define msa_qaddq_u8 __builtin_msa_adds_u_b +#define msa_qaddq_s8 __builtin_msa_adds_s_b +#define msa_qaddq_u16 __builtin_msa_adds_u_h +#define msa_qaddq_s16 __builtin_msa_adds_s_h +#define msa_qaddq_u32 __builtin_msa_adds_u_w +#define msa_qaddq_s32 __builtin_msa_adds_s_w +#define msa_qaddq_u64 __builtin_msa_adds_u_d +#define msa_qaddq_s64 __builtin_msa_adds_s_d +#define msa_addq_u8(a, b) ((v16u8)__builtin_msa_addv_b((v16i8)(a), (v16i8)(b))) +#define msa_addq_s8 __builtin_msa_addv_b +#define msa_addq_u16(a, b) ((v8u16)__builtin_msa_addv_h((v8i16)(a), (v8i16)(b))) +#define msa_addq_s16 __builtin_msa_addv_h +#define msa_addq_u32(a, b) ((v4u32)__builtin_msa_addv_w((v4i32)(a), (v4i32)(b))) +#define msa_addq_s32 __builtin_msa_addv_w +#define msa_addq_f32 __builtin_msa_fadd_w +#define msa_addq_u64(a, b) ((v2u64)__builtin_msa_addv_d((v2i64)(a), (v2i64)(b))) +#define msa_addq_s64 __builtin_msa_addv_d +#define msa_addq_f64 __builtin_msa_fadd_d +#define msa_qsubq_u8 __builtin_msa_subs_u_b +#define msa_qsubq_s8 __builtin_msa_subs_s_b +#define msa_qsubq_u16 __builtin_msa_subs_u_h +#define msa_qsubq_s16 __builtin_msa_subs_s_h +#define msa_subq_u8(a, b) ((v16u8)__builtin_msa_subv_b((v16i8)(a), (v16i8)(b))) +#define msa_subq_s8 __builtin_msa_subv_b +#define msa_subq_u16(a, b) ((v8u16)__builtin_msa_subv_h((v8i16)(a), (v8i16)(b))) +#define msa_subq_s16 __builtin_msa_subv_h +#define msa_subq_u32(a, b) ((v4u32)__builtin_msa_subv_w((v4i32)(a), (v4i32)(b))) +#define msa_subq_s32 __builtin_msa_subv_w +#define msa_subq_f32 __builtin_msa_fsub_w +#define msa_subq_u64(a, b) ((v2u64)__builtin_msa_subv_d((v2i64)(a), (v2i64)(b))) +#define msa_subq_s64 __builtin_msa_subv_d +#define msa_subq_f64 __builtin_msa_fsub_d +#define msa_mulq_u8(a, b) ((v16u8)__builtin_msa_mulv_b((v16i8)(a), (v16i8)(b))) +#define msa_mulq_s8(a, b) ((v16i8)__builtin_msa_mulv_b((v16i8)(a), (v16i8)(b))) +#define msa_mulq_u16(a, b) ((v8u16)__builtin_msa_mulv_h((v8i16)(a), (v8i16)(b))) +#define msa_mulq_s16(a, b) ((v8i16)__builtin_msa_mulv_h((v8i16)(a), (v8i16)(b))) +#define msa_mulq_u32(a, b) ((v4u32)__builtin_msa_mulv_w((v4i32)(a), (v4i32)(b))) +#define msa_mulq_s32(a, b) ((v4i32)__builtin_msa_mulv_w((v4i32)(a), (v4i32)(b))) +#define msa_mulq_u64(a, b) ((v2u64)__builtin_msa_mulv_d((v2i64)(a), (v2i64)(b))) +#define msa_mulq_s64(a, b) ((v2i64)__builtin_msa_mulv_d((v2i64)(a), (v2i64)(b))) +#define msa_mulq_f32 __builtin_msa_fmul_w +#define msa_mulq_f64 __builtin_msa_fmul_d +#define msa_divq_f32 __builtin_msa_fdiv_w +#define msa_divq_f64 __builtin_msa_fdiv_d +#define msa_dotp_s_h __builtin_msa_dotp_s_h +#define msa_dotp_s_w __builtin_msa_dotp_s_w +#define msa_dotp_s_d __builtin_msa_dotp_s_d +#define msa_dotp_u_h __builtin_msa_dotp_u_h +#define msa_dotp_u_w __builtin_msa_dotp_u_w +#define msa_dotp_u_d __builtin_msa_dotp_u_d +#define msa_dpadd_s_h __builtin_msa_dpadd_s_h +#define msa_dpadd_s_w __builtin_msa_dpadd_s_w +#define msa_dpadd_s_d __builtin_msa_dpadd_s_d +#define msa_dpadd_u_h __builtin_msa_dpadd_u_h +#define msa_dpadd_u_w __builtin_msa_dpadd_u_w +#define msa_dpadd_u_d __builtin_msa_dpadd_u_d + +#define ILVRL_B2(RTYPE, in0, in1, low, hi) do { \ + low = (RTYPE)__builtin_msa_ilvr_b((v16i8)(in0), (v16i8)(in1)); \ + hi = (RTYPE)__builtin_msa_ilvl_b((v16i8)(in0), (v16i8)(in1)); \ + } while (0) +#define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__) +#define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__) +#define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__) +#define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__) +#define ILVRL_B2_SW(...) ILVRL_B2(v4i32, __VA_ARGS__) + +#define ILVRL_H2(RTYPE, in0, in1, low, hi) do { \ + low = (RTYPE)__builtin_msa_ilvr_h((v8i16)(in0), (v8i16)(in1)); \ + hi = (RTYPE)__builtin_msa_ilvl_h((v8i16)(in0), (v8i16)(in1)); \ + } while (0) +#define ILVRL_H2_UB(...) ILVRL_H2(v16u8, __VA_ARGS__) +#define ILVRL_H2_SB(...) ILVRL_H2(v16i8, __VA_ARGS__) +#define ILVRL_H2_UH(...) ILVRL_H2(v8u16, __VA_ARGS__) +#define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__) +#define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__) +#define ILVRL_H2_UW(...) ILVRL_H2(v4u32, __VA_ARGS__) + +#define ILVRL_W2(RTYPE, in0, in1, low, hi) do { \ + low = (RTYPE)__builtin_msa_ilvr_w((v4i32)(in0), (v4i32)(in1)); \ + hi = (RTYPE)__builtin_msa_ilvl_w((v4i32)(in0), (v4i32)(in1)); \ + } while (0) +#define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__) +#define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__) +#define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__) +#define ILVRL_W2_UW(...) ILVRL_W2(v4u32, __VA_ARGS__) + +/* absq, qabsq (r = |a|;) */ +#define msa_absq_s8(a) __builtin_msa_add_a_b(a, __builtin_msa_fill_b(0)) +#define msa_absq_s16(a) __builtin_msa_add_a_h(a, __builtin_msa_fill_h(0)) +#define msa_absq_s32(a) __builtin_msa_add_a_w(a, __builtin_msa_fill_w(0)) +#define msa_absq_s64(a) __builtin_msa_add_a_d(a, __builtin_msa_fill_d(0)) +#define msa_absq_f32(a) ((v4f32)__builtin_msa_bclri_w((v4u32)(a), 31)) +#define msa_absq_f64(a) ((v2f64)__builtin_msa_bclri_d((v2u64)(a), 63)) +#define msa_qabsq_s8(a) __builtin_msa_adds_a_b(a, __builtin_msa_fill_b(0)) +#define msa_qabsq_s16(a) __builtin_msa_adds_a_h(a, __builtin_msa_fill_h(0)) +#define msa_qabsq_s32(a) __builtin_msa_adds_a_w(a, __builtin_msa_fill_w(0)) +#define msa_qabsq_s64(a) __builtin_msa_adds_a_d(a, __builtin_msa_fill_d(0)) + +/* abdq, qabdq (r = |a - b|;) */ +#define msa_abdq_u8 __builtin_msa_asub_u_b +#define msa_abdq_s8 __builtin_msa_asub_s_b +#define msa_abdq_u16 __builtin_msa_asub_u_h +#define msa_abdq_s16 __builtin_msa_asub_s_h +#define msa_abdq_u32 __builtin_msa_asub_u_w +#define msa_abdq_s32 __builtin_msa_asub_s_w +#define msa_abdq_u64 __builtin_msa_asub_u_d +#define msa_abdq_s64 __builtin_msa_asub_s_d +#define msa_abdq_f32(a, b) msa_absq_f32(__builtin_msa_fsub_w(a, b)) +#define msa_abdq_f64(a, b) msa_absq_f64(__builtin_msa_fsub_d(a, b)) +#define msa_qabdq_s8(a, b) msa_qabsq_s8(__builtin_msa_subs_s_b(a, b)) +#define msa_qabdq_s16(a, b) msa_qabsq_s16(__builtin_msa_subs_s_h(a, b)) +#define msa_qabdq_s32(a, b) msa_qabsq_s32(__builtin_msa_subs_s_w(a, b)) +#define msa_qabdq_s64(a, b) msa_qabsq_s64(__builtin_msa_subs_s_d(a, b)) + +/* sqrtq, rsqrtq */ +#define msa_sqrtq_f32 __builtin_msa_fsqrt_w +#define msa_sqrtq_f64 __builtin_msa_fsqrt_d +#define msa_rsqrtq_f32 __builtin_msa_frsqrt_w +#define msa_rsqrtq_f64 __builtin_msa_frsqrt_d + + +/* mlaq: r = a + b * c; */ +__extension__ extern __inline v4i32 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +msa_mlaq_s32(v4i32 __a, v4i32 __b, v4i32 __c) +{ + __asm__ volatile("maddv.w %w[__a], %w[__b], %w[__c]\n" + // Outputs + : [__a] "+f"(__a) + // Inputs + : [__b] "f"(__b), [__c] "f"(__c)); + return __a; +} + +__extension__ extern __inline v2i64 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +msa_mlaq_s64(v2i64 __a, v2i64 __b, v2i64 __c) +{ + __asm__ volatile("maddv.d %w[__a], %w[__b], %w[__c]\n" + // Outputs + : [__a] "+f"(__a) + // Inputs + : [__b] "f"(__b), [__c] "f"(__c)); + return __a; +} + +__extension__ extern __inline v4f32 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +msa_mlaq_f32(v4f32 __a, v4f32 __b, v4f32 __c) +{ + __asm__ volatile("fmadd.w %w[__a], %w[__b], %w[__c]\n" + // Outputs + : [__a] "+f"(__a) + // Inputs + : [__b] "f"(__b), [__c] "f"(__c)); + return __a; +} + +__extension__ extern __inline v2f64 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +msa_mlaq_f64(v2f64 __a, v2f64 __b, v2f64 __c) +{ + __asm__ volatile("fmadd.d %w[__a], %w[__b], %w[__c]\n" + // Outputs + : [__a] "+f"(__a) + // Inputs + : [__b] "f"(__b), [__c] "f"(__c)); + return __a; +} + +/* cntq */ +#define msa_cntq_s8 __builtin_msa_pcnt_b +#define msa_cntq_s16 __builtin_msa_pcnt_h +#define msa_cntq_s32 __builtin_msa_pcnt_w +#define msa_cntq_s64 __builtin_msa_pcnt_d + +/* bslq (a: mask; r = b(if a == 0); r = c(if a == 1);) */ +#define msa_bslq_u8 __builtin_msa_bsel_v + +/* ilvrq, ilvlq (For EL only, ilvrq: b0, a0, b1, a1; ilvlq: b2, a2, b3, a3;) */ +#define msa_ilvrq_s8 __builtin_msa_ilvr_b +#define msa_ilvrq_s16 __builtin_msa_ilvr_h +#define msa_ilvrq_s32 __builtin_msa_ilvr_w +#define msa_ilvrq_s64 __builtin_msa_ilvr_d +#define msa_ilvlq_s8 __builtin_msa_ilvl_b +#define msa_ilvlq_s16 __builtin_msa_ilvl_h +#define msa_ilvlq_s32 __builtin_msa_ilvl_w +#define msa_ilvlq_s64 __builtin_msa_ilvl_d + +/* ilvevq, ilvodq (ilvevq: b0, a0, b2, a2; ilvodq: b1, a1, b3, a3; ) */ +#define msa_ilvevq_s8 __builtin_msa_ilvev_b +#define msa_ilvevq_s16 __builtin_msa_ilvev_h +#define msa_ilvevq_s32 __builtin_msa_ilvev_w +#define msa_ilvevq_s64 __builtin_msa_ilvev_d +#define msa_ilvodq_s8 __builtin_msa_ilvod_b +#define msa_ilvodq_s16 __builtin_msa_ilvod_h +#define msa_ilvodq_s32 __builtin_msa_ilvod_w +#define msa_ilvodq_s64 __builtin_msa_ilvod_d + +/* extq (r = (a || b); a concatenation b and get elements from index c) */ +#ifdef _MIPSEB +#define msa_extq_s8(a, b, c) \ +(__builtin_msa_vshf_b(__builtin_msa_subv_b((v16i8)((v2i64){0x1716151413121110, 0x1F1E1D1C1B1A1918}), __builtin_msa_fill_b(c)), a, b)) +#define msa_extq_s16(a, b, c) \ +(__builtin_msa_vshf_h(__builtin_msa_subv_h((v8i16)((v2i64){0x000B000A00090008, 0x000F000E000D000C}), __builtin_msa_fill_h(c)), a, b)) +#define msa_extq_s32(a, b, c) \ +(__builtin_msa_vshf_w(__builtin_msa_subv_w((v4i32)((v2i64){0x0000000500000004, 0x0000000700000006}), __builtin_msa_fill_w(c)), a, b)) +#define msa_extq_s64(a, b, c) \ +(__builtin_msa_vshf_d(__builtin_msa_subv_d((v2i64){0x0000000000000002, 0x0000000000000003}, __builtin_msa_fill_d(c)), a, b)) +#else +#define msa_extq_s8(a, b, c) \ +(__builtin_msa_vshf_b(__builtin_msa_addv_b((v16i8)((v2i64){0x0706050403020100, 0x0F0E0D0C0B0A0908}), __builtin_msa_fill_b(c)), b, a)) +#define msa_extq_s16(a, b, c) \ +(__builtin_msa_vshf_h(__builtin_msa_addv_h((v8i16)((v2i64){0x0003000200010000, 0x0007000600050004}), __builtin_msa_fill_h(c)), b, a)) +#define msa_extq_s32(a, b, c) \ +(__builtin_msa_vshf_w(__builtin_msa_addv_w((v4i32)((v2i64){0x0000000100000000, 0x0000000300000002}), __builtin_msa_fill_w(c)), b, a)) +#define msa_extq_s64(a, b, c) \ +(__builtin_msa_vshf_d(__builtin_msa_addv_d((v2i64){0x0000000000000000, 0x0000000000000001}, __builtin_msa_fill_d(c)), b, a)) +#endif /* _MIPSEB */ + +/* cvttruncq, cvttintq, cvtrintq */ +#define msa_cvttruncq_u32_f32 __builtin_msa_ftrunc_u_w +#define msa_cvttruncq_s32_f32 __builtin_msa_ftrunc_s_w +#define msa_cvttruncq_u64_f64 __builtin_msa_ftrunc_u_d +#define msa_cvttruncq_s64_f64 __builtin_msa_ftrunc_s_d +#define msa_cvttintq_u32_f32 __builtin_msa_ftint_u_w +#define msa_cvttintq_s32_f32 __builtin_msa_ftint_s_w +#define msa_cvttintq_u64_f64 __builtin_msa_ftint_u_d +#define msa_cvttintq_s64_f64 __builtin_msa_ftint_s_d +#define msa_cvtrintq_f32 __builtin_msa_frint_w +#define msa_cvtrintq_f64 __builtin_msa_frint_d + +/* cvtfintq, cvtfq */ +#define msa_cvtfintq_f32_u32 __builtin_msa_ffint_u_w +#define msa_cvtfintq_f32_s32 __builtin_msa_ffint_s_w +#define msa_cvtfintq_f64_u64 __builtin_msa_ffint_u_d +#define msa_cvtfintq_f64_s64 __builtin_msa_ffint_s_d +#define msa_cvtfq_f32_f64 __builtin_msa_fexdo_w +#define msa_cvtflq_f64_f32 __builtin_msa_fexupr_d +#define msa_cvtfhq_f64_f32 __builtin_msa_fexupl_d + +#define msa_addl_u8(a, b) ((v8u16)__builtin_msa_addv_h((v8i16)V8U8_2_V8I16(a), (v8i16)V8U8_2_V8I16(b))) +#define msa_addl_s8(a, b) (__builtin_msa_addv_h((v8i16)V8I8_2_V8I16(a), (v8i16)V8I8_2_V8I16(b))) +#define msa_addl_u16(a, b) ((v4u32)__builtin_msa_addv_w((v4i32)V4U16_2_V4I32(a), (v4i32)V4U16_2_V4I32(b))) +#define msa_addl_s16(a, b) (__builtin_msa_addv_w((v4i32)V4I16_2_V4I32(a), (v4i32)V4I16_2_V4I32(b))) +#define msa_subl_s16(a, b) (__builtin_msa_subv_w((v4i32)V4I16_2_V4I32(a), (v4i32)V4I16_2_V4I32(b))) +#define msa_recpeq_f32 __builtin_msa_frcp_w +#define msa_recpsq_f32(a, b) (__builtin_msa_fsub_w(msa_dupq_n_f32(2.0f), __builtin_msa_fmul_w(a, b))) + +#define MSA_INTERLEAVED_IMPL_LOAD2_STORE2(_Tp, _Tpv, _Tpvs, suffix, df, nlanes) \ +__extension__ extern __inline void \ +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \ +msa_ld2q_##suffix(const _Tp* ptr, _Tpv* a, _Tpv* b) \ +{ \ + _Tpv v0 = msa_ld1q_##suffix(ptr); \ + _Tpv v1 = msa_ld1q_##suffix(ptr + nlanes); \ + *a = (_Tpv)__builtin_msa_pckev_##df((_Tpvs)v1, (_Tpvs)v0); \ + *b = (_Tpv)__builtin_msa_pckod_##df((_Tpvs)v1, (_Tpvs)v0); \ +} \ +__extension__ extern __inline void \ +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \ +msa_st2q_##suffix(_Tp* ptr, const _Tpv a, const _Tpv b) \ +{ \ + msa_st1q_##suffix(ptr, (_Tpv)__builtin_msa_ilvr_##df((_Tpvs)b, (_Tpvs)a)); \ + msa_st1q_##suffix(ptr + nlanes, (_Tpv)__builtin_msa_ilvl_##df((_Tpvs)b, (_Tpvs)a)); \ +} + +MSA_INTERLEAVED_IMPL_LOAD2_STORE2(uint8_t, v16u8, v16i8, u8, b, 16) +MSA_INTERLEAVED_IMPL_LOAD2_STORE2(int8_t, v16i8, v16i8, s8, b, 16) +MSA_INTERLEAVED_IMPL_LOAD2_STORE2(uint16_t, v8u16, v8i16, u16, h, 8) +MSA_INTERLEAVED_IMPL_LOAD2_STORE2(int16_t, v8i16, v8i16, s16, h, 8) +MSA_INTERLEAVED_IMPL_LOAD2_STORE2(uint32_t, v4u32, v4i32, u32, w, 4) +MSA_INTERLEAVED_IMPL_LOAD2_STORE2(int32_t, v4i32, v4i32, s32, w, 4) +MSA_INTERLEAVED_IMPL_LOAD2_STORE2(float, v4f32, v4i32, f32, w, 4) +MSA_INTERLEAVED_IMPL_LOAD2_STORE2(uint64_t, v2u64, v2i64, u64, d, 2) +MSA_INTERLEAVED_IMPL_LOAD2_STORE2(int64_t, v2i64, v2i64, s64, d, 2) +MSA_INTERLEAVED_IMPL_LOAD2_STORE2(double, v2f64, v2i64, f64, d, 2) + +#ifdef _MIPSEB +#define MSA_INTERLEAVED_IMPL_LOAD3_8(_Tp, _Tpv, _Tpvs, suffix) \ +__extension__ extern __inline void \ +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \ +msa_ld3q_##suffix(const _Tp* ptr, _Tpv* a, _Tpv* b, _Tpv* c) \ +{ \ + _Tpv v0 = msa_ld1q_##suffix(ptr); \ + _Tpv v1 = msa_ld1q_##suffix(ptr + 16); \ + _Tpv v2 = msa_ld1q_##suffix(ptr + 32); \ + _Tpvs v3 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x0704011F1F1F1F1F, 0x1F1C191613100D0A}), (_Tpvs)v0, (_Tpvs)v1); \ + *a = (_Tpv)__builtin_msa_vshf_b((_Tpvs)((v2i64){0x1716150E0B080502, 0x1F1E1D1C1B1A1918}), v3, (_Tpvs)v2); \ + v3 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x0603001F1F1F1F1F, 0x1E1B1815120F0C09}), (_Tpvs)v0, (_Tpvs)v1); \ + *b = (_Tpv)__builtin_msa_vshf_b((_Tpvs)((v2i64){0x1716150D0A070401, 0x1F1E1D1C1B1A1918}), v3, (_Tpvs)v2); \ + v3 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x05021F1F1F1F1F1F, 0x1D1A1714110E0B08}), (_Tpvs)v0, (_Tpvs)v1); \ + *c = (_Tpv)__builtin_msa_vshf_b((_Tpvs)((v2i64){0x17160F0C09060300, 0x1F1E1D1C1B1A1918}), v3, (_Tpvs)v2); \ +} +#else +#define MSA_INTERLEAVED_IMPL_LOAD3_8(_Tp, _Tpv, _Tpvs, suffix) \ +__extension__ extern __inline void \ +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \ +msa_ld3q_##suffix(const _Tp* ptr, _Tpv* a, _Tpv* b, _Tpv* c) \ +{ \ + _Tpv v0 = msa_ld1q_##suffix(ptr); \ + _Tpv v1 = msa_ld1q_##suffix(ptr + 16); \ + _Tpv v2 = msa_ld1q_##suffix(ptr + 32); \ + _Tpvs v3 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x15120F0C09060300, 0x00000000001E1B18}), (_Tpvs)v1, (_Tpvs)v0); \ + *a = (_Tpv)__builtin_msa_vshf_b((_Tpvs)((v2i64){0x0706050403020100, 0x1D1A1714110A0908}), (_Tpvs)v2, v3); \ + v3 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x1613100D0A070401, 0x00000000001F1C19}), (_Tpvs)v1, (_Tpvs)v0); \ + *b = (_Tpv)__builtin_msa_vshf_b((_Tpvs)((v2i64){0x0706050403020100, 0x1E1B1815120A0908}), (_Tpvs)v2, v3); \ + v3 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x1714110E0B080502, 0x0000000000001D1A}), (_Tpvs)v1, (_Tpvs)v0); \ + *c = (_Tpv)__builtin_msa_vshf_b((_Tpvs)((v2i64){0x0706050403020100, 0x1F1C191613100908}), (_Tpvs)v2, v3); \ +} +#endif + +MSA_INTERLEAVED_IMPL_LOAD3_8(uint8_t, v16u8, v16i8, u8) +MSA_INTERLEAVED_IMPL_LOAD3_8(int8_t, v16i8, v16i8, s8) + +#ifdef _MIPSEB +#define MSA_INTERLEAVED_IMPL_LOAD3_16(_Tp, _Tpv, _Tpvs, suffix) \ +__extension__ extern __inline void \ +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \ +msa_ld3q_##suffix(const _Tp* ptr, _Tpv* a, _Tpv* b, _Tpv* c) \ +{ \ + _Tpv v0 = msa_ld1q_##suffix(ptr); \ + _Tpv v1 = msa_ld1q_##suffix(ptr + 8); \ + _Tpv v2 = msa_ld1q_##suffix(ptr + 16); \ + _Tpvs v3 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x00030000000F000F, 0x000F000C00090006}), (_Tpvs)v1, (_Tpvs)v0); \ + *a = (_Tpv)__builtin_msa_vshf_h((_Tpvs)((v2i64){0x000B000A00050002, 0x000F000E000D000C}), (_Tpvs)v2, v3); \ + v3 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x0002000F000F000F, 0x000E000B00080005}), (_Tpvs)v1, (_Tpvs)v0); \ + *b = (_Tpv)__builtin_msa_vshf_h((_Tpvs)((v2i64){0x000B000700040001, 0x000F000E000D000C}), (_Tpvs)v2, v3); \ + v3 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x0001000F000F000F, 0x000D000A00070004}), (_Tpvs)v1, (_Tpvs)v0); \ + *c = (_Tpv)__builtin_msa_vshf_h((_Tpvs)((v2i64){0x000B000600030000, 0x000F000E000D000C}), (_Tpvs)v2, v3); \ +} +#else +#define MSA_INTERLEAVED_IMPL_LOAD3_16(_Tp, _Tpv, _Tpvs, suffix) \ +__extension__ extern __inline void \ +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \ +msa_ld3q_##suffix(const _Tp* ptr, _Tpv* a, _Tpv* b, _Tpv* c) \ +{ \ + _Tpv v0 = msa_ld1q_##suffix(ptr); \ + _Tpv v1 = msa_ld1q_##suffix(ptr + 8); \ + _Tpv v2 = msa_ld1q_##suffix(ptr + 16); \ + _Tpvs v3 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x0009000600030000, 0x00000000000F000C}), (_Tpvs)v1, (_Tpvs)v0); \ + *a = (_Tpv)__builtin_msa_vshf_h((_Tpvs)((v2i64){0x0003000200010000, 0x000D000A00050004}), (_Tpvs)v2, v3); \ + v3 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x000A000700040001, 0x000000000000000D}), (_Tpvs)v1, (_Tpvs)v0); \ + *b = (_Tpv)__builtin_msa_vshf_h((_Tpvs)((v2i64){0x0003000200010000, 0x000E000B00080004}), (_Tpvs)v2, v3); \ + v3 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x000B000800050002, 0x000000000000000E}), (_Tpvs)v1, (_Tpvs)v0); \ + *c = (_Tpv)__builtin_msa_vshf_h((_Tpvs)((v2i64){0x0003000200010000, 0x000F000C00090004}), (_Tpvs)v2, v3); \ +} +#endif + +MSA_INTERLEAVED_IMPL_LOAD3_16(uint16_t, v8u16, v8i16, u16) +MSA_INTERLEAVED_IMPL_LOAD3_16(int16_t, v8i16, v8i16, s16) + +#define MSA_INTERLEAVED_IMPL_LOAD3_32(_Tp, _Tpv, _Tpvs, suffix) \ +__extension__ extern __inline void \ +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \ +msa_ld3q_##suffix(const _Tp* ptr, _Tpv* a, _Tpv* b, _Tpv* c) \ +{ \ + _Tpv v00 = msa_ld1q_##suffix(ptr); \ + _Tpv v01 = msa_ld1q_##suffix(ptr + 4); \ + _Tpv v02 = msa_ld1q_##suffix(ptr + 8); \ + _Tpvs v10 = __builtin_msa_ilvr_w((_Tpvs)__builtin_msa_ilvl_d((v2i64)v01, (v2i64)v01), (_Tpvs)v00); \ + _Tpvs v11 = __builtin_msa_ilvr_w((_Tpvs)v02, (_Tpvs)__builtin_msa_ilvl_d((v2i64)v00, (v2i64)v00)); \ + _Tpvs v12 = __builtin_msa_ilvr_w((_Tpvs)__builtin_msa_ilvl_d((v2i64)v02, (v2i64)v02), (_Tpvs)v01); \ + *a = (_Tpv)__builtin_msa_ilvr_w((_Tpvs)__builtin_msa_ilvl_d((v2i64)v11, (v2i64)v11), v10); \ + *b = (_Tpv)__builtin_msa_ilvr_w(v12, (_Tpvs)__builtin_msa_ilvl_d((v2i64)v10, (v2i64)v10)); \ + *c = (_Tpv)__builtin_msa_ilvr_w((_Tpvs)__builtin_msa_ilvl_d((v2i64)v12, (v2i64)v12), v11); \ +} + +MSA_INTERLEAVED_IMPL_LOAD3_32(uint32_t, v4u32, v4i32, u32) +MSA_INTERLEAVED_IMPL_LOAD3_32(int32_t, v4i32, v4i32, s32) +MSA_INTERLEAVED_IMPL_LOAD3_32(float, v4f32, v4i32, f32) + +#define MSA_INTERLEAVED_IMPL_LOAD3_64(_Tp, _Tpv, suffix) \ +__extension__ extern __inline void \ +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \ +msa_ld3q_##suffix(const _Tp* ptr, _Tpv* a, _Tpv* b, _Tpv* c) \ +{ \ + *((_Tp*)a) = *ptr; *((_Tp*)b) = *(ptr + 1); *((_Tp*)c) = *(ptr + 2); \ + *((_Tp*)a + 1) = *(ptr + 3); *((_Tp*)b + 1) = *(ptr + 4); *((_Tp*)c + 1) = *(ptr + 5); \ +} + +MSA_INTERLEAVED_IMPL_LOAD3_64(uint64_t, v2u64, u64) +MSA_INTERLEAVED_IMPL_LOAD3_64(int64_t, v2i64, s64) +MSA_INTERLEAVED_IMPL_LOAD3_64(double, v2f64, f64) + +#ifdef _MIPSEB +#define MSA_INTERLEAVED_IMPL_STORE3_8(_Tp, _Tpv, _Tpvs, suffix) \ +__extension__ extern __inline void \ +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \ +msa_st3q_##suffix(_Tp* ptr, const _Tpv a, const _Tpv b, const _Tpv c) \ +{ \ + _Tpvs v0 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x0F0E0D0C0B1F1F1F, 0x1F1E1D1C1B1A1F1F}), (_Tpvs)b, (_Tpvs)a); \ + _Tpvs v1 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x0D1C140C1B130B1A, 0x1F170F1E160E1D15}), (_Tpvs)c, (_Tpvs)v0); \ + msa_st1q_##suffix(ptr, (_Tpv)v1); \ + v0 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x0A09080706051F1F, 0x19181716151F1F1F}), (_Tpvs)b, (_Tpvs)a); \ + v1 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x1D14071C13061B12, 0x170A1F16091E1508}), (_Tpvs)c, (_Tpvs)v0); \ + msa_st1q_##suffix(ptr + 16, (_Tpv)v1); \ + v0 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x04030201001F1F1F, 0x14131211101F1F1F}), (_Tpvs)b, (_Tpvs)a); \ + v1 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x15021C14011B1300, 0x051F17041E16031D}), (_Tpvs)c, (_Tpvs)v0); \ + msa_st1q_##suffix(ptr + 32, (_Tpv)v1); \ +} +#else +#define MSA_INTERLEAVED_IMPL_STORE3_8(_Tp, _Tpv, _Tpvs, suffix) \ +__extension__ extern __inline void \ +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \ +msa_st3q_##suffix(_Tp* ptr, const _Tpv a, const _Tpv b, const _Tpv c) \ +{ \ + _Tpvs v0 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x0000050403020100, 0x0000001413121110}), (_Tpvs)b, (_Tpvs)a); \ + _Tpvs v1 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x0A02110901100800, 0x05140C04130B0312}), (_Tpvs)c, (_Tpvs)v0); \ + msa_st1q_##suffix(ptr, (_Tpv)v1); \ + v0 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x0000000A09080706, 0x00001A1918171615}), (_Tpvs)b, (_Tpvs)a); \ + v1 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x170A011609001508, 0x0D04190C03180B02}), (_Tpvs)c, (_Tpvs)v0); \ + msa_st1q_##suffix(ptr + 16, (_Tpv)v1); \ + v0 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x0000000F0E0D0C0B, 0x0000001F1E1D1C1B}), (_Tpvs)b, (_Tpvs)a); \ + v1 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x021C09011B08001A, 0x1F0C041E0B031D0A}), (_Tpvs)c, (_Tpvs)v0); \ + msa_st1q_##suffix(ptr + 32, (_Tpv)v1); \ +} +#endif + +MSA_INTERLEAVED_IMPL_STORE3_8(uint8_t, v16u8, v16i8, u8) +MSA_INTERLEAVED_IMPL_STORE3_8(int8_t, v16i8, v16i8, s8) + +#ifdef _MIPSEB +#define MSA_INTERLEAVED_IMPL_STORE3_16(_Tp, _Tpv, _Tpvs, suffix) \ +__extension__ extern __inline void \ +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \ +msa_st3q_##suffix(_Tp* ptr, const _Tpv a, const _Tpv b, const _Tpv c) \ +{ \ + _Tpvs v0 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x000700060005000F, 0x000F000E000D000F}), (_Tpvs)b, (_Tpvs)a); \ + _Tpvs v1 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x000A0006000D0009, 0x000F000B0007000E}), (_Tpvs)c, (_Tpvs)v0); \ + msa_st1q_##suffix(ptr, (_Tpv)v1); \ + v0 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x00040003000F000F, 0x000C000B000A000F}), (_Tpvs)b, (_Tpvs)a); \ + v1 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x000E000A0003000D, 0x0005000F000B0004}), (_Tpvs)c, (_Tpvs)v0); \ + msa_st1q_##suffix(ptr + 8, (_Tpv)v1); \ + v0 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x000200010000000F, 0x00090008000F000F}), (_Tpvs)b, (_Tpvs)a); \ + v1 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x0001000E00090000, 0x000B0002000F000A}), (_Tpvs)c, (_Tpvs)v0); \ + msa_st1q_##suffix(ptr + 16, (_Tpv)v1); \ +} +#else +#define MSA_INTERLEAVED_IMPL_STORE3_16(_Tp, _Tpv, _Tpvs, suffix) \ +__extension__ extern __inline void \ +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \ +msa_st3q_##suffix(_Tp* ptr, const _Tpv a, const _Tpv b, const _Tpv c) \ +{ \ + _Tpvs v0 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x0000000200010000, 0x0000000A00090008}), (_Tpvs)b, (_Tpvs)a); \ + _Tpvs v1 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x0001000800040000, 0x0006000200090005}), (_Tpvs)c, (_Tpvs)v0); \ + msa_st1q_##suffix(ptr, (_Tpv)v1); \ + v0 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x0000000500040003, 0x00000000000C000B}), (_Tpvs)b, (_Tpvs)a); \ + v1 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x000B00040000000A, 0x0002000C00050001}), (_Tpvs)c, (_Tpvs)v0); \ + msa_st1q_##suffix(ptr + 8, (_Tpv)v1); \ + v0 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x0000000000070006, 0x0000000F000E000D}), (_Tpvs)b, (_Tpvs)a); \ + v1 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x00050000000D0004, 0x000F00060001000E}), (_Tpvs)c, (_Tpvs)v0); \ + msa_st1q_##suffix(ptr + 16, (_Tpv)v1); \ +} +#endif + +MSA_INTERLEAVED_IMPL_STORE3_16(uint16_t, v8u16, v8i16, u16) +MSA_INTERLEAVED_IMPL_STORE3_16(int16_t, v8i16, v8i16, s16) + +#ifdef _MIPSEB +#define MSA_INTERLEAVED_IMPL_STORE3_32(_Tp, _Tpv, _Tpvs, suffix) \ +__extension__ extern __inline void \ +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \ +msa_st3q_##suffix(_Tp* ptr, const _Tpv a, const _Tpv b, const _Tpv c) \ +{ \ + _Tpvs v0 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000300000007, 0x0000000700000006}), (_Tpvs)b, (_Tpvs)a); \ + _Tpvs v1 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000300000006, 0x0000000700000005}), (_Tpvs)c, (_Tpvs)v0); \ + msa_st1q_##suffix(ptr, (_Tpv)v1); \ + v0 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000200000001, 0x0000000500000007}), (_Tpvs)b, (_Tpvs)a); \ + v1 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000700000004, 0x0000000500000002}), (_Tpvs)c, (_Tpvs)v0); \ + msa_st1q_##suffix(ptr + 4, (_Tpv)v1); \ + v0 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000000000007, 0x0000000400000007}), (_Tpvs)b, (_Tpvs)a); \ + v1 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000500000000, 0x0000000100000007}), (_Tpvs)c, (_Tpvs)v0); \ + msa_st1q_##suffix(ptr + 8, (_Tpv)v1); \ +} +#else +#define MSA_INTERLEAVED_IMPL_STORE3_32(_Tp, _Tpv, _Tpvs, suffix) \ +__extension__ extern __inline void \ +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \ +msa_st3q_##suffix(_Tp* ptr, const _Tpv a, const _Tpv b, const _Tpv c) \ +{ \ + _Tpvs v0 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000100000000, 0x0000000000000004}), (_Tpvs)b, (_Tpvs)a); \ + _Tpvs v1 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000200000000, 0x0000000100000004}), (_Tpvs)c, (_Tpvs)v0); \ + msa_st1q_##suffix(ptr, (_Tpv)v1); \ + v0 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000000000002, 0x0000000600000005}), (_Tpvs)b, (_Tpvs)a); \ + v1 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000500000002, 0x0000000300000000}), (_Tpvs)c, (_Tpvs)v0); \ + msa_st1q_##suffix(ptr + 4, (_Tpv)v1); \ + v0 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000000000003, 0x0000000000000007}), (_Tpvs)b, (_Tpvs)a); \ + v1 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000000000006, 0x0000000700000002}), (_Tpvs)c, (_Tpvs)v0); \ + msa_st1q_##suffix(ptr + 8, (_Tpv)v1); \ +} +#endif + +MSA_INTERLEAVED_IMPL_STORE3_32(uint32_t, v4u32, v4i32, u32) +MSA_INTERLEAVED_IMPL_STORE3_32(int32_t, v4i32, v4i32, s32) +MSA_INTERLEAVED_IMPL_STORE3_32(float, v4f32, v4i32, f32) + +#define MSA_INTERLEAVED_IMPL_STORE3_64(_Tp, _Tpv, suffix) \ +__extension__ extern __inline void \ +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \ +msa_st3q_##suffix(_Tp* ptr, const _Tpv a, const _Tpv b, const _Tpv c) \ +{ \ + *ptr = a[0]; *(ptr + 1) = b[0]; *(ptr + 2) = c[0]; \ + *(ptr + 3) = a[1]; *(ptr + 4) = b[1]; *(ptr + 5) = c[1]; \ +} + +MSA_INTERLEAVED_IMPL_STORE3_64(uint64_t, v2u64, u64) +MSA_INTERLEAVED_IMPL_STORE3_64(int64_t, v2i64, s64) +MSA_INTERLEAVED_IMPL_STORE3_64(double, v2f64, f64) + +#define MSA_INTERLEAVED_IMPL_LOAD4_STORE4(_Tp, _Tpv, _Tpvs, suffix, df, nlanes) \ +__extension__ extern __inline void \ +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \ +msa_ld4q_##suffix(const _Tp* ptr, _Tpv* a, _Tpv* b, _Tpv* c, _Tpv* d) \ +{ \ + _Tpv v0 = msa_ld1q_##suffix(ptr); \ + _Tpv v1 = msa_ld1q_##suffix(ptr + nlanes); \ + _Tpv v2 = msa_ld1q_##suffix(ptr + nlanes * 2); \ + _Tpv v3 = msa_ld1q_##suffix(ptr + nlanes * 3); \ + _Tpvs t0 = __builtin_msa_pckev_##df((_Tpvs)v1, (_Tpvs)v0); \ + _Tpvs t1 = __builtin_msa_pckev_##df((_Tpvs)v3, (_Tpvs)v2); \ + _Tpvs t2 = __builtin_msa_pckod_##df((_Tpvs)v1, (_Tpvs)v0); \ + _Tpvs t3 = __builtin_msa_pckod_##df((_Tpvs)v3, (_Tpvs)v2); \ + *a = (_Tpv)__builtin_msa_pckev_##df(t1, t0); \ + *b = (_Tpv)__builtin_msa_pckev_##df(t3, t2); \ + *c = (_Tpv)__builtin_msa_pckod_##df(t1, t0); \ + *d = (_Tpv)__builtin_msa_pckod_##df(t3, t2); \ +} \ +__extension__ extern __inline void \ +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \ +msa_st4q_##suffix(_Tp* ptr, const _Tpv a, const _Tpv b, const _Tpv c, const _Tpv d) \ +{ \ + _Tpvs v0 = __builtin_msa_ilvr_##df((_Tpvs)c, (_Tpvs)a); \ + _Tpvs v1 = __builtin_msa_ilvr_##df((_Tpvs)d, (_Tpvs)b); \ + _Tpvs v2 = __builtin_msa_ilvl_##df((_Tpvs)c, (_Tpvs)a); \ + _Tpvs v3 = __builtin_msa_ilvl_##df((_Tpvs)d, (_Tpvs)b); \ + msa_st1q_##suffix(ptr, (_Tpv)__builtin_msa_ilvr_##df(v1, v0)); \ + msa_st1q_##suffix(ptr + nlanes, (_Tpv)__builtin_msa_ilvl_##df(v1, v0)); \ + msa_st1q_##suffix(ptr + 2 * nlanes, (_Tpv)__builtin_msa_ilvr_##df(v3, v2)); \ + msa_st1q_##suffix(ptr + 3 * nlanes, (_Tpv)__builtin_msa_ilvl_##df(v3, v2)); \ +} + +MSA_INTERLEAVED_IMPL_LOAD4_STORE4(uint8_t, v16u8, v16i8, u8, b, 16) +MSA_INTERLEAVED_IMPL_LOAD4_STORE4(int8_t, v16i8, v16i8, s8, b, 16) +MSA_INTERLEAVED_IMPL_LOAD4_STORE4(uint16_t, v8u16, v8i16, u16, h, 8) +MSA_INTERLEAVED_IMPL_LOAD4_STORE4(int16_t, v8i16, v8i16, s16, h, 8) +MSA_INTERLEAVED_IMPL_LOAD4_STORE4(uint32_t, v4u32, v4i32, u32, w, 4) +MSA_INTERLEAVED_IMPL_LOAD4_STORE4(int32_t, v4i32, v4i32, s32, w, 4) +MSA_INTERLEAVED_IMPL_LOAD4_STORE4(float, v4f32, v4i32, f32, w, 4) + +#define MSA_INTERLEAVED_IMPL_LOAD4_STORE4_64(_Tp, _Tpv, _Tpvs, suffix) \ +__extension__ extern __inline void \ +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \ +msa_ld4q_##suffix(const _Tp* ptr, _Tpv* a, _Tpv* b, _Tpv* c, _Tpv* d) \ +{ \ + _Tpv v0 = msa_ld1q_##suffix(ptr); \ + _Tpv v1 = msa_ld1q_##suffix(ptr + 2); \ + _Tpv v2 = msa_ld1q_##suffix(ptr + 4); \ + _Tpv v3 = msa_ld1q_##suffix(ptr + 6); \ + *a = (_Tpv)__builtin_msa_ilvr_d((_Tpvs)v2, (_Tpvs)v0); \ + *b = (_Tpv)__builtin_msa_ilvl_d((_Tpvs)v2, (_Tpvs)v0); \ + *c = (_Tpv)__builtin_msa_ilvr_d((_Tpvs)v3, (_Tpvs)v1); \ + *d = (_Tpv)__builtin_msa_ilvl_d((_Tpvs)v3, (_Tpvs)v1); \ +} \ +__extension__ extern __inline void \ +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \ +msa_st4q_##suffix(_Tp* ptr, const _Tpv a, const _Tpv b, const _Tpv c, const _Tpv d) \ +{ \ + msa_st1q_##suffix(ptr, (_Tpv)__builtin_msa_ilvr_d((_Tpvs)b, (_Tpvs)a)); \ + msa_st1q_##suffix(ptr + 2, (_Tpv)__builtin_msa_ilvr_d((_Tpvs)d, (_Tpvs)c)); \ + msa_st1q_##suffix(ptr + 4, (_Tpv)__builtin_msa_ilvl_d((_Tpvs)b, (_Tpvs)a)); \ + msa_st1q_##suffix(ptr + 6, (_Tpv)__builtin_msa_ilvl_d((_Tpvs)d, (_Tpvs)c)); \ +} + +MSA_INTERLEAVED_IMPL_LOAD4_STORE4_64(uint64_t, v2u64, v2i64, u64) +MSA_INTERLEAVED_IMPL_LOAD4_STORE4_64(int64_t, v2i64, v2i64, s64) +MSA_INTERLEAVED_IMPL_LOAD4_STORE4_64(double, v2f64, v2i64, f64) + +__extension__ extern __inline v8i16 +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +msa_qdmulhq_n_s16(v8i16 a, int16_t b) +{ + v8i16 a_lo, a_hi; + ILVRL_H2_SH(a, msa_dupq_n_s16(0), a_lo, a_hi); + return msa_packr_s32(msa_shlq_n_s32(msa_mulq_s32(msa_paddlq_s16(a_lo), msa_dupq_n_s32(b)), 1), + msa_shlq_n_s32(msa_mulq_s32(msa_paddlq_s16(a_hi), msa_dupq_n_s32(b)), 1), 16); +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif /*__mips_msa*/ +#endif /* OPENCV_CORE_MSA_MACROS_H */ diff --git a/3rdparty/opencv/include/opencv2/core/hal/simd_utils.impl.hpp b/3rdparty/opencv/include/opencv2/core/hal/simd_utils.impl.hpp new file mode 100644 index 00000000..fff8f942 --- /dev/null +++ b/3rdparty/opencv/include/opencv2/core/hal/simd_utils.impl.hpp @@ -0,0 +1,146 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html + +// This header is not standalone. Don't include directly, use "intrin.hpp" instead. +#ifdef OPENCV_HAL_INTRIN_HPP // defined in intrin.hpp + + +#if CV_SIMD128 || CV_SIMD128_CPP + +template struct Type2Vec128_Traits; +#define CV_INTRIN_DEF_TYPE2VEC128_TRAITS(type_, vec_type_) \ + template<> struct Type2Vec128_Traits \ + { \ + typedef vec_type_ vec_type; \ + } + +CV_INTRIN_DEF_TYPE2VEC128_TRAITS(uchar, v_uint8x16); +CV_INTRIN_DEF_TYPE2VEC128_TRAITS(schar, v_int8x16); +CV_INTRIN_DEF_TYPE2VEC128_TRAITS(ushort, v_uint16x8); +CV_INTRIN_DEF_TYPE2VEC128_TRAITS(short, v_int16x8); +CV_INTRIN_DEF_TYPE2VEC128_TRAITS(unsigned, v_uint32x4); +CV_INTRIN_DEF_TYPE2VEC128_TRAITS(int, v_int32x4); +CV_INTRIN_DEF_TYPE2VEC128_TRAITS(float, v_float32x4); +CV_INTRIN_DEF_TYPE2VEC128_TRAITS(uint64, v_uint64x2); +CV_INTRIN_DEF_TYPE2VEC128_TRAITS(int64, v_int64x2); +#if CV_SIMD128_64F +CV_INTRIN_DEF_TYPE2VEC128_TRAITS(double, v_float64x2); +#endif + +template static inline +typename Type2Vec128_Traits<_T>::vec_type v_setall(const _T& a); + +template<> inline Type2Vec128_Traits< uchar>::vec_type v_setall< uchar>(const uchar& a) { return v_setall_u8(a); } +template<> inline Type2Vec128_Traits< schar>::vec_type v_setall< schar>(const schar& a) { return v_setall_s8(a); } +template<> inline Type2Vec128_Traits::vec_type v_setall(const ushort& a) { return v_setall_u16(a); } +template<> inline Type2Vec128_Traits< short>::vec_type v_setall< short>(const short& a) { return v_setall_s16(a); } +template<> inline Type2Vec128_Traits< uint>::vec_type v_setall< uint>(const uint& a) { return v_setall_u32(a); } +template<> inline Type2Vec128_Traits< int>::vec_type v_setall< int>(const int& a) { return v_setall_s32(a); } +template<> inline Type2Vec128_Traits::vec_type v_setall(const uint64& a) { return v_setall_u64(a); } +template<> inline Type2Vec128_Traits< int64>::vec_type v_setall< int64>(const int64& a) { return v_setall_s64(a); } +template<> inline Type2Vec128_Traits< float>::vec_type v_setall< float>(const float& a) { return v_setall_f32(a); } +#if CV_SIMD128_64F +template<> inline Type2Vec128_Traits::vec_type v_setall(const double& a) { return v_setall_f64(a); } +#endif + +#endif // SIMD128 + + +#if CV_SIMD256 + +template struct Type2Vec256_Traits; +#define CV_INTRIN_DEF_TYPE2VEC256_TRAITS(type_, vec_type_) \ + template<> struct Type2Vec256_Traits \ + { \ + typedef vec_type_ vec_type; \ + } + +CV_INTRIN_DEF_TYPE2VEC256_TRAITS(uchar, v_uint8x32); +CV_INTRIN_DEF_TYPE2VEC256_TRAITS(schar, v_int8x32); +CV_INTRIN_DEF_TYPE2VEC256_TRAITS(ushort, v_uint16x16); +CV_INTRIN_DEF_TYPE2VEC256_TRAITS(short, v_int16x16); +CV_INTRIN_DEF_TYPE2VEC256_TRAITS(unsigned, v_uint32x8); +CV_INTRIN_DEF_TYPE2VEC256_TRAITS(int, v_int32x8); +CV_INTRIN_DEF_TYPE2VEC256_TRAITS(float, v_float32x8); +CV_INTRIN_DEF_TYPE2VEC256_TRAITS(uint64, v_uint64x4); +CV_INTRIN_DEF_TYPE2VEC256_TRAITS(int64, v_int64x4); +#if CV_SIMD256_64F +CV_INTRIN_DEF_TYPE2VEC256_TRAITS(double, v_float64x4); +#endif + +template static inline +typename Type2Vec256_Traits<_T>::vec_type v256_setall(const _T& a); + +template<> inline Type2Vec256_Traits< uchar>::vec_type v256_setall< uchar>(const uchar& a) { return v256_setall_u8(a); } +template<> inline Type2Vec256_Traits< schar>::vec_type v256_setall< schar>(const schar& a) { return v256_setall_s8(a); } +template<> inline Type2Vec256_Traits::vec_type v256_setall(const ushort& a) { return v256_setall_u16(a); } +template<> inline Type2Vec256_Traits< short>::vec_type v256_setall< short>(const short& a) { return v256_setall_s16(a); } +template<> inline Type2Vec256_Traits< uint>::vec_type v256_setall< uint>(const uint& a) { return v256_setall_u32(a); } +template<> inline Type2Vec256_Traits< int>::vec_type v256_setall< int>(const int& a) { return v256_setall_s32(a); } +template<> inline Type2Vec256_Traits::vec_type v256_setall(const uint64& a) { return v256_setall_u64(a); } +template<> inline Type2Vec256_Traits< int64>::vec_type v256_setall< int64>(const int64& a) { return v256_setall_s64(a); } +template<> inline Type2Vec256_Traits< float>::vec_type v256_setall< float>(const float& a) { return v256_setall_f32(a); } +#if CV_SIMD256_64F +template<> inline Type2Vec256_Traits::vec_type v256_setall(const double& a) { return v256_setall_f64(a); } +#endif + +#endif // SIMD256 + + +#if CV_SIMD512 + +template struct Type2Vec512_Traits; +#define CV_INTRIN_DEF_TYPE2VEC512_TRAITS(type_, vec_type_) \ + template<> struct Type2Vec512_Traits \ + { \ + typedef vec_type_ vec_type; \ + } + +CV_INTRIN_DEF_TYPE2VEC512_TRAITS(uchar, v_uint8x64); +CV_INTRIN_DEF_TYPE2VEC512_TRAITS(schar, v_int8x64); +CV_INTRIN_DEF_TYPE2VEC512_TRAITS(ushort, v_uint16x32); +CV_INTRIN_DEF_TYPE2VEC512_TRAITS(short, v_int16x32); +CV_INTRIN_DEF_TYPE2VEC512_TRAITS(unsigned, v_uint32x16); +CV_INTRIN_DEF_TYPE2VEC512_TRAITS(int, v_int32x16); +CV_INTRIN_DEF_TYPE2VEC512_TRAITS(float, v_float32x16); +CV_INTRIN_DEF_TYPE2VEC512_TRAITS(uint64, v_uint64x8); +CV_INTRIN_DEF_TYPE2VEC512_TRAITS(int64, v_int64x8); +#if CV_SIMD512_64F +CV_INTRIN_DEF_TYPE2VEC512_TRAITS(double, v_float64x8); +#endif + +template static inline +typename Type2Vec512_Traits<_T>::vec_type v512_setall(const _T& a); + +template<> inline Type2Vec512_Traits< uchar>::vec_type v512_setall< uchar>(const uchar& a) { return v512_setall_u8(a); } +template<> inline Type2Vec512_Traits< schar>::vec_type v512_setall< schar>(const schar& a) { return v512_setall_s8(a); } +template<> inline Type2Vec512_Traits::vec_type v512_setall(const ushort& a) { return v512_setall_u16(a); } +template<> inline Type2Vec512_Traits< short>::vec_type v512_setall< short>(const short& a) { return v512_setall_s16(a); } +template<> inline Type2Vec512_Traits< uint>::vec_type v512_setall< uint>(const uint& a) { return v512_setall_u32(a); } +template<> inline Type2Vec512_Traits< int>::vec_type v512_setall< int>(const int& a) { return v512_setall_s32(a); } +template<> inline Type2Vec512_Traits::vec_type v512_setall(const uint64& a) { return v512_setall_u64(a); } +template<> inline Type2Vec512_Traits< int64>::vec_type v512_setall< int64>(const int64& a) { return v512_setall_s64(a); } +template<> inline Type2Vec512_Traits< float>::vec_type v512_setall< float>(const float& a) { return v512_setall_f32(a); } +#if CV_SIMD512_64F +template<> inline Type2Vec512_Traits::vec_type v512_setall(const double& a) { return v512_setall_f64(a); } +#endif + +#endif // SIMD512 + + +#if CV_SIMD_WIDTH == 16 +template static inline +typename Type2Vec128_Traits<_T>::vec_type vx_setall(const _T& a) { return v_setall(a); } +#elif CV_SIMD_WIDTH == 32 +template static inline +typename Type2Vec256_Traits<_T>::vec_type vx_setall(const _T& a) { return v256_setall(a); } +#elif CV_SIMD_WIDTH == 64 +template static inline +typename Type2Vec512_Traits<_T>::vec_type vx_setall(const _T& a) { return v512_setall(a); } +#else +#error "Build configuration error, unsupported CV_SIMD_WIDTH" +#endif + + +#endif // OPENCV_HAL_INTRIN_HPP diff --git a/3rdparty/opencv/include/opencv2/core/simd_intrinsics.hpp b/3rdparty/opencv/include/opencv2/core/simd_intrinsics.hpp new file mode 100644 index 00000000..309202d1 --- /dev/null +++ b/3rdparty/opencv/include/opencv2/core/simd_intrinsics.hpp @@ -0,0 +1,87 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef OPENCV_CORE_SIMD_INTRINSICS_HPP +#define OPENCV_CORE_SIMD_INTRINSICS_HPP + +/** +Helper header to support SIMD intrinsics (universal intrinsics) in user code. +Intrinsics documentation: https://docs.opencv.org/3.4/df/d91/group__core__hal__intrin.html + + +Checks of target CPU instruction set based on compiler definitions don't work well enough. +More reliable solutions require utilization of configuration systems (like CMake). + +So, probably you need to specify your own configuration. + +You can do that via CMake in this way: + add_definitions(/DOPENCV_SIMD_CONFIG_HEADER=opencv_simd_config_custom.hpp) +or + add_definitions(/DOPENCV_SIMD_CONFIG_INCLUDE_DIR=1) + +Additionally you may need to add include directory to your files: + include_directories("${CMAKE_CURRENT_LIST_DIR}/opencv_config_${MYTARGET}") + +These files can be pre-generated for target configurations of your application +or generated by CMake on the fly (use CMAKE_BINARY_DIR for that). + +Notes: +- H/W capability checks are still responsibility of your application +- runtime dispatching is not covered by this helper header +*/ + +#ifdef __OPENCV_BUILD +#error "Use core/hal/intrin.hpp during OpenCV build" +#endif + +#ifdef OPENCV_HAL_INTRIN_HPP +#error "core/simd_intrinsics.hpp must be included before core/hal/intrin.hpp" +#endif + +#include "opencv2/core/cvdef.h" + +#ifdef OPENCV_SIMD_CONFIG_HEADER +#include CVAUX_STR(OPENCV_SIMD_CONFIG_HEADER) +#elif defined(OPENCV_SIMD_CONFIG_INCLUDE_DIR) +#include "opencv_simd_config.hpp" // corresponding directory should be added via -I compiler parameter +#else // custom config headers + +#if (!defined(CV_AVX_512F) || !CV_AVX_512F) && (defined(__AVX512__) || defined(__AVX512F__)) +# include +# undef CV_AVX_512F +# define CV_AVX_512F 1 +# ifndef OPENCV_SIMD_DONT_ASSUME_SKX // Skylake-X with AVX-512F/CD/BW/DQ/VL +# undef CV_AVX512_SKX +# define CV_AVX512_SKX 1 +# undef CV_AVX_512CD +# define CV_AVX_512CD 1 +# undef CV_AVX_512BW +# define CV_AVX_512BW 1 +# undef CV_AVX_512DQ +# define CV_AVX_512DQ 1 +# undef CV_AVX_512VL +# define CV_AVX_512VL 1 +# endif +#endif // AVX512 + +// GCC/Clang: -mavx2 +// MSVC: /arch:AVX2 +#if defined __AVX2__ +# include +# undef CV_AVX2 +# define CV_AVX2 1 +# if defined __F16C__ +# undef CV_FP16 +# define CV_FP16 1 +# endif +#endif + +#endif + +// SSE / NEON / VSX is handled by cv_cpu_dispatch.h compatibility block +#include "cv_cpu_dispatch.h" + +#include "hal/intrin.hpp" + +#endif // OPENCV_CORE_SIMD_INTRINSICS_HPP diff --git a/3rdparty/opencv/include/opencv2/core/utils/allocator_stats.hpp b/3rdparty/opencv/include/opencv2/core/utils/allocator_stats.hpp new file mode 100644 index 00000000..79e93382 --- /dev/null +++ b/3rdparty/opencv/include/opencv2/core/utils/allocator_stats.hpp @@ -0,0 +1,29 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef OPENCV_CORE_ALLOCATOR_STATS_HPP +#define OPENCV_CORE_ALLOCATOR_STATS_HPP + +#include "../cvdef.h" + +namespace cv { namespace utils { + +class AllocatorStatisticsInterface +{ +protected: + AllocatorStatisticsInterface() {} + virtual ~AllocatorStatisticsInterface() {} +public: + virtual uint64_t getCurrentUsage() const = 0; + virtual uint64_t getTotalUsage() const = 0; + virtual uint64_t getNumberOfAllocations() const = 0; + virtual uint64_t getPeakUsage() const = 0; + + /** set peak usage = current usage */ + virtual void resetPeakUsage() = 0; +}; + +}} // namespace + +#endif // OPENCV_CORE_ALLOCATOR_STATS_HPP diff --git a/3rdparty/opencv/include/opencv2/core/utils/allocator_stats.impl.hpp b/3rdparty/opencv/include/opencv2/core/utils/allocator_stats.impl.hpp new file mode 100644 index 00000000..eb5ecde1 --- /dev/null +++ b/3rdparty/opencv/include/opencv2/core/utils/allocator_stats.impl.hpp @@ -0,0 +1,158 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef OPENCV_CORE_ALLOCATOR_STATS_IMPL_HPP +#define OPENCV_CORE_ALLOCATOR_STATS_IMPL_HPP + +#include "./allocator_stats.hpp" + +//#define OPENCV_DISABLE_ALLOCATOR_STATS + +#ifdef CV_CXX11 + +#include + +#ifndef OPENCV_ALLOCATOR_STATS_COUNTER_TYPE +#if defined(__GNUC__) && (\ + (defined(__SIZEOF_POINTER__) && __SIZEOF_POINTER__ == 4) || \ + (defined(__GCC_HAVE_SYNC_COMPARE_AND_SWAP_4) && !defined(__GCC_HAVE_SYNC_COMPARE_AND_SWAP_8)) \ + ) +#define OPENCV_ALLOCATOR_STATS_COUNTER_TYPE int +#endif +#endif + +#ifndef OPENCV_ALLOCATOR_STATS_COUNTER_TYPE +#define OPENCV_ALLOCATOR_STATS_COUNTER_TYPE long long +#endif + +#else // CV_CXX11 + +#ifndef OPENCV_ALLOCATOR_STATS_COUNTER_TYPE +#define OPENCV_ALLOCATOR_STATS_COUNTER_TYPE int // CV_XADD supports int only +#endif + +#endif // CV_CXX11 + +namespace cv { namespace utils { + +#ifdef CV__ALLOCATOR_STATS_LOG +namespace { +#endif + +class AllocatorStatistics : public AllocatorStatisticsInterface +{ +#ifdef OPENCV_DISABLE_ALLOCATOR_STATS + +public: + AllocatorStatistics() {} + ~AllocatorStatistics() CV_OVERRIDE {} + + uint64_t getCurrentUsage() const CV_OVERRIDE { return 0; } + uint64_t getTotalUsage() const CV_OVERRIDE { return 0; } + uint64_t getNumberOfAllocations() const CV_OVERRIDE { return 0; } + uint64_t getPeakUsage() const CV_OVERRIDE { return 0; } + + /** set peak usage = current usage */ + void resetPeakUsage() CV_OVERRIDE {}; + + void onAllocate(size_t /*sz*/) {} + void onFree(size_t /*sz*/) {} + +#elif defined(CV_CXX11) + +protected: + typedef OPENCV_ALLOCATOR_STATS_COUNTER_TYPE counter_t; + std::atomic curr, total, total_allocs, peak; +public: + AllocatorStatistics() {} + ~AllocatorStatistics() CV_OVERRIDE {} + + uint64_t getCurrentUsage() const CV_OVERRIDE { return (uint64_t)curr.load(); } + uint64_t getTotalUsage() const CV_OVERRIDE { return (uint64_t)total.load(); } + uint64_t getNumberOfAllocations() const CV_OVERRIDE { return (uint64_t)total_allocs.load(); } + uint64_t getPeakUsage() const CV_OVERRIDE { return (uint64_t)peak.load(); } + + /** set peak usage = current usage */ + void resetPeakUsage() CV_OVERRIDE { peak.store(curr.load()); } + + // Controller interface + void onAllocate(size_t sz) + { +#ifdef CV__ALLOCATOR_STATS_LOG + CV__ALLOCATOR_STATS_LOG(cv::format("allocate: %lld (curr=%lld)", (long long int)sz, (long long int)curr.load())); +#endif + + counter_t new_curr = curr.fetch_add((counter_t)sz) + (counter_t)sz; + + // peak = std::max((uint64_t)peak, new_curr); + auto prev_peak = peak.load(); + while (prev_peak < new_curr) + { + if (peak.compare_exchange_weak(prev_peak, new_curr)) + break; + } + // end of peak = max(...) + + total += (counter_t)sz; + total_allocs++; + } + void onFree(size_t sz) + { +#ifdef CV__ALLOCATOR_STATS_LOG + CV__ALLOCATOR_STATS_LOG(cv::format("free: %lld (curr=%lld)", (long long int)sz, (long long int)curr.load())); +#endif + curr -= (counter_t)sz; + } + +#else // non C++11 + +protected: + typedef OPENCV_ALLOCATOR_STATS_COUNTER_TYPE counter_t; + volatile counter_t curr, total, total_allocs, peak; // overflow is possible, CV_XADD operates with 'int' only +public: + AllocatorStatistics() + : curr(0), total(0), total_allocs(0), peak(0) + {} + ~AllocatorStatistics() CV_OVERRIDE {} + + uint64_t getCurrentUsage() const CV_OVERRIDE { return (uint64_t)curr; } + uint64_t getTotalUsage() const CV_OVERRIDE { return (uint64_t)total; } + uint64_t getNumberOfAllocations() const CV_OVERRIDE { return (uint64_t)total_allocs; } + uint64_t getPeakUsage() const CV_OVERRIDE { return (uint64_t)peak; } + + void resetPeakUsage() CV_OVERRIDE { peak = curr; } + + // Controller interface + void onAllocate(size_t sz) + { +#ifdef CV__ALLOCATOR_STATS_LOG + CV__ALLOCATOR_STATS_LOG(cv::format("allocate: %lld (curr=%lld)", (long long int)sz, (long long int)curr)); +#endif + + counter_t new_curr = (counter_t)CV_XADD(&curr, (counter_t)sz) + (counter_t)sz; + + peak = std::max((counter_t)peak, new_curr); // non-thread safe + + //CV_XADD(&total, (uint64_t)sz); // overflow with int, non-reliable... + total += sz; + + CV_XADD(&total_allocs, (counter_t)1); + } + void onFree(size_t sz) + { +#ifdef CV__ALLOCATOR_STATS_LOG + CV__ALLOCATOR_STATS_LOG(cv::format("free: %lld (curr=%lld)", (long long int)sz, (long long int)curr)); +#endif + CV_XADD(&curr, (counter_t)-sz); + } +#endif +}; + +#ifdef CV__ALLOCATOR_STATS_LOG +} // namespace +#endif + +}} // namespace + +#endif // OPENCV_CORE_ALLOCATOR_STATS_IMPL_HPP diff --git a/3rdparty/opencv/include/opencv2/core/utils/instrumentation.hpp b/3rdparty/opencv/include/opencv2/core/utils/instrumentation.hpp new file mode 100644 index 00000000..36398670 --- /dev/null +++ b/3rdparty/opencv/include/opencv2/core/utils/instrumentation.hpp @@ -0,0 +1,125 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef OPENCV_UTILS_INSTR_HPP +#define OPENCV_UTILS_INSTR_HPP + +#include +#include + +namespace cv { + +//! @addtogroup core_utils +//! @{ + +#ifdef CV_COLLECT_IMPL_DATA +CV_EXPORTS void setImpl(int flags); // set implementation flags and reset storage arrays +CV_EXPORTS void addImpl(int flag, const char* func = 0); // add implementation and function name to storage arrays +// Get stored implementation flags and functions names arrays +// Each implementation entry correspond to function name entry, so you can find which implementation was executed in which function +CV_EXPORTS int getImpl(std::vector &impl, std::vector &funName); + +CV_EXPORTS bool useCollection(); // return implementation collection state +CV_EXPORTS void setUseCollection(bool flag); // set implementation collection state + +#define CV_IMPL_PLAIN 0x01 // native CPU OpenCV implementation +#define CV_IMPL_OCL 0x02 // OpenCL implementation +#define CV_IMPL_IPP 0x04 // IPP implementation +#define CV_IMPL_MT 0x10 // multithreaded implementation + +#undef CV_IMPL_ADD +#define CV_IMPL_ADD(impl) \ + if(cv::useCollection()) \ + { \ + cv::addImpl(impl, CV_Func); \ + } +#endif + +// Instrumentation external interface +namespace instr +{ + +#if !defined OPENCV_ABI_CHECK + +enum TYPE +{ + TYPE_GENERAL = 0, // OpenCV API function, e.g. exported function + TYPE_MARKER, // Information marker + TYPE_WRAPPER, // Wrapper function for implementation + TYPE_FUN, // Simple function call +}; + +enum IMPL +{ + IMPL_PLAIN = 0, + IMPL_IPP, + IMPL_OPENCL, +}; + +struct NodeDataTls +{ + NodeDataTls() + { + m_ticksTotal = 0; + } + uint64 m_ticksTotal; +}; + +class CV_EXPORTS NodeData +{ +public: + NodeData(const char* funName = 0, const char* fileName = NULL, int lineNum = 0, void* retAddress = NULL, bool alwaysExpand = false, cv::instr::TYPE instrType = TYPE_GENERAL, cv::instr::IMPL implType = IMPL_PLAIN); + NodeData(NodeData &ref); + ~NodeData(); + NodeData& operator=(const NodeData&); + + cv::String m_funName; + cv::instr::TYPE m_instrType; + cv::instr::IMPL m_implType; + const char* m_fileName; + int m_lineNum; + void* m_retAddress; + bool m_alwaysExpand; + bool m_funError; + + volatile int m_counter; + volatile uint64 m_ticksTotal; + TLSDataAccumulator m_tls; + int m_threads; + + // No synchronization + double getTotalMs() const { return ((double)m_ticksTotal / cv::getTickFrequency()) * 1000; } + double getMeanMs() const { return (((double)m_ticksTotal/m_counter) / cv::getTickFrequency()) * 1000; } +}; +bool operator==(const NodeData& lhs, const NodeData& rhs); + +typedef Node InstrNode; + +CV_EXPORTS InstrNode* getTrace(); + +#endif // !defined OPENCV_ABI_CHECK + + +CV_EXPORTS bool useInstrumentation(); +CV_EXPORTS void setUseInstrumentation(bool flag); +CV_EXPORTS void resetTrace(); + +enum FLAGS +{ + FLAGS_NONE = 0, + FLAGS_MAPPING = 0x01, + FLAGS_EXPAND_SAME_NAMES = 0x02, +}; + +CV_EXPORTS void setFlags(FLAGS modeFlags); +static inline void setFlags(int modeFlags) { setFlags((FLAGS)modeFlags); } +CV_EXPORTS FLAGS getFlags(); + +} // namespace instr + +//! @} + +} // namespace + +#endif // OPENCV_UTILS_TLS_HPP diff --git a/3rdparty/opencv/include/opencv2/core/utils/tls.hpp b/3rdparty/opencv/include/opencv2/core/utils/tls.hpp new file mode 100644 index 00000000..c0d2962c --- /dev/null +++ b/3rdparty/opencv/include/opencv2/core/utils/tls.hpp @@ -0,0 +1,239 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef OPENCV_UTILS_TLS_HPP +#define OPENCV_UTILS_TLS_HPP + +#ifndef OPENCV_CORE_UTILITY_H +#error "tls.hpp must be included after opencv2/core/utility.hpp or opencv2/core.hpp" +#endif + +namespace cv { + +//! @addtogroup core_utils +//! @{ + +namespace details { class TlsStorage; } + +/** TLS container base implementation + * + * Don't use directly. + * + * @sa TLSData, TLSDataAccumulator templates + */ +class CV_EXPORTS TLSDataContainer +{ +protected: + TLSDataContainer(); + virtual ~TLSDataContainer(); + + /// @deprecated use detachData() instead + void gatherData(std::vector &data) const; + /// get TLS data and detach all data from threads (similar to cleanup() call) + void detachData(std::vector& data); + + void* getData() const; + void release(); + +protected: + virtual void* createDataInstance() const = 0; + virtual void deleteDataInstance(void* pData) const = 0; + +#if OPENCV_ABI_COMPATIBILITY > 300 +private: +#else +public: +#endif + int key_; + + friend class cv::details::TlsStorage; // core/src/system.cpp + +public: + void cleanup(); //!< Release created TLS data container objects. It is similar to release() call, but it keeps TLS container valid. + +private: + // Disable copy/assign (noncopyable pattern) + TLSDataContainer(TLSDataContainer &); + TLSDataContainer& operator =(const TLSDataContainer &); +}; + + +/** @brief Simple TLS data class + * + * @sa TLSDataAccumulator + */ +template +class TLSData : protected TLSDataContainer +{ +public: + inline TLSData() {} + inline ~TLSData() { release(); } + + inline T* get() const { return (T*)getData(); } //!< Get data associated with key + inline T& getRef() const { T* ptr = (T*)getData(); CV_DbgAssert(ptr); return *ptr; } //!< Get data associated with key + + /// Release associated thread data + inline void cleanup() + { + TLSDataContainer::cleanup(); + } + +protected: + /// Wrapper to allocate data by template + virtual void* createDataInstance() const CV_OVERRIDE { return new T; } + /// Wrapper to release data by template + virtual void deleteDataInstance(void* pData) const CV_OVERRIDE { delete (T*)pData; } +}; + + +/// TLS data accumulator with gathering methods +template +class TLSDataAccumulator : public TLSData +{ + mutable cv::Mutex mutex; + mutable std::vector dataFromTerminatedThreads; + std::vector detachedData; + bool cleanupMode; +public: + TLSDataAccumulator() : cleanupMode(false) {} + ~TLSDataAccumulator() + { + release(); + } + + /** @brief Get data from all threads + * @deprecated replaced by detachData() + * + * Lifetime of vector data is valid until next detachData()/cleanup()/release() calls + * + * @param[out] data result buffer (should be empty) + */ + void gather(std::vector &data) const + { + CV_Assert(cleanupMode == false); // state is not valid + CV_Assert(data.empty()); + { + std::vector &dataVoid = reinterpret_cast&>(data); + TLSDataContainer::gatherData(dataVoid); + } + { + AutoLock lock(mutex); + data.reserve(data.size() + dataFromTerminatedThreads.size()); + for (typename std::vector::const_iterator i = dataFromTerminatedThreads.begin(); i != dataFromTerminatedThreads.end(); ++i) + { + data.push_back((T*)*i); + } + } + } + + /** @brief Get and detach data from all threads + * + * Call cleanupDetachedData() when returned vector is not needed anymore. + * + * @return Vector with associated data. Content is preserved (including lifetime of attached data pointers) until next detachData()/cleanupDetachedData()/cleanup()/release() calls + */ + std::vector& detachData() + { + CV_Assert(cleanupMode == false); // state is not valid + std::vector dataVoid; + { + TLSDataContainer::detachData(dataVoid); + } + { + AutoLock lock(mutex); + detachedData.reserve(dataVoid.size() + dataFromTerminatedThreads.size()); + for (typename std::vector::const_iterator i = dataFromTerminatedThreads.begin(); i != dataFromTerminatedThreads.end(); ++i) + { + detachedData.push_back((T*)*i); + } + dataFromTerminatedThreads.clear(); + for (typename std::vector::const_iterator i = dataVoid.begin(); i != dataVoid.end(); ++i) + { + detachedData.push_back((T*)(void*)*i); + } + } + dataVoid.clear(); + return detachedData; + } + + /// Release associated thread data returned by detachData() call + void cleanupDetachedData() + { + AutoLock lock(mutex); + cleanupMode = true; + _cleanupDetachedData(); + cleanupMode = false; + } + + /// Release associated thread data + void cleanup() + { + cleanupMode = true; + TLSDataContainer::cleanup(); + + AutoLock lock(mutex); + _cleanupDetachedData(); + _cleanupTerminatedData(); + cleanupMode = false; + } + + /// Release associated thread data and free TLS key + void release() + { + cleanupMode = true; + TLSDataContainer::release(); + { + AutoLock lock(mutex); + _cleanupDetachedData(); + _cleanupTerminatedData(); + } + } + +protected: + // synchronized + void _cleanupDetachedData() + { + for (typename std::vector::iterator i = detachedData.begin(); i != detachedData.end(); ++i) + { + deleteDataInstance((T*)*i); + } + detachedData.clear(); + } + + // synchronized + void _cleanupTerminatedData() + { + for (typename std::vector::iterator i = dataFromTerminatedThreads.begin(); i != dataFromTerminatedThreads.end(); ++i) + { + deleteDataInstance((T*)*i); + } + dataFromTerminatedThreads.clear(); + } + +protected: + virtual void* createDataInstance() const CV_OVERRIDE + { + // Note: we can collect all allocated data here, but this would require raced mutex locks + return new T; + } + virtual void deleteDataInstance(void* pData) const CV_OVERRIDE + { + if (cleanupMode) + { + delete (T*)pData; + } + else + { + AutoLock lock(mutex); + dataFromTerminatedThreads.push_back((T*)pData); + } + } +}; + + +//! @} + +} // namespace + +#endif // OPENCV_UTILS_TLS_HPP diff --git a/3rdparty/opencv/include/opencv2/world.hpp b/3rdparty/opencv/include/opencv2/world.hpp new file mode 100644 index 00000000..4902c2f2 --- /dev/null +++ b/3rdparty/opencv/include/opencv2/world.hpp @@ -0,0 +1,58 @@ +/*M/////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009-2010, Willow Garage Inc., all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#ifndef OPENCV_WORLD_HPP +#define OPENCV_WORLD_HPP + +#include "opencv2/core.hpp" + +#ifdef __cplusplus +namespace cv +{ + +CV_EXPORTS_W bool initAll(); + +} + +#endif + +#endif diff --git a/3rdparty/opencv/x86/lib/opencv_world3414.lib b/3rdparty/opencv/x86/lib/opencv_world3414.lib new file mode 100644 index 00000000..34984076 Binary files /dev/null and b/3rdparty/opencv/x86/lib/opencv_world3414.lib differ diff --git a/3rdparty/opencv/x86/lib/opencv_world3414d.lib b/3rdparty/opencv/x86/lib/opencv_world3414d.lib new file mode 100644 index 00000000..30403cae Binary files /dev/null and b/3rdparty/opencv/x86/lib/opencv_world3414d.lib differ diff --git a/3rdparty/opencv/x86/staticlib/debug/opencv_world3414d.lib b/3rdparty/opencv/x86/staticlib/debug/opencv_world3414d.lib new file mode 100644 index 00000000..a8679629 Binary files /dev/null and b/3rdparty/opencv/x86/staticlib/debug/opencv_world3414d.lib differ diff --git a/3rdparty/opencv/x86/staticlib/debug/quircd.lib b/3rdparty/opencv/x86/staticlib/debug/quircd.lib new file mode 100644 index 00000000..0472624b Binary files /dev/null and b/3rdparty/opencv/x86/staticlib/debug/quircd.lib differ diff --git a/3rdparty/opencv/x86/staticlib/release/opencv_world3414.lib b/3rdparty/opencv/x86/staticlib/release/opencv_world3414.lib new file mode 100644 index 00000000..884436be Binary files /dev/null and b/3rdparty/opencv/x86/staticlib/release/opencv_world3414.lib differ diff --git a/3rdparty/opencv/x86/staticlib/release/quirc.lib b/3rdparty/opencv/x86/staticlib/release/quirc.lib new file mode 100644 index 00000000..a84419c8 Binary files /dev/null and b/3rdparty/opencv/x86/staticlib/release/quirc.lib differ