diff --git a/3rdparty/opencv/include/opencv2/core/detail/async_promise.hpp b/3rdparty/opencv/include/opencv2/core/detail/async_promise.hpp
new file mode 100644
index 00000000..6eb3fb52
--- /dev/null
+++ b/3rdparty/opencv/include/opencv2/core/detail/async_promise.hpp
@@ -0,0 +1,71 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_CORE_ASYNC_PROMISE_HPP
+#define OPENCV_CORE_ASYNC_PROMISE_HPP
+
+#include "../async.hpp"
+
+#include "exception_ptr.hpp"
+
+namespace cv {
+
+/** @addtogroup core_async
+@{
+*/
+
+
+/** @brief Provides result of asynchronous operations
+
+*/
+class CV_EXPORTS AsyncPromise
+{
+public:
+    ~AsyncPromise() CV_NOEXCEPT;
+    AsyncPromise() CV_NOEXCEPT;
+    explicit AsyncPromise(const AsyncPromise& o) CV_NOEXCEPT;
+    AsyncPromise& operator=(const AsyncPromise& o) CV_NOEXCEPT;
+    void release() CV_NOEXCEPT;
+
+    /** Returns associated AsyncArray
+    @note Can be called once
+    */
+    AsyncArray getArrayResult();
+
+    /** Stores asynchronous result.
+    @param[in] value result
+    */
+    void setValue(InputArray value);
+
+    // TODO "move" setters
+
+#if CV__EXCEPTION_PTR
+    /** Stores exception.
+    @param[in] exception exception to be raised in AsyncArray
+    */
+    void setException(std::exception_ptr exception);
+#endif
+
+    /** Stores exception.
+    @param[in] exception exception to be raised in AsyncArray
+    */
+    void setException(const cv::Exception& exception);
+
+#ifdef CV_CXX11
+    explicit AsyncPromise(AsyncPromise&& o) { p = o.p; o.p = NULL; }
+    AsyncPromise& operator=(AsyncPromise&& o) CV_NOEXCEPT { std::swap(p, o.p); return *this; }
+#endif
+
+
+    // PImpl
+    typedef struct AsyncArray::Impl Impl; friend struct AsyncArray::Impl;
+    inline void* _getImpl() const CV_NOEXCEPT { return p; }
+protected:
+    Impl* p;
+};
+
+
+//! @}
+} // namespace
+#endif // OPENCV_CORE_ASYNC_PROMISE_HPP
diff --git a/3rdparty/opencv/include/opencv2/core/detail/exception_ptr.hpp b/3rdparty/opencv/include/opencv2/core/detail/exception_ptr.hpp
new file mode 100644
index 00000000..d98ffc40
--- /dev/null
+++ b/3rdparty/opencv/include/opencv2/core/detail/exception_ptr.hpp
@@ -0,0 +1,27 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_CORE_DETAILS_EXCEPTION_PTR_H
+#define OPENCV_CORE_DETAILS_EXCEPTION_PTR_H
+
+#ifndef CV__EXCEPTION_PTR
+#  if defined(__ANDROID__) && defined(ATOMIC_INT_LOCK_FREE) && ATOMIC_INT_LOCK_FREE < 2
+#    define CV__EXCEPTION_PTR 0  // Not supported, details: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58938
+#  elif defined(CV_CXX11)
+#    define CV__EXCEPTION_PTR 1
+#  elif defined(_MSC_VER)
+#    define CV__EXCEPTION_PTR (_MSC_VER >= 1600)
+#  elif defined(__clang__)
+#    define CV__EXCEPTION_PTR 0  // C++11 only (see above)
+#  elif defined(__GNUC__) && defined(__GXX_EXPERIMENTAL_CXX0X__)
+#    define CV__EXCEPTION_PTR (__GXX_EXPERIMENTAL_CXX0X__ > 0)
+#  endif
+#endif
+#ifndef CV__EXCEPTION_PTR
+#  define CV__EXCEPTION_PTR 0
+#elif CV__EXCEPTION_PTR
+#  include <exception>  // std::exception_ptr
+#endif
+
+#endif // OPENCV_CORE_DETAILS_EXCEPTION_PTR_H
diff --git a/3rdparty/opencv/include/opencv2/core/hal/intrin_avx512.hpp b/3rdparty/opencv/include/opencv2/core/hal/intrin_avx512.hpp
new file mode 100644
index 00000000..75a3bd4b
--- /dev/null
+++ b/3rdparty/opencv/include/opencv2/core/hal/intrin_avx512.hpp
@@ -0,0 +1,3078 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+#ifndef OPENCV_HAL_INTRIN_AVX512_HPP
+#define OPENCV_HAL_INTRIN_AVX512_HPP
+
+#if defined(_MSC_VER) && (_MSC_VER < 1920/*MSVS2019*/)
+# pragma warning(disable:4146)  // unary minus operator applied to unsigned type, result still unsigned
+# pragma warning(disable:4309)  // 'argument': truncation of constant value
+# pragma warning(disable:4310)  // cast truncates constant value
+#endif
+
+#define CVT_ROUND_MODES_IMPLEMENTED 0
+
+#define CV_SIMD512 1
+#define CV_SIMD512_64F 1
+#define CV_SIMD512_FP16 0  // no native operations with FP16 type. Only load/store from float32x8 are available (if CV_FP16 == 1)
+
+#define _v512_set_epu64(a7, a6, a5, a4, a3, a2, a1, a0) _mm512_set_epi64((int64)(a7),(int64)(a6),(int64)(a5),(int64)(a4),(int64)(a3),(int64)(a2),(int64)(a1),(int64)(a0))
+#define _v512_set_epu32(a15, a14, a13, a12, a11, a10,  a9,  a8,  a7,  a6,  a5,  a4,  a3,  a2,  a1,  a0) \
+        _mm512_set_epi64(((int64)(a15)<<32)|(int64)(a14), ((int64)(a13)<<32)|(int64)(a12), ((int64)(a11)<<32)|(int64)(a10), ((int64)( a9)<<32)|(int64)( a8), \
+                         ((int64)( a7)<<32)|(int64)( a6), ((int64)( a5)<<32)|(int64)( a4), ((int64)( a3)<<32)|(int64)( a2), ((int64)( a1)<<32)|(int64)( a0))
+#define _v512_set_epu16(a31, a30, a29, a28, a27, a26, a25, a24, a23, a22, a21, a20, a19, a18, a17, a16, \
+                        a15, a14, a13, a12, a11, a10,  a9,  a8,  a7,  a6,  a5,  a4,  a3,  a2,  a1,  a0) \
+        _v512_set_epu32(((unsigned)(a31)<<16)|(unsigned)(a30), ((unsigned)(a29)<<16)|(unsigned)(a28), ((unsigned)(a27)<<16)|(unsigned)(a26), ((unsigned)(a25)<<16)|(unsigned)(a24), \
+                        ((unsigned)(a23)<<16)|(unsigned)(a22), ((unsigned)(a21)<<16)|(unsigned)(a20), ((unsigned)(a19)<<16)|(unsigned)(a18), ((unsigned)(a17)<<16)|(unsigned)(a16), \
+                        ((unsigned)(a15)<<16)|(unsigned)(a14), ((unsigned)(a13)<<16)|(unsigned)(a12), ((unsigned)(a11)<<16)|(unsigned)(a10), ((unsigned)( a9)<<16)|(unsigned)( a8), \
+                        ((unsigned)( a7)<<16)|(unsigned)( a6), ((unsigned)( a5)<<16)|(unsigned)( a4), ((unsigned)( a3)<<16)|(unsigned)( a2), ((unsigned)( a1)<<16)|(unsigned)( a0))
+#define _v512_set_epu8(a63, a62, a61, a60, a59, a58, a57, a56, a55, a54, a53, a52, a51, a50, a49, a48, \
+                       a47, a46, a45, a44, a43, a42, a41, a40, a39, a38, a37, a36, a35, a34, a33, a32, \
+                       a31, a30, a29, a28, a27, a26, a25, a24, a23, a22, a21, a20, a19, a18, a17, a16, \
+                       a15, a14, a13, a12, a11, a10,  a9,  a8,  a7,  a6,  a5,  a4,  a3,  a2,  a1,  a0) \
+        _v512_set_epu32(((unsigned)(a63)<<24)|((unsigned)(a62)<<16)|((unsigned)(a61)<<8)|(unsigned)(a60),((unsigned)(a59)<<24)|((unsigned)(a58)<<16)|((unsigned)(a57)<<8)|(unsigned)(a56), \
+                        ((unsigned)(a55)<<24)|((unsigned)(a54)<<16)|((unsigned)(a53)<<8)|(unsigned)(a52),((unsigned)(a51)<<24)|((unsigned)(a50)<<16)|((unsigned)(a49)<<8)|(unsigned)(a48), \
+                        ((unsigned)(a47)<<24)|((unsigned)(a46)<<16)|((unsigned)(a45)<<8)|(unsigned)(a44),((unsigned)(a43)<<24)|((unsigned)(a42)<<16)|((unsigned)(a41)<<8)|(unsigned)(a40), \
+                        ((unsigned)(a39)<<24)|((unsigned)(a38)<<16)|((unsigned)(a37)<<8)|(unsigned)(a36),((unsigned)(a35)<<24)|((unsigned)(a34)<<16)|((unsigned)(a33)<<8)|(unsigned)(a32), \
+                        ((unsigned)(a31)<<24)|((unsigned)(a30)<<16)|((unsigned)(a29)<<8)|(unsigned)(a28),((unsigned)(a27)<<24)|((unsigned)(a26)<<16)|((unsigned)(a25)<<8)|(unsigned)(a24), \
+                        ((unsigned)(a23)<<24)|((unsigned)(a22)<<16)|((unsigned)(a21)<<8)|(unsigned)(a20),((unsigned)(a19)<<24)|((unsigned)(a18)<<16)|((unsigned)(a17)<<8)|(unsigned)(a16), \
+                        ((unsigned)(a15)<<24)|((unsigned)(a14)<<16)|((unsigned)(a13)<<8)|(unsigned)(a12),((unsigned)(a11)<<24)|((unsigned)(a10)<<16)|((unsigned)( a9)<<8)|(unsigned)( a8), \
+                        ((unsigned)( a7)<<24)|((unsigned)( a6)<<16)|((unsigned)( a5)<<8)|(unsigned)( a4),((unsigned)( a3)<<24)|((unsigned)( a2)<<16)|((unsigned)( a1)<<8)|(unsigned)( a0))
+#define _v512_set_epi8(a63, a62, a61, a60, a59, a58, a57, a56, a55, a54, a53, a52, a51, a50, a49, a48, \
+                       a47, a46, a45, a44, a43, a42, a41, a40, a39, a38, a37, a36, a35, a34, a33, a32, \
+                       a31, a30, a29, a28, a27, a26, a25, a24, a23, a22, a21, a20, a19, a18, a17, a16, \
+                       a15, a14, a13, a12, a11, a10,  a9,  a8,  a7,  a6,  a5,  a4,  a3,  a2,  a1,  a0) \
+        _v512_set_epu8((uchar)(a63), (uchar)(a62), (uchar)(a61), (uchar)(a60), (uchar)(a59), (uchar)(a58), (uchar)(a57), (uchar)(a56), \
+                       (uchar)(a55), (uchar)(a54), (uchar)(a53), (uchar)(a52), (uchar)(a51), (uchar)(a50), (uchar)(a49), (uchar)(a48), \
+                       (uchar)(a47), (uchar)(a46), (uchar)(a45), (uchar)(a44), (uchar)(a43), (uchar)(a42), (uchar)(a41), (uchar)(a40), \
+                       (uchar)(a39), (uchar)(a38), (uchar)(a37), (uchar)(a36), (uchar)(a35), (uchar)(a34), (uchar)(a33), (uchar)(a32), \
+                       (uchar)(a31), (uchar)(a30), (uchar)(a29), (uchar)(a28), (uchar)(a27), (uchar)(a26), (uchar)(a25), (uchar)(a24), \
+                       (uchar)(a23), (uchar)(a22), (uchar)(a21), (uchar)(a20), (uchar)(a19), (uchar)(a18), (uchar)(a17), (uchar)(a16), \
+                       (uchar)(a15), (uchar)(a14), (uchar)(a13), (uchar)(a12), (uchar)(a11), (uchar)(a10), (uchar)( a9), (uchar)( a8), \
+                       (uchar)( a7), (uchar)( a6), (uchar)( a5), (uchar)( a4), (uchar)( a3), (uchar)( a2), (uchar)( a1), (uchar)( a0))
+
+#ifndef _mm512_cvtpd_pslo
+#ifdef _mm512_zextsi256_si512
+#define _mm512_cvtpd_pslo(a) _mm512_zextps256_ps512(_mm512_cvtpd_ps(a))
+#else
+//if preferred way to extend with zeros is unavailable
+#define _mm512_cvtpd_pslo(a) _mm512_castps256_ps512(_mm512_cvtpd_ps(a))
+#endif
+#endif
+///////// Utils ////////////
+
+namespace
+{
+
+inline __m512i _v512_combine(const __m256i& lo, const __m256i& hi)
+{ return _mm512_inserti32x8(_mm512_castsi256_si512(lo), hi, 1); }
+
+inline __m512 _v512_combine(const __m256& lo, const __m256& hi)
+{ return _mm512_insertf32x8(_mm512_castps256_ps512(lo), hi, 1); }
+
+inline __m512d _v512_combine(const __m256d& lo, const __m256d& hi)
+{ return _mm512_insertf64x4(_mm512_castpd256_pd512(lo), hi, 1); }
+
+inline int _v_cvtsi512_si32(const __m512i& a)
+{ return _mm_cvtsi128_si32(_mm512_castsi512_si128(a)); }
+
+inline __m256i _v512_extract_high(const __m512i& v)
+{ return _mm512_extracti32x8_epi32(v, 1); }
+
+inline __m256  _v512_extract_high(const __m512& v)
+{ return _mm512_extractf32x8_ps(v, 1); }
+
+inline __m256d _v512_extract_high(const __m512d& v)
+{ return _mm512_extractf64x4_pd(v, 1); }
+
+inline __m256i _v512_extract_low(const __m512i& v)
+{ return _mm512_castsi512_si256(v); }
+
+inline __m256  _v512_extract_low(const __m512& v)
+{ return _mm512_castps512_ps256(v); }
+
+inline __m256d _v512_extract_low(const __m512d& v)
+{ return _mm512_castpd512_pd256(v); }
+
+inline __m512i _v512_insert(const __m512i& a, const __m256i& b)
+{ return _mm512_inserti32x8(a, b, 0); }
+
+inline __m512 _v512_insert(const __m512& a, const __m256& b)
+{ return _mm512_insertf32x8(a, b, 0); }
+
+inline __m512d _v512_insert(const __m512d& a, const __m256d& b)
+{ return _mm512_insertf64x4(a, b, 0); }
+
+}
+
+namespace cv
+{
+
+//! @cond IGNORED
+
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
+
+///////// Types ////////////
+
+struct v_uint8x64
+{
+    typedef uchar lane_type;
+    enum { nlanes = 64 };
+    __m512i val;
+
+    explicit v_uint8x64(__m512i v) : val(v) {}
+    v_uint8x64(uchar v0,  uchar v1,  uchar v2,  uchar v3,
+               uchar v4,  uchar v5,  uchar v6,  uchar v7,
+               uchar v8,  uchar v9,  uchar v10, uchar v11,
+               uchar v12, uchar v13, uchar v14, uchar v15,
+               uchar v16, uchar v17, uchar v18, uchar v19,
+               uchar v20, uchar v21, uchar v22, uchar v23,
+               uchar v24, uchar v25, uchar v26, uchar v27,
+               uchar v28, uchar v29, uchar v30, uchar v31,
+               uchar v32, uchar v33, uchar v34, uchar v35,
+               uchar v36, uchar v37, uchar v38, uchar v39,
+               uchar v40, uchar v41, uchar v42, uchar v43,
+               uchar v44, uchar v45, uchar v46, uchar v47,
+               uchar v48, uchar v49, uchar v50, uchar v51,
+               uchar v52, uchar v53, uchar v54, uchar v55,
+               uchar v56, uchar v57, uchar v58, uchar v59,
+               uchar v60, uchar v61, uchar v62, uchar v63)
+    {
+        val = _v512_set_epu8(v63, v62, v61, v60, v59, v58, v57, v56, v55, v54, v53, v52, v51, v50, v49, v48,
+                             v47, v46, v45, v44, v43, v42, v41, v40, v39, v38, v37, v36, v35, v34, v33, v32,
+                             v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16,
+                             v15, v14, v13, v12, v11, v10, v9,  v8,  v7,  v6,  v5,  v4,  v3,  v2,  v1,  v0);
+    }
+    v_uint8x64() {}
+
+    static inline v_uint8x64 zero() { return v_uint8x64(_mm512_setzero_si512()); }
+
+    uchar get0() const { return (uchar)_v_cvtsi512_si32(val); }
+};
+
+struct v_int8x64
+{
+    typedef schar lane_type;
+    enum { nlanes = 64 };
+    __m512i val;
+
+    explicit v_int8x64(__m512i v) : val(v) {}
+    v_int8x64(schar v0,  schar v1,  schar v2,  schar v3,
+              schar v4,  schar v5,  schar v6,  schar v7,
+              schar v8,  schar v9,  schar v10, schar v11,
+              schar v12, schar v13, schar v14, schar v15,
+              schar v16, schar v17, schar v18, schar v19,
+              schar v20, schar v21, schar v22, schar v23,
+              schar v24, schar v25, schar v26, schar v27,
+              schar v28, schar v29, schar v30, schar v31,
+              schar v32, schar v33, schar v34, schar v35,
+              schar v36, schar v37, schar v38, schar v39,
+              schar v40, schar v41, schar v42, schar v43,
+              schar v44, schar v45, schar v46, schar v47,
+              schar v48, schar v49, schar v50, schar v51,
+              schar v52, schar v53, schar v54, schar v55,
+              schar v56, schar v57, schar v58, schar v59,
+              schar v60, schar v61, schar v62, schar v63)
+    {
+        val = _v512_set_epi8(v63, v62, v61, v60, v59, v58, v57, v56, v55, v54, v53, v52, v51, v50, v49, v48,
+                             v47, v46, v45, v44, v43, v42, v41, v40, v39, v38, v37, v36, v35, v34, v33, v32,
+                             v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16,
+                             v15, v14, v13, v12, v11, v10, v9,  v8,  v7,  v6,  v5,  v4,  v3,  v2,  v1,  v0);
+    }
+    v_int8x64() {}
+
+    static inline v_int8x64 zero() { return v_int8x64(_mm512_setzero_si512()); }
+
+    schar get0() const { return (schar)_v_cvtsi512_si32(val); }
+};
+
+struct v_uint16x32
+{
+    typedef ushort lane_type;
+    enum { nlanes = 32 };
+    __m512i val;
+
+    explicit v_uint16x32(__m512i v) : val(v) {}
+    v_uint16x32(ushort v0,  ushort v1,  ushort v2,  ushort v3,
+                ushort v4,  ushort v5,  ushort v6,  ushort v7,
+                ushort v8,  ushort v9,  ushort v10, ushort v11,
+                ushort v12, ushort v13, ushort v14, ushort v15,
+                ushort v16, ushort v17, ushort v18, ushort v19,
+                ushort v20, ushort v21, ushort v22, ushort v23,
+                ushort v24, ushort v25, ushort v26, ushort v27,
+                ushort v28, ushort v29, ushort v30, ushort v31)
+    {
+        val = _v512_set_epu16(v31, v30, v29, v28, v27, v26, v25, v24, v23, v22, v21, v20, v19, v18, v17, v16,
+                              v15, v14, v13, v12, v11, v10, v9,  v8,  v7,  v6,  v5,  v4,  v3,  v2,  v1,  v0);
+    }
+    v_uint16x32() {}
+
+    static inline v_uint16x32 zero() { return v_uint16x32(_mm512_setzero_si512()); }
+
+    ushort get0() const { return (ushort)_v_cvtsi512_si32(val); }
+};
+
+struct v_int16x32
+{
+    typedef short lane_type;
+    enum { nlanes = 32 };
+    __m512i val;
+
+    explicit v_int16x32(__m512i v) : val(v) {}
+    v_int16x32(short v0,  short v1,  short v2,  short v3,  short v4,  short v5,  short v6,  short v7,
+               short v8,  short v9,  short v10, short v11, short v12, short v13, short v14, short v15,
+               short v16, short v17, short v18, short v19, short v20, short v21, short v22, short v23,
+               short v24, short v25, short v26, short v27, short v28, short v29, short v30, short v31)
+    {
+        val = _v512_set_epu16((ushort)v31, (ushort)v30, (ushort)v29, (ushort)v28, (ushort)v27, (ushort)v26, (ushort)v25, (ushort)v24,
+                              (ushort)v23, (ushort)v22, (ushort)v21, (ushort)v20, (ushort)v19, (ushort)v18, (ushort)v17, (ushort)v16,
+                              (ushort)v15, (ushort)v14, (ushort)v13, (ushort)v12, (ushort)v11, (ushort)v10, (ushort)v9 , (ushort)v8,
+                              (ushort)v7 , (ushort)v6 , (ushort)v5 , (ushort)v4 , (ushort)v3 , (ushort)v2 , (ushort)v1 , (ushort)v0);
+    }
+    v_int16x32() {}
+
+    static inline v_int16x32 zero() { return v_int16x32(_mm512_setzero_si512()); }
+
+    short get0() const { return (short)_v_cvtsi512_si32(val); }
+};
+
+struct v_uint32x16
+{
+    typedef unsigned lane_type;
+    enum { nlanes = 16 };
+    __m512i val;
+
+    explicit v_uint32x16(__m512i v) : val(v) {}
+    v_uint32x16(unsigned v0,  unsigned v1,  unsigned v2,  unsigned v3,
+                unsigned v4,  unsigned v5,  unsigned v6,  unsigned v7,
+                unsigned v8,  unsigned v9,  unsigned v10, unsigned v11,
+                unsigned v12, unsigned v13, unsigned v14, unsigned v15)
+    {
+        val = _mm512_setr_epi32((int)v0,  (int)v1,  (int)v2,  (int)v3, (int)v4,  (int)v5,  (int)v6,  (int)v7,
+                                (int)v8,  (int)v9,  (int)v10, (int)v11, (int)v12, (int)v13, (int)v14, (int)v15);
+    }
+    v_uint32x16() {}
+
+    static inline v_uint32x16 zero() { return v_uint32x16(_mm512_setzero_si512()); }
+
+    unsigned get0() const { return (unsigned)_v_cvtsi512_si32(val); }
+};
+
+struct v_int32x16
+{
+    typedef int lane_type;
+    enum { nlanes = 16 };
+    __m512i val;
+
+    explicit v_int32x16(__m512i v) : val(v) {}
+    v_int32x16(int v0, int v1, int v2,  int v3,  int v4,  int v5,  int v6,  int v7,
+               int v8, int v9, int v10, int v11, int v12, int v13, int v14, int v15)
+    {
+        val = _mm512_setr_epi32(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
+    }
+    v_int32x16() {}
+
+    static inline v_int32x16 zero() { return v_int32x16(_mm512_setzero_si512()); }
+
+    int get0() const { return _v_cvtsi512_si32(val); }
+};
+
+struct v_float32x16
+{
+    typedef float lane_type;
+    enum { nlanes = 16 };
+    __m512 val;
+
+    explicit v_float32x16(__m512 v) : val(v) {}
+    v_float32x16(float v0, float v1, float v2,  float v3,  float v4,  float v5,  float v6,  float v7,
+                 float v8, float v9, float v10, float v11, float v12, float v13, float v14, float v15)
+    {
+        val = _mm512_setr_ps(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
+    }
+    v_float32x16() {}
+
+    static inline v_float32x16 zero() { return v_float32x16(_mm512_setzero_ps()); }
+
+    float get0() const { return _mm_cvtss_f32(_mm512_castps512_ps128(val)); }
+};
+
+struct v_uint64x8
+{
+    typedef uint64 lane_type;
+    enum { nlanes = 8 };
+    __m512i val;
+
+    explicit v_uint64x8(__m512i v) : val(v) {}
+    v_uint64x8(uint64 v0, uint64 v1, uint64 v2, uint64 v3, uint64 v4, uint64 v5, uint64 v6, uint64 v7)
+    { val = _mm512_setr_epi64((int64)v0, (int64)v1, (int64)v2, (int64)v3, (int64)v4, (int64)v5, (int64)v6, (int64)v7); }
+    v_uint64x8() {}
+
+    static inline v_uint64x8 zero() { return v_uint64x8(_mm512_setzero_si512()); }
+
+    uint64 get0() const
+    {
+    #if defined __x86_64__ || defined _M_X64
+        return (uint64)_mm_cvtsi128_si64(_mm512_castsi512_si128(val));
+    #else
+        int a = _mm_cvtsi128_si32(_mm512_castsi512_si128(val));
+        int b = _mm_cvtsi128_si32(_mm512_castsi512_si128(_mm512_srli_epi64(val, 32)));
+        return (unsigned)a | ((uint64)(unsigned)b << 32);
+    #endif
+    }
+};
+
+struct v_int64x8
+{
+    typedef int64 lane_type;
+    enum { nlanes = 8 };
+    __m512i val;
+
+    explicit v_int64x8(__m512i v) : val(v) {}
+    v_int64x8(int64 v0, int64 v1, int64 v2, int64 v3, int64 v4, int64 v5, int64 v6, int64 v7)
+    { val = _mm512_setr_epi64(v0, v1, v2, v3, v4, v5, v6, v7); }
+    v_int64x8() {}
+
+    static inline v_int64x8 zero() { return v_int64x8(_mm512_setzero_si512()); }
+
+    int64 get0() const
+    {
+    #if defined __x86_64__ || defined _M_X64
+        return (int64)_mm_cvtsi128_si64(_mm512_castsi512_si128(val));
+    #else
+        int a = _mm_cvtsi128_si32(_mm512_castsi512_si128(val));
+        int b = _mm_cvtsi128_si32(_mm512_castsi512_si128(_mm512_srli_epi64(val, 32)));
+        return (int64)((unsigned)a | ((uint64)(unsigned)b << 32));
+    #endif
+    }
+};
+
+struct v_float64x8
+{
+    typedef double lane_type;
+    enum { nlanes = 8 };
+    __m512d val;
+
+    explicit v_float64x8(__m512d v) : val(v) {}
+    v_float64x8(double v0, double v1, double v2, double v3, double v4, double v5, double v6, double v7)
+    { val = _mm512_setr_pd(v0, v1, v2, v3, v4, v5, v6, v7); }
+    v_float64x8() {}
+
+    static inline v_float64x8 zero() { return v_float64x8(_mm512_setzero_pd()); }
+
+    double get0() const { return _mm_cvtsd_f64(_mm512_castpd512_pd128(val)); }
+};
+
+//////////////// Load and store operations ///////////////
+
+#define OPENCV_HAL_IMPL_AVX512_LOADSTORE(_Tpvec, _Tp)                    \
+    inline _Tpvec v512_load(const _Tp* ptr)                           \
+    { return _Tpvec(_mm512_loadu_si512((const __m512i*)ptr)); }       \
+    inline _Tpvec v512_load_aligned(const _Tp* ptr)                   \
+    { return _Tpvec(_mm512_load_si512((const __m512i*)ptr)); }        \
+    inline _Tpvec v512_load_low(const _Tp* ptr)                       \
+    {                                                                 \
+        __m256i v256 = _mm256_loadu_si256((const __m256i*)ptr);       \
+        return _Tpvec(_mm512_castsi256_si512(v256));                  \
+    }                                                                 \
+    inline _Tpvec v512_load_halves(const _Tp* ptr0, const _Tp* ptr1)  \
+    {                                                                 \
+        __m256i vlo = _mm256_loadu_si256((const __m256i*)ptr0);       \
+        __m256i vhi = _mm256_loadu_si256((const __m256i*)ptr1);       \
+        return _Tpvec(_v512_combine(vlo, vhi));                       \
+    }                                                                 \
+    inline void v_store(_Tp* ptr, const _Tpvec& a)                    \
+    { _mm512_storeu_si512((__m512i*)ptr, a.val); }                    \
+    inline void v_store_aligned(_Tp* ptr, const _Tpvec& a)            \
+    { _mm512_store_si512((__m512i*)ptr, a.val); }                     \
+    inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a)    \
+    { _mm512_stream_si512((__m512i*)ptr, a.val); }                    \
+    inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
+    { \
+        if( mode == hal::STORE_UNALIGNED ) \
+            _mm512_storeu_si512((__m512i*)ptr, a.val); \
+        else if( mode == hal::STORE_ALIGNED_NOCACHE )  \
+            _mm512_stream_si512((__m512i*)ptr, a.val); \
+        else \
+            _mm512_store_si512((__m512i*)ptr, a.val); \
+    } \
+    inline void v_store_low(_Tp* ptr, const _Tpvec& a)                \
+    { _mm256_storeu_si256((__m256i*)ptr, _v512_extract_low(a.val)); }    \
+    inline void v_store_high(_Tp* ptr, const _Tpvec& a)               \
+    { _mm256_storeu_si256((__m256i*)ptr, _v512_extract_high(a.val)); }
+
+OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_uint8x64,  uchar)
+OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_int8x64,   schar)
+OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_uint16x32, ushort)
+OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_int16x32,  short)
+OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_uint32x16,  unsigned)
+OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_int32x16,   int)
+OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_uint64x8,  uint64)
+OPENCV_HAL_IMPL_AVX512_LOADSTORE(v_int64x8,   int64)
+
+#define OPENCV_HAL_IMPL_AVX512_LOADSTORE_FLT(_Tpvec, _Tp, suffix, halfreg)   \
+    inline _Tpvec v512_load(const _Tp* ptr)                               \
+    { return _Tpvec(_mm512_loadu_##suffix(ptr)); }                        \
+    inline _Tpvec v512_load_aligned(const _Tp* ptr)                       \
+    { return _Tpvec(_mm512_load_##suffix(ptr)); }                         \
+    inline _Tpvec v512_load_low(const _Tp* ptr)                           \
+    {                                                                     \
+        return _Tpvec(_mm512_cast##suffix##256_##suffix##512              \
+                     (_mm256_loadu_##suffix(ptr)));                       \
+    }                                                                     \
+    inline _Tpvec v512_load_halves(const _Tp* ptr0, const _Tp* ptr1)      \
+    {                                                                     \
+        halfreg vlo = _mm256_loadu_##suffix(ptr0);                        \
+        halfreg vhi = _mm256_loadu_##suffix(ptr1);                        \
+        return _Tpvec(_v512_combine(vlo, vhi));                           \
+    }                                                                     \
+    inline void v_store(_Tp* ptr, const _Tpvec& a)                        \
+    { _mm512_storeu_##suffix(ptr, a.val); }                               \
+    inline void v_store_aligned(_Tp* ptr, const _Tpvec& a)                \
+    { _mm512_store_##suffix(ptr, a.val); }                                \
+    inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a)        \
+    { _mm512_stream_##suffix(ptr, a.val); }                               \
+    inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode mode) \
+    { \
+        if( mode == hal::STORE_UNALIGNED ) \
+            _mm512_storeu_##suffix(ptr, a.val); \
+        else if( mode == hal::STORE_ALIGNED_NOCACHE )  \
+            _mm512_stream_##suffix(ptr, a.val); \
+        else \
+            _mm512_store_##suffix(ptr, a.val); \
+    } \
+    inline void v_store_low(_Tp* ptr, const _Tpvec& a)                    \
+    { _mm256_storeu_##suffix(ptr, _v512_extract_low(a.val)); }            \
+    inline void v_store_high(_Tp* ptr, const _Tpvec& a)                   \
+    { _mm256_storeu_##suffix(ptr, _v512_extract_high(a.val)); }
+
+OPENCV_HAL_IMPL_AVX512_LOADSTORE_FLT(v_float32x16, float,  ps, __m256)
+OPENCV_HAL_IMPL_AVX512_LOADSTORE_FLT(v_float64x8, double, pd, __m256d)
+
+#define OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, _Tpvecf, suffix, cast) \
+    inline _Tpvec v_reinterpret_as_##suffix(const _Tpvecf& a)   \
+    { return _Tpvec(cast(a.val)); }
+
+#define OPENCV_HAL_IMPL_AVX512_INIT(_Tpvec, _Tp, suffix, ssuffix, ctype_s)         \
+    inline _Tpvec v512_setzero_##suffix()                                          \
+    { return _Tpvec(_mm512_setzero_si512()); }                                     \
+    inline _Tpvec v512_setall_##suffix(_Tp v)                                      \
+    { return _Tpvec(_mm512_set1_##ssuffix((ctype_s)v)); }                          \
+    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint8x64,   suffix, OPENCV_HAL_NOP)      \
+    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int8x64,    suffix, OPENCV_HAL_NOP)      \
+    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint16x32,  suffix, OPENCV_HAL_NOP)      \
+    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int16x32,   suffix, OPENCV_HAL_NOP)      \
+    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint32x16,  suffix, OPENCV_HAL_NOP)      \
+    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int32x16,   suffix, OPENCV_HAL_NOP)      \
+    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint64x8,   suffix, OPENCV_HAL_NOP)      \
+    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int64x8,    suffix, OPENCV_HAL_NOP)      \
+    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_float32x16, suffix, _mm512_castps_si512) \
+    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_float64x8,  suffix, _mm512_castpd_si512)
+
+OPENCV_HAL_IMPL_AVX512_INIT(v_uint8x64,  uchar,    u8,  epi8,   char)
+OPENCV_HAL_IMPL_AVX512_INIT(v_int8x64,   schar,    s8,  epi8,   char)
+OPENCV_HAL_IMPL_AVX512_INIT(v_uint16x32, ushort,   u16, epi16,  short)
+OPENCV_HAL_IMPL_AVX512_INIT(v_int16x32,  short,    s16, epi16,  short)
+OPENCV_HAL_IMPL_AVX512_INIT(v_uint32x16, unsigned, u32, epi32,  int)
+OPENCV_HAL_IMPL_AVX512_INIT(v_int32x16,  int,      s32, epi32,  int)
+OPENCV_HAL_IMPL_AVX512_INIT(v_uint64x8,  uint64,   u64, epi64,  int64)
+OPENCV_HAL_IMPL_AVX512_INIT(v_int64x8,   int64,    s64, epi64,  int64)
+
+#define OPENCV_HAL_IMPL_AVX512_INIT_FLT(_Tpvec, _Tp, suffix, zsuffix, cast) \
+    inline _Tpvec v512_setzero_##suffix()                                   \
+    { return _Tpvec(_mm512_setzero_##zsuffix()); }                          \
+    inline _Tpvec v512_setall_##suffix(_Tp v)                               \
+    { return _Tpvec(_mm512_set1_##zsuffix(v)); }                            \
+    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint8x64,  suffix, cast)          \
+    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int8x64,   suffix, cast)          \
+    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint16x32, suffix, cast)          \
+    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int16x32,  suffix, cast)          \
+    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint32x16, suffix, cast)          \
+    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int32x16,  suffix, cast)          \
+    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_uint64x8,  suffix, cast)          \
+    OPENCV_HAL_IMPL_AVX512_CAST(_Tpvec, v_int64x8,   suffix, cast)
+
+OPENCV_HAL_IMPL_AVX512_INIT_FLT(v_float32x16, float,  f32, ps, _mm512_castsi512_ps)
+OPENCV_HAL_IMPL_AVX512_INIT_FLT(v_float64x8,  double, f64, pd, _mm512_castsi512_pd)
+
+inline v_float32x16 v_reinterpret_as_f32(const v_float32x16& a)
+{ return a; }
+inline v_float32x16 v_reinterpret_as_f32(const v_float64x8& a)
+{ return v_float32x16(_mm512_castpd_ps(a.val)); }
+
+inline v_float64x8 v_reinterpret_as_f64(const v_float64x8& a)
+{ return a; }
+inline v_float64x8 v_reinterpret_as_f64(const v_float32x16& a)
+{ return v_float64x8(_mm512_castps_pd(a.val)); }
+
+// FP16
+inline v_float32x16 v512_load_expand(const float16_t* ptr)
+{
+    return v_float32x16(_mm512_cvtph_ps(_mm256_loadu_si256((const __m256i*)ptr)));
+}
+
+inline void v_pack_store(float16_t* ptr, const v_float32x16& a)
+{
+    __m256i ah = _mm512_cvtps_ph(a.val, 0);
+    _mm256_storeu_si256((__m256i*)ptr, ah);
+}
+
+/* Recombine & ZIP */
+inline void v_zip(const v_int8x64& a, const v_int8x64& b, v_int8x64& ab0, v_int8x64& ab1)
+{
+#if CV_AVX_512VBMI
+    __m512i mask0 = _v512_set_epu8( 95,  31,  94,  30,  93,  29,  92,  28,  91,  27,  90,  26,  89,  25,  88,  24,
+                                    87,  23,  86,  22,  85,  21,  84,  20,  83,  19,  82,  18,  81,  17,  80,  16,
+                                    79,  15,  78,  14,  77,  13,  76,  12,  75,  11,  74,  10,  73,   9,  72,   8,
+                                    71,   7,  70,   6,  69,   5,  68,   4,  67,   3,  66,   2,  65,   1,  64,   0);
+    ab0 = v_int8x64(_mm512_permutex2var_epi8(a.val, mask0, b.val));
+    __m512i mask1 = _v512_set_epu8(127,  63, 126,  62, 125,  61, 124,  60, 123,  59, 122,  58, 121,  57, 120,  56,
+                                   119,  55, 118,  54, 117,  53, 116,  52, 115,  51, 114,  50, 113,  49, 112,  48,
+                                   111,  47, 110,  46, 109,  45, 108,  44, 107,  43, 106,  42, 105,  41, 104,  40,
+                                   103,  39, 102,  38, 101,  37, 100,  36,  99,  35,  98,  34,  97,  33,  96,  32);
+    ab1 = v_int8x64(_mm512_permutex2var_epi8(a.val, mask1, b.val));
+#else
+    __m512i low  = _mm512_unpacklo_epi8(a.val, b.val);
+    __m512i high = _mm512_unpackhi_epi8(a.val, b.val);
+    ab0 = v_int8x64(_mm512_permutex2var_epi64(low, _v512_set_epu64(11, 10, 3, 2,  9,  8, 1, 0), high));
+    ab1 = v_int8x64(_mm512_permutex2var_epi64(low, _v512_set_epu64(15, 14, 7, 6, 13, 12, 5, 4), high));
+#endif
+}
+inline void v_zip(const v_int16x32& a, const v_int16x32& b, v_int16x32& ab0, v_int16x32& ab1)
+{
+    __m512i mask0 = _v512_set_epu16(47, 15, 46, 14, 45, 13, 44, 12, 43, 11, 42, 10, 41,  9, 40,  8,
+                                    39,  7, 38,  6, 37,  5, 36,  4, 35,  3, 34,  2, 33,  1, 32,  0);
+    ab0 = v_int16x32(_mm512_permutex2var_epi16(a.val, mask0, b.val));
+    __m512i mask1 = _v512_set_epu16(63, 31, 62, 30, 61, 29, 60, 28, 59, 27, 58, 26, 57, 25, 56, 24,
+                                    55, 23, 54, 22, 53, 21, 52, 20, 51, 19, 50, 18, 49, 17, 48, 16);
+    ab1 = v_int16x32(_mm512_permutex2var_epi16(a.val, mask1, b.val));
+}
+inline void v_zip(const v_int32x16& a, const v_int32x16& b, v_int32x16& ab0, v_int32x16& ab1)
+{
+    __m512i mask0 = _v512_set_epu32(23,  7, 22,  6, 21,  5, 20,  4, 19,  3, 18,  2, 17, 1, 16, 0);
+    ab0 = v_int32x16(_mm512_permutex2var_epi32(a.val, mask0, b.val));
+    __m512i mask1 = _v512_set_epu32(31, 15, 30, 14, 29, 13, 28, 12, 27, 11, 26, 10, 25, 9, 24, 8);
+    ab1 = v_int32x16(_mm512_permutex2var_epi32(a.val, mask1, b.val));
+}
+inline void v_zip(const v_int64x8& a, const v_int64x8& b, v_int64x8& ab0, v_int64x8& ab1)
+{
+    __m512i mask0 = _v512_set_epu64(11, 3, 10, 2,  9, 1,  8, 0);
+    ab0 = v_int64x8(_mm512_permutex2var_epi64(a.val, mask0, b.val));
+    __m512i mask1 = _v512_set_epu64(15, 7, 14, 6, 13, 5, 12, 4);
+    ab1 = v_int64x8(_mm512_permutex2var_epi64(a.val, mask1, b.val));
+}
+
+inline void v_zip(const v_uint8x64&  a, const v_uint8x64&  b, v_uint8x64& ab0, v_uint8x64& ab1)
+{
+    v_int8x64 i0, i1;
+    v_zip(v_reinterpret_as_s8(a), v_reinterpret_as_s8(b), i0, i1);
+    ab0 = v_reinterpret_as_u8(i0);
+    ab1 = v_reinterpret_as_u8(i1);
+}
+inline void v_zip(const v_uint16x32&  a, const v_uint16x32&  b, v_uint16x32& ab0, v_uint16x32& ab1)
+{
+    v_int16x32 i0, i1;
+    v_zip(v_reinterpret_as_s16(a), v_reinterpret_as_s16(b), i0, i1);
+    ab0 = v_reinterpret_as_u16(i0);
+    ab1 = v_reinterpret_as_u16(i1);
+}
+inline void v_zip(const v_uint32x16&  a, const v_uint32x16&  b, v_uint32x16& ab0, v_uint32x16& ab1)
+{
+    v_int32x16 i0, i1;
+    v_zip(v_reinterpret_as_s32(a), v_reinterpret_as_s32(b), i0, i1);
+    ab0 = v_reinterpret_as_u32(i0);
+    ab1 = v_reinterpret_as_u32(i1);
+}
+inline void v_zip(const v_uint64x8&  a, const v_uint64x8&  b, v_uint64x8& ab0, v_uint64x8& ab1)
+{
+    v_int64x8 i0, i1;
+    v_zip(v_reinterpret_as_s64(a), v_reinterpret_as_s64(b), i0, i1);
+    ab0 = v_reinterpret_as_u64(i0);
+    ab1 = v_reinterpret_as_u64(i1);
+}
+inline void v_zip(const v_float32x16&  a, const v_float32x16&  b, v_float32x16& ab0, v_float32x16& ab1)
+{
+    v_int32x16 i0, i1;
+    v_zip(v_reinterpret_as_s32(a), v_reinterpret_as_s32(b), i0, i1);
+    ab0 = v_reinterpret_as_f32(i0);
+    ab1 = v_reinterpret_as_f32(i1);
+}
+inline void v_zip(const v_float64x8&  a, const v_float64x8&  b, v_float64x8& ab0, v_float64x8& ab1)
+{
+    v_int64x8 i0, i1;
+    v_zip(v_reinterpret_as_s64(a), v_reinterpret_as_s64(b), i0, i1);
+    ab0 = v_reinterpret_as_f64(i0);
+    ab1 = v_reinterpret_as_f64(i1);
+}
+
+#define OPENCV_HAL_IMPL_AVX512_COMBINE(_Tpvec, suffix)                                    \
+    inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b)                         \
+    { return _Tpvec(_v512_combine(_v512_extract_low(a.val), _v512_extract_low(b.val))); } \
+    inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b)                        \
+    { return _Tpvec(_v512_insert(b.val, _v512_extract_high(a.val))); }                    \
+    inline void v_recombine(const _Tpvec& a, const _Tpvec& b,                             \
+                                  _Tpvec& c, _Tpvec& d)                                   \
+    {                                                                                     \
+        c.val = _v512_combine(_v512_extract_low(a.val),_v512_extract_low(b.val));         \
+        d.val = _v512_insert(b.val,_v512_extract_high(a.val));                            \
+    }
+
+
+OPENCV_HAL_IMPL_AVX512_COMBINE(v_uint8x64,   epi8)
+OPENCV_HAL_IMPL_AVX512_COMBINE(v_int8x64,    epi8)
+OPENCV_HAL_IMPL_AVX512_COMBINE(v_uint16x32,  epi16)
+OPENCV_HAL_IMPL_AVX512_COMBINE(v_int16x32,   epi16)
+OPENCV_HAL_IMPL_AVX512_COMBINE(v_uint32x16,  epi32)
+OPENCV_HAL_IMPL_AVX512_COMBINE(v_int32x16,   epi32)
+OPENCV_HAL_IMPL_AVX512_COMBINE(v_uint64x8,   epi64)
+OPENCV_HAL_IMPL_AVX512_COMBINE(v_int64x8,    epi64)
+OPENCV_HAL_IMPL_AVX512_COMBINE(v_float32x16, ps)
+OPENCV_HAL_IMPL_AVX512_COMBINE(v_float64x8,  pd)
+
+////////// Arithmetic, bitwise and comparison operations /////////
+
+/* Element-wise binary and unary operations */
+
+/** Non-saturating arithmetics **/
+#define OPENCV_HAL_IMPL_AVX512_BIN_FUNC(func, _Tpvec, intrin) \
+    inline _Tpvec func(const _Tpvec& a, const _Tpvec& b)      \
+    { return _Tpvec(intrin(a.val, b.val)); }
+
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_add_wrap, v_uint8x64, _mm512_add_epi8)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_add_wrap, v_int8x64, _mm512_add_epi8)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_add_wrap, v_uint16x32, _mm512_add_epi16)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_add_wrap, v_int16x32, _mm512_add_epi16)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_sub_wrap, v_uint8x64, _mm512_sub_epi8)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_sub_wrap, v_int8x64, _mm512_sub_epi8)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_sub_wrap, v_uint16x32, _mm512_sub_epi16)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_sub_wrap, v_int16x32, _mm512_sub_epi16)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_mul_wrap, v_uint16x32, _mm512_mullo_epi16)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_mul_wrap, v_int16x32, _mm512_mullo_epi16)
+
+inline v_uint8x64 v_mul_wrap(const v_uint8x64& a, const v_uint8x64& b)
+{
+    __m512i ad = _mm512_srai_epi16(a.val, 8);
+    __m512i bd = _mm512_srai_epi16(b.val, 8);
+    __m512i p0 = _mm512_mullo_epi16(a.val, b.val); // even
+    __m512i p1 = _mm512_slli_epi16(_mm512_mullo_epi16(ad, bd), 8); // odd
+    return v_uint8x64(_mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, p0, p1));
+}
+inline v_int8x64 v_mul_wrap(const v_int8x64& a, const v_int8x64& b)
+{
+    return v_reinterpret_as_s8(v_mul_wrap(v_reinterpret_as_u8(a), v_reinterpret_as_u8(b)));
+}
+
+#define OPENCV_HAL_IMPL_AVX512_BIN_OP(bin_op, _Tpvec, intrin)            \
+    inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b)     \
+    { return _Tpvec(intrin(a.val, b.val)); }                             \
+    inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b)       \
+    { a.val = intrin(a.val, b.val); return a; }
+
+OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_uint32x16, _mm512_add_epi32)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_uint32x16, _mm512_sub_epi32)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_int32x16, _mm512_add_epi32)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_int32x16, _mm512_sub_epi32)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_uint64x8, _mm512_add_epi64)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_uint64x8, _mm512_sub_epi64)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_int64x8, _mm512_add_epi64)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_int64x8, _mm512_sub_epi64)
+
+OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_uint32x16, _mm512_mullo_epi32)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_int32x16, _mm512_mullo_epi32)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_uint64x8, _mm512_mullo_epi64)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_int64x8, _mm512_mullo_epi64)
+
+/** Saturating arithmetics **/
+OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_uint8x64,  _mm512_adds_epu8)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_uint8x64,  _mm512_subs_epu8)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_int8x64,   _mm512_adds_epi8)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_int8x64,   _mm512_subs_epi8)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_uint16x32, _mm512_adds_epu16)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_uint16x32, _mm512_subs_epu16)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_int16x32,  _mm512_adds_epi16)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_int16x32,  _mm512_subs_epi16)
+
+OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_float32x16, _mm512_add_ps)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_float32x16, _mm512_sub_ps)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_float32x16, _mm512_mul_ps)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(/, v_float32x16, _mm512_div_ps)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(+, v_float64x8, _mm512_add_pd)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(-, v_float64x8, _mm512_sub_pd)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(*, v_float64x8, _mm512_mul_pd)
+OPENCV_HAL_IMPL_AVX512_BIN_OP(/, v_float64x8, _mm512_div_pd)
+
+// saturating multiply
+inline v_uint8x64 operator * (const v_uint8x64& a, const v_uint8x64& b)
+{
+    v_uint16x32 c, d;
+    v_mul_expand(a, b, c, d);
+    return v_pack(c, d);
+}
+inline v_int8x64 operator * (const v_int8x64& a, const v_int8x64& b)
+{
+    v_int16x32 c, d;
+    v_mul_expand(a, b, c, d);
+    return v_pack(c, d);
+}
+inline v_uint16x32 operator * (const v_uint16x32& a, const v_uint16x32& b)
+{
+    __m512i pl = _mm512_mullo_epi16(a.val, b.val);
+    __m512i ph = _mm512_mulhi_epu16(a.val, b.val);
+    __m512i p0 = _mm512_unpacklo_epi16(pl, ph);
+    __m512i p1 = _mm512_unpackhi_epi16(pl, ph);
+
+    const __m512i m = _mm512_set1_epi32(65535);
+    return v_uint16x32(_mm512_packus_epi32(_mm512_min_epu32(p0, m), _mm512_min_epu32(p1, m)));
+}
+inline v_int16x32 operator * (const v_int16x32& a, const v_int16x32& b)
+{
+    __m512i pl = _mm512_mullo_epi16(a.val, b.val);
+    __m512i ph = _mm512_mulhi_epi16(a.val, b.val);
+    __m512i p0 = _mm512_unpacklo_epi16(pl, ph);
+    __m512i p1 = _mm512_unpackhi_epi16(pl, ph);
+    return v_int16x32(_mm512_packs_epi32(p0, p1));
+}
+
+inline v_uint8x64& operator *= (v_uint8x64& a, const v_uint8x64& b)
+{ a = a * b; return a; }
+inline v_int8x64& operator *= (v_int8x64& a, const v_int8x64& b)
+{ a = a * b; return a; }
+inline v_uint16x32& operator *= (v_uint16x32& a, const v_uint16x32& b)
+{ a = a * b; return a; }
+inline v_int16x32& operator *= (v_int16x32& a, const v_int16x32& b)
+{ a = a * b; return a; }
+
+inline v_int16x32 v_mul_hi(const v_int16x32& a, const v_int16x32& b) { return v_int16x32(_mm512_mulhi_epi16(a.val, b.val)); }
+inline v_uint16x32 v_mul_hi(const v_uint16x32& a, const v_uint16x32& b) { return v_uint16x32(_mm512_mulhi_epu16(a.val, b.val)); }
+
+//  Multiply and expand
+inline void v_mul_expand(const v_uint8x64& a, const v_uint8x64& b,
+                         v_uint16x32& c, v_uint16x32& d)
+{
+    v_uint16x32 a0, a1, b0, b1;
+    v_expand(a, a0, a1);
+    v_expand(b, b0, b1);
+    c = v_mul_wrap(a0, b0);
+    d = v_mul_wrap(a1, b1);
+}
+
+inline void v_mul_expand(const v_int8x64& a, const v_int8x64& b,
+                         v_int16x32& c, v_int16x32& d)
+{
+    v_int16x32 a0, a1, b0, b1;
+    v_expand(a, a0, a1);
+    v_expand(b, b0, b1);
+    c = v_mul_wrap(a0, b0);
+    d = v_mul_wrap(a1, b1);
+}
+
+inline void v_mul_expand(const v_int16x32& a, const v_int16x32& b,
+                         v_int32x16& c, v_int32x16& d)
+{
+    v_int16x32 v0, v1;
+    v_zip(v_mul_wrap(a, b), v_mul_hi(a, b), v0, v1);
+
+    c = v_reinterpret_as_s32(v0);
+    d = v_reinterpret_as_s32(v1);
+}
+
+inline void v_mul_expand(const v_uint16x32& a, const v_uint16x32& b,
+                         v_uint32x16& c, v_uint32x16& d)
+{
+    v_uint16x32 v0, v1;
+    v_zip(v_mul_wrap(a, b), v_mul_hi(a, b), v0, v1);
+
+    c = v_reinterpret_as_u32(v0);
+    d = v_reinterpret_as_u32(v1);
+}
+
+inline void v_mul_expand(const v_uint32x16& a, const v_uint32x16& b,
+                         v_uint64x8& c, v_uint64x8& d)
+{
+    v_zip(v_uint64x8(_mm512_mul_epu32(a.val, b.val)),
+          v_uint64x8(_mm512_mul_epu32(_mm512_srli_epi64(a.val, 32), _mm512_srli_epi64(b.val, 32))), c, d);
+}
+
+inline void v_mul_expand(const v_int32x16& a, const v_int32x16& b,
+    v_int64x8& c, v_int64x8& d)
+{
+    v_zip(v_int64x8(_mm512_mul_epi32(a.val, b.val)),
+          v_int64x8(_mm512_mul_epi32(_mm512_srli_epi64(a.val, 32), _mm512_srli_epi64(b.val, 32))), c, d);
+}
+
+/** Bitwise shifts **/
+#define OPENCV_HAL_IMPL_AVX512_SHIFT_OP(_Tpuvec, _Tpsvec, suffix) \
+    inline _Tpuvec operator << (const _Tpuvec& a, int imm)        \
+    { return _Tpuvec(_mm512_slli_##suffix(a.val, imm)); }         \
+    inline _Tpsvec operator << (const _Tpsvec& a, int imm)        \
+    { return _Tpsvec(_mm512_slli_##suffix(a.val, imm)); }         \
+    inline _Tpuvec operator >> (const _Tpuvec& a, int imm)        \
+    { return _Tpuvec(_mm512_srli_##suffix(a.val, imm)); }         \
+    inline _Tpsvec operator >> (const _Tpsvec& a, int imm)        \
+    { return _Tpsvec(_mm512_srai_##suffix(a.val, imm)); }         \
+    template<int imm>                                             \
+    inline _Tpuvec v_shl(const _Tpuvec& a)                        \
+    { return _Tpuvec(_mm512_slli_##suffix(a.val, imm)); }         \
+    template<int imm>                                             \
+    inline _Tpsvec v_shl(const _Tpsvec& a)                        \
+    { return _Tpsvec(_mm512_slli_##suffix(a.val, imm)); }         \
+    template<int imm>                                             \
+    inline _Tpuvec v_shr(const _Tpuvec& a)                        \
+    { return _Tpuvec(_mm512_srli_##suffix(a.val, imm)); }         \
+    template<int imm>                                             \
+    inline _Tpsvec v_shr(const _Tpsvec& a)                        \
+    { return _Tpsvec(_mm512_srai_##suffix(a.val, imm)); }
+
+OPENCV_HAL_IMPL_AVX512_SHIFT_OP(v_uint16x32, v_int16x32, epi16)
+OPENCV_HAL_IMPL_AVX512_SHIFT_OP(v_uint32x16, v_int32x16, epi32)
+OPENCV_HAL_IMPL_AVX512_SHIFT_OP(v_uint64x8,  v_int64x8,  epi64)
+
+
+/** Bitwise logic **/
+#define OPENCV_HAL_IMPL_AVX512_LOGIC_OP(_Tpvec, suffix, not_const) \
+    OPENCV_HAL_IMPL_AVX512_BIN_OP(&, _Tpvec, _mm512_and_##suffix)  \
+    OPENCV_HAL_IMPL_AVX512_BIN_OP(|, _Tpvec, _mm512_or_##suffix)   \
+    OPENCV_HAL_IMPL_AVX512_BIN_OP(^, _Tpvec, _mm512_xor_##suffix)  \
+    inline _Tpvec operator ~ (const _Tpvec& a)                     \
+    { return _Tpvec(_mm512_xor_##suffix(a.val, not_const)); }
+
+OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_uint8x64,   si512, _mm512_set1_epi32(-1))
+OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_int8x64,    si512, _mm512_set1_epi32(-1))
+OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_uint16x32,  si512, _mm512_set1_epi32(-1))
+OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_int16x32,   si512, _mm512_set1_epi32(-1))
+OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_uint32x16,  si512, _mm512_set1_epi32(-1))
+OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_int32x16,   si512, _mm512_set1_epi32(-1))
+OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_uint64x8,   si512, _mm512_set1_epi64(-1))
+OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_int64x8,    si512, _mm512_set1_epi64(-1))
+OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_float32x16, ps,    _mm512_castsi512_ps(_mm512_set1_epi32(-1)))
+OPENCV_HAL_IMPL_AVX512_LOGIC_OP(v_float64x8,  pd,    _mm512_castsi512_pd(_mm512_set1_epi32(-1)))
+
+/** Select **/
+#define OPENCV_HAL_IMPL_AVX512_SELECT(_Tpvec, suffix, zsuf)                      \
+    inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
+    { return _Tpvec(_mm512_mask_blend_##suffix(_mm512_cmp_##suffix##_mask(mask.val, _mm512_setzero_##zsuf(), _MM_CMPINT_EQ), a.val, b.val)); }
+
+OPENCV_HAL_IMPL_AVX512_SELECT(v_uint8x64,   epi8, si512)
+OPENCV_HAL_IMPL_AVX512_SELECT(v_int8x64,    epi8, si512)
+OPENCV_HAL_IMPL_AVX512_SELECT(v_uint16x32, epi16, si512)
+OPENCV_HAL_IMPL_AVX512_SELECT(v_int16x32,  epi16, si512)
+OPENCV_HAL_IMPL_AVX512_SELECT(v_uint32x16, epi32, si512)
+OPENCV_HAL_IMPL_AVX512_SELECT(v_int32x16,  epi32, si512)
+OPENCV_HAL_IMPL_AVX512_SELECT(v_uint64x8,  epi64, si512)
+OPENCV_HAL_IMPL_AVX512_SELECT(v_int64x8,   epi64, si512)
+OPENCV_HAL_IMPL_AVX512_SELECT(v_float32x16,   ps,    ps)
+OPENCV_HAL_IMPL_AVX512_SELECT(v_float64x8,    pd,    pd)
+
+/** Comparison **/
+#define OPENCV_HAL_IMPL_AVX512_CMP_INT(bin_op, imm8, _Tpvec, sufcmp, sufset, tval) \
+    inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b)               \
+    { return _Tpvec(_mm512_maskz_set1_##sufset(_mm512_cmp_##sufcmp##_mask(a.val, b.val, imm8), tval)); }
+
+#define OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(_Tpvec, sufcmp, sufset, tval)              \
+    OPENCV_HAL_IMPL_AVX512_CMP_INT(==, _MM_CMPINT_EQ,  _Tpvec, sufcmp, sufset, tval) \
+    OPENCV_HAL_IMPL_AVX512_CMP_INT(!=, _MM_CMPINT_NE,  _Tpvec, sufcmp, sufset, tval) \
+    OPENCV_HAL_IMPL_AVX512_CMP_INT(<,  _MM_CMPINT_LT,  _Tpvec, sufcmp, sufset, tval) \
+    OPENCV_HAL_IMPL_AVX512_CMP_INT(>,  _MM_CMPINT_NLE, _Tpvec, sufcmp, sufset, tval) \
+    OPENCV_HAL_IMPL_AVX512_CMP_INT(<=, _MM_CMPINT_LE,  _Tpvec, sufcmp, sufset, tval) \
+    OPENCV_HAL_IMPL_AVX512_CMP_INT(>=, _MM_CMPINT_NLT, _Tpvec, sufcmp, sufset, tval)
+
+OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_uint8x64,   epu8,  epi8, (char)-1)
+OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_int8x64,    epi8,  epi8, (char)-1)
+OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_uint16x32, epu16, epi16, (short)-1)
+OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_int16x32,  epi16, epi16, (short)-1)
+OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_uint32x16, epu32, epi32, (int)-1)
+OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_int32x16,  epi32, epi32, (int)-1)
+OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_uint64x8,  epu64, epi64, (int64)-1)
+OPENCV_HAL_IMPL_AVX512_CMP_OP_INT(v_int64x8,   epi64, epi64, (int64)-1)
+
+#define OPENCV_HAL_IMPL_AVX512_CMP_FLT(bin_op, imm8, _Tpvec, sufcmp, sufset, tval) \
+    inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b)               \
+    { return _Tpvec(_mm512_castsi512_##sufcmp(_mm512_maskz_set1_##sufset(_mm512_cmp_##sufcmp##_mask(a.val, b.val, imm8), tval))); }
+
+#define OPENCV_HAL_IMPL_AVX512_CMP_OP_FLT(_Tpvec, sufcmp, sufset, tval)           \
+    OPENCV_HAL_IMPL_AVX512_CMP_FLT(==, _CMP_EQ_OQ,  _Tpvec, sufcmp, sufset, tval) \
+    OPENCV_HAL_IMPL_AVX512_CMP_FLT(!=, _CMP_NEQ_OQ, _Tpvec, sufcmp, sufset, tval) \
+    OPENCV_HAL_IMPL_AVX512_CMP_FLT(<,  _CMP_LT_OQ,  _Tpvec, sufcmp, sufset, tval) \
+    OPENCV_HAL_IMPL_AVX512_CMP_FLT(>,  _CMP_GT_OQ,  _Tpvec, sufcmp, sufset, tval) \
+    OPENCV_HAL_IMPL_AVX512_CMP_FLT(<=, _CMP_LE_OQ,  _Tpvec, sufcmp, sufset, tval) \
+    OPENCV_HAL_IMPL_AVX512_CMP_FLT(>=, _CMP_GE_OQ,  _Tpvec, sufcmp, sufset, tval)
+
+OPENCV_HAL_IMPL_AVX512_CMP_OP_FLT(v_float32x16, ps, epi32, (int)-1)
+OPENCV_HAL_IMPL_AVX512_CMP_OP_FLT(v_float64x8,  pd, epi64, (int64)-1)
+
+inline v_float32x16 v_not_nan(const v_float32x16& a)
+{ return v_float32x16(_mm512_castsi512_ps(_mm512_maskz_set1_epi32(_mm512_cmp_ps_mask(a.val, a.val, _CMP_ORD_Q), (int)-1))); }
+inline v_float64x8 v_not_nan(const v_float64x8& a)
+{ return v_float64x8(_mm512_castsi512_pd(_mm512_maskz_set1_epi64(_mm512_cmp_pd_mask(a.val, a.val, _CMP_ORD_Q), (int64)-1))); }
+
+/** min/max **/
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_uint8x64,   _mm512_min_epu8)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_uint8x64,   _mm512_max_epu8)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_int8x64,    _mm512_min_epi8)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_int8x64,    _mm512_max_epi8)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_uint16x32,  _mm512_min_epu16)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_uint16x32,  _mm512_max_epu16)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_int16x32,   _mm512_min_epi16)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_int16x32,   _mm512_max_epi16)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_uint32x16,  _mm512_min_epu32)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_uint32x16,  _mm512_max_epu32)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_int32x16,   _mm512_min_epi32)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_int32x16,   _mm512_max_epi32)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_uint64x8,   _mm512_min_epu64)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_uint64x8,   _mm512_max_epu64)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_int64x8,    _mm512_min_epi64)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_int64x8,    _mm512_max_epi64)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_float32x16, _mm512_min_ps)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_float32x16, _mm512_max_ps)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_min, v_float64x8,  _mm512_min_pd)
+OPENCV_HAL_IMPL_AVX512_BIN_FUNC(v_max, v_float64x8,  _mm512_max_pd)
+
+/** Rotate **/
+namespace {
+    template<bool prec, int imm4, bool part, int imm32>
+    struct _v_rotate_right { static inline v_int8x64 eval(const v_int8x64&, const v_int8x64&) { return v_int8x64(); }};
+    template<int imm4, int imm32>
+    struct _v_rotate_right<true, imm4, false, imm32> { static inline v_int8x64 eval(const v_int8x64& a, const v_int8x64& b)
+    {
+        return v_int8x64(_mm512_or_si512(_mm512_srli_epi32(_mm512_alignr_epi32(b.val, a.val, imm32    ),    imm4 *8),
+                                         _mm512_slli_epi32(_mm512_alignr_epi32(b.val, a.val, imm32 + 1), (4-imm4)*8)));
+    }};
+    template<int imm4>
+    struct _v_rotate_right<true, imm4, false, 15> { static inline v_int8x64 eval(const v_int8x64& a, const v_int8x64& b)
+    {
+        return v_int8x64(_mm512_or_si512(_mm512_srli_epi32(_mm512_alignr_epi32(b.val, a.val, 15),    imm4 *8),
+                                         _mm512_slli_epi32(                                b.val, (4-imm4)*8)));
+    }};
+    template<int imm4, int imm32>
+    struct _v_rotate_right<true, imm4, true, imm32> { static inline v_int8x64 eval(const v_int8x64&, const v_int8x64& b)
+    {
+        return v_int8x64(_mm512_or_si512(_mm512_srli_epi32(_mm512_alignr_epi32(_mm512_setzero_si512(), b.val, imm32 - 16),    imm4 *8),
+                                         _mm512_slli_epi32(_mm512_alignr_epi32(_mm512_setzero_si512(), b.val, imm32 - 15), (4-imm4)*8)));
+    }};
+    template<int imm4>
+    struct _v_rotate_right<true, imm4, true, 31> { static inline v_int8x64 eval(const v_int8x64&, const v_int8x64& b)
+    { return v_int8x64(_mm512_srli_epi32(_mm512_alignr_epi32(_mm512_setzero_si512(), b.val, 15), imm4*8)); }};
+    template<int imm32>
+    struct _v_rotate_right<false, 0, false, imm32> { static inline v_int8x64 eval(const v_int8x64& a, const v_int8x64& b)
+    { return v_int8x64(_mm512_alignr_epi32(b.val, a.val, imm32)); }};
+    template<>
+    struct _v_rotate_right<false, 0, false, 0> { static inline v_int8x64 eval(const v_int8x64& a, const v_int8x64&) { return a; }};
+    template<int imm32>
+    struct _v_rotate_right<false, 0, true, imm32> { static inline v_int8x64 eval(const v_int8x64&, const v_int8x64& b)
+    { return v_int8x64(_mm512_alignr_epi32(_mm512_setzero_si512(), b.val, imm32 - 16)); }};
+    template<>
+    struct _v_rotate_right<false, 0, true, 16> { static inline v_int8x64 eval(const v_int8x64&, const v_int8x64& b) { return b; }};
+    template<>
+    struct _v_rotate_right<false, 0, true, 32> { static inline v_int8x64 eval(const v_int8x64&, const v_int8x64&) { return v_int8x64(); }};
+}
+template<int imm> inline v_int8x64 v_rotate_right(const v_int8x64& a, const v_int8x64& b)
+{
+    return imm >= 128 ? v_int8x64() :
+#if CV_AVX_512VBMI
+    v_int8x64(_mm512_permutex2var_epi8(a.val,
+    _v512_set_epu8(0x3f + imm, 0x3e + imm, 0x3d + imm, 0x3c + imm, 0x3b + imm, 0x3a + imm, 0x39 + imm, 0x38 + imm,
+                   0x37 + imm, 0x36 + imm, 0x35 + imm, 0x34 + imm, 0x33 + imm, 0x32 + imm, 0x31 + imm, 0x30 + imm,
+                   0x2f + imm, 0x2e + imm, 0x2d + imm, 0x2c + imm, 0x2b + imm, 0x2a + imm, 0x29 + imm, 0x28 + imm,
+                   0x27 + imm, 0x26 + imm, 0x25 + imm, 0x24 + imm, 0x23 + imm, 0x22 + imm, 0x21 + imm, 0x20 + imm,
+                   0x1f + imm, 0x1e + imm, 0x1d + imm, 0x1c + imm, 0x1b + imm, 0x1a + imm, 0x19 + imm, 0x18 + imm,
+                   0x17 + imm, 0x16 + imm, 0x15 + imm, 0x14 + imm, 0x13 + imm, 0x12 + imm, 0x11 + imm, 0x10 + imm,
+                   0x0f + imm, 0x0e + imm, 0x0d + imm, 0x0c + imm, 0x0b + imm, 0x0a + imm, 0x09 + imm, 0x08 + imm,
+                   0x07 + imm, 0x06 + imm, 0x05 + imm, 0x04 + imm, 0x03 + imm, 0x02 + imm, 0x01 + imm, 0x00 + imm), b.val));
+#else
+    _v_rotate_right<imm%4!=0, imm%4, (imm/4 > 15), imm/4>::eval(a, b);
+#endif
+}
+template<int imm>
+inline v_int8x64 v_rotate_left(const v_int8x64& a, const v_int8x64& b)
+{
+    if (imm == 0) return a;
+    if (imm == 64) return b;
+    if (imm >= 128) return v_int8x64();
+#if CV_AVX_512VBMI
+    return v_int8x64(_mm512_permutex2var_epi8(b.val,
+           _v512_set_epi8(0x7f - imm,0x7e - imm,0x7d - imm,0x7c - imm,0x7b - imm,0x7a - imm,0x79 - imm,0x78 - imm,
+                          0x77 - imm,0x76 - imm,0x75 - imm,0x74 - imm,0x73 - imm,0x72 - imm,0x71 - imm,0x70 - imm,
+                          0x6f - imm,0x6e - imm,0x6d - imm,0x6c - imm,0x6b - imm,0x6a - imm,0x69 - imm,0x68 - imm,
+                          0x67 - imm,0x66 - imm,0x65 - imm,0x64 - imm,0x63 - imm,0x62 - imm,0x61 - imm,0x60 - imm,
+                          0x5f - imm,0x5e - imm,0x5d - imm,0x5c - imm,0x5b - imm,0x5a - imm,0x59 - imm,0x58 - imm,
+                          0x57 - imm,0x56 - imm,0x55 - imm,0x54 - imm,0x53 - imm,0x52 - imm,0x51 - imm,0x50 - imm,
+                          0x4f - imm,0x4e - imm,0x4d - imm,0x4c - imm,0x4b - imm,0x4a - imm,0x49 - imm,0x48 - imm,
+                          0x47 - imm,0x46 - imm,0x45 - imm,0x44 - imm,0x43 - imm,0x42 - imm,0x41 - imm,0x40 - imm), a.val));
+#else
+    return imm < 64 ? v_rotate_right<64 - imm>(b, a) : v_rotate_right<128 - imm>(v512_setzero_s8(), b);
+#endif
+}
+template<int imm>
+inline v_int8x64 v_rotate_right(const v_int8x64& a)
+{
+    if (imm == 0) return a;
+    if (imm >= 64) return v_int8x64();
+#if CV_AVX_512VBMI
+    return v_int8x64(_mm512_maskz_permutexvar_epi8(0xFFFFFFFFFFFFFFFF >> imm,
+           _v512_set_epu8(0x3f + imm,0x3e + imm,0x3d + imm,0x3c + imm,0x3b + imm,0x3a + imm,0x39 + imm,0x38 + imm,
+                          0x37 + imm,0x36 + imm,0x35 + imm,0x34 + imm,0x33 + imm,0x32 + imm,0x31 + imm,0x30 + imm,
+                          0x2f + imm,0x2e + imm,0x2d + imm,0x2c + imm,0x2b + imm,0x2a + imm,0x29 + imm,0x28 + imm,
+                          0x27 + imm,0x26 + imm,0x25 + imm,0x24 + imm,0x23 + imm,0x22 + imm,0x21 + imm,0x20 + imm,
+                          0x1f + imm,0x1e + imm,0x1d + imm,0x1c + imm,0x1b + imm,0x1a + imm,0x19 + imm,0x18 + imm,
+                          0x17 + imm,0x16 + imm,0x15 + imm,0x14 + imm,0x13 + imm,0x12 + imm,0x11 + imm,0x10 + imm,
+                          0x0f + imm,0x0e + imm,0x0d + imm,0x0c + imm,0x0b + imm,0x0a + imm,0x09 + imm,0x08 + imm,
+                          0x07 + imm,0x06 + imm,0x05 + imm,0x04 + imm,0x03 + imm,0x02 + imm,0x01 + imm,0x00 + imm), a.val));
+#else
+    return v_rotate_right<imm>(a, v512_setzero_s8());
+#endif
+}
+template<int imm>
+inline v_int8x64 v_rotate_left(const v_int8x64& a)
+{
+    if (imm == 0) return a;
+    if (imm >= 64) return v_int8x64();
+#if CV_AVX_512VBMI
+    return v_int8x64(_mm512_maskz_permutexvar_epi8(0xFFFFFFFFFFFFFFFF << imm,
+           _v512_set_epi8(0x3f - imm,0x3e - imm,0x3d - imm,0x3c - imm,0x3b - imm,0x3a - imm,0x39 - imm,0x38 - imm,
+                          0x37 - imm,0x36 - imm,0x35 - imm,0x34 - imm,0x33 - imm,0x32 - imm,0x31 - imm,0x30 - imm,
+                          0x2f - imm,0x2e - imm,0x2d - imm,0x2c - imm,0x2b - imm,0x2a - imm,0x29 - imm,0x28 - imm,
+                          0x27 - imm,0x26 - imm,0x25 - imm,0x24 - imm,0x23 - imm,0x22 - imm,0x21 - imm,0x20 - imm,
+                          0x1f - imm,0x1e - imm,0x1d - imm,0x1c - imm,0x1b - imm,0x1a - imm,0x19 - imm,0x18 - imm,
+                          0x17 - imm,0x16 - imm,0x15 - imm,0x14 - imm,0x13 - imm,0x12 - imm,0x11 - imm,0x10 - imm,
+                          0x0f - imm,0x0e - imm,0x0d - imm,0x0c - imm,0x0b - imm,0x0a - imm,0x09 - imm,0x08 - imm,
+                          0x07 - imm,0x06 - imm,0x05 - imm,0x04 - imm,0x03 - imm,0x02 - imm,0x01 - imm,0x00 - imm), a.val));
+#else
+    return v_rotate_right<64 - imm>(v512_setzero_s8(), a);
+#endif
+}
+
+#define OPENCV_HAL_IMPL_AVX512_ROTATE_PM(_Tpvec, suffix)                                                                                   \
+template<int imm> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b)                                                            \
+{ return v_reinterpret_as_##suffix(v_rotate_left<imm * sizeof(_Tpvec::lane_type)>(v_reinterpret_as_s8(a), v_reinterpret_as_s8(b))); }      \
+template<int imm> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b)                                                           \
+{ return v_reinterpret_as_##suffix(v_rotate_right<imm * sizeof(_Tpvec::lane_type)>(v_reinterpret_as_s8(a), v_reinterpret_as_s8(b))); }     \
+template<int imm> inline _Tpvec v_rotate_left(const _Tpvec& a)                                                                             \
+{ return v_reinterpret_as_##suffix(v_rotate_left<imm * sizeof(_Tpvec::lane_type)>(v_reinterpret_as_s8(a))); }                              \
+template<int imm> inline _Tpvec v_rotate_right(const _Tpvec& a)                                                                            \
+{ return v_reinterpret_as_##suffix(v_rotate_right<imm * sizeof(_Tpvec::lane_type)>(v_reinterpret_as_s8(a))); }
+
+#define OPENCV_HAL_IMPL_AVX512_ROTATE_EC(_Tpvec, suffix)                                                                                   \
+template<int imm>                                                                                                                          \
+inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b)                                                                              \
+{                                                                                                                                          \
+    enum { SHIFT2 = (_Tpvec::nlanes - imm) };                                                                                              \
+    enum { MASK = ((1 << _Tpvec::nlanes) - 1) };                                                                                           \
+    if (imm == 0) return a;                                                                                                                \
+    if (imm == _Tpvec::nlanes) return b;                                                                                                   \
+    if (imm >= 2*_Tpvec::nlanes) return _Tpvec::zero();                                                                                    \
+    return _Tpvec(_mm512_mask_expand_##suffix(_mm512_maskz_compress_##suffix((MASK << SHIFT2)&MASK, b.val), (MASK << (imm))&MASK, a.val)); \
+}                                                                                                                                          \
+template<int imm>                                                                                                                          \
+inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b)                                                                             \
+{                                                                                                                                          \
+    enum { SHIFT2 = (_Tpvec::nlanes - imm) };                                                                                              \
+    enum { MASK = ((1 << _Tpvec::nlanes) - 1) };                                                                                           \
+    if (imm == 0) return a;                                                                                                                \
+    if (imm == _Tpvec::nlanes) return b;                                                                                                   \
+    if (imm >= 2*_Tpvec::nlanes) return _Tpvec::zero();                                                                                    \
+    return _Tpvec(_mm512_mask_expand_##suffix(_mm512_maskz_compress_##suffix((MASK << (imm))&MASK, a.val), (MASK << SHIFT2)&MASK, b.val)); \
+}                                                                                                                                          \
+template<int imm>                                                                                                                          \
+inline _Tpvec v_rotate_left(const _Tpvec& a)                                                                                               \
+{                                                                                                                                          \
+    if (imm == 0) return a;                                                                                                                \
+    if (imm >= _Tpvec::nlanes) return _Tpvec::zero();                                                                                      \
+    return _Tpvec(_mm512_maskz_expand_##suffix((1 << _Tpvec::nlanes) - (1 << (imm)), a.val));                                              \
+}                                                                                                                                          \
+template<int imm>                                                                                                                          \
+inline _Tpvec v_rotate_right(const _Tpvec& a)                                                                                              \
+{                                                                                                                                          \
+    if (imm == 0) return a;                                                                                                                \
+    if (imm >= _Tpvec::nlanes) return _Tpvec::zero();                                                                                      \
+    return _Tpvec(_mm512_maskz_compress_##suffix((1 << _Tpvec::nlanes) - (1 << (imm)), a.val));                                            \
+}
+
+OPENCV_HAL_IMPL_AVX512_ROTATE_PM(v_uint8x64,   u8)
+OPENCV_HAL_IMPL_AVX512_ROTATE_PM(v_uint16x32,  u16)
+OPENCV_HAL_IMPL_AVX512_ROTATE_PM(v_int16x32,   s16)
+OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_uint32x16,  epi32)
+OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_int32x16,   epi32)
+OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_uint64x8,   epi64)
+OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_int64x8,    epi64)
+OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_float32x16, ps)
+OPENCV_HAL_IMPL_AVX512_ROTATE_EC(v_float64x8,  pd)
+
+/** Reverse **/
+inline v_uint8x64 v_reverse(const v_uint8x64 &a)
+{
+#if CV_AVX_512VBMI
+    static const __m512i perm = _mm512_set_epi32(
+            0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f,
+            0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f,
+            0x20212223, 0x24252627, 0x28292a2b, 0x2c2d2e2f,
+            0x30313233, 0x34353637, 0x38393a3b, 0x3c3d3e3f);
+    return v_uint8x64(_mm512_permutexvar_epi8(perm, a.val));
+#else
+    static const __m512i shuf = _mm512_set_epi32(
+            0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f,
+            0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f,
+            0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f,
+            0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f);
+    static const __m512i perm = _mm512_set_epi64(1, 0, 3, 2, 5, 4, 7, 6);
+    __m512i vec = _mm512_shuffle_epi8(a.val, shuf);
+    return v_uint8x64(_mm512_permutexvar_epi64(perm, vec));
+#endif
+}
+
+inline v_int8x64 v_reverse(const v_int8x64 &a)
+{ return v_reinterpret_as_s8(v_reverse(v_reinterpret_as_u8(a))); }
+
+inline v_uint16x32 v_reverse(const v_uint16x32 &a)
+{
+#if CV_AVX_512VBMI
+    static const __m512i perm = _mm512_set_epi32(
+            0x00000001, 0x00020003, 0x00040005, 0x00060007,
+            0x00080009, 0x000a000b, 0x000c000d, 0x000e000f,
+            0x00100011, 0x00120013, 0x00140015, 0x00160017,
+            0x00180019, 0x001a001b, 0x001c001d, 0x001e001f);
+    return v_uint16x32(_mm512_permutexvar_epi16(perm, a.val));
+#else
+    static const __m512i shuf = _mm512_set_epi32(
+            0x01000302, 0x05040706, 0x09080b0a, 0x0d0c0f0e,
+            0x01000302, 0x05040706, 0x09080b0a, 0x0d0c0f0e,
+            0x01000302, 0x05040706, 0x09080b0a, 0x0d0c0f0e,
+            0x01000302, 0x05040706, 0x09080b0a, 0x0d0c0f0e);
+    static const __m512i perm = _mm512_set_epi64(1, 0, 3, 2, 5, 4, 7, 6);
+    __m512i vec = _mm512_shuffle_epi8(a.val, shuf);
+    return v_uint16x32(_mm512_permutexvar_epi64(perm, vec));
+#endif
+}
+
+inline v_int16x32 v_reverse(const v_int16x32 &a)
+{ return v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); }
+
+inline v_uint32x16 v_reverse(const v_uint32x16 &a)
+{
+    static const __m512i perm = _mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,14, 15);
+    return v_uint32x16(_mm512_permutexvar_epi32(perm, a.val));
+}
+
+inline v_int32x16 v_reverse(const v_int32x16 &a)
+{ return v_reinterpret_as_s32(v_reverse(v_reinterpret_as_u32(a))); }
+
+inline v_float32x16 v_reverse(const v_float32x16 &a)
+{ return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); }
+
+inline v_uint64x8 v_reverse(const v_uint64x8 &a)
+{
+    static const __m512i perm = _mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7);
+    return v_uint64x8(_mm512_permutexvar_epi64(perm, a.val));
+}
+
+inline v_int64x8 v_reverse(const v_int64x8 &a)
+{ return v_reinterpret_as_s64(v_reverse(v_reinterpret_as_u64(a))); }
+
+inline v_float64x8 v_reverse(const v_float64x8 &a)
+{ return v_reinterpret_as_f64(v_reverse(v_reinterpret_as_u64(a))); }
+
+////////// Reduce /////////
+
+/** Reduce **/
+#define OPENCV_HAL_IMPL_AVX512_REDUCE_ADD64(a, b) a + b
+#define OPENCV_HAL_IMPL_AVX512_REDUCE_8(sctype, func, _Tpvec, ifunc, scop)                                          \
+    inline sctype v_reduce_##func(const _Tpvec& a)                                                                  \
+    { __m256i half = _mm256_##ifunc(_v512_extract_low(a.val), _v512_extract_high(a.val));                           \
+      sctype CV_DECL_ALIGNED(64) idx[2];                                                                            \
+      _mm_store_si128((__m128i*)idx, _mm_##ifunc(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1))); \
+      return scop(idx[0], idx[1]); }
+OPENCV_HAL_IMPL_AVX512_REDUCE_8(uint64, min, v_uint64x8, min_epu64, min)
+OPENCV_HAL_IMPL_AVX512_REDUCE_8(uint64, max, v_uint64x8, max_epu64, max)
+OPENCV_HAL_IMPL_AVX512_REDUCE_8(uint64, sum, v_uint64x8, add_epi64, OPENCV_HAL_IMPL_AVX512_REDUCE_ADD64)
+OPENCV_HAL_IMPL_AVX512_REDUCE_8(int64,  min, v_int64x8,  min_epi64, min)
+OPENCV_HAL_IMPL_AVX512_REDUCE_8(int64,  max, v_int64x8,  max_epi64, max)
+OPENCV_HAL_IMPL_AVX512_REDUCE_8(int64,  sum, v_int64x8,  add_epi64, OPENCV_HAL_IMPL_AVX512_REDUCE_ADD64)
+
+#define OPENCV_HAL_IMPL_AVX512_REDUCE_8F(func, ifunc, scop)                                         \
+    inline double v_reduce_##func(const v_float64x8& a)                                             \
+    { __m256d half = _mm256_##ifunc(_v512_extract_low(a.val), _v512_extract_high(a.val));           \
+      double CV_DECL_ALIGNED(64) idx[2];                                                            \
+      _mm_store_pd(idx, _mm_##ifunc(_mm256_castpd256_pd128(half), _mm256_extractf128_pd(half, 1))); \
+      return scop(idx[0], idx[1]); }
+OPENCV_HAL_IMPL_AVX512_REDUCE_8F(min, min_pd, min)
+OPENCV_HAL_IMPL_AVX512_REDUCE_8F(max, max_pd, max)
+OPENCV_HAL_IMPL_AVX512_REDUCE_8F(sum, add_pd, OPENCV_HAL_IMPL_AVX512_REDUCE_ADD64)
+
+#define OPENCV_HAL_IMPL_AVX512_REDUCE_16(sctype, func, _Tpvec, ifunc)                                 \
+    inline sctype v_reduce_##func(const _Tpvec& a)                                                    \
+    { __m256i half = _mm256_##ifunc(_v512_extract_low(a.val), _v512_extract_high(a.val));             \
+      __m128i quarter = _mm_##ifunc(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1)); \
+      quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 8));                                     \
+      quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 4));                                     \
+      return (sctype)_mm_cvtsi128_si32(quarter); }
+OPENCV_HAL_IMPL_AVX512_REDUCE_16(uint, min, v_uint32x16, min_epu32)
+OPENCV_HAL_IMPL_AVX512_REDUCE_16(uint, max, v_uint32x16, max_epu32)
+OPENCV_HAL_IMPL_AVX512_REDUCE_16(int,  min, v_int32x16,  min_epi32)
+OPENCV_HAL_IMPL_AVX512_REDUCE_16(int,  max, v_int32x16,  max_epi32)
+
+#define OPENCV_HAL_IMPL_AVX512_REDUCE_16F(func, ifunc)                                            \
+    inline float v_reduce_##func(const v_float32x16& a)                                           \
+    { __m256 half = _mm256_##ifunc(_v512_extract_low(a.val), _v512_extract_high(a.val));          \
+      __m128 quarter = _mm_##ifunc(_mm256_castps256_ps128(half), _mm256_extractf128_ps(half, 1)); \
+      quarter = _mm_##ifunc(quarter, _mm_permute_ps(quarter, _MM_SHUFFLE(0, 0, 3, 2)));           \
+      quarter = _mm_##ifunc(quarter, _mm_permute_ps(quarter, _MM_SHUFFLE(0, 0, 0, 1)));           \
+      return _mm_cvtss_f32(quarter); }
+OPENCV_HAL_IMPL_AVX512_REDUCE_16F(min, min_ps)
+OPENCV_HAL_IMPL_AVX512_REDUCE_16F(max, max_ps)
+
+inline float v_reduce_sum(const v_float32x16& a)
+{
+    __m256 half = _mm256_add_ps(_v512_extract_low(a.val), _v512_extract_high(a.val));
+    __m128 quarter = _mm_add_ps(_mm256_castps256_ps128(half), _mm256_extractf128_ps(half, 1));
+    quarter = _mm_hadd_ps(quarter, quarter);
+    return _mm_cvtss_f32(_mm_hadd_ps(quarter, quarter));
+}
+inline int v_reduce_sum(const v_int32x16& a)
+{
+    __m256i half = _mm256_add_epi32(_v512_extract_low(a.val), _v512_extract_high(a.val));
+    __m128i quarter = _mm_add_epi32(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1));
+    quarter = _mm_hadd_epi32(quarter, quarter);
+    return _mm_cvtsi128_si32(_mm_hadd_epi32(quarter, quarter));
+}
+inline uint v_reduce_sum(const v_uint32x16& a)
+{ return (uint)v_reduce_sum(v_reinterpret_as_s32(a)); }
+
+#define OPENCV_HAL_IMPL_AVX512_REDUCE_32(sctype, func, _Tpvec, ifunc)                                 \
+    inline sctype v_reduce_##func(const _Tpvec& a)                                                    \
+    { __m256i half = _mm256_##ifunc(_v512_extract_low(a.val), _v512_extract_high(a.val));             \
+      __m128i quarter = _mm_##ifunc(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1)); \
+      quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 8));                                     \
+      quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 4));                                     \
+      quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 2));                                     \
+      return (sctype)_mm_cvtsi128_si32(quarter); }
+OPENCV_HAL_IMPL_AVX512_REDUCE_32(ushort, min, v_uint16x32, min_epu16)
+OPENCV_HAL_IMPL_AVX512_REDUCE_32(ushort, max, v_uint16x32, max_epu16)
+OPENCV_HAL_IMPL_AVX512_REDUCE_32(short,  min, v_int16x32,  min_epi16)
+OPENCV_HAL_IMPL_AVX512_REDUCE_32(short,  max, v_int16x32,  max_epi16)
+
+inline int v_reduce_sum(const v_int16x32& a)
+{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
+inline uint v_reduce_sum(const v_uint16x32& a)
+{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
+
+#define OPENCV_HAL_IMPL_AVX512_REDUCE_64(sctype, func, _Tpvec, ifunc)                                 \
+    inline sctype v_reduce_##func(const _Tpvec& a)                                                    \
+    { __m256i half = _mm256_##ifunc(_v512_extract_low(a.val), _v512_extract_high(a.val));             \
+      __m128i quarter = _mm_##ifunc(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1)); \
+      quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 8));                                     \
+      quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 4));                                     \
+      quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 2));                                     \
+      quarter = _mm_##ifunc(quarter, _mm_srli_si128(quarter, 1));                                     \
+      return (sctype)_mm_cvtsi128_si32(quarter); }
+OPENCV_HAL_IMPL_AVX512_REDUCE_64(uchar, min, v_uint8x64, min_epu8)
+OPENCV_HAL_IMPL_AVX512_REDUCE_64(uchar, max, v_uint8x64, max_epu8)
+OPENCV_HAL_IMPL_AVX512_REDUCE_64(schar, min, v_int8x64,  min_epi8)
+OPENCV_HAL_IMPL_AVX512_REDUCE_64(schar, max, v_int8x64,  max_epi8)
+
+#define OPENCV_HAL_IMPL_AVX512_REDUCE_64_SUM(sctype, _Tpvec, suffix)                                    \
+    inline sctype v_reduce_sum(const _Tpvec& a)                                                         \
+    {   __m512i a16 = _mm512_add_epi16(_mm512_cvt##suffix##_epi16(_v512_extract_low(a.val)),            \
+                                       _mm512_cvt##suffix##_epi16(_v512_extract_high(a.val)));          \
+        a16 = _mm512_cvtepi16_epi32(_mm256_add_epi16(_v512_extract_low(a16), _v512_extract_high(a16))); \
+        __m256i a8 = _mm256_add_epi32(_v512_extract_low(a16), _v512_extract_high(a16));                 \
+        __m128i a4 = _mm_add_epi32(_mm256_castsi256_si128(a8), _mm256_extracti128_si256(a8, 1));        \
+        a4 = _mm_hadd_epi32(a4, a4);                                                                    \
+        return (sctype)_mm_cvtsi128_si32(_mm_hadd_epi32(a4, a4)); }
+OPENCV_HAL_IMPL_AVX512_REDUCE_64_SUM(uint, v_uint8x64, epu8)
+OPENCV_HAL_IMPL_AVX512_REDUCE_64_SUM(int,  v_int8x64,  epi8)
+
+inline v_float32x16 v_reduce_sum4(const v_float32x16& a, const v_float32x16& b,
+                                  const v_float32x16& c, const v_float32x16& d)
+{
+    __m256 abl = _mm256_hadd_ps(_v512_extract_low(a.val), _v512_extract_low(b.val));
+    __m256 abh = _mm256_hadd_ps(_v512_extract_high(a.val), _v512_extract_high(b.val));
+    __m256 cdl = _mm256_hadd_ps(_v512_extract_low(c.val), _v512_extract_low(d.val));
+    __m256 cdh = _mm256_hadd_ps(_v512_extract_high(c.val), _v512_extract_high(d.val));
+    return v_float32x16(_v512_combine(_mm256_hadd_ps(abl, cdl), _mm256_hadd_ps(abh, cdh)));
+}
+
+inline unsigned v_reduce_sad(const v_uint8x64& a, const v_uint8x64& b)
+{
+    __m512i val = _mm512_sad_epu8(a.val, b.val);
+    __m256i half = _mm256_add_epi32(_v512_extract_low(val), _v512_extract_high(val));
+    __m128i quarter = _mm_add_epi32(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1));
+    return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(quarter, _mm_unpackhi_epi64(quarter, quarter)));
+}
+inline unsigned v_reduce_sad(const v_int8x64& a, const v_int8x64& b)
+{
+    __m512i val = _mm512_set1_epi8(-128);
+    val = _mm512_sad_epu8(_mm512_add_epi8(a.val, val), _mm512_add_epi8(b.val, val));
+    __m256i half = _mm256_add_epi32(_v512_extract_low(val), _v512_extract_high(val));
+    __m128i quarter = _mm_add_epi32(_mm256_castsi256_si128(half), _mm256_extracti128_si256(half, 1));
+    return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(quarter, _mm_unpackhi_epi64(quarter, quarter)));
+}
+inline unsigned v_reduce_sad(const v_uint16x32& a, const v_uint16x32& b)
+{ return v_reduce_sum(v_add_wrap(a - b, b - a)); }
+inline unsigned v_reduce_sad(const v_int16x32& a, const v_int16x32& b)
+{ return v_reduce_sum(v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b)))); }
+inline unsigned v_reduce_sad(const v_uint32x16& a, const v_uint32x16& b)
+{ return v_reduce_sum(v_max(a, b) - v_min(a, b)); }
+inline unsigned v_reduce_sad(const v_int32x16& a, const v_int32x16& b)
+{ return v_reduce_sum(v_reinterpret_as_u32(v_max(a, b) - v_min(a, b))); }
+inline float v_reduce_sad(const v_float32x16& a, const v_float32x16& b)
+{ return v_reduce_sum((a - b) & v_float32x16(_mm512_castsi512_ps(_mm512_set1_epi32(0x7fffffff)))); }
+inline double v_reduce_sad(const v_float64x8& a, const v_float64x8& b)
+{ return v_reduce_sum((a - b) & v_float64x8(_mm512_castsi512_pd(_mm512_set1_epi64(0x7fffffffffffffff)))); }
+
+/** Popcount **/
+inline v_uint8x64 v_popcount(const v_int8x64& a)
+{
+#if CV_AVX_512BITALG
+    return v_uint8x64(_mm512_popcnt_epi8(a.val));
+#elif CV_AVX_512VBMI
+    __m512i _popcnt_table0 = _v512_set_epu8(7, 6, 6, 5, 6, 5, 5, 4, 6, 5, 5, 4, 5, 4, 4, 3,
+                                            5, 4, 4, 3, 4, 3, 3, 2, 4, 3, 3, 2, 3, 2, 2, 1,
+                                            5, 4, 4, 3, 4, 3, 3, 2, 4, 3, 3, 2, 3, 2, 2, 1,
+                                            4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0);
+    __m512i _popcnt_table1 = _v512_set_epu8(7, 6, 6, 5, 6, 5, 5, 4, 6, 5, 5, 4, 5, 4, 4, 3,
+                                            6, 5, 5, 4, 5, 4, 4, 3, 5, 4, 4, 3, 4, 3, 3, 2,
+                                            6, 5, 5, 4, 5, 4, 4, 3, 5, 4, 4, 3, 4, 3, 3, 2,
+                                            5, 4, 4, 3, 4, 3, 3, 2, 4, 3, 3, 2, 3, 2, 2, 1);
+    return v_uint8x64(_mm512_sub_epi8(_mm512_permutex2var_epi8(_popcnt_table0, a.val, _popcnt_table1), _mm512_movm_epi8(_mm512_movepi8_mask(a.val))));
+#else
+    __m512i _popcnt_table = _mm512_set4_epi32(0x04030302, 0x03020201, 0x03020201, 0x02010100);
+    __m512i _popcnt_mask = _mm512_set1_epi8(0x0F);
+
+    return v_uint8x64(_mm512_add_epi8(_mm512_shuffle_epi8(_popcnt_table, _mm512_and_si512(                  a.val,     _popcnt_mask)),
+                                      _mm512_shuffle_epi8(_popcnt_table, _mm512_and_si512(_mm512_srli_epi16(a.val, 4), _popcnt_mask))));
+#endif
+}
+inline v_uint16x32 v_popcount(const v_int16x32& a)
+{
+#if CV_AVX_512BITALG
+    return v_uint16x32(_mm512_popcnt_epi16(a.val));
+#elif CV_AVX_512VPOPCNTDQ
+    __m512i zero = _mm512_setzero_si512();
+    return v_uint16x32(_mm512_packs_epi32(_mm512_popcnt_epi32(_mm512_unpacklo_epi16(a.val, zero)),
+                                          _mm512_popcnt_epi32(_mm512_unpackhi_epi16(a.val, zero))));
+#else
+    v_uint8x64 p = v_popcount(v_reinterpret_as_s8(a));
+    p += v_rotate_right<1>(p);
+    return v_reinterpret_as_u16(p) & v512_setall_u16(0x00ff);
+#endif
+}
+inline v_uint32x16 v_popcount(const v_int32x16& a)
+{
+#if CV_AVX_512VPOPCNTDQ
+    return v_uint32x16(_mm512_popcnt_epi32(a.val));
+#else
+    v_uint8x64 p = v_popcount(v_reinterpret_as_s8(a));
+    p += v_rotate_right<1>(p);
+    p += v_rotate_right<2>(p);
+    return v_reinterpret_as_u32(p) & v512_setall_u32(0x000000ff);
+#endif
+}
+inline v_uint64x8 v_popcount(const v_int64x8& a)
+{
+#if CV_AVX_512VPOPCNTDQ
+    return v_uint64x8(_mm512_popcnt_epi64(a.val));
+#else
+    return v_uint64x8(_mm512_sad_epu8(v_popcount(v_reinterpret_as_s8(a)).val, _mm512_setzero_si512()));
+#endif
+}
+
+
+inline v_uint8x64  v_popcount(const v_uint8x64&  a) { return v_popcount(v_reinterpret_as_s8 (a)); }
+inline v_uint16x32 v_popcount(const v_uint16x32& a) { return v_popcount(v_reinterpret_as_s16(a)); }
+inline v_uint32x16 v_popcount(const v_uint32x16& a) { return v_popcount(v_reinterpret_as_s32(a)); }
+inline v_uint64x8  v_popcount(const v_uint64x8&  a) { return v_popcount(v_reinterpret_as_s64(a)); }
+
+
+////////// Other math /////////
+
+/** Some frequent operations **/
+#define OPENCV_HAL_IMPL_AVX512_MULADD(_Tpvec, suffix)                         \
+    inline _Tpvec v_fma(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c)    \
+    { return _Tpvec(_mm512_fmadd_##suffix(a.val, b.val, c.val)); }            \
+    inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
+    { return _Tpvec(_mm512_fmadd_##suffix(a.val, b.val, c.val)); }            \
+    inline _Tpvec v_sqrt(const _Tpvec& x)                                     \
+    { return _Tpvec(_mm512_sqrt_##suffix(x.val)); }                           \
+    inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b)           \
+    { return v_fma(a, a, b * b); }                                            \
+    inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b)               \
+    { return v_sqrt(v_fma(a, a, b * b)); }
+
+OPENCV_HAL_IMPL_AVX512_MULADD(v_float32x16, ps)
+OPENCV_HAL_IMPL_AVX512_MULADD(v_float64x8,  pd)
+
+inline v_int32x16 v_fma(const v_int32x16& a, const v_int32x16& b, const v_int32x16& c)
+{ return a * b + c; }
+inline v_int32x16 v_muladd(const v_int32x16& a, const v_int32x16& b, const v_int32x16& c)
+{ return v_fma(a, b, c); }
+
+inline v_float32x16 v_invsqrt(const v_float32x16& x)
+{
+#if CV_AVX_512ER
+    return v_float32x16(_mm512_rsqrt28_ps(x.val));
+#else
+    v_float32x16 half = x * v512_setall_f32(0.5);
+    v_float32x16 t  = v_float32x16(_mm512_rsqrt14_ps(x.val));
+    t *= v512_setall_f32(1.5) - ((t * t) * half);
+    return t;
+#endif
+}
+
+inline v_float64x8 v_invsqrt(const v_float64x8& x)
+{
+#if CV_AVX_512ER
+    return v_float64x8(_mm512_rsqrt28_pd(x.val));
+#else
+    return v512_setall_f64(1.) / v_sqrt(x);
+//    v_float64x8 half = x * v512_setall_f64(0.5);
+//    v_float64x8 t = v_float64x8(_mm512_rsqrt14_pd(x.val));
+//    t *= v512_setall_f64(1.5) - ((t * t) * half);
+//    t *= v512_setall_f64(1.5) - ((t * t) * half);
+//    return t;
+#endif
+}
+
+/** Absolute values **/
+#define OPENCV_HAL_IMPL_AVX512_ABS(_Tpvec, _Tpuvec, suffix) \
+    inline _Tpuvec v_abs(const _Tpvec& x)                   \
+    { return _Tpuvec(_mm512_abs_##suffix(x.val)); }
+
+OPENCV_HAL_IMPL_AVX512_ABS(v_int8x64,    v_uint8x64,    epi8)
+OPENCV_HAL_IMPL_AVX512_ABS(v_int16x32,   v_uint16x32,  epi16)
+OPENCV_HAL_IMPL_AVX512_ABS(v_int32x16,   v_uint32x16,  epi32)
+OPENCV_HAL_IMPL_AVX512_ABS(v_int64x8,    v_uint64x8,   epi64)
+
+inline v_float32x16 v_abs(const v_float32x16& x)
+{
+#ifdef _mm512_abs_pd
+    return v_float32x16(_mm512_abs_ps(x.val));
+#else
+    return v_float32x16(_mm512_castsi512_ps(_mm512_and_si512(_mm512_castps_si512(x.val),
+                        _v512_set_epu64(0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF,
+                                        0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF, 0x7FFFFFFF7FFFFFFF))));
+#endif
+}
+
+inline v_float64x8 v_abs(const v_float64x8& x)
+{
+#ifdef _mm512_abs_pd
+    #if defined __GNUC__ && (__GNUC__ < 7 || (__GNUC__ == 7 && __GNUC_MINOR__ <= 3) || (__GNUC__ == 8 && __GNUC_MINOR__ <= 2))
+        // Workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=87476
+        return v_float64x8(_mm512_abs_pd(_mm512_castpd_ps(x.val)));
+    #else
+        return v_float64x8(_mm512_abs_pd(x.val));
+    #endif
+#else
+    return v_float64x8(_mm512_castsi512_pd(_mm512_and_si512(_mm512_castpd_si512(x.val),
+                       _v512_set_epu64(0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF,
+                                       0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF))));
+#endif
+}
+
+/** Absolute difference **/
+inline v_uint8x64 v_absdiff(const v_uint8x64& a, const v_uint8x64& b)
+{ return v_add_wrap(a - b,  b - a); }
+inline v_uint16x32 v_absdiff(const v_uint16x32& a, const v_uint16x32& b)
+{ return v_add_wrap(a - b,  b - a); }
+inline v_uint32x16 v_absdiff(const v_uint32x16& a, const v_uint32x16& b)
+{ return v_max(a, b) - v_min(a, b); }
+
+inline v_uint8x64 v_absdiff(const v_int8x64& a, const v_int8x64& b)
+{
+    v_int8x64 d = v_sub_wrap(a, b);
+    v_int8x64 m = a < b;
+    return v_reinterpret_as_u8(v_sub_wrap(d ^ m, m));
+}
+
+inline v_uint16x32 v_absdiff(const v_int16x32& a, const v_int16x32& b)
+{ return v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b))); }
+
+inline v_uint32x16 v_absdiff(const v_int32x16& a, const v_int32x16& b)
+{
+    v_int32x16 d = a - b;
+    v_int32x16 m = a < b;
+    return v_reinterpret_as_u32((d ^ m) - m);
+}
+
+inline v_float32x16 v_absdiff(const v_float32x16& a, const v_float32x16& b)
+{ return v_abs(a - b); }
+
+inline v_float64x8 v_absdiff(const v_float64x8& a, const v_float64x8& b)
+{ return v_abs(a - b); }
+
+/** Saturating absolute difference **/
+inline v_int8x64 v_absdiffs(const v_int8x64& a, const v_int8x64& b)
+{
+    v_int8x64 d = a - b;
+    v_int8x64 m = a < b;
+    return (d ^ m) - m;
+}
+inline v_int16x32 v_absdiffs(const v_int16x32& a, const v_int16x32& b)
+{ return v_max(a, b) - v_min(a, b); }
+
+////////// Conversions /////////
+
+/** Rounding **/
+inline v_int32x16 v_round(const v_float32x16& a)
+{ return v_int32x16(_mm512_cvtps_epi32(a.val)); }
+
+inline v_int32x16 v_round(const v_float64x8& a)
+{ return v_int32x16(_mm512_castsi256_si512(_mm512_cvtpd_epi32(a.val))); }
+
+inline v_int32x16 v_round(const v_float64x8& a, const v_float64x8& b)
+{ return v_int32x16(_v512_combine(_mm512_cvtpd_epi32(a.val), _mm512_cvtpd_epi32(b.val))); }
+
+inline v_int32x16 v_trunc(const v_float32x16& a)
+{ return v_int32x16(_mm512_cvttps_epi32(a.val)); }
+
+inline v_int32x16 v_trunc(const v_float64x8& a)
+{ return v_int32x16(_mm512_castsi256_si512(_mm512_cvttpd_epi32(a.val))); }
+
+#if CVT_ROUND_MODES_IMPLEMENTED
+inline v_int32x16 v_floor(const v_float32x16& a)
+{ return v_int32x16(_mm512_cvt_roundps_epi32(a.val, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)); }
+
+inline v_int32x16 v_floor(const v_float64x8& a)
+{ return v_int32x16(_mm512_castsi256_si512(_mm512_cvt_roundpd_epi32(a.val, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC))); }
+
+inline v_int32x16 v_ceil(const v_float32x16& a)
+{ return v_int32x16(_mm512_cvt_roundps_epi32(a.val, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)); }
+
+inline v_int32x16 v_ceil(const v_float64x8& a)
+{ return v_int32x16(_mm512_castsi256_si512(_mm512_cvt_roundpd_epi32(a.val, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC))); }
+#else
+inline v_int32x16 v_floor(const v_float32x16& a)
+{ return v_int32x16(_mm512_cvtps_epi32(_mm512_roundscale_ps(a.val, 1))); }
+
+inline v_int32x16 v_floor(const v_float64x8& a)
+{ return v_int32x16(_mm512_castsi256_si512(_mm512_cvtpd_epi32(_mm512_roundscale_pd(a.val, 1)))); }
+
+inline v_int32x16 v_ceil(const v_float32x16& a)
+{ return v_int32x16(_mm512_cvtps_epi32(_mm512_roundscale_ps(a.val, 2))); }
+
+inline v_int32x16 v_ceil(const v_float64x8& a)
+{ return v_int32x16(_mm512_castsi256_si512(_mm512_cvtpd_epi32(_mm512_roundscale_pd(a.val, 2)))); }
+#endif
+
+/** To float **/
+inline v_float32x16 v_cvt_f32(const v_int32x16& a)
+{ return v_float32x16(_mm512_cvtepi32_ps(a.val)); }
+
+inline v_float32x16 v_cvt_f32(const v_float64x8& a)
+{ return v_float32x16(_mm512_cvtpd_pslo(a.val)); }
+
+inline v_float32x16 v_cvt_f32(const v_float64x8& a, const v_float64x8& b)
+{ return v_float32x16(_v512_combine(_mm512_cvtpd_ps(a.val), _mm512_cvtpd_ps(b.val))); }
+
+inline v_float64x8 v_cvt_f64(const v_int32x16& a)
+{ return v_float64x8(_mm512_cvtepi32_pd(_v512_extract_low(a.val))); }
+
+inline v_float64x8 v_cvt_f64_high(const v_int32x16& a)
+{ return v_float64x8(_mm512_cvtepi32_pd(_v512_extract_high(a.val))); }
+
+inline v_float64x8 v_cvt_f64(const v_float32x16& a)
+{ return v_float64x8(_mm512_cvtps_pd(_v512_extract_low(a.val))); }
+
+inline v_float64x8 v_cvt_f64_high(const v_float32x16& a)
+{ return v_float64x8(_mm512_cvtps_pd(_v512_extract_high(a.val))); }
+
+// from (Mysticial and wim) https://stackoverflow.com/q/41144668
+inline v_float64x8 v_cvt_f64(const v_int64x8& v)
+{
+#if CV_AVX_512DQ
+    return v_float64x8(_mm512_cvtepi64_pd(v.val));
+#else
+    // constants encoded as floating-point
+    __m512i magic_i_lo   = _mm512_set1_epi64(0x4330000000000000); // 2^52
+    __m512i magic_i_hi32 = _mm512_set1_epi64(0x4530000080000000); // 2^84 + 2^63
+    __m512i magic_i_all  = _mm512_set1_epi64(0x4530000080100000); // 2^84 + 2^63 + 2^52
+    __m512d magic_d_all  = _mm512_castsi512_pd(magic_i_all);
+
+    // Blend the 32 lowest significant bits of v with magic_int_lo
+    __m512i v_lo         = _mm512_mask_blend_epi32(0x5555, magic_i_lo, v.val);
+    // Extract the 32 most significant bits of v
+    __m512i v_hi         = _mm512_srli_epi64(v.val, 32);
+    // Flip the msb of v_hi and blend with 0x45300000
+            v_hi         = _mm512_xor_si512(v_hi, magic_i_hi32);
+    // Compute in double precision
+    __m512d v_hi_dbl     = _mm512_sub_pd(_mm512_castsi512_pd(v_hi), magic_d_all);
+    // (v_hi - magic_d_all) + v_lo  Do not assume associativity of floating point addition
+    __m512d result       = _mm512_add_pd(v_hi_dbl, _mm512_castsi512_pd(v_lo));
+    return v_float64x8(result);
+#endif
+}
+
+////////////// Lookup table access ////////////////////
+
+inline v_int8x64 v512_lut(const schar* tab, const int* idx)
+{
+    __m128i p0 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx    ), (const int *)tab, 1));
+    __m128i p1 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx + 1), (const int *)tab, 1));
+    __m128i p2 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx + 2), (const int *)tab, 1));
+    __m128i p3 = _mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx + 3), (const int *)tab, 1));
+    return v_int8x64(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_castsi128_si512(p0), p1, 1), p2, 2), p3, 3));
+}
+inline v_int8x64 v512_lut_pairs(const schar* tab, const int* idx)
+{
+    __m256i p0 = _mm512_cvtepi32_epi16(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx    ), (const int *)tab, 1));
+    __m256i p1 = _mm512_cvtepi32_epi16(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx + 1), (const int *)tab, 1));
+    return v_int8x64(_v512_combine(p0, p1));
+}
+inline v_int8x64 v512_lut_quads(const schar* tab, const int* idx)
+{
+    return v_int8x64(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx), (const int *)tab, 1));
+}
+inline v_uint8x64 v512_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v512_lut((const schar *)tab, idx)); }
+inline v_uint8x64 v512_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v512_lut_pairs((const schar *)tab, idx)); }
+inline v_uint8x64 v512_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v512_lut_quads((const schar *)tab, idx)); }
+
+inline v_int16x32 v512_lut(const short* tab, const int* idx)
+{
+    __m256i p0 = _mm512_cvtepi32_epi16(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx    ), (const int *)tab, 2));
+    __m256i p1 = _mm512_cvtepi32_epi16(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx + 1), (const int *)tab, 2));
+    return v_int16x32(_v512_combine(p0, p1));
+}
+inline v_int16x32 v512_lut_pairs(const short* tab, const int* idx)
+{
+    return v_int16x32(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx), (const int *)tab, 2));
+}
+inline v_int16x32 v512_lut_quads(const short* tab, const int* idx)
+{
+#if defined(__GNUC__)
+    return v_int16x32(_mm512_i32gather_epi64(_mm256_loadu_si256((const __m256i*)idx), (const long long int*)tab, 2));
+#else
+    return v_int16x32(_mm512_i32gather_epi64(_mm256_loadu_si256((const __m256i*)idx), (const int64*)tab, 2));
+#endif
+}
+inline v_uint16x32 v512_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v512_lut((const short *)tab, idx)); }
+inline v_uint16x32 v512_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v512_lut_pairs((const short *)tab, idx)); }
+inline v_uint16x32 v512_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v512_lut_quads((const short *)tab, idx)); }
+
+inline v_int32x16 v512_lut(const int* tab, const int* idx)
+{
+    return v_int32x16(_mm512_i32gather_epi32(_mm512_loadu_si512((const __m512i*)idx), tab, 4));
+}
+inline v_int32x16 v512_lut_pairs(const int* tab, const int* idx)
+{
+#if defined(__GNUC__)
+    return v_int32x16(_mm512_i32gather_epi64(_mm256_loadu_si256((const __m256i*)idx), (const long long int*)tab, 4));
+#else
+    return v_int32x16(_mm512_i32gather_epi64(_mm256_loadu_si256((const __m256i*)idx), (const int64*)tab, 4));
+#endif
+}
+inline v_int32x16 v512_lut_quads(const int* tab, const int* idx)
+{
+    return v_int32x16(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_castsi128_si512(
+                          _mm_loadu_si128((const __m128i*)(tab + idx[0]))),
+                          _mm_loadu_si128((const __m128i*)(tab + idx[1])), 1),
+                          _mm_loadu_si128((const __m128i*)(tab + idx[2])), 2),
+                          _mm_loadu_si128((const __m128i*)(tab + idx[3])), 3));
+}
+inline v_uint32x16 v512_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v512_lut((const int *)tab, idx)); }
+inline v_uint32x16 v512_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v512_lut_pairs((const int *)tab, idx)); }
+inline v_uint32x16 v512_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v512_lut_quads((const int *)tab, idx)); }
+
+inline v_int64x8 v512_lut(const int64* tab, const int* idx)
+{
+#if defined(__GNUC__)
+    return v_int64x8(_mm512_i32gather_epi64(_mm256_loadu_si256((const __m256i*)idx), (const long long int*)tab, 8));
+#else
+    return v_int64x8(_mm512_i32gather_epi64(_mm256_loadu_si256((const __m256i*)idx), tab , 8));
+#endif
+}
+inline v_int64x8 v512_lut_pairs(const int64* tab, const int* idx)
+{
+    return v_int64x8(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_inserti32x4(_mm512_castsi128_si512(
+                         _mm_loadu_si128((const __m128i*)(tab + idx[0]))),
+                         _mm_loadu_si128((const __m128i*)(tab + idx[1])), 1),
+                         _mm_loadu_si128((const __m128i*)(tab + idx[2])), 2),
+                         _mm_loadu_si128((const __m128i*)(tab + idx[3])), 3));
+}
+inline v_uint64x8 v512_lut(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v512_lut((const int64 *)tab, idx)); }
+inline v_uint64x8 v512_lut_pairs(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v512_lut_pairs((const int64 *)tab, idx)); }
+
+inline v_float32x16 v512_lut(const float* tab, const int* idx)
+{
+    return v_float32x16(_mm512_i32gather_ps(_mm512_loadu_si512((const __m512i*)idx), tab, 4));
+}
+inline v_float32x16 v512_lut_pairs(const float* tab, const int* idx) { return v_reinterpret_as_f32(v512_lut_pairs((const int *)tab, idx)); }
+inline v_float32x16 v512_lut_quads(const float* tab, const int* idx) { return v_reinterpret_as_f32(v512_lut_quads((const int *)tab, idx)); }
+
+inline v_float64x8 v512_lut(const double* tab, const int* idx)
+{
+    return v_float64x8(_mm512_i32gather_pd(_mm256_loadu_si256((const __m256i*)idx), tab, 8));
+}
+inline v_float64x8 v512_lut_pairs(const double* tab, const int* idx)
+{
+        return v_float64x8(_mm512_insertf64x2(_mm512_insertf64x2(_mm512_insertf64x2(_mm512_castpd128_pd512(
+                               _mm_loadu_pd(tab + idx[0])),
+                               _mm_loadu_pd(tab + idx[1]), 1),
+                               _mm_loadu_pd(tab + idx[2]), 2),
+                               _mm_loadu_pd(tab + idx[3]), 3));
+}
+
+inline v_int32x16 v_lut(const int* tab, const v_int32x16& idxvec)
+{
+    return v_int32x16(_mm512_i32gather_epi32(idxvec.val, tab, 4));
+}
+
+inline v_uint32x16 v_lut(const unsigned* tab, const v_int32x16& idxvec)
+{
+    return v_reinterpret_as_u32(v_lut((const int *)tab, idxvec));
+}
+
+inline v_float32x16 v_lut(const float* tab, const v_int32x16& idxvec)
+{
+    return v_float32x16(_mm512_i32gather_ps(idxvec.val, tab, 4));
+}
+
+inline v_float64x8 v_lut(const double* tab, const v_int32x16& idxvec)
+{
+    return v_float64x8(_mm512_i32gather_pd(_v512_extract_low(idxvec.val), tab, 8));
+}
+
+inline void v_lut_deinterleave(const float* tab, const v_int32x16& idxvec, v_float32x16& x, v_float32x16& y)
+{
+    x.val = _mm512_i32gather_ps(idxvec.val, tab, 4);
+    y.val = _mm512_i32gather_ps(idxvec.val, &tab[1], 4);
+}
+
+inline void v_lut_deinterleave(const double* tab, const v_int32x16& idxvec, v_float64x8& x, v_float64x8& y)
+{
+    x.val = _mm512_i32gather_pd(_v512_extract_low(idxvec.val), tab, 8);
+    y.val = _mm512_i32gather_pd(_v512_extract_low(idxvec.val), &tab[1], 8);
+}
+
+inline v_int8x64 v_interleave_pairs(const v_int8x64& vec)
+{
+    return v_int8x64(_mm512_shuffle_epi8(vec.val, _mm512_set4_epi32(0x0f0d0e0c, 0x0b090a08, 0x07050604, 0x03010200)));
+}
+inline v_uint8x64 v_interleave_pairs(const v_uint8x64& vec) { return v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec))); }
+inline v_int8x64 v_interleave_quads(const v_int8x64& vec)
+{
+    return v_int8x64(_mm512_shuffle_epi8(vec.val, _mm512_set4_epi32(0x0f0b0e0a, 0x0d090c08, 0x07030602, 0x05010400)));
+}
+inline v_uint8x64 v_interleave_quads(const v_uint8x64& vec) { return v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec))); }
+
+inline v_int16x32 v_interleave_pairs(const v_int16x32& vec)
+{
+    return v_int16x32(_mm512_shuffle_epi8(vec.val, _mm512_set4_epi32(0x0f0e0b0a, 0x0d0c0908, 0x07060302, 0x05040100)));
+}
+inline v_uint16x32 v_interleave_pairs(const v_uint16x32& vec) { return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
+inline v_int16x32 v_interleave_quads(const v_int16x32& vec)
+{
+    return v_int16x32(_mm512_shuffle_epi8(vec.val, _mm512_set4_epi32(0x0f0e0706, 0x0d0c0504, 0x0b0a0302, 0x09080100)));
+}
+inline v_uint16x32 v_interleave_quads(const v_uint16x32& vec) { return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }
+
+inline v_int32x16 v_interleave_pairs(const v_int32x16& vec)
+{
+    return v_int32x16(_mm512_shuffle_epi32(vec.val, _MM_PERM_ACBD));
+}
+inline v_uint32x16 v_interleave_pairs(const v_uint32x16& vec) { return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
+inline v_float32x16 v_interleave_pairs(const v_float32x16& vec) { return v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
+
+inline v_int8x64 v_pack_triplets(const v_int8x64& vec)
+{
+    return v_int8x64(_mm512_permutexvar_epi32(_v512_set_epu64(0x0000000f0000000f, 0x0000000f0000000f, 0x0000000e0000000d, 0x0000000c0000000a,
+                                                              0x0000000900000008, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000),
+                                              _mm512_shuffle_epi8(vec.val, _mm512_set4_epi32(0xffffff0f, 0x0e0d0c0a, 0x09080605, 0x04020100))));
+}
+inline v_uint8x64 v_pack_triplets(const v_uint8x64& vec) { return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }
+
+inline v_int16x32 v_pack_triplets(const v_int16x32& vec)
+{
+    return v_int16x32(_mm512_permutexvar_epi16(_v512_set_epu64(0x001f001f001f001f, 0x001f001f001f001f, 0x001e001d001c001a, 0x0019001800160015,
+                                                               0x0014001200110010, 0x000e000d000c000a, 0x0009000800060005, 0x0004000200010000), vec.val));
+}
+inline v_uint16x32 v_pack_triplets(const v_uint16x32& vec) { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
+
+inline v_int32x16 v_pack_triplets(const v_int32x16& vec)
+{
+    return v_int32x16(_mm512_permutexvar_epi32(_v512_set_epu64(0x0000000f0000000f, 0x0000000f0000000f, 0x0000000e0000000d, 0x0000000c0000000a,
+                                                               0x0000000900000008, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000), vec.val));
+}
+inline v_uint32x16 v_pack_triplets(const v_uint32x16& vec) { return v_reinterpret_as_u32(v_pack_triplets(v_reinterpret_as_s32(vec))); }
+inline v_float32x16 v_pack_triplets(const v_float32x16& vec)
+{
+    return v_float32x16(_mm512_permutexvar_ps(_v512_set_epu64(0x0000000f0000000f, 0x0000000f0000000f, 0x0000000e0000000d, 0x0000000c0000000a,
+                                                              0x0000000900000008, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000), vec.val));
+}
+
+////////// Matrix operations /////////
+
+//////// Dot Product ////////
+
+// 16 >> 32
+inline v_int32x16 v_dotprod(const v_int16x32& a, const v_int16x32& b)
+{ return v_int32x16(_mm512_madd_epi16(a.val, b.val)); }
+inline v_int32x16 v_dotprod(const v_int16x32& a, const v_int16x32& b, const v_int32x16& c)
+{ return v_dotprod(a, b) + c; }
+
+// 32 >> 64
+inline v_int64x8 v_dotprod(const v_int32x16& a, const v_int32x16& b)
+{
+    __m512i even = _mm512_mul_epi32(a.val, b.val);
+    __m512i odd = _mm512_mul_epi32(_mm512_srli_epi64(a.val, 32), _mm512_srli_epi64(b.val, 32));
+    return v_int64x8(_mm512_add_epi64(even, odd));
+}
+inline v_int64x8 v_dotprod(const v_int32x16& a, const v_int32x16& b, const v_int64x8& c)
+{ return v_dotprod(a, b) + c; }
+
+// 8 >> 32
+inline v_uint32x16 v_dotprod_expand(const v_uint8x64& a, const v_uint8x64& b)
+{
+    __m512i even_a = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, a.val, _mm512_setzero_si512());
+    __m512i odd_a  = _mm512_srli_epi16(a.val, 8);
+
+    __m512i even_b = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, b.val, _mm512_setzero_si512());
+    __m512i odd_b  = _mm512_srli_epi16(b.val, 8);
+
+    __m512i prod0  = _mm512_madd_epi16(even_a, even_b);
+    __m512i prod1  = _mm512_madd_epi16(odd_a, odd_b);
+    return v_uint32x16(_mm512_add_epi32(prod0, prod1));
+}
+inline v_uint32x16 v_dotprod_expand(const v_uint8x64& a, const v_uint8x64& b, const v_uint32x16& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+inline v_int32x16 v_dotprod_expand(const v_int8x64& a, const v_int8x64& b)
+{
+    __m512i even_a = _mm512_srai_epi16(_mm512_bslli_epi128(a.val, 1), 8);
+    __m512i odd_a  = _mm512_srai_epi16(a.val, 8);
+
+    __m512i even_b = _mm512_srai_epi16(_mm512_bslli_epi128(b.val, 1), 8);
+    __m512i odd_b  = _mm512_srai_epi16(b.val, 8);
+
+    __m512i prod0  = _mm512_madd_epi16(even_a, even_b);
+    __m512i prod1  = _mm512_madd_epi16(odd_a, odd_b);
+    return v_int32x16(_mm512_add_epi32(prod0, prod1));
+}
+inline v_int32x16 v_dotprod_expand(const v_int8x64& a, const v_int8x64& b, const v_int32x16& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+// 16 >> 64
+inline v_uint64x8 v_dotprod_expand(const v_uint16x32& a, const v_uint16x32& b)
+{
+    __m512i mullo = _mm512_mullo_epi16(a.val, b.val);
+    __m512i mulhi = _mm512_mulhi_epu16(a.val, b.val);
+    __m512i mul0  = _mm512_unpacklo_epi16(mullo, mulhi);
+    __m512i mul1  = _mm512_unpackhi_epi16(mullo, mulhi);
+
+    __m512i p02   = _mm512_mask_blend_epi32(0xAAAA, mul0, _mm512_setzero_si512());
+    __m512i p13   = _mm512_srli_epi64(mul0, 32);
+    __m512i p46   = _mm512_mask_blend_epi32(0xAAAA, mul1, _mm512_setzero_si512());
+    __m512i p57   = _mm512_srli_epi64(mul1, 32);
+
+    __m512i p15_  = _mm512_add_epi64(p02, p13);
+    __m512i p9d_  = _mm512_add_epi64(p46, p57);
+
+    return v_uint64x8(_mm512_add_epi64(
+        _mm512_unpacklo_epi64(p15_, p9d_),
+        _mm512_unpackhi_epi64(p15_, p9d_)
+    ));
+}
+inline v_uint64x8 v_dotprod_expand(const v_uint16x32& a, const v_uint16x32& b, const v_uint64x8& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+inline v_int64x8 v_dotprod_expand(const v_int16x32& a, const v_int16x32& b)
+{
+    __m512i prod = _mm512_madd_epi16(a.val, b.val);
+    __m512i even = _mm512_srai_epi64(_mm512_bslli_epi128(prod, 4), 32);
+    __m512i odd  = _mm512_srai_epi64(prod, 32);
+    return v_int64x8(_mm512_add_epi64(even, odd));
+}
+inline v_int64x8 v_dotprod_expand(const v_int16x32& a, const v_int16x32& b, const v_int64x8& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+// 32 >> 64f
+inline v_float64x8 v_dotprod_expand(const v_int32x16& a, const v_int32x16& b)
+{ return v_cvt_f64(v_dotprod(a, b)); }
+inline v_float64x8 v_dotprod_expand(const v_int32x16& a, const v_int32x16& b, const v_float64x8& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+//////// Fast Dot Product ////////
+
+// 16 >> 32
+inline v_int32x16 v_dotprod_fast(const v_int16x32& a, const v_int16x32& b)
+{ return v_dotprod(a, b); }
+inline v_int32x16 v_dotprod_fast(const v_int16x32& a, const v_int16x32& b, const v_int32x16& c)
+{ return v_dotprod(a, b, c); }
+
+// 32 >> 64
+inline v_int64x8 v_dotprod_fast(const v_int32x16& a, const v_int32x16& b)
+{ return v_dotprod(a, b); }
+inline v_int64x8 v_dotprod_fast(const v_int32x16& a, const v_int32x16& b, const v_int64x8& c)
+{ return v_dotprod(a, b, c); }
+
+// 8 >> 32
+inline v_uint32x16 v_dotprod_expand_fast(const v_uint8x64& a, const v_uint8x64& b)
+{ return v_dotprod_expand(a, b); }
+inline v_uint32x16 v_dotprod_expand_fast(const v_uint8x64& a, const v_uint8x64& b, const v_uint32x16& c)
+{ return v_dotprod_expand(a, b, c); }
+
+inline v_int32x16 v_dotprod_expand_fast(const v_int8x64& a, const v_int8x64& b)
+{ return v_dotprod_expand(a, b); }
+inline v_int32x16 v_dotprod_expand_fast(const v_int8x64& a, const v_int8x64& b, const v_int32x16& c)
+{ return v_dotprod_expand(a, b, c); }
+
+// 16 >> 64
+inline v_uint64x8 v_dotprod_expand_fast(const v_uint16x32& a, const v_uint16x32& b)
+{
+    __m512i mullo = _mm512_mullo_epi16(a.val, b.val);
+    __m512i mulhi = _mm512_mulhi_epu16(a.val, b.val);
+    __m512i mul0  = _mm512_unpacklo_epi16(mullo, mulhi);
+    __m512i mul1  = _mm512_unpackhi_epi16(mullo, mulhi);
+
+    __m512i p02   = _mm512_mask_blend_epi32(0xAAAA, mul0, _mm512_setzero_si512());
+    __m512i p13   = _mm512_srli_epi64(mul0, 32);
+    __m512i p46   = _mm512_mask_blend_epi32(0xAAAA, mul1, _mm512_setzero_si512());
+    __m512i p57   = _mm512_srli_epi64(mul1, 32);
+
+    __m512i p15_  = _mm512_add_epi64(p02, p13);
+    __m512i p9d_  = _mm512_add_epi64(p46, p57);
+    return v_uint64x8(_mm512_add_epi64(p15_, p9d_));
+}
+inline v_uint64x8 v_dotprod_expand_fast(const v_uint16x32& a, const v_uint16x32& b, const v_uint64x8& c)
+{ return v_dotprod_expand_fast(a, b) + c; }
+
+inline v_int64x8 v_dotprod_expand_fast(const v_int16x32& a, const v_int16x32& b)
+{ return v_dotprod_expand(a, b); }
+inline v_int64x8 v_dotprod_expand_fast(const v_int16x32& a, const v_int16x32& b, const v_int64x8& c)
+{ return v_dotprod_expand(a, b, c); }
+
+// 32 >> 64f
+inline v_float64x8 v_dotprod_expand_fast(const v_int32x16& a, const v_int32x16& b)
+{ return v_dotprod_expand(a, b); }
+inline v_float64x8 v_dotprod_expand_fast(const v_int32x16& a, const v_int32x16& b, const v_float64x8& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+
+#define OPENCV_HAL_AVX512_SPLAT2_PS(a, im) \
+    v_float32x16(_mm512_permute_ps(a.val, _MM_SHUFFLE(im, im, im, im)))
+
+inline v_float32x16 v_matmul(const v_float32x16& v,
+                             const v_float32x16& m0, const v_float32x16& m1,
+                             const v_float32x16& m2, const v_float32x16& m3)
+{
+    v_float32x16 v04 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 0);
+    v_float32x16 v15 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 1);
+    v_float32x16 v26 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 2);
+    v_float32x16 v37 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 3);
+    return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, v37 * m3)));
+}
+
+inline v_float32x16 v_matmuladd(const v_float32x16& v,
+                                const v_float32x16& m0, const v_float32x16& m1,
+                                const v_float32x16& m2, const v_float32x16& a)
+{
+    v_float32x16 v04 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 0);
+    v_float32x16 v15 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 1);
+    v_float32x16 v26 = OPENCV_HAL_AVX512_SPLAT2_PS(v, 2);
+    return v_fma(v04, m0, v_fma(v15, m1, v_fma(v26, m2, a)));
+}
+
+#define OPENCV_HAL_IMPL_AVX512_TRANSPOSE4x4(_Tpvec, suffix, cast_from, cast_to) \
+    inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1,              \
+                               const _Tpvec& a2, const _Tpvec& a3,              \
+                               _Tpvec& b0, _Tpvec& b1, _Tpvec& b2, _Tpvec& b3)  \
+    {                                                                           \
+        __m512i t0 = cast_from(_mm512_unpacklo_##suffix(a0.val, a1.val));       \
+        __m512i t1 = cast_from(_mm512_unpacklo_##suffix(a2.val, a3.val));       \
+        __m512i t2 = cast_from(_mm512_unpackhi_##suffix(a0.val, a1.val));       \
+        __m512i t3 = cast_from(_mm512_unpackhi_##suffix(a2.val, a3.val));       \
+        b0.val = cast_to(_mm512_unpacklo_epi64(t0, t1));                        \
+        b1.val = cast_to(_mm512_unpackhi_epi64(t0, t1));                        \
+        b2.val = cast_to(_mm512_unpacklo_epi64(t2, t3));                        \
+        b3.val = cast_to(_mm512_unpackhi_epi64(t2, t3));                        \
+    }
+
+OPENCV_HAL_IMPL_AVX512_TRANSPOSE4x4(v_uint32x16,  epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_AVX512_TRANSPOSE4x4(v_int32x16,   epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
+OPENCV_HAL_IMPL_AVX512_TRANSPOSE4x4(v_float32x16, ps, _mm512_castps_si512, _mm512_castsi512_ps)
+
+//////////////// Value reordering ///////////////
+
+/* Expand */
+#define OPENCV_HAL_IMPL_AVX512_EXPAND(_Tpvec, _Tpwvec, _Tp, intrin) \
+    inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
+    {                                                               \
+        b0.val = intrin(_v512_extract_low(a.val));                  \
+        b1.val = intrin(_v512_extract_high(a.val));                 \
+    }                                                               \
+    inline _Tpwvec v_expand_low(const _Tpvec& a)                    \
+    { return _Tpwvec(intrin(_v512_extract_low(a.val))); }           \
+    inline _Tpwvec v_expand_high(const _Tpvec& a)                   \
+    { return _Tpwvec(intrin(_v512_extract_high(a.val))); }          \
+    inline _Tpwvec v512_load_expand(const _Tp* ptr)                 \
+    {                                                               \
+        __m256i a = _mm256_loadu_si256((const __m256i*)ptr);        \
+        return _Tpwvec(intrin(a));                                  \
+    }
+
+OPENCV_HAL_IMPL_AVX512_EXPAND(v_uint8x64,  v_uint16x32, uchar,    _mm512_cvtepu8_epi16)
+OPENCV_HAL_IMPL_AVX512_EXPAND(v_int8x64,   v_int16x32,  schar,    _mm512_cvtepi8_epi16)
+OPENCV_HAL_IMPL_AVX512_EXPAND(v_uint16x32, v_uint32x16, ushort,   _mm512_cvtepu16_epi32)
+OPENCV_HAL_IMPL_AVX512_EXPAND(v_int16x32,  v_int32x16,  short,    _mm512_cvtepi16_epi32)
+OPENCV_HAL_IMPL_AVX512_EXPAND(v_uint32x16, v_uint64x8,  unsigned, _mm512_cvtepu32_epi64)
+OPENCV_HAL_IMPL_AVX512_EXPAND(v_int32x16,  v_int64x8,   int,      _mm512_cvtepi32_epi64)
+
+#define OPENCV_HAL_IMPL_AVX512_EXPAND_Q(_Tpvec, _Tp, intrin) \
+    inline _Tpvec v512_load_expand_q(const _Tp* ptr)         \
+    {                                                        \
+        __m128i a = _mm_loadu_si128((const __m128i*)ptr);    \
+        return _Tpvec(intrin(a));                            \
+    }
+
+OPENCV_HAL_IMPL_AVX512_EXPAND_Q(v_uint32x16, uchar, _mm512_cvtepu8_epi32)
+OPENCV_HAL_IMPL_AVX512_EXPAND_Q(v_int32x16,  schar, _mm512_cvtepi8_epi32)
+
+/* pack */
+// 16
+inline v_int8x64 v_pack(const v_int16x32& a, const v_int16x32& b)
+{ return v_int8x64(_mm512_permutexvar_epi64(_v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), _mm512_packs_epi16(a.val, b.val))); }
+
+inline v_uint8x64 v_pack(const v_uint16x32& a, const v_uint16x32& b)
+{
+    const __m512i t = _mm512_set1_epi16(255);
+    return v_uint8x64(_v512_combine(_mm512_cvtepi16_epi8(_mm512_min_epu16(a.val, t)), _mm512_cvtepi16_epi8(_mm512_min_epu16(b.val, t))));
+}
+
+inline v_uint8x64 v_pack_u(const v_int16x32& a, const v_int16x32& b)
+{
+    return v_uint8x64(_mm512_permutexvar_epi64(_v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), _mm512_packus_epi16(a.val, b.val)));
+}
+
+inline void v_pack_store(schar* ptr, const v_int16x32& a)
+{ v_store_low(ptr, v_pack(a, a)); }
+
+inline void v_pack_store(uchar* ptr, const v_uint16x32& a)
+{
+    const __m512i m = _mm512_set1_epi16(255);
+    _mm256_storeu_si256((__m256i*)ptr, _mm512_cvtepi16_epi8(_mm512_min_epu16(a.val, m)));
+}
+
+inline void v_pack_u_store(uchar* ptr, const v_int16x32& a)
+{ v_store_low(ptr, v_pack_u(a, a)); }
+
+template<int n> inline
+v_uint8x64 v_rshr_pack(const v_uint16x32& a, const v_uint16x32& b)
+{
+    // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
+    v_uint16x32 delta = v512_setall_u16((short)(1 << (n-1)));
+    return v_pack_u(v_reinterpret_as_s16((a + delta) >> n),
+                    v_reinterpret_as_s16((b + delta) >> n));
+}
+
+template<int n> inline
+void v_rshr_pack_store(uchar* ptr, const v_uint16x32& a)
+{
+    v_uint16x32 delta = v512_setall_u16((short)(1 << (n-1)));
+    v_pack_u_store(ptr, v_reinterpret_as_s16((a + delta) >> n));
+}
+
+template<int n> inline
+v_uint8x64 v_rshr_pack_u(const v_int16x32& a, const v_int16x32& b)
+{
+    v_int16x32 delta = v512_setall_s16((short)(1 << (n-1)));
+    return v_pack_u((a + delta) >> n, (b + delta) >> n);
+}
+
+template<int n> inline
+void v_rshr_pack_u_store(uchar* ptr, const v_int16x32& a)
+{
+    v_int16x32 delta = v512_setall_s16((short)(1 << (n-1)));
+    v_pack_u_store(ptr, (a + delta) >> n);
+}
+
+template<int n> inline
+v_int8x64 v_rshr_pack(const v_int16x32& a, const v_int16x32& b)
+{
+    v_int16x32 delta = v512_setall_s16((short)(1 << (n-1)));
+    return v_pack((a + delta) >> n, (b + delta) >> n);
+}
+
+template<int n> inline
+void v_rshr_pack_store(schar* ptr, const v_int16x32& a)
+{
+    v_int16x32 delta = v512_setall_s16((short)(1 << (n-1)));
+    v_pack_store(ptr, (a + delta) >> n);
+}
+
+// 32
+inline v_int16x32 v_pack(const v_int32x16& a, const v_int32x16& b)
+{ return v_int16x32(_mm512_permutexvar_epi64(_v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), _mm512_packs_epi32(a.val, b.val))); }
+
+inline v_uint16x32 v_pack(const v_uint32x16& a, const v_uint32x16& b)
+{
+    const __m512i m = _mm512_set1_epi32(65535);
+    return v_uint16x32(_v512_combine(_mm512_cvtepi32_epi16(_mm512_min_epu32(a.val, m)), _mm512_cvtepi32_epi16(_mm512_min_epu32(b.val, m))));
+}
+
+inline v_uint16x32 v_pack_u(const v_int32x16& a, const v_int32x16& b)
+{ return v_uint16x32(_mm512_permutexvar_epi64(_v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), _mm512_packus_epi32(a.val, b.val))); }
+
+inline void v_pack_store(short* ptr, const v_int32x16& a)
+{ v_store_low(ptr, v_pack(a, a)); }
+
+inline void v_pack_store(ushort* ptr, const v_uint32x16& a)
+{
+    const __m512i m = _mm512_set1_epi32(65535);
+    _mm256_storeu_si256((__m256i*)ptr, _mm512_cvtepi32_epi16(_mm512_min_epu32(a.val, m)));
+}
+
+inline void v_pack_u_store(ushort* ptr, const v_int32x16& a)
+{ v_store_low(ptr, v_pack_u(a, a)); }
+
+
+template<int n> inline
+v_uint16x32 v_rshr_pack(const v_uint32x16& a, const v_uint32x16& b)
+{
+    v_uint32x16 delta = v512_setall_u32(1 << (n-1));
+    return v_pack_u(v_reinterpret_as_s32((a + delta) >> n),
+                    v_reinterpret_as_s32((b + delta) >> n));
+}
+
+template<int n> inline
+void v_rshr_pack_store(ushort* ptr, const v_uint32x16& a)
+{
+    v_uint32x16 delta = v512_setall_u32(1 << (n-1));
+    v_pack_u_store(ptr, v_reinterpret_as_s32((a + delta) >> n));
+}
+
+template<int n> inline
+v_uint16x32 v_rshr_pack_u(const v_int32x16& a, const v_int32x16& b)
+{
+    v_int32x16 delta = v512_setall_s32(1 << (n-1));
+    return v_pack_u((a + delta) >> n, (b + delta) >> n);
+}
+
+template<int n> inline
+void v_rshr_pack_u_store(ushort* ptr, const v_int32x16& a)
+{
+    v_int32x16 delta = v512_setall_s32(1 << (n-1));
+    v_pack_u_store(ptr, (a + delta) >> n);
+}
+
+template<int n> inline
+v_int16x32 v_rshr_pack(const v_int32x16& a, const v_int32x16& b)
+{
+    v_int32x16 delta = v512_setall_s32(1 << (n-1));
+    return v_pack((a + delta) >> n, (b + delta) >> n);
+}
+
+template<int n> inline
+void v_rshr_pack_store(short* ptr, const v_int32x16& a)
+{
+    v_int32x16 delta = v512_setall_s32(1 << (n-1));
+    v_pack_store(ptr, (a + delta) >> n);
+}
+
+// 64
+// Non-saturating pack
+inline v_uint32x16 v_pack(const v_uint64x8& a, const v_uint64x8& b)
+{ return v_uint32x16(_v512_combine(_mm512_cvtepi64_epi32(a.val), _mm512_cvtepi64_epi32(b.val))); }
+
+inline v_int32x16 v_pack(const v_int64x8& a, const v_int64x8& b)
+{ return v_reinterpret_as_s32(v_pack(v_reinterpret_as_u64(a), v_reinterpret_as_u64(b))); }
+
+inline void v_pack_store(unsigned* ptr, const v_uint64x8& a)
+{ _mm256_storeu_si256((__m256i*)ptr, _mm512_cvtepi64_epi32(a.val)); }
+
+inline void v_pack_store(int* ptr, const v_int64x8& b)
+{ v_pack_store((unsigned*)ptr, v_reinterpret_as_u64(b)); }
+
+template<int n> inline
+v_uint32x16 v_rshr_pack(const v_uint64x8& a, const v_uint64x8& b)
+{
+    v_uint64x8 delta = v512_setall_u64((uint64)1 << (n-1));
+    return v_pack((a + delta) >> n, (b + delta) >> n);
+}
+
+template<int n> inline
+void v_rshr_pack_store(unsigned* ptr, const v_uint64x8& a)
+{
+    v_uint64x8 delta = v512_setall_u64((uint64)1 << (n-1));
+    v_pack_store(ptr, (a + delta) >> n);
+}
+
+template<int n> inline
+v_int32x16 v_rshr_pack(const v_int64x8& a, const v_int64x8& b)
+{
+    v_int64x8 delta = v512_setall_s64((int64)1 << (n-1));
+    return v_pack((a + delta) >> n, (b + delta) >> n);
+}
+
+template<int n> inline
+void v_rshr_pack_store(int* ptr, const v_int64x8& a)
+{
+    v_int64x8 delta = v512_setall_s64((int64)1 << (n-1));
+    v_pack_store(ptr, (a + delta) >> n);
+}
+
+// pack boolean
+inline v_uint8x64 v_pack_b(const v_uint16x32& a, const v_uint16x32& b)
+{ return v_uint8x64(_mm512_permutexvar_epi64(_v512_set_epu64(7, 5, 3, 1, 6, 4, 2, 0), _mm512_packs_epi16(a.val, b.val))); }
+
+inline v_uint8x64 v_pack_b(const v_uint32x16& a, const v_uint32x16& b,
+                           const v_uint32x16& c, const v_uint32x16& d)
+{
+    __m512i ab = _mm512_packs_epi32(a.val, b.val);
+    __m512i cd = _mm512_packs_epi32(c.val, d.val);
+
+    return v_uint8x64(_mm512_permutexvar_epi32(_v512_set_epu32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0), _mm512_packs_epi16(ab, cd)));
+}
+
+inline v_uint8x64 v_pack_b(const v_uint64x8& a, const v_uint64x8& b, const v_uint64x8& c,
+                           const v_uint64x8& d, const v_uint64x8& e, const v_uint64x8& f,
+                           const v_uint64x8& g, const v_uint64x8& h)
+{
+    __m512i ab = _mm512_packs_epi32(a.val, b.val);
+    __m512i cd = _mm512_packs_epi32(c.val, d.val);
+    __m512i ef = _mm512_packs_epi32(e.val, f.val);
+    __m512i gh = _mm512_packs_epi32(g.val, h.val);
+
+    __m512i abcd = _mm512_packs_epi32(ab, cd);
+    __m512i efgh = _mm512_packs_epi32(ef, gh);
+
+    return v_uint8x64(_mm512_permutexvar_epi16(_v512_set_epu16(31, 23, 15, 7, 30, 22, 14, 6, 29, 21, 13, 5, 28, 20, 12, 4,
+                                                               27, 19, 11, 3, 26, 18, 10, 2, 25, 17,  9, 1, 24, 16,  8, 0), _mm512_packs_epi16(abcd, efgh)));
+}
+
+/* Recombine */
+// its up there with load and store operations
+
+/* Extract */
+#define OPENCV_HAL_IMPL_AVX512_EXTRACT(_Tpvec)                \
+    template<int s>                                           \
+    inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) \
+    { return v_rotate_right<s>(a, b); }
+
+OPENCV_HAL_IMPL_AVX512_EXTRACT(v_uint8x64)
+OPENCV_HAL_IMPL_AVX512_EXTRACT(v_int8x64)
+OPENCV_HAL_IMPL_AVX512_EXTRACT(v_uint16x32)
+OPENCV_HAL_IMPL_AVX512_EXTRACT(v_int16x32)
+OPENCV_HAL_IMPL_AVX512_EXTRACT(v_uint32x16)
+OPENCV_HAL_IMPL_AVX512_EXTRACT(v_int32x16)
+OPENCV_HAL_IMPL_AVX512_EXTRACT(v_uint64x8)
+OPENCV_HAL_IMPL_AVX512_EXTRACT(v_int64x8)
+OPENCV_HAL_IMPL_AVX512_EXTRACT(v_float32x16)
+OPENCV_HAL_IMPL_AVX512_EXTRACT(v_float64x8)
+
+#define OPENCV_HAL_IMPL_AVX512_EXTRACT_N(_Tpvec, _Tp) \
+template<int i> inline _Tp v_extract_n(_Tpvec v) { return v_rotate_right<i>(v).get0(); }
+
+OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_uint8x64, uchar)
+OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_int8x64, schar)
+OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_uint16x32, ushort)
+OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_int16x32, short)
+OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_uint32x16, uint)
+OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_int32x16, int)
+OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_uint64x8, uint64)
+OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_int64x8, int64)
+OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_float32x16, float)
+OPENCV_HAL_IMPL_AVX512_EXTRACT_N(v_float64x8, double)
+
+template<int i>
+inline v_uint32x16 v_broadcast_element(v_uint32x16 a)
+{
+    static const __m512i perm = _mm512_set1_epi32((char)i);
+    return v_uint32x16(_mm512_permutexvar_epi32(perm, a.val));
+}
+
+template<int i>
+inline v_int32x16 v_broadcast_element(const v_int32x16 &a)
+{ return v_reinterpret_as_s32(v_broadcast_element<i>(v_reinterpret_as_u32(a))); }
+
+template<int i>
+inline v_float32x16 v_broadcast_element(const v_float32x16 &a)
+{ return v_reinterpret_as_f32(v_broadcast_element<i>(v_reinterpret_as_u32(a))); }
+
+
+///////////////////// load deinterleave /////////////////////////////
+
+inline void v_load_deinterleave( const uchar* ptr, v_uint8x64& a, v_uint8x64& b )
+{
+    __m512i ab0 = _mm512_loadu_si512((const __m512i*)ptr);
+    __m512i ab1 = _mm512_loadu_si512((const __m512i*)(ptr + 64));
+#if CV_AVX_512VBMI
+    __m512i mask0 = _v512_set_epu8(126, 124, 122, 120, 118, 116, 114, 112, 110, 108, 106, 104, 102, 100, 98, 96,
+                                    94,  92,  90,  88,  86,  84,  82,  80,  78,  76,  74,  72,  70,  68, 66, 64,
+                                    62,  60,  58,  56,  54,  52,  50,  48,  46,  44,  42,  40,  38,  36, 34, 32,
+                                    30,  28,  26,  24,  22,  20,  18,  16,  14,  12,  10,   8,   6,   4,  2,  0);
+    __m512i mask1 = _v512_set_epu8(127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107, 105, 103, 101, 99, 97,
+                                    95,  93,  91,  89,  87,  85,  83,  81,  79,  77,  75,  73,  71,  69, 67, 65,
+                                    63,  61,  59,  57,  55,  53,  51,  49,  47,  45,  43,  41,  39,  37, 35, 33,
+                                    31,  29,  27,  25,  23,  21,  19,  17,  15,  13,  11,   9,   7,   5,  3,  1);
+    a = v_uint8x64(_mm512_permutex2var_epi8(ab0, mask0, ab1));
+    b = v_uint8x64(_mm512_permutex2var_epi8(ab0, mask1, ab1));
+#else
+    __m512i mask0 = _mm512_set4_epi32(0x0f0d0b09, 0x07050301, 0x0e0c0a08, 0x06040200);
+    __m512i a0b0 = _mm512_shuffle_epi8(ab0, mask0);
+    __m512i a1b1 = _mm512_shuffle_epi8(ab1, mask0);
+    __m512i mask1 = _v512_set_epu64(14, 12, 10, 8, 6, 4, 2, 0);
+    __m512i mask2 = _v512_set_epu64(15, 13, 11, 9, 7, 5, 3, 1);
+    a = v_uint8x64(_mm512_permutex2var_epi64(a0b0, mask1, a1b1));
+    b = v_uint8x64(_mm512_permutex2var_epi64(a0b0, mask2, a1b1));
+#endif
+}
+
+inline void v_load_deinterleave( const ushort* ptr, v_uint16x32& a, v_uint16x32& b )
+{
+    __m512i ab0 = _mm512_loadu_si512((const __m512i*)ptr);
+    __m512i ab1 = _mm512_loadu_si512((const __m512i*)(ptr + 32));
+    __m512i mask0 = _v512_set_epu16(62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32,
+                                    30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10,  8,  6,  4,  2,  0);
+    __m512i mask1 = _v512_set_epu16(63, 61, 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33,
+                                    31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11,  9,  7,  5,  3,  1);
+    a = v_uint16x32(_mm512_permutex2var_epi16(ab0, mask0, ab1));
+    b = v_uint16x32(_mm512_permutex2var_epi16(ab0, mask1, ab1));
+}
+
+inline void v_load_deinterleave( const unsigned* ptr, v_uint32x16& a, v_uint32x16& b )
+{
+    __m512i ab0 = _mm512_loadu_si512((const __m512i*)ptr);
+    __m512i ab1 = _mm512_loadu_si512((const __m512i*)(ptr + 16));
+    __m512i mask0 = _v512_set_epu32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
+    __m512i mask1 = _v512_set_epu32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
+    a = v_uint32x16(_mm512_permutex2var_epi32(ab0, mask0, ab1));
+    b = v_uint32x16(_mm512_permutex2var_epi32(ab0, mask1, ab1));
+}
+
+inline void v_load_deinterleave( const uint64* ptr, v_uint64x8& a, v_uint64x8& b )
+{
+    __m512i ab0 = _mm512_loadu_si512((const __m512i*)ptr);
+    __m512i ab1 = _mm512_loadu_si512((const __m512i*)(ptr + 8));
+    __m512i mask0 = _v512_set_epu64(14, 12, 10, 8, 6, 4, 2, 0);
+    __m512i mask1 = _v512_set_epu64(15, 13, 11, 9, 7, 5, 3, 1);
+    a = v_uint64x8(_mm512_permutex2var_epi64(ab0, mask0, ab1));
+    b = v_uint64x8(_mm512_permutex2var_epi64(ab0, mask1, ab1));
+}
+
+inline void v_load_deinterleave( const uchar* ptr, v_uint8x64& a, v_uint8x64& b, v_uint8x64& c )
+{
+    __m512i bgr0 = _mm512_loadu_si512((const __m512i*)ptr);
+    __m512i bgr1 = _mm512_loadu_si512((const __m512i*)(ptr + 64));
+    __m512i bgr2 = _mm512_loadu_si512((const __m512i*)(ptr + 128));
+
+#if CV_AVX_512VBMI2
+    __m512i mask0 = _v512_set_epu8(126, 123, 120, 117, 114, 111, 108, 105, 102,  99,  96,  93,  90,  87,  84, 81,
+                                    78,  75,  72,  69,  66,  63,  60,  57,  54,  51,  48,  45,  42,  39,  36, 33,
+                                    30,  27,  24,  21,  18,  15,  12,   9,   6,   3,   0,  62,  59,  56,  53, 50,
+                                    47,  44,  41,  38,  35,  32,  29,  26,  23,  20,  17,  14,  11,   8,   5,  2);
+    __m512i r0b01 = _mm512_permutex2var_epi8(bgr0, mask0, bgr1);
+    __m512i b1g12 = _mm512_permutex2var_epi8(bgr1, mask0, bgr2);
+    __m512i r12b2 = _mm512_permutex2var_epi8(bgr1,
+                    _v512_set_epu8(125, 122, 119, 116, 113, 110, 107, 104, 101,  98,  95,  92,  89,  86,  83, 80,
+                                    77,  74,  71,  68,  65, 127, 124, 121, 118, 115, 112, 109, 106, 103, 100, 97,
+                                    94,  91,  88,  85,  82,  79,  76,  73,  70,  67,  64,  61,  58,  55,  52, 49,
+                                    46,  43,  40,  37,  34,  31,  28,  25,  22,  19,  16,  13,  10,   7,   4,  1), bgr2);
+    a = v_uint8x64(_mm512_mask_compress_epi8(r12b2, 0xffffffffffe00000, r0b01));
+    b = v_uint8x64(_mm512_mask_compress_epi8(b1g12, 0x2492492492492492, bgr0));
+    c = v_uint8x64(_mm512_mask_expand_epi8(r0b01, 0xffffffffffe00000, r12b2));
+#elif CV_AVX_512VBMI
+    __m512i b0g0b1 = _mm512_mask_blend_epi8(0xb6db6db6db6db6db, bgr1, bgr0);
+    __m512i g1r1g2 = _mm512_mask_blend_epi8(0xb6db6db6db6db6db, bgr2, bgr1);
+    __m512i r2b2r0 = _mm512_mask_blend_epi8(0xb6db6db6db6db6db, bgr0, bgr2);
+    a = v_uint8x64(_mm512_permutex2var_epi8(b0g0b1, _v512_set_epu8(125, 122, 119, 116, 113, 110, 107, 104, 101,  98,  95,  92,  89,  86,  83,  80,
+                                                                    77,  74,  71,  68,  65,  63,  61,  60,  58,  57,  55,  54,  52,  51,  49,  48,
+                                                                    46,  45,  43,  42,  40,  39,  37,  36,  34,  33,  31,  30,  28,  27,  25,  24,
+                                                                    23,  21,  20,  18,  17,  15,  14,  12,  11,   9,   8,   6,   5,   3,   2,   0), bgr2));
+    b = v_uint8x64(_mm512_permutex2var_epi8(g1r1g2, _v512_set_epu8( 63,  61,  60,  58,  57,  55,  54,  52,  51,  49,  48,  46,  45,  43,  42,  40,
+                                                                    39,  37,  36,  34,  33,  31,  30,  28,  27,  25,  24,  23,  21,  20,  18,  17,
+                                                                    15,  14,  12,  11,   9,   8,   6,   5,   3,   2,   0, 126, 123, 120, 117, 114,
+                                                                   111, 108, 105, 102,  99,  96,  93,  90,  87,  84,  81,  78,  75,  72,  69,  66), bgr0));
+    c = v_uint8x64(_mm512_permutex2var_epi8(r2b2r0, _v512_set_epu8( 63,  60,  57,  54,  51,  48,  45,  42,  39,  36,  33,  30,  27,  24,  21,  18,
+                                                                    15,  12,   9,   6,   3,   0, 125, 122, 119, 116, 113, 110, 107, 104, 101,  98,
+                                                                    95,  92,  89,  86,  83,  80,  77,  74,  71,  68,  65,  62,  59,  56,  53,  50,
+                                                                    47,  44,  41,  38,  35,  32,  29,  26,  23,  20,  17,  14,  11,   8,   5,   2), bgr1));
+#else
+    __m512i mask0 = _v512_set_epu16(61, 58, 55, 52, 49, 46, 43, 40, 37, 34, 63, 60, 57, 54, 51, 48,
+                                    45, 42, 39, 36, 33, 30, 27, 24, 21, 18, 15, 12,  9,  6,  3,  0);
+    __m512i b01g1 = _mm512_permutex2var_epi16(bgr0, mask0, bgr1);
+    __m512i r12b2 = _mm512_permutex2var_epi16(bgr1, mask0, bgr2);
+    __m512i g20r0 = _mm512_permutex2var_epi16(bgr2, mask0, bgr0);
+
+    __m512i b0g0 = _mm512_mask_blend_epi32(0xf800, b01g1, r12b2);
+    __m512i r0b1 = _mm512_permutex2var_epi16(bgr1, _v512_set_epu16(42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 29, 26, 23, 20, 17,
+                                                                   14, 11,  8,  5,  2, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43), g20r0);
+    __m512i g1r1 = _mm512_alignr_epi32(r12b2, g20r0, 11);
+    a = v_uint8x64(_mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, b0g0, r0b1));
+    c = v_uint8x64(_mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, r0b1, g1r1));
+    b = v_uint8x64(_mm512_shuffle_epi8(_mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, g1r1, b0g0), _mm512_set4_epi32(0x0e0f0c0d, 0x0a0b0809, 0x06070405, 0x02030001)));
+#endif
+}
+
+inline void v_load_deinterleave( const ushort* ptr, v_uint16x32& a, v_uint16x32& b, v_uint16x32& c )
+{
+    __m512i bgr0 = _mm512_loadu_si512((const __m512i*)ptr);
+    __m512i bgr1 = _mm512_loadu_si512((const __m512i*)(ptr + 32));
+    __m512i bgr2 = _mm512_loadu_si512((const __m512i*)(ptr + 64));
+
+    __m512i mask0 = _v512_set_epu16(61, 58, 55, 52, 49, 46, 43, 40, 37, 34, 63, 60, 57, 54, 51, 48,
+                                    45, 42, 39, 36, 33, 30, 27, 24, 21, 18, 15, 12,  9,  6,  3,  0);
+    __m512i b01g1 = _mm512_permutex2var_epi16(bgr0, mask0, bgr1);
+    __m512i r12b2 = _mm512_permutex2var_epi16(bgr1, mask0, bgr2);
+    __m512i g20r0 = _mm512_permutex2var_epi16(bgr2, mask0, bgr0);
+
+    a = v_uint16x32(_mm512_mask_blend_epi32(0xf800, b01g1, r12b2));
+    b = v_uint16x32(_mm512_permutex2var_epi16(bgr1, _v512_set_epu16(42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 29, 26, 23, 20, 17,
+                                                                    14, 11,  8,  5,  2, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43), g20r0));
+    c = v_uint16x32(_mm512_alignr_epi32(r12b2, g20r0, 11));
+}
+
+inline void v_load_deinterleave( const unsigned* ptr, v_uint32x16& a, v_uint32x16& b, v_uint32x16& c )
+{
+    __m512i bgr0 = _mm512_loadu_si512((const __m512i*)ptr);
+    __m512i bgr1 = _mm512_loadu_si512((const __m512i*)(ptr + 16));
+    __m512i bgr2 = _mm512_loadu_si512((const __m512i*)(ptr + 32));
+
+    __m512i mask0 = _v512_set_epu32(29, 26, 23, 20, 17, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0);
+    __m512i b01r1 = _mm512_permutex2var_epi32(bgr0, mask0, bgr1);
+    __m512i g12b2 = _mm512_permutex2var_epi32(bgr1, mask0, bgr2);
+    __m512i r20g0 = _mm512_permutex2var_epi32(bgr2, mask0, bgr0);
+
+    a = v_uint32x16(_mm512_mask_blend_epi32(0xf800, b01r1, g12b2));
+    b = v_uint32x16(_mm512_alignr_epi32(g12b2, r20g0, 11));
+    c = v_uint32x16(_mm512_permutex2var_epi32(bgr1, _v512_set_epu32(21, 20, 19, 18, 17, 16, 13, 10, 7, 4, 1, 26, 25, 24, 23, 22), r20g0));
+}
+
+inline void v_load_deinterleave( const uint64* ptr, v_uint64x8& a, v_uint64x8& b, v_uint64x8& c )
+{
+    __m512i bgr0 = _mm512_loadu_si512((const __m512i*)ptr);
+    __m512i bgr1 = _mm512_loadu_si512((const __m512i*)(ptr + 8));
+    __m512i bgr2 = _mm512_loadu_si512((const __m512i*)(ptr + 16));
+
+    __m512i mask0 = _v512_set_epu64(13, 10, 15, 12, 9, 6, 3, 0);
+    __m512i b01g1 = _mm512_permutex2var_epi64(bgr0, mask0, bgr1);
+    __m512i r12b2 = _mm512_permutex2var_epi64(bgr1, mask0, bgr2);
+    __m512i g20r0 = _mm512_permutex2var_epi64(bgr2, mask0, bgr0);
+
+    a = v_uint64x8(_mm512_mask_blend_epi64(0xc0, b01g1, r12b2));
+    c = v_uint64x8(_mm512_alignr_epi64(r12b2, g20r0, 6));
+    b = v_uint64x8(_mm512_permutex2var_epi64(bgr1, _v512_set_epu64(10, 9, 8, 5, 2, 13, 12, 11), g20r0));
+}
+
+inline void v_load_deinterleave( const uchar* ptr, v_uint8x64& a, v_uint8x64& b, v_uint8x64& c, v_uint8x64& d )
+{
+    __m512i bgra0 = _mm512_loadu_si512((const __m512i*)ptr);
+    __m512i bgra1 = _mm512_loadu_si512((const __m512i*)(ptr + 64));
+    __m512i bgra2 = _mm512_loadu_si512((const __m512i*)(ptr + 128));
+    __m512i bgra3 = _mm512_loadu_si512((const __m512i*)(ptr + 192));
+
+#if CV_AVX_512VBMI
+    __m512i mask0 = _v512_set_epu8(126, 124, 122, 120, 118, 116, 114, 112, 110, 108, 106, 104, 102, 100, 98, 96,
+                                    94,  92,  90,  88,  86,  84,  82,  80,  78,  76,  74,  72,  70,  68, 66, 64,
+                                    62,  60,  58,  56,  54,  52,  50,  48,  46,  44,  42,  40,  38,  36, 34, 32,
+                                    30,  28,  26,  24,  22,  20,  18,  16,  14,  12,  10,   8,   6,   4,  2,  0);
+    __m512i mask1 = _v512_set_epu8(127, 125, 123, 121, 119, 117, 115, 113, 111, 109, 107, 105, 103, 101, 99, 97,
+                                    95,  93,  91,  89,  87,  85,  83,  81,  79,  77,  75,  73,  71,  69, 67, 65,
+                                    63,  61,  59,  57,  55,  53,  51,  49,  47,  45,  43,  41,  39,  37, 35, 33,
+                                    31,  29,  27,  25,  23,  21,  19,  17,  15,  13,  11,   9,   7,   5,  3,  1);
+
+    __m512i br01 = _mm512_permutex2var_epi8(bgra0, mask0, bgra1);
+    __m512i ga01 = _mm512_permutex2var_epi8(bgra0, mask1, bgra1);
+    __m512i br23 = _mm512_permutex2var_epi8(bgra2, mask0, bgra3);
+    __m512i ga23 = _mm512_permutex2var_epi8(bgra2, mask1, bgra3);
+
+    a = v_uint8x64(_mm512_permutex2var_epi8(br01, mask0, br23));
+    c = v_uint8x64(_mm512_permutex2var_epi8(br01, mask1, br23));
+    b = v_uint8x64(_mm512_permutex2var_epi8(ga01, mask0, ga23));
+    d = v_uint8x64(_mm512_permutex2var_epi8(ga01, mask1, ga23));
+#else
+    __m512i mask = _mm512_set4_epi32(0x0f0b0703, 0x0e0a0602, 0x0d090501, 0x0c080400);
+    __m512i b0g0r0a0 = _mm512_shuffle_epi8(bgra0, mask);
+    __m512i b1g1r1a1 = _mm512_shuffle_epi8(bgra1, mask);
+    __m512i b2g2r2a2 = _mm512_shuffle_epi8(bgra2, mask);
+    __m512i b3g3r3a3 = _mm512_shuffle_epi8(bgra3, mask);
+
+    __m512i mask0 = _v512_set_epu32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
+    __m512i mask1 = _v512_set_epu32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
+
+    __m512i br01 = _mm512_permutex2var_epi32(b0g0r0a0, mask0, b1g1r1a1);
+    __m512i ga01 = _mm512_permutex2var_epi32(b0g0r0a0, mask1, b1g1r1a1);
+    __m512i br23 = _mm512_permutex2var_epi32(b2g2r2a2, mask0, b3g3r3a3);
+    __m512i ga23 = _mm512_permutex2var_epi32(b2g2r2a2, mask1, b3g3r3a3);
+
+    a = v_uint8x64(_mm512_permutex2var_epi32(br01, mask0, br23));
+    c = v_uint8x64(_mm512_permutex2var_epi32(br01, mask1, br23));
+    b = v_uint8x64(_mm512_permutex2var_epi32(ga01, mask0, ga23));
+    d = v_uint8x64(_mm512_permutex2var_epi32(ga01, mask1, ga23));
+#endif
+}
+
+inline void v_load_deinterleave( const ushort* ptr, v_uint16x32& a, v_uint16x32& b, v_uint16x32& c, v_uint16x32& d )
+{
+    __m512i bgra0 = _mm512_loadu_si512((const __m512i*)ptr);
+    __m512i bgra1 = _mm512_loadu_si512((const __m512i*)(ptr + 32));
+    __m512i bgra2 = _mm512_loadu_si512((const __m512i*)(ptr + 64));
+    __m512i bgra3 = _mm512_loadu_si512((const __m512i*)(ptr + 96));
+
+    __m512i mask0 = _v512_set_epu16(62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32,
+                                    30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10,  8,  6,  4,  2,  0);
+    __m512i mask1 = _v512_set_epu16(63, 61, 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35, 33,
+                                    31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11,  9,  7,  5,  3,  1);
+
+    __m512i br01 = _mm512_permutex2var_epi16(bgra0, mask0, bgra1);
+    __m512i ga01 = _mm512_permutex2var_epi16(bgra0, mask1, bgra1);
+    __m512i br23 = _mm512_permutex2var_epi16(bgra2, mask0, bgra3);
+    __m512i ga23 = _mm512_permutex2var_epi16(bgra2, mask1, bgra3);
+
+    a = v_uint16x32(_mm512_permutex2var_epi16(br01, mask0, br23));
+    c = v_uint16x32(_mm512_permutex2var_epi16(br01, mask1, br23));
+    b = v_uint16x32(_mm512_permutex2var_epi16(ga01, mask0, ga23));
+    d = v_uint16x32(_mm512_permutex2var_epi16(ga01, mask1, ga23));
+}
+
+inline void v_load_deinterleave( const unsigned* ptr, v_uint32x16& a, v_uint32x16& b, v_uint32x16& c, v_uint32x16& d )
+{
+    __m512i bgra0 = _mm512_loadu_si512((const __m512i*)ptr);
+    __m512i bgra1 = _mm512_loadu_si512((const __m512i*)(ptr + 16));
+    __m512i bgra2 = _mm512_loadu_si512((const __m512i*)(ptr + 32));
+    __m512i bgra3 = _mm512_loadu_si512((const __m512i*)(ptr + 48));
+
+    __m512i mask0 = _v512_set_epu32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
+    __m512i mask1 = _v512_set_epu32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
+
+    __m512i br01 = _mm512_permutex2var_epi32(bgra0, mask0, bgra1);
+    __m512i ga01 = _mm512_permutex2var_epi32(bgra0, mask1, bgra1);
+    __m512i br23 = _mm512_permutex2var_epi32(bgra2, mask0, bgra3);
+    __m512i ga23 = _mm512_permutex2var_epi32(bgra2, mask1, bgra3);
+
+    a = v_uint32x16(_mm512_permutex2var_epi32(br01, mask0, br23));
+    c = v_uint32x16(_mm512_permutex2var_epi32(br01, mask1, br23));
+    b = v_uint32x16(_mm512_permutex2var_epi32(ga01, mask0, ga23));
+    d = v_uint32x16(_mm512_permutex2var_epi32(ga01, mask1, ga23));
+}
+
+inline void v_load_deinterleave( const uint64* ptr, v_uint64x8& a, v_uint64x8& b, v_uint64x8& c, v_uint64x8& d )
+{
+    __m512i bgra0 = _mm512_loadu_si512((const __m512i*)ptr);
+    __m512i bgra1 = _mm512_loadu_si512((const __m512i*)(ptr + 8));
+    __m512i bgra2 = _mm512_loadu_si512((const __m512i*)(ptr + 16));
+    __m512i bgra3 = _mm512_loadu_si512((const __m512i*)(ptr + 24));
+
+    __m512i mask0 = _v512_set_epu64(14, 12, 10, 8, 6, 4, 2, 0);
+    __m512i mask1 = _v512_set_epu64(15, 13, 11, 9, 7, 5, 3, 1);
+
+    __m512i br01 = _mm512_permutex2var_epi64(bgra0, mask0, bgra1);
+    __m512i ga01 = _mm512_permutex2var_epi64(bgra0, mask1, bgra1);
+    __m512i br23 = _mm512_permutex2var_epi64(bgra2, mask0, bgra3);
+    __m512i ga23 = _mm512_permutex2var_epi64(bgra2, mask1, bgra3);
+
+    a = v_uint64x8(_mm512_permutex2var_epi64(br01, mask0, br23));
+    c = v_uint64x8(_mm512_permutex2var_epi64(br01, mask1, br23));
+    b = v_uint64x8(_mm512_permutex2var_epi64(ga01, mask0, ga23));
+    d = v_uint64x8(_mm512_permutex2var_epi64(ga01, mask1, ga23));
+}
+
+///////////////////////////// store interleave /////////////////////////////////////
+
+inline void v_store_interleave( uchar* ptr, const v_uint8x64& x, const v_uint8x64& y,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
+{
+    v_uint8x64 low, high;
+    v_zip(x, y, low, high);
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm512_stream_si512((__m512i*)ptr, low.val);
+        _mm512_stream_si512((__m512i*)(ptr + 64), high.val);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm512_store_si512((__m512i*)ptr, low.val);
+        _mm512_store_si512((__m512i*)(ptr + 64), high.val);
+    }
+    else
+    {
+        _mm512_storeu_si512((__m512i*)ptr, low.val);
+        _mm512_storeu_si512((__m512i*)(ptr + 64), high.val);
+    }
+}
+
+inline void v_store_interleave( ushort* ptr, const v_uint16x32& x, const v_uint16x32& y,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
+{
+    v_uint16x32 low, high;
+    v_zip(x, y, low, high);
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm512_stream_si512((__m512i*)ptr, low.val);
+        _mm512_stream_si512((__m512i*)(ptr + 32), high.val);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm512_store_si512((__m512i*)ptr, low.val);
+        _mm512_store_si512((__m512i*)(ptr + 32), high.val);
+    }
+    else
+    {
+        _mm512_storeu_si512((__m512i*)ptr, low.val);
+        _mm512_storeu_si512((__m512i*)(ptr + 32), high.val);
+    }
+}
+
+inline void v_store_interleave( unsigned* ptr, const v_uint32x16& x, const v_uint32x16& y,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
+{
+    v_uint32x16 low, high;
+    v_zip(x, y, low, high);
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm512_stream_si512((__m512i*)ptr, low.val);
+        _mm512_stream_si512((__m512i*)(ptr + 16), high.val);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm512_store_si512((__m512i*)ptr, low.val);
+        _mm512_store_si512((__m512i*)(ptr + 16), high.val);
+    }
+    else
+    {
+        _mm512_storeu_si512((__m512i*)ptr, low.val);
+        _mm512_storeu_si512((__m512i*)(ptr + 16), high.val);
+    }
+}
+
+inline void v_store_interleave( uint64* ptr, const v_uint64x8& x, const v_uint64x8& y,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
+{
+    v_uint64x8 low, high;
+    v_zip(x, y, low, high);
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm512_stream_si512((__m512i*)ptr, low.val);
+        _mm512_stream_si512((__m512i*)(ptr + 8), high.val);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm512_store_si512((__m512i*)ptr, low.val);
+        _mm512_store_si512((__m512i*)(ptr + 8), high.val);
+    }
+    else
+    {
+        _mm512_storeu_si512((__m512i*)ptr, low.val);
+        _mm512_storeu_si512((__m512i*)(ptr + 8), high.val);
+    }
+}
+
+inline void v_store_interleave( uchar* ptr, const v_uint8x64& a, const v_uint8x64& b, const v_uint8x64& c,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
+{
+#if CV_AVX_512VBMI
+    __m512i mask0 = _v512_set_epu8(127,  84,  20, 126,  83,  19, 125,  82,  18, 124,  81,  17, 123,  80,  16, 122,
+                                    79,  15, 121,  78,  14, 120,  77,  13, 119,  76,  12, 118,  75,  11, 117,  74,
+                                    10, 116,  73,   9, 115,  72,   8, 114,  71,   7, 113,  70,   6, 112,  69,   5,
+                                   111,  68,   4, 110,  67,   3, 109,  66,   2, 108,  65,   1, 107,  64,   0, 106);
+    __m512i mask1 = _v512_set_epu8( 21,  42, 105,  20,  41, 104,  19,  40, 103,  18,  39, 102,  17,  38, 101,  16,
+                                    37, 100,  15,  36,  99,  14,  35,  98,  13,  34,  97,  12,  33,  96,  11,  32,
+                                    95,  10,  31,  94,   9,  30,  93,   8,  29,  92,   7,  28,  91,   6,  27,  90,
+                                     5,  26,  89,   4,  25,  88,   3,  24,  87,   2,  23,  86,   1,  22,  85,   0);
+    __m512i mask2 = _v512_set_epu8(106, 127,  63, 105, 126,  62, 104, 125,  61, 103, 124,  60, 102, 123,  59, 101,
+                                   122,  58, 100, 121,  57,  99, 120,  56,  98, 119,  55,  97, 118,  54,  96, 117,
+                                    53,  95, 116,  52,  94, 115,  51,  93, 114,  50,  92, 113,  49,  91, 112,  48,
+                                    90, 111,  47,  89, 110,  46,  88, 109,  45,  87, 108,  44,  86, 107,  43,  85);
+    __m512i r2g0r0 = _mm512_permutex2var_epi8(b.val, mask0, c.val);
+    __m512i b0r1b1 = _mm512_permutex2var_epi8(a.val, mask1, c.val);
+    __m512i g1b2g2 = _mm512_permutex2var_epi8(a.val, mask2, b.val);
+
+    __m512i bgr0 = _mm512_mask_blend_epi8(0x9249249249249249, r2g0r0, b0r1b1);
+    __m512i bgr1 = _mm512_mask_blend_epi8(0x9249249249249249, b0r1b1, g1b2g2);
+    __m512i bgr2 = _mm512_mask_blend_epi8(0x9249249249249249, g1b2g2, r2g0r0);
+#else
+    __m512i g1g0 = _mm512_shuffle_epi8(b.val, _mm512_set4_epi32(0x0e0f0c0d, 0x0a0b0809, 0x06070405, 0x02030001));
+    __m512i b0g0 = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, a.val, g1g0);
+    __m512i r0b1 = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, c.val, a.val);
+    __m512i g1r1 = _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, g1g0, c.val);
+
+    __m512i mask0 = _v512_set_epu16(42, 10, 31, 41,  9, 30, 40,  8, 29, 39,  7, 28, 38,  6, 27, 37,
+                                     5, 26, 36,  4, 25, 35,  3, 24, 34,  2, 23, 33,  1, 22, 32,  0);
+    __m512i mask1 = _v512_set_epu16(21, 52, 41, 20, 51, 40, 19, 50, 39, 18, 49, 38, 17, 48, 37, 16,
+                                    47, 36, 15, 46, 35, 14, 45, 34, 13, 44, 33, 12, 43, 32, 11, 42);
+    __m512i mask2 = _v512_set_epu16(63, 31, 20, 62, 30, 19, 61, 29, 18, 60, 28, 17, 59, 27, 16, 58,
+                                    26, 15, 57, 25, 14, 56, 24, 13, 55, 23, 12, 54, 22, 11, 53, 21);
+    __m512i b0g0b2 = _mm512_permutex2var_epi16(b0g0, mask0, r0b1);
+    __m512i r1b1r0 = _mm512_permutex2var_epi16(b0g0, mask1, g1r1);
+    __m512i g2r2g1 = _mm512_permutex2var_epi16(r0b1, mask2, g1r1);
+
+    __m512i bgr0 = _mm512_mask_blend_epi16(0x24924924, b0g0b2, r1b1r0);
+    __m512i bgr1 = _mm512_mask_blend_epi16(0x24924924, r1b1r0, g2r2g1);
+    __m512i bgr2 = _mm512_mask_blend_epi16(0x24924924, g2r2g1, b0g0b2);
+#endif
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm512_stream_si512((__m512i*)ptr, bgr0);
+        _mm512_stream_si512((__m512i*)(ptr + 64), bgr1);
+        _mm512_stream_si512((__m512i*)(ptr + 128), bgr2);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm512_store_si512((__m512i*)ptr, bgr0);
+        _mm512_store_si512((__m512i*)(ptr + 64), bgr1);
+        _mm512_store_si512((__m512i*)(ptr + 128), bgr2);
+    }
+    else
+    {
+        _mm512_storeu_si512((__m512i*)ptr, bgr0);
+        _mm512_storeu_si512((__m512i*)(ptr + 64), bgr1);
+        _mm512_storeu_si512((__m512i*)(ptr + 128), bgr2);
+    }
+}
+
+inline void v_store_interleave( ushort* ptr, const v_uint16x32& a, const v_uint16x32& b, const v_uint16x32& c,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
+{
+    __m512i mask0 = _v512_set_epu16(42, 10, 31, 41,  9, 30, 40,  8, 29, 39,  7, 28, 38,  6, 27, 37,
+                                     5, 26, 36,  4, 25, 35,  3, 24, 34,  2, 23, 33,  1, 22, 32,  0);
+    __m512i mask1 = _v512_set_epu16(21, 52, 41, 20, 51, 40, 19, 50, 39, 18, 49, 38, 17, 48, 37, 16,
+                                    47, 36, 15, 46, 35, 14, 45, 34, 13, 44, 33, 12, 43, 32, 11, 42);
+    __m512i mask2 = _v512_set_epu16(63, 31, 20, 62, 30, 19, 61, 29, 18, 60, 28, 17, 59, 27, 16, 58,
+                                    26, 15, 57, 25, 14, 56, 24, 13, 55, 23, 12, 54, 22, 11, 53, 21);
+    __m512i b0g0b2 = _mm512_permutex2var_epi16(a.val, mask0, b.val);
+    __m512i r1b1r0 = _mm512_permutex2var_epi16(a.val, mask1, c.val);
+    __m512i g2r2g1 = _mm512_permutex2var_epi16(b.val, mask2, c.val);
+
+    __m512i bgr0 = _mm512_mask_blend_epi16(0x24924924, b0g0b2, r1b1r0);
+    __m512i bgr1 = _mm512_mask_blend_epi16(0x24924924, r1b1r0, g2r2g1);
+    __m512i bgr2 = _mm512_mask_blend_epi16(0x24924924, g2r2g1, b0g0b2);
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm512_stream_si512((__m512i*)ptr, bgr0);
+        _mm512_stream_si512((__m512i*)(ptr + 32), bgr1);
+        _mm512_stream_si512((__m512i*)(ptr + 64), bgr2);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm512_store_si512((__m512i*)ptr, bgr0);
+        _mm512_store_si512((__m512i*)(ptr + 32), bgr1);
+        _mm512_store_si512((__m512i*)(ptr + 64), bgr2);
+    }
+    else
+    {
+        _mm512_storeu_si512((__m512i*)ptr, bgr0);
+        _mm512_storeu_si512((__m512i*)(ptr + 32), bgr1);
+        _mm512_storeu_si512((__m512i*)(ptr + 64), bgr2);
+    }
+}
+
+inline void v_store_interleave( unsigned* ptr, const v_uint32x16& a, const v_uint32x16& b, const v_uint32x16& c,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
+{
+    __m512i mask0 = _v512_set_epu32(26, 31, 15, 25, 30, 14, 24, 29, 13, 23, 28, 12, 22, 27, 11, 21);
+    __m512i mask1 = _v512_set_epu32(31, 10, 25, 30,  9, 24, 29,  8, 23, 28,  7, 22, 27,  6, 21, 26);
+    __m512i g1b2g2 = _mm512_permutex2var_epi32(a.val, mask0, b.val);
+    __m512i r2r1b1 = _mm512_permutex2var_epi32(a.val, mask1, c.val);
+
+    __m512i bgr0 = _mm512_mask_expand_epi32(_mm512_mask_expand_epi32(_mm512_maskz_expand_epi32(0x9249, a.val), 0x2492, b.val), 0x4924, c.val);
+    __m512i bgr1 = _mm512_mask_blend_epi32(0x9249, r2r1b1, g1b2g2);
+    __m512i bgr2 = _mm512_mask_blend_epi32(0x9249, g1b2g2, r2r1b1);
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm512_stream_si512((__m512i*)ptr, bgr0);
+        _mm512_stream_si512((__m512i*)(ptr + 16), bgr1);
+        _mm512_stream_si512((__m512i*)(ptr + 32), bgr2);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm512_store_si512((__m512i*)ptr, bgr0);
+        _mm512_store_si512((__m512i*)(ptr + 16), bgr1);
+        _mm512_store_si512((__m512i*)(ptr + 32), bgr2);
+    }
+    else
+    {
+        _mm512_storeu_si512((__m512i*)ptr, bgr0);
+        _mm512_storeu_si512((__m512i*)(ptr + 16), bgr1);
+        _mm512_storeu_si512((__m512i*)(ptr + 32), bgr2);
+    }
+}
+
+inline void v_store_interleave( uint64* ptr, const v_uint64x8& a, const v_uint64x8& b, const v_uint64x8& c,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
+{
+    __m512i mask0 = _v512_set_epu64( 5, 12,  7,  4, 11,  6,  3, 10);
+    __m512i mask1 = _v512_set_epu64(15,  7,  4, 14,  6,  3, 13,  5);
+    __m512i r1b1b2 = _mm512_permutex2var_epi64(a.val, mask0, c.val);
+    __m512i g2r2g1 = _mm512_permutex2var_epi64(b.val, mask1, c.val);
+
+    __m512i bgr0 = _mm512_mask_expand_epi64(_mm512_mask_expand_epi64(_mm512_maskz_expand_epi64(0x49, a.val), 0x92, b.val), 0x24, c.val);
+    __m512i bgr1 = _mm512_mask_blend_epi64(0xdb, g2r2g1, r1b1b2);
+    __m512i bgr2 = _mm512_mask_blend_epi64(0xdb, r1b1b2, g2r2g1);
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm512_stream_si512((__m512i*)ptr, bgr0);
+        _mm512_stream_si512((__m512i*)(ptr + 8), bgr1);
+        _mm512_stream_si512((__m512i*)(ptr + 16), bgr2);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm512_store_si512((__m512i*)ptr, bgr0);
+        _mm512_store_si512((__m512i*)(ptr + 8), bgr1);
+        _mm512_store_si512((__m512i*)(ptr + 16), bgr2);
+    }
+    else
+    {
+        _mm512_storeu_si512((__m512i*)ptr, bgr0);
+        _mm512_storeu_si512((__m512i*)(ptr + 8), bgr1);
+        _mm512_storeu_si512((__m512i*)(ptr + 16), bgr2);
+    }
+}
+
+inline void v_store_interleave( uchar* ptr, const v_uint8x64& a, const v_uint8x64& b,
+                                const v_uint8x64& c, const v_uint8x64& d,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
+{
+    v_uint8x64 br01, br23, ga01, ga23;
+    v_zip(a, c, br01, br23);
+    v_zip(b, d, ga01, ga23);
+    v_uint8x64 bgra0, bgra1, bgra2, bgra3;
+    v_zip(br01, ga01, bgra0, bgra1);
+    v_zip(br23, ga23, bgra2, bgra3);
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm512_stream_si512((__m512i*)ptr, bgra0.val);
+        _mm512_stream_si512((__m512i*)(ptr + 64), bgra1.val);
+        _mm512_stream_si512((__m512i*)(ptr + 128), bgra2.val);
+        _mm512_stream_si512((__m512i*)(ptr + 192), bgra3.val);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm512_store_si512((__m512i*)ptr, bgra0.val);
+        _mm512_store_si512((__m512i*)(ptr + 64), bgra1.val);
+        _mm512_store_si512((__m512i*)(ptr + 128), bgra2.val);
+        _mm512_store_si512((__m512i*)(ptr + 192), bgra3.val);
+    }
+    else
+    {
+        _mm512_storeu_si512((__m512i*)ptr, bgra0.val);
+        _mm512_storeu_si512((__m512i*)(ptr + 64), bgra1.val);
+        _mm512_storeu_si512((__m512i*)(ptr + 128), bgra2.val);
+        _mm512_storeu_si512((__m512i*)(ptr + 192), bgra3.val);
+    }
+}
+
+inline void v_store_interleave( ushort* ptr, const v_uint16x32& a, const v_uint16x32& b,
+                                const v_uint16x32& c, const v_uint16x32& d,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
+{
+    v_uint16x32 br01, br23, ga01, ga23;
+    v_zip(a, c, br01, br23);
+    v_zip(b, d, ga01, ga23);
+    v_uint16x32 bgra0, bgra1, bgra2, bgra3;
+    v_zip(br01, ga01, bgra0, bgra1);
+    v_zip(br23, ga23, bgra2, bgra3);
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm512_stream_si512((__m512i*)ptr, bgra0.val);
+        _mm512_stream_si512((__m512i*)(ptr + 32), bgra1.val);
+        _mm512_stream_si512((__m512i*)(ptr + 64), bgra2.val);
+        _mm512_stream_si512((__m512i*)(ptr + 96), bgra3.val);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm512_store_si512((__m512i*)ptr, bgra0.val);
+        _mm512_store_si512((__m512i*)(ptr + 32), bgra1.val);
+        _mm512_store_si512((__m512i*)(ptr + 64), bgra2.val);
+        _mm512_store_si512((__m512i*)(ptr + 96), bgra3.val);
+    }
+    else
+    {
+        _mm512_storeu_si512((__m512i*)ptr, bgra0.val);
+        _mm512_storeu_si512((__m512i*)(ptr + 32), bgra1.val);
+        _mm512_storeu_si512((__m512i*)(ptr + 64), bgra2.val);
+        _mm512_storeu_si512((__m512i*)(ptr + 96), bgra3.val);
+    }
+}
+
+inline void v_store_interleave( unsigned* ptr, const v_uint32x16& a, const v_uint32x16& b,
+                                const v_uint32x16& c, const v_uint32x16& d,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
+{
+    v_uint32x16 br01, br23, ga01, ga23;
+    v_zip(a, c, br01, br23);
+    v_zip(b, d, ga01, ga23);
+    v_uint32x16 bgra0, bgra1, bgra2, bgra3;
+    v_zip(br01, ga01, bgra0, bgra1);
+    v_zip(br23, ga23, bgra2, bgra3);
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm512_stream_si512((__m512i*)ptr, bgra0.val);
+        _mm512_stream_si512((__m512i*)(ptr + 16), bgra1.val);
+        _mm512_stream_si512((__m512i*)(ptr + 32), bgra2.val);
+        _mm512_stream_si512((__m512i*)(ptr + 48), bgra3.val);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm512_store_si512((__m512i*)ptr, bgra0.val);
+        _mm512_store_si512((__m512i*)(ptr + 16), bgra1.val);
+        _mm512_store_si512((__m512i*)(ptr + 32), bgra2.val);
+        _mm512_store_si512((__m512i*)(ptr + 48), bgra3.val);
+    }
+    else
+    {
+        _mm512_storeu_si512((__m512i*)ptr, bgra0.val);
+        _mm512_storeu_si512((__m512i*)(ptr + 16), bgra1.val);
+        _mm512_storeu_si512((__m512i*)(ptr + 32), bgra2.val);
+        _mm512_storeu_si512((__m512i*)(ptr + 48), bgra3.val);
+    }
+}
+
+inline void v_store_interleave( uint64* ptr, const v_uint64x8& a, const v_uint64x8& b,
+                                const v_uint64x8& c, const v_uint64x8& d,
+                                hal::StoreMode mode=hal::STORE_UNALIGNED )
+{
+    v_uint64x8 br01, br23, ga01, ga23;
+    v_zip(a, c, br01, br23);
+    v_zip(b, d, ga01, ga23);
+    v_uint64x8 bgra0, bgra1, bgra2, bgra3;
+    v_zip(br01, ga01, bgra0, bgra1);
+    v_zip(br23, ga23, bgra2, bgra3);
+
+    if( mode == hal::STORE_ALIGNED_NOCACHE )
+    {
+        _mm512_stream_si512((__m512i*)ptr, bgra0.val);
+        _mm512_stream_si512((__m512i*)(ptr + 8), bgra1.val);
+        _mm512_stream_si512((__m512i*)(ptr + 16), bgra2.val);
+        _mm512_stream_si512((__m512i*)(ptr + 24), bgra3.val);
+    }
+    else if( mode == hal::STORE_ALIGNED )
+    {
+        _mm512_store_si512((__m512i*)ptr, bgra0.val);
+        _mm512_store_si512((__m512i*)(ptr + 8), bgra1.val);
+        _mm512_store_si512((__m512i*)(ptr + 16), bgra2.val);
+        _mm512_store_si512((__m512i*)(ptr + 24), bgra3.val);
+    }
+    else
+    {
+        _mm512_storeu_si512((__m512i*)ptr, bgra0.val);
+        _mm512_storeu_si512((__m512i*)(ptr + 8), bgra1.val);
+        _mm512_storeu_si512((__m512i*)(ptr + 16), bgra2.val);
+        _mm512_storeu_si512((__m512i*)(ptr + 24), bgra3.val);
+    }
+}
+
+#define OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(_Tpvec0, _Tp0, suffix0, _Tpvec1, _Tp1, suffix1) \
+inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0 ) \
+{ \
+    _Tpvec1 a1, b1; \
+    v_load_deinterleave((const _Tp1*)ptr, a1, b1); \
+    a0 = v_reinterpret_as_##suffix0(a1); \
+    b0 = v_reinterpret_as_##suffix0(b1); \
+} \
+inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0 ) \
+{ \
+    _Tpvec1 a1, b1, c1; \
+    v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1); \
+    a0 = v_reinterpret_as_##suffix0(a1); \
+    b0 = v_reinterpret_as_##suffix0(b1); \
+    c0 = v_reinterpret_as_##suffix0(c1); \
+} \
+inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0, _Tpvec0& d0 ) \
+{ \
+    _Tpvec1 a1, b1, c1, d1; \
+    v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1, d1); \
+    a0 = v_reinterpret_as_##suffix0(a1); \
+    b0 = v_reinterpret_as_##suffix0(b1); \
+    c0 = v_reinterpret_as_##suffix0(c1); \
+    d0 = v_reinterpret_as_##suffix0(d1); \
+} \
+inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
+                                hal::StoreMode mode=hal::STORE_UNALIGNED ) \
+{ \
+    _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
+    _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
+    v_store_interleave((_Tp1*)ptr, a1, b1, mode);      \
+} \
+inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, const _Tpvec0& c0, \
+                                hal::StoreMode mode=hal::STORE_UNALIGNED ) \
+{ \
+    _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
+    _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
+    _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
+    v_store_interleave((_Tp1*)ptr, a1, b1, c1, mode);  \
+} \
+inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
+                                const _Tpvec0& c0, const _Tpvec0& d0, \
+                                hal::StoreMode mode=hal::STORE_UNALIGNED ) \
+{ \
+    _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
+    _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
+    _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
+    _Tpvec1 d1 = v_reinterpret_as_##suffix1(d0); \
+    v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1, mode); \
+}
+
+OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(v_int8x64, schar, s8, v_uint8x64, uchar, u8)
+OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(v_int16x32, short, s16, v_uint16x32, ushort, u16)
+OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(v_int32x16, int, s32, v_uint32x16, unsigned, u32)
+OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(v_float32x16, float, f32, v_uint32x16, unsigned, u32)
+OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(v_int64x8, int64, s64, v_uint64x8, uint64, u64)
+OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(v_float64x8, double, f64, v_uint64x8, uint64, u64)
+
+////////// Mask and checks /////////
+
+/** Mask **/
+inline int64 v_signmask(const v_int8x64& a) { return (int64)_mm512_movepi8_mask(a.val); }
+inline int v_signmask(const v_int16x32& a) { return (int)_mm512_cmp_epi16_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }
+inline int v_signmask(const v_int32x16& a) { return (int)_mm512_cmp_epi32_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }
+inline int v_signmask(const v_int64x8& a) { return (int)_mm512_cmp_epi64_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }
+
+inline int64 v_signmask(const v_uint8x64& a) { return v_signmask(v_reinterpret_as_s8(a)); }
+inline int v_signmask(const v_uint16x32& a) { return v_signmask(v_reinterpret_as_s16(a)); }
+inline int v_signmask(const v_uint32x16& a) { return v_signmask(v_reinterpret_as_s32(a)); }
+inline int v_signmask(const v_uint64x8& a) { return v_signmask(v_reinterpret_as_s64(a)); }
+inline int v_signmask(const v_float32x16& a) { return v_signmask(v_reinterpret_as_s32(a)); }
+inline int v_signmask(const v_float64x8& a) { return v_signmask(v_reinterpret_as_s64(a)); }
+
+/** Checks **/
+inline bool v_check_all(const v_int8x64& a) { return !(bool)_mm512_cmp_epi8_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_NLT); }
+inline bool v_check_any(const v_int8x64& a) { return (bool)_mm512_movepi8_mask(a.val); }
+inline bool v_check_all(const v_int16x32& a) { return !(bool)_mm512_cmp_epi16_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_NLT); }
+inline bool v_check_any(const v_int16x32& a) { return (bool)_mm512_cmp_epi16_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }
+inline bool v_check_all(const v_int32x16& a) { return !(bool)_mm512_cmp_epi32_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_NLT); }
+inline bool v_check_any(const v_int32x16& a) { return (bool)_mm512_cmp_epi32_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }
+inline bool v_check_all(const v_int64x8& a) { return !(bool)_mm512_cmp_epi64_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_NLT); }
+inline bool v_check_any(const v_int64x8& a) { return (bool)_mm512_cmp_epi64_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); }
+
+inline bool v_check_all(const v_float32x16& a) { return v_check_all(v_reinterpret_as_s32(a)); }
+inline bool v_check_any(const v_float32x16& a) { return v_check_any(v_reinterpret_as_s32(a)); }
+inline bool v_check_all(const v_float64x8& a) { return v_check_all(v_reinterpret_as_s64(a)); }
+inline bool v_check_any(const v_float64x8& a) { return v_check_any(v_reinterpret_as_s64(a)); }
+inline bool v_check_all(const v_uint8x64& a) { return v_check_all(v_reinterpret_as_s8(a)); }
+inline bool v_check_all(const v_uint16x32& a) { return v_check_all(v_reinterpret_as_s16(a)); }
+inline bool v_check_all(const v_uint32x16& a) { return v_check_all(v_reinterpret_as_s32(a)); }
+inline bool v_check_all(const v_uint64x8& a) { return v_check_all(v_reinterpret_as_s64(a)); }
+inline bool v_check_any(const v_uint8x64& a) { return v_check_any(v_reinterpret_as_s8(a)); }
+inline bool v_check_any(const v_uint16x32& a) { return v_check_any(v_reinterpret_as_s16(a)); }
+inline bool v_check_any(const v_uint32x16& a) { return v_check_any(v_reinterpret_as_s32(a)); }
+inline bool v_check_any(const v_uint64x8& a) { return v_check_any(v_reinterpret_as_s64(a)); }
+
+inline int v_scan_forward(const v_int8x64& a)
+{
+    int64 mask = _mm512_movepi8_mask(a.val);
+    int mask32 = (int)mask;
+    return mask != 0 ? mask32 != 0 ? trailingZeros32(mask32) : 32 + trailingZeros32((int)(mask >> 32)) : 0;
+}
+inline int v_scan_forward(const v_uint8x64& a) { return v_scan_forward(v_reinterpret_as_s8(a)); }
+inline int v_scan_forward(const v_int16x32& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))); }
+inline int v_scan_forward(const v_uint16x32& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))); }
+inline int v_scan_forward(const v_int32x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))) / 2; }
+inline int v_scan_forward(const v_uint32x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))) / 2; }
+inline int v_scan_forward(const v_float32x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))) / 2; }
+inline int v_scan_forward(const v_int64x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))) / 4; }
+inline int v_scan_forward(const v_uint64x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))) / 4; }
+inline int v_scan_forward(const v_float64x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))) / 4; }
+
+inline void v512_cleanup() { _mm256_zeroall(); }
+
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
+
+//! @endcond
+
+} // cv::
+
+#endif // OPENCV_HAL_INTRIN_AVX_HPP
diff --git a/3rdparty/opencv/include/opencv2/core/hal/intrin_msa.hpp b/3rdparty/opencv/include/opencv2/core/hal/intrin_msa.hpp
new file mode 100644
index 00000000..a1fbb093
--- /dev/null
+++ b/3rdparty/opencv/include/opencv2/core/hal/intrin_msa.hpp
@@ -0,0 +1,1887 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_HAL_INTRIN_MSA_HPP
+#define OPENCV_HAL_INTRIN_MSA_HPP
+
+#include <algorithm>
+#include "opencv2/core/utility.hpp"
+
+namespace cv
+{
+
+//! @cond IGNORED
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
+
+#define CV_SIMD128 1
+
+//MSA implements 128-bit wide vector registers shared with the 64-bit wide floating-point unit registers.
+//MSA and FPU can not be both present, unless the FPU has 64-bit floating-point registers.
+#define CV_SIMD128_64F 1
+
+struct v_uint8x16
+{
+    typedef uchar lane_type;
+    enum { nlanes = 16 };
+
+    v_uint8x16() {}
+    explicit v_uint8x16(v16u8 v) : val(v) {}
+    v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
+               uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
+    {
+        uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
+        val = msa_ld1q_u8(v);
+    }
+
+    uchar get0() const
+    {
+        return msa_getq_lane_u8(val, 0);
+    }
+
+    v16u8 val;
+};
+
+struct v_int8x16
+{
+    typedef schar lane_type;
+    enum { nlanes = 16 };
+
+    v_int8x16() {}
+    explicit v_int8x16(v16i8 v) : val(v) {}
+    v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
+               schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
+    {
+        schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
+        val = msa_ld1q_s8(v);
+    }
+
+    schar get0() const
+    {
+        return msa_getq_lane_s8(val, 0);
+    }
+
+    v16i8 val;
+};
+
+struct v_uint16x8
+{
+    typedef ushort lane_type;
+    enum { nlanes = 8 };
+
+    v_uint16x8() {}
+    explicit v_uint16x8(v8u16 v) : val(v) {}
+    v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
+    {
+        ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
+        val = msa_ld1q_u16(v);
+    }
+
+    ushort get0() const
+    {
+        return msa_getq_lane_u16(val, 0);
+    }
+
+    v8u16 val;
+};
+
+struct v_int16x8
+{
+    typedef short lane_type;
+    enum { nlanes = 8 };
+
+    v_int16x8() {}
+    explicit v_int16x8(v8i16 v) : val(v) {}
+    v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
+    {
+        short v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
+        val = msa_ld1q_s16(v);
+    }
+
+    short get0() const
+    {
+        return msa_getq_lane_s16(val, 0);
+    }
+
+    v8i16 val;
+};
+
+struct v_uint32x4
+{
+    typedef unsigned int lane_type;
+    enum { nlanes = 4 };
+
+    v_uint32x4() {}
+    explicit v_uint32x4(v4u32 v) : val(v) {}
+    v_uint32x4(unsigned int v0, unsigned int v1, unsigned int v2, unsigned int v3)
+    {
+        unsigned int v[] = {v0, v1, v2, v3};
+        val = msa_ld1q_u32(v);
+    }
+
+    unsigned int get0() const
+    {
+        return msa_getq_lane_u32(val, 0);
+    }
+
+    v4u32 val;
+};
+
+struct v_int32x4
+{
+    typedef int lane_type;
+    enum { nlanes = 4 };
+
+    v_int32x4() {}
+    explicit v_int32x4(v4i32 v) : val(v) {}
+    v_int32x4(int v0, int v1, int v2, int v3)
+    {
+        int v[] = {v0, v1, v2, v3};
+        val = msa_ld1q_s32(v);
+    }
+
+    int get0() const
+    {
+        return msa_getq_lane_s32(val, 0);
+    }
+
+    v4i32 val;
+};
+
+struct v_float32x4
+{
+    typedef float lane_type;
+    enum { nlanes = 4 };
+
+    v_float32x4() {}
+    explicit v_float32x4(v4f32 v) : val(v) {}
+    v_float32x4(float v0, float v1, float v2, float v3)
+    {
+        float v[] = {v0, v1, v2, v3};
+        val = msa_ld1q_f32(v);
+    }
+
+    float get0() const
+    {
+        return msa_getq_lane_f32(val, 0);
+    }
+
+    v4f32 val;
+};
+
+struct v_uint64x2
+{
+    typedef uint64 lane_type;
+    enum { nlanes = 2 };
+
+    v_uint64x2() {}
+    explicit v_uint64x2(v2u64 v) : val(v) {}
+    v_uint64x2(uint64 v0, uint64 v1)
+    {
+        uint64 v[] = {v0, v1};
+        val = msa_ld1q_u64(v);
+    }
+
+    uint64 get0() const
+    {
+        return msa_getq_lane_u64(val, 0);
+    }
+
+    v2u64 val;
+};
+
+struct v_int64x2
+{
+    typedef int64 lane_type;
+    enum { nlanes = 2 };
+
+    v_int64x2() {}
+    explicit v_int64x2(v2i64 v) : val(v) {}
+    v_int64x2(int64 v0, int64 v1)
+    {
+        int64 v[] = {v0, v1};
+        val = msa_ld1q_s64(v);
+    }
+
+    int64 get0() const
+    {
+        return msa_getq_lane_s64(val, 0);
+    }
+
+    v2i64 val;
+};
+
+struct v_float64x2
+{
+    typedef double lane_type;
+    enum { nlanes = 2 };
+
+    v_float64x2() {}
+    explicit v_float64x2(v2f64 v) : val(v) {}
+    v_float64x2(double v0, double v1)
+    {
+        double v[] = {v0, v1};
+        val = msa_ld1q_f64(v);
+    }
+
+    double get0() const
+    {
+        return msa_getq_lane_f64(val, 0);
+    }
+
+    v2f64 val;
+};
+
+#define OPENCV_HAL_IMPL_MSA_INIT(_Tpv, _Tp, suffix) \
+inline v_##_Tpv v_setzero_##suffix() { return v_##_Tpv(msa_dupq_n_##suffix((_Tp)0)); } \
+inline v_##_Tpv v_setall_##suffix(_Tp v) { return v_##_Tpv(msa_dupq_n_##suffix(v)); } \
+inline v_uint8x16 v_reinterpret_as_u8(const v_##_Tpv& v) { return v_uint8x16(MSA_TPV_REINTERPRET(v16u8, v.val)); } \
+inline v_int8x16 v_reinterpret_as_s8(const v_##_Tpv& v) { return v_int8x16(MSA_TPV_REINTERPRET(v16i8, v.val)); } \
+inline v_uint16x8 v_reinterpret_as_u16(const v_##_Tpv& v) { return v_uint16x8(MSA_TPV_REINTERPRET(v8u16, v.val)); } \
+inline v_int16x8 v_reinterpret_as_s16(const v_##_Tpv& v) { return v_int16x8(MSA_TPV_REINTERPRET(v8i16, v.val)); } \
+inline v_uint32x4 v_reinterpret_as_u32(const v_##_Tpv& v) { return v_uint32x4(MSA_TPV_REINTERPRET(v4u32, v.val)); } \
+inline v_int32x4 v_reinterpret_as_s32(const v_##_Tpv& v) { return v_int32x4(MSA_TPV_REINTERPRET(v4i32, v.val)); } \
+inline v_uint64x2 v_reinterpret_as_u64(const v_##_Tpv& v) { return v_uint64x2(MSA_TPV_REINTERPRET(v2u64, v.val)); } \
+inline v_int64x2 v_reinterpret_as_s64(const v_##_Tpv& v) { return v_int64x2(MSA_TPV_REINTERPRET(v2i64, v.val)); } \
+inline v_float32x4 v_reinterpret_as_f32(const v_##_Tpv& v) { return v_float32x4(MSA_TPV_REINTERPRET(v4f32, v.val)); } \
+inline v_float64x2 v_reinterpret_as_f64(const v_##_Tpv& v) { return v_float64x2(MSA_TPV_REINTERPRET(v2f64, v.val)); }
+
+OPENCV_HAL_IMPL_MSA_INIT(uint8x16, uchar, u8)
+OPENCV_HAL_IMPL_MSA_INIT(int8x16, schar, s8)
+OPENCV_HAL_IMPL_MSA_INIT(uint16x8, ushort, u16)
+OPENCV_HAL_IMPL_MSA_INIT(int16x8, short, s16)
+OPENCV_HAL_IMPL_MSA_INIT(uint32x4, unsigned int, u32)
+OPENCV_HAL_IMPL_MSA_INIT(int32x4, int, s32)
+OPENCV_HAL_IMPL_MSA_INIT(uint64x2, uint64, u64)
+OPENCV_HAL_IMPL_MSA_INIT(int64x2, int64, s64)
+OPENCV_HAL_IMPL_MSA_INIT(float32x4, float, f32)
+OPENCV_HAL_IMPL_MSA_INIT(float64x2, double, f64)
+
+#define OPENCV_HAL_IMPL_MSA_PACK(_Tpvec, _Tpwvec, pack, mov, rshr) \
+inline _Tpvec v_##pack(const _Tpwvec& a, const _Tpwvec& b) \
+{ \
+    return _Tpvec(mov(a.val, b.val)); \
+} \
+template<int n> inline \
+_Tpvec v_rshr_##pack(const _Tpwvec& a, const _Tpwvec& b) \
+{ \
+    return _Tpvec(rshr(a.val, b.val, n)); \
+}
+
+OPENCV_HAL_IMPL_MSA_PACK(v_uint8x16, v_uint16x8, pack, msa_qpack_u16, msa_qrpackr_u16)
+OPENCV_HAL_IMPL_MSA_PACK(v_int8x16, v_int16x8, pack, msa_qpack_s16, msa_qrpackr_s16)
+OPENCV_HAL_IMPL_MSA_PACK(v_uint16x8, v_uint32x4, pack, msa_qpack_u32, msa_qrpackr_u32)
+OPENCV_HAL_IMPL_MSA_PACK(v_int16x8, v_int32x4, pack, msa_qpack_s32, msa_qrpackr_s32)
+OPENCV_HAL_IMPL_MSA_PACK(v_uint32x4, v_uint64x2, pack, msa_pack_u64, msa_rpackr_u64)
+OPENCV_HAL_IMPL_MSA_PACK(v_int32x4, v_int64x2, pack, msa_pack_s64, msa_rpackr_s64)
+OPENCV_HAL_IMPL_MSA_PACK(v_uint8x16, v_int16x8, pack_u, msa_qpacku_s16, msa_qrpackru_s16)
+OPENCV_HAL_IMPL_MSA_PACK(v_uint16x8, v_int32x4, pack_u, msa_qpacku_s32, msa_qrpackru_s32)
+
+#define OPENCV_HAL_IMPL_MSA_PACK_STORE(_Tpvec, _Tp, hreg, suffix, _Tpwvec, pack, mov, rshr) \
+inline void v_##pack##_store(_Tp* ptr, const _Tpwvec& a) \
+{ \
+    hreg a1 = mov(a.val); \
+    msa_st1_##suffix(ptr, a1); \
+} \
+template<int n> inline \
+void v_rshr_##pack##_store(_Tp* ptr, const _Tpwvec& a) \
+{ \
+    hreg a1 = rshr(a.val, n); \
+    msa_st1_##suffix(ptr, a1); \
+}
+
+OPENCV_HAL_IMPL_MSA_PACK_STORE(v_uint8x16, uchar, v8u8, u8, v_uint16x8, pack, msa_qmovn_u16, msa_qrshrn_n_u16)
+OPENCV_HAL_IMPL_MSA_PACK_STORE(v_int8x16, schar, v8i8, s8, v_int16x8, pack, msa_qmovn_s16, msa_qrshrn_n_s16)
+OPENCV_HAL_IMPL_MSA_PACK_STORE(v_uint16x8, ushort, v4u16, u16, v_uint32x4, pack, msa_qmovn_u32, msa_qrshrn_n_u32)
+OPENCV_HAL_IMPL_MSA_PACK_STORE(v_int16x8, short, v4i16, s16, v_int32x4, pack, msa_qmovn_s32, msa_qrshrn_n_s32)
+OPENCV_HAL_IMPL_MSA_PACK_STORE(v_uint32x4, unsigned, v2u32, u32, v_uint64x2, pack, msa_movn_u64, msa_rshrn_n_u64)
+OPENCV_HAL_IMPL_MSA_PACK_STORE(v_int32x4, int, v2i32, s32, v_int64x2, pack, msa_movn_s64, msa_rshrn_n_s64)
+OPENCV_HAL_IMPL_MSA_PACK_STORE(v_uint8x16, uchar, v8u8, u8, v_int16x8, pack_u, msa_qmovun_s16, msa_qrshrun_n_s16)
+OPENCV_HAL_IMPL_MSA_PACK_STORE(v_uint16x8, ushort, v4u16, u16, v_int32x4, pack_u, msa_qmovun_s32, msa_qrshrun_n_s32)
+
+// pack boolean
+inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
+{
+    return v_uint8x16(msa_pack_u16(a.val, b.val));
+}
+
+inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
+                           const v_uint32x4& c, const v_uint32x4& d)
+{
+    return v_uint8x16(msa_pack_u16(msa_pack_u32(a.val, b.val), msa_pack_u32(c.val, d.val)));
+}
+
+inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
+                           const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f,
+                           const v_uint64x2& g, const v_uint64x2& h)
+{
+    v8u16 abcd = msa_pack_u32(msa_pack_u64(a.val, b.val), msa_pack_u64(c.val, d.val));
+    v8u16 efgh = msa_pack_u32(msa_pack_u64(e.val, f.val), msa_pack_u64(g.val, h.val));
+    return v_uint8x16(msa_pack_u16(abcd, efgh));
+}
+
+inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
+                            const v_float32x4& m1, const v_float32x4& m2,
+                            const v_float32x4& m3)
+{
+    v4f32 v0 = v.val;
+    v4f32 res = msa_mulq_lane_f32(m0.val, v0, 0);
+    res = msa_mlaq_lane_f32(res, m1.val, v0, 1);
+    res = msa_mlaq_lane_f32(res, m2.val, v0, 2);
+    res = msa_mlaq_lane_f32(res, m3.val, v0, 3);
+    return v_float32x4(res);
+}
+
+inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
+                               const v_float32x4& m1, const v_float32x4& m2,
+                               const v_float32x4& a)
+{
+    v4f32 v0 = v.val;
+    v4f32 res = msa_mulq_lane_f32(m0.val, v0, 0);
+    res = msa_mlaq_lane_f32(res, m1.val, v0, 1);
+    res = msa_mlaq_lane_f32(res, m2.val, v0, 2);
+    res = msa_addq_f32(res, a.val);
+    return v_float32x4(res);
+}
+
+#define OPENCV_HAL_IMPL_MSA_BIN_OP(bin_op, _Tpvec, intrin) \
+inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(intrin(a.val, b.val)); \
+} \
+inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
+{ \
+    a.val = intrin(a.val, b.val); \
+    return a; \
+}
+
+OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_uint8x16, msa_qaddq_u8)
+OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_uint8x16, msa_qsubq_u8)
+OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_int8x16, msa_qaddq_s8)
+OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_int8x16, msa_qsubq_s8)
+OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_uint16x8, msa_qaddq_u16)
+OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_uint16x8, msa_qsubq_u16)
+OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_int16x8, msa_qaddq_s16)
+OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_int16x8, msa_qsubq_s16)
+OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_int32x4, msa_addq_s32)
+OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_int32x4, msa_subq_s32)
+OPENCV_HAL_IMPL_MSA_BIN_OP(*, v_int32x4, msa_mulq_s32)
+OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_uint32x4, msa_addq_u32)
+OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_uint32x4, msa_subq_u32)
+OPENCV_HAL_IMPL_MSA_BIN_OP(*, v_uint32x4, msa_mulq_u32)
+OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_float32x4, msa_addq_f32)
+OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_float32x4, msa_subq_f32)
+OPENCV_HAL_IMPL_MSA_BIN_OP(*, v_float32x4, msa_mulq_f32)
+OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_int64x2, msa_addq_s64)
+OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_int64x2, msa_subq_s64)
+OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_uint64x2, msa_addq_u64)
+OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_uint64x2, msa_subq_u64)
+OPENCV_HAL_IMPL_MSA_BIN_OP(/, v_float32x4, msa_divq_f32)
+OPENCV_HAL_IMPL_MSA_BIN_OP(+, v_float64x2, msa_addq_f64)
+OPENCV_HAL_IMPL_MSA_BIN_OP(-, v_float64x2, msa_subq_f64)
+OPENCV_HAL_IMPL_MSA_BIN_OP(*, v_float64x2, msa_mulq_f64)
+OPENCV_HAL_IMPL_MSA_BIN_OP(/, v_float64x2, msa_divq_f64)
+
+// saturating multiply 8-bit, 16-bit
+#define OPENCV_HAL_IMPL_MSA_MUL_SAT(_Tpvec, _Tpwvec)         \
+inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b)  \
+{                                                            \
+    _Tpwvec c, d;                                            \
+    v_mul_expand(a, b, c, d);                                \
+    return v_pack(c, d);                                     \
+}                                                            \
+inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b)      \
+{a = a * b; return a; }
+
+OPENCV_HAL_IMPL_MSA_MUL_SAT(v_int8x16,  v_int16x8)
+OPENCV_HAL_IMPL_MSA_MUL_SAT(v_uint8x16, v_uint16x8)
+OPENCV_HAL_IMPL_MSA_MUL_SAT(v_int16x8,  v_int32x4)
+OPENCV_HAL_IMPL_MSA_MUL_SAT(v_uint16x8, v_uint32x4)
+
+//  Multiply and expand
+inline void v_mul_expand(const v_int8x16& a, const v_int8x16& b,
+                         v_int16x8& c, v_int16x8& d)
+{
+    v16i8 a_lo, a_hi, b_lo, b_hi;
+
+    ILVRL_B2_SB(a.val, msa_dupq_n_s8(0), a_lo, a_hi);
+    ILVRL_B2_SB(b.val, msa_dupq_n_s8(0), b_lo, b_hi);
+    c.val = msa_mulq_s16(msa_paddlq_s8(a_lo), msa_paddlq_s8(b_lo));
+    d.val = msa_mulq_s16(msa_paddlq_s8(a_hi), msa_paddlq_s8(b_hi));
+}
+
+inline void v_mul_expand(const v_uint8x16& a, const v_uint8x16& b,
+                         v_uint16x8& c, v_uint16x8& d)
+{
+    v16u8 a_lo, a_hi, b_lo, b_hi;
+
+    ILVRL_B2_UB(a.val, msa_dupq_n_u8(0), a_lo, a_hi);
+    ILVRL_B2_UB(b.val, msa_dupq_n_u8(0), b_lo, b_hi);
+    c.val = msa_mulq_u16(msa_paddlq_u8(a_lo), msa_paddlq_u8(b_lo));
+    d.val = msa_mulq_u16(msa_paddlq_u8(a_hi), msa_paddlq_u8(b_hi));
+}
+
+inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
+                         v_int32x4& c, v_int32x4& d)
+{
+    v8i16 a_lo, a_hi, b_lo, b_hi;
+
+    ILVRL_H2_SH(a.val, msa_dupq_n_s16(0), a_lo, a_hi);
+    ILVRL_H2_SH(b.val, msa_dupq_n_s16(0), b_lo, b_hi);
+    c.val = msa_mulq_s32(msa_paddlq_s16(a_lo), msa_paddlq_s16(b_lo));
+    d.val = msa_mulq_s32(msa_paddlq_s16(a_hi), msa_paddlq_s16(b_hi));
+}
+
+inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b,
+                         v_uint32x4& c, v_uint32x4& d)
+{
+    v8u16 a_lo, a_hi, b_lo, b_hi;
+
+    ILVRL_H2_UH(a.val, msa_dupq_n_u16(0), a_lo, a_hi);
+    ILVRL_H2_UH(b.val, msa_dupq_n_u16(0), b_lo, b_hi);
+    c.val = msa_mulq_u32(msa_paddlq_u16(a_lo), msa_paddlq_u16(b_lo));
+    d.val = msa_mulq_u32(msa_paddlq_u16(a_hi), msa_paddlq_u16(b_hi));
+}
+
+inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
+                         v_uint64x2& c, v_uint64x2& d)
+{
+    v4u32 a_lo, a_hi, b_lo, b_hi;
+
+    ILVRL_W2_UW(a.val, msa_dupq_n_u32(0), a_lo, a_hi);
+    ILVRL_W2_UW(b.val, msa_dupq_n_u32(0), b_lo, b_hi);
+    c.val = msa_mulq_u64(msa_paddlq_u32(a_lo), msa_paddlq_u32(b_lo));
+    d.val = msa_mulq_u64(msa_paddlq_u32(a_hi), msa_paddlq_u32(b_hi));
+}
+
+inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b)
+{
+    v8i16 a_lo, a_hi, b_lo, b_hi;
+
+    ILVRL_H2_SH(a.val, msa_dupq_n_s16(0), a_lo, a_hi);
+    ILVRL_H2_SH(b.val, msa_dupq_n_s16(0), b_lo, b_hi);
+
+    return v_int16x8(msa_packr_s32(msa_mulq_s32(msa_paddlq_s16(a_lo), msa_paddlq_s16(b_lo)),
+                                   msa_mulq_s32(msa_paddlq_s16(a_hi), msa_paddlq_s16(b_hi)), 16));
+}
+
+inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b)
+{
+    v8u16 a_lo, a_hi, b_lo, b_hi;
+
+    ILVRL_H2_UH(a.val, msa_dupq_n_u16(0), a_lo, a_hi);
+    ILVRL_H2_UH(b.val, msa_dupq_n_u16(0), b_lo, b_hi);
+
+    return v_uint16x8(msa_packr_u32(msa_mulq_u32(msa_paddlq_u16(a_lo), msa_paddlq_u16(b_lo)),
+                                    msa_mulq_u32(msa_paddlq_u16(a_hi), msa_paddlq_u16(b_hi)), 16));
+}
+
+//////// Dot Product ////////
+
+// 16 >> 32
+inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
+{ return v_int32x4(msa_dotp_s_w(a.val, b.val)); }
+inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
+{ return v_int32x4(msa_dpadd_s_w(c.val , a.val, b.val)); }
+
+// 32 >> 64
+inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
+{ return v_int64x2(msa_dotp_s_d(a.val, b.val)); }
+inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
+{ return v_int64x2(msa_dpadd_s_d(c.val , a.val, b.val)); }
+
+// 8 >> 32
+inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
+{
+    v8u16 even_a = msa_shrq_n_u16(msa_shlq_n_u16(MSA_TPV_REINTERPRET(v8u16, a.val), 8), 8);
+    v8u16 odd_a  = msa_shrq_n_u16(MSA_TPV_REINTERPRET(v8u16, a.val), 8);
+    v8u16 even_b = msa_shrq_n_u16(msa_shlq_n_u16(MSA_TPV_REINTERPRET(v8u16, b.val), 8), 8);
+    v8u16 odd_b  = msa_shrq_n_u16(MSA_TPV_REINTERPRET(v8u16, b.val), 8);
+    v4u32 prod   = msa_dotp_u_w(even_a, even_b);
+    return v_uint32x4(msa_dpadd_u_w(prod, odd_a, odd_b));
+}
+inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
+{
+    v8u16 even_a = msa_shrq_n_u16(msa_shlq_n_u16(MSA_TPV_REINTERPRET(v8u16, a.val), 8), 8);
+    v8u16 odd_a  = msa_shrq_n_u16(MSA_TPV_REINTERPRET(v8u16, a.val), 8);
+    v8u16 even_b = msa_shrq_n_u16(msa_shlq_n_u16(MSA_TPV_REINTERPRET(v8u16, b.val), 8), 8);
+    v8u16 odd_b  = msa_shrq_n_u16(MSA_TPV_REINTERPRET(v8u16, b.val), 8);
+    v4u32 prod   = msa_dpadd_u_w(c.val, even_a, even_b);
+    return v_uint32x4(msa_dpadd_u_w(prod, odd_a, odd_b));
+}
+
+inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
+{
+    v8i16 prod = msa_dotp_s_h(a.val, b.val);
+    return v_int32x4(msa_hadd_s32(prod, prod));
+}
+inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b,
+                                  const v_int32x4& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+// 16 >> 64
+inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
+{
+    v4u32 even_a = msa_shrq_n_u32(msa_shlq_n_u32(MSA_TPV_REINTERPRET(v4u32, a.val), 16), 16);
+    v4u32 odd_a  = msa_shrq_n_u32(MSA_TPV_REINTERPRET(v4u32, a.val), 16);
+    v4u32 even_b = msa_shrq_n_u32(msa_shlq_n_u32(MSA_TPV_REINTERPRET(v4u32, b.val), 16), 16);
+    v4u32 odd_b  = msa_shrq_n_u32(MSA_TPV_REINTERPRET(v4u32, b.val), 16);
+    v2u64 prod   = msa_dotp_u_d(even_a, even_b);
+    return v_uint64x2(msa_dpadd_u_d(prod, odd_a, odd_b));
+}
+inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b,
+                                   const v_uint64x2& c)
+{
+    v4u32 even_a = msa_shrq_n_u32(msa_shlq_n_u32(MSA_TPV_REINTERPRET(v4u32, a.val), 16), 16);
+    v4u32 odd_a  = msa_shrq_n_u32(MSA_TPV_REINTERPRET(v4u32, a.val), 16);
+    v4u32 even_b = msa_shrq_n_u32(msa_shlq_n_u32(MSA_TPV_REINTERPRET(v4u32, b.val), 16), 16);
+    v4u32 odd_b  = msa_shrq_n_u32(MSA_TPV_REINTERPRET(v4u32, b.val), 16);
+    v2u64 prod   = msa_dpadd_u_d(c.val, even_a, even_b);
+    return v_uint64x2(msa_dpadd_u_d(prod, odd_a, odd_b));
+}
+
+inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
+{
+    v4i32 prod = msa_dotp_s_w(a.val, b.val);
+    return v_int64x2(msa_hadd_s64(prod, prod));
+}
+inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+// 32 >> 64f
+inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
+{ return v_cvt_f64(v_dotprod(a, b)); }
+inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+
+//////// Fast Dot Product ////////
+
+// 16 >> 32
+inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
+{ return v_dotprod(a, b); }
+inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
+{ return v_dotprod(a, b, c); }
+
+// 32 >> 64
+inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
+{ return v_dotprod(a, b); }
+inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
+{ return v_dotprod(a, b, c); }
+
+// 8 >> 32
+inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b)
+{ return v_dotprod_expand(a, b); }
+inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
+{ return v_dotprod_expand(a, b, c); }
+inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
+{ return v_dotprod_expand(a, b); }
+inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
+{ return v_dotprod_expand(a, b, c); }
+
+// 16 >> 64
+inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)
+{ return v_dotprod_expand(a, b); }
+inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
+{ return v_dotprod_expand(a, b, c); }
+inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
+{ return v_dotprod_expand(a, b); }
+inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
+{ return v_dotprod_expand(a, b, c); }
+
+// 32 >> 64f
+inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
+{ return v_dotprod_expand(a, b); }
+inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
+{ return v_dotprod_expand(a, b, c); }
+
+#define OPENCV_HAL_IMPL_MSA_LOGIC_OP(_Tpvec, _Tpv, suffix) \
+OPENCV_HAL_IMPL_MSA_BIN_OP(&, _Tpvec, msa_andq_##suffix)   \
+OPENCV_HAL_IMPL_MSA_BIN_OP(|, _Tpvec, msa_orrq_##suffix)   \
+OPENCV_HAL_IMPL_MSA_BIN_OP(^, _Tpvec, msa_eorq_##suffix)   \
+inline _Tpvec operator ~ (const _Tpvec& a) \
+{ \
+    return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_mvnq_u8(MSA_TPV_REINTERPRET(v16u8, a.val)))); \
+}
+
+OPENCV_HAL_IMPL_MSA_LOGIC_OP(v_uint8x16, v16u8, u8)
+OPENCV_HAL_IMPL_MSA_LOGIC_OP(v_int8x16, v16i8, s8)
+OPENCV_HAL_IMPL_MSA_LOGIC_OP(v_uint16x8, v8u16, u16)
+OPENCV_HAL_IMPL_MSA_LOGIC_OP(v_int16x8, v8i16, s16)
+OPENCV_HAL_IMPL_MSA_LOGIC_OP(v_uint32x4, v4u32, u32)
+OPENCV_HAL_IMPL_MSA_LOGIC_OP(v_int32x4, v4i32, s32)
+OPENCV_HAL_IMPL_MSA_LOGIC_OP(v_uint64x2, v2u64, u64)
+OPENCV_HAL_IMPL_MSA_LOGIC_OP(v_int64x2, v2i64, s64)
+
+#define OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(bin_op, intrin) \
+inline v_float32x4 operator bin_op (const v_float32x4& a, const v_float32x4& b) \
+{ \
+    return v_float32x4(MSA_TPV_REINTERPRET(v4f32, intrin(MSA_TPV_REINTERPRET(v4i32, a.val), MSA_TPV_REINTERPRET(v4i32, b.val)))); \
+} \
+inline v_float32x4& operator bin_op##= (v_float32x4& a, const v_float32x4& b) \
+{ \
+    a.val = MSA_TPV_REINTERPRET(v4f32, intrin(MSA_TPV_REINTERPRET(v4i32, a.val), MSA_TPV_REINTERPRET(v4i32, b.val))); \
+    return a; \
+}
+
+OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(&, msa_andq_s32)
+OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(|, msa_orrq_s32)
+OPENCV_HAL_IMPL_MSA_FLT_BIT_OP(^, msa_eorq_s32)
+
+inline v_float32x4 operator ~ (const v_float32x4& a)
+{
+    return v_float32x4(MSA_TPV_REINTERPRET(v4f32, msa_mvnq_s32(MSA_TPV_REINTERPRET(v4i32, a.val))));
+}
+
+/* v_abs */
+#define OPENCV_HAL_IMPL_MSA_ABS(_Tpuvec, _Tpsvec, usuffix, ssuffix) \
+inline _Tpuvec v_abs(const _Tpsvec& a) \
+{ \
+    return v_reinterpret_as_##usuffix(_Tpsvec(msa_absq_##ssuffix(a.val))); \
+}
+
+OPENCV_HAL_IMPL_MSA_ABS(v_uint8x16, v_int8x16, u8, s8)
+OPENCV_HAL_IMPL_MSA_ABS(v_uint16x8, v_int16x8, u16, s16)
+OPENCV_HAL_IMPL_MSA_ABS(v_uint32x4, v_int32x4, u32, s32)
+
+/* v_abs(float), v_sqrt, v_invsqrt */
+#define OPENCV_HAL_IMPL_MSA_BASIC_FUNC(_Tpvec, func, intrin) \
+inline _Tpvec func(const _Tpvec& a) \
+{ \
+    return _Tpvec(intrin(a.val)); \
+}
+
+OPENCV_HAL_IMPL_MSA_BASIC_FUNC(v_float32x4, v_abs, msa_absq_f32)
+OPENCV_HAL_IMPL_MSA_BASIC_FUNC(v_float64x2, v_abs, msa_absq_f64)
+OPENCV_HAL_IMPL_MSA_BASIC_FUNC(v_float32x4, v_sqrt, msa_sqrtq_f32)
+OPENCV_HAL_IMPL_MSA_BASIC_FUNC(v_float32x4, v_invsqrt, msa_rsqrtq_f32)
+OPENCV_HAL_IMPL_MSA_BASIC_FUNC(v_float64x2, v_sqrt, msa_sqrtq_f64)
+OPENCV_HAL_IMPL_MSA_BASIC_FUNC(v_float64x2, v_invsqrt, msa_rsqrtq_f64)
+
+#define OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(bin_op, intrin) \
+inline v_float64x2 operator bin_op (const v_float64x2& a, const v_float64x2& b) \
+{ \
+    return v_float64x2(MSA_TPV_REINTERPRET(v2f64, intrin(MSA_TPV_REINTERPRET(v2i64, a.val), MSA_TPV_REINTERPRET(v2i64, b.val)))); \
+} \
+inline v_float64x2& operator bin_op##= (v_float64x2& a, const v_float64x2& b) \
+{ \
+    a.val = MSA_TPV_REINTERPRET(v2f64, intrin(MSA_TPV_REINTERPRET(v2i64, a.val), MSA_TPV_REINTERPRET(v2i64, b.val))); \
+    return a; \
+}
+
+OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(&, msa_andq_s64)
+OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(|, msa_orrq_s64)
+OPENCV_HAL_IMPL_MSA_DBL_BIT_OP(^, msa_eorq_s64)
+
+inline v_float64x2 operator ~ (const v_float64x2& a)
+{
+    return v_float64x2(MSA_TPV_REINTERPRET(v2f64, msa_mvnq_s32(MSA_TPV_REINTERPRET(v4i32, a.val))));
+}
+
+// TODO: exp, log, sin, cos
+
+#define OPENCV_HAL_IMPL_MSA_BIN_FUNC(_Tpvec, func, intrin) \
+inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(intrin(a.val, b.val)); \
+}
+
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint8x16, v_min, msa_minq_u8)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint8x16, v_max, msa_maxq_u8)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int8x16, v_min, msa_minq_s8)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int8x16, v_max, msa_maxq_s8)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint16x8, v_min, msa_minq_u16)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint16x8, v_max, msa_maxq_u16)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int16x8, v_min, msa_minq_s16)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int16x8, v_max, msa_maxq_s16)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint32x4, v_min, msa_minq_u32)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint32x4, v_max, msa_maxq_u32)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int32x4, v_min, msa_minq_s32)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int32x4, v_max, msa_maxq_s32)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_float32x4, v_min, msa_minq_f32)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_float32x4, v_max, msa_maxq_f32)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_float64x2, v_min, msa_minq_f64)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_float64x2, v_max, msa_maxq_f64)
+
+#define OPENCV_HAL_IMPL_MSA_INT_CMP_OP(_Tpvec, _Tpv, suffix, not_suffix) \
+inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_ceqq_##suffix(a.val, b.val))); } \
+inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_mvnq_##not_suffix(msa_ceqq_##suffix(a.val, b.val)))); } \
+inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_cltq_##suffix(a.val, b.val))); } \
+inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_cgtq_##suffix(a.val, b.val))); } \
+inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_cleq_##suffix(a.val, b.val))); } \
+inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_cgeq_##suffix(a.val, b.val))); }
+
+OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_uint8x16, v16u8, u8, u8)
+OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_int8x16, v16i8, s8, u8)
+OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_uint16x8, v8u16, u16, u16)
+OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_int16x8, v8i16, s16, u16)
+OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_uint32x4, v4u32, u32, u32)
+OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_int32x4, v4i32, s32, u32)
+OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_float32x4, v4f32, f32, u32)
+OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_uint64x2, v2u64, u64, u64)
+OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_int64x2, v2i64, s64, u64)
+OPENCV_HAL_IMPL_MSA_INT_CMP_OP(v_float64x2, v2f64, f64, u64)
+
+inline v_float32x4 v_not_nan(const v_float32x4& a)
+{ return v_float32x4(MSA_TPV_REINTERPRET(v4f32, msa_ceqq_f32(a.val, a.val))); }
+inline v_float64x2 v_not_nan(const v_float64x2& a)
+{ return v_float64x2(MSA_TPV_REINTERPRET(v2f64, msa_ceqq_f64(a.val, a.val))); }
+
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint8x16, v_add_wrap, msa_addq_u8)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int8x16, v_add_wrap, msa_addq_s8)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint16x8, v_add_wrap, msa_addq_u16)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int16x8, v_add_wrap, msa_addq_s16)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint8x16, v_sub_wrap, msa_subq_u8)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int8x16, v_sub_wrap, msa_subq_s8)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint16x8, v_sub_wrap, msa_subq_u16)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int16x8, v_sub_wrap, msa_subq_s16)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint8x16, v_mul_wrap, msa_mulq_u8)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int8x16, v_mul_wrap, msa_mulq_s8)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint16x8, v_mul_wrap, msa_mulq_u16)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int16x8, v_mul_wrap, msa_mulq_s16)
+
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint8x16, v_absdiff, msa_abdq_u8)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint16x8, v_absdiff, msa_abdq_u16)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_uint32x4, v_absdiff, msa_abdq_u32)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_float32x4, v_absdiff, msa_abdq_f32)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_float64x2, v_absdiff, msa_abdq_f64)
+
+/** Saturating absolute difference **/
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int8x16, v_absdiffs, msa_qabdq_s8)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC(v_int16x8, v_absdiffs, msa_qabdq_s16)
+
+#define OPENCV_HAL_IMPL_MSA_BIN_FUNC2(_Tpvec, _Tpvec2, _Tpv, func, intrin) \
+inline _Tpvec2 func(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec2(MSA_TPV_REINTERPRET(_Tpv, intrin(a.val, b.val))); \
+}
+
+OPENCV_HAL_IMPL_MSA_BIN_FUNC2(v_int8x16, v_uint8x16, v16u8, v_absdiff, msa_abdq_s8)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC2(v_int16x8, v_uint16x8, v8u16, v_absdiff, msa_abdq_s16)
+OPENCV_HAL_IMPL_MSA_BIN_FUNC2(v_int32x4, v_uint32x4, v4u32, v_absdiff, msa_abdq_s32)
+
+/* v_magnitude, v_sqr_magnitude, v_fma, v_muladd */
+inline v_float32x4 v_magnitude(const v_float32x4& a, const v_float32x4& b)
+{
+    v_float32x4 x(msa_mlaq_f32(msa_mulq_f32(a.val, a.val), b.val, b.val));
+    return v_sqrt(x);
+}
+
+inline v_float32x4 v_sqr_magnitude(const v_float32x4& a, const v_float32x4& b)
+{
+    return v_float32x4(msa_mlaq_f32(msa_mulq_f32(a.val, a.val), b.val, b.val));
+}
+
+inline v_float32x4 v_fma(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
+{
+    return v_float32x4(msa_mlaq_f32(c.val, a.val, b.val));
+}
+
+inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
+{
+    return v_int32x4(msa_mlaq_s32(c.val, a.val, b.val));
+}
+
+inline v_float32x4 v_muladd(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
+{
+    return v_fma(a, b, c);
+}
+
+inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
+{
+    return v_fma(a, b, c);
+}
+
+inline v_float64x2 v_magnitude(const v_float64x2& a, const v_float64x2& b)
+{
+    v_float64x2 x(msa_mlaq_f64(msa_mulq_f64(a.val, a.val), b.val, b.val));
+    return v_sqrt(x);
+}
+
+inline v_float64x2 v_sqr_magnitude(const v_float64x2& a, const v_float64x2& b)
+{
+    return v_float64x2(msa_mlaq_f64(msa_mulq_f64(a.val, a.val), b.val, b.val));
+}
+
+inline v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
+{
+    return v_float64x2(msa_mlaq_f64(c.val, a.val, b.val));
+}
+
+inline v_float64x2 v_muladd(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
+{
+    return v_fma(a, b, c);
+}
+
+// trade efficiency for convenience
+#define OPENCV_HAL_IMPL_MSA_SHIFT_OP(_Tpvec, suffix, _Tps, ssuffix) \
+inline _Tpvec operator << (const _Tpvec& a, int n) \
+{ return _Tpvec(msa_shlq_##suffix(a.val, msa_dupq_n_##ssuffix((_Tps)n))); } \
+inline _Tpvec operator >> (const _Tpvec& a, int n) \
+{ return _Tpvec(msa_shrq_##suffix(a.val, msa_dupq_n_##ssuffix((_Tps)n))); } \
+template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
+{ return _Tpvec(msa_shlq_n_##suffix(a.val, n)); } \
+template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
+{ return _Tpvec(msa_shrq_n_##suffix(a.val, n)); } \
+template<int n> inline _Tpvec v_rshr(const _Tpvec& a) \
+{ return _Tpvec(msa_rshrq_n_##suffix(a.val, n)); }
+
+OPENCV_HAL_IMPL_MSA_SHIFT_OP(v_uint8x16, u8, schar, s8)
+OPENCV_HAL_IMPL_MSA_SHIFT_OP(v_int8x16, s8, schar, s8)
+OPENCV_HAL_IMPL_MSA_SHIFT_OP(v_uint16x8, u16, short, s16)
+OPENCV_HAL_IMPL_MSA_SHIFT_OP(v_int16x8, s16, short, s16)
+OPENCV_HAL_IMPL_MSA_SHIFT_OP(v_uint32x4, u32, int, s32)
+OPENCV_HAL_IMPL_MSA_SHIFT_OP(v_int32x4, s32, int, s32)
+OPENCV_HAL_IMPL_MSA_SHIFT_OP(v_uint64x2, u64, int64, s64)
+OPENCV_HAL_IMPL_MSA_SHIFT_OP(v_int64x2, s64, int64, s64)
+
+/* v_rotate_right, v_rotate_left */
+#define OPENCV_HAL_IMPL_MSA_ROTATE_OP(_Tpvec, _Tpv, _Tpvs, suffix) \
+template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a) \
+{ \
+    return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_extq_##suffix(MSA_TPV_REINTERPRET(_Tpvs, a.val), msa_dupq_n_##suffix(0), n))); \
+} \
+template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a) \
+{ \
+    return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_extq_##suffix(msa_dupq_n_##suffix(0), MSA_TPV_REINTERPRET(_Tpvs, a.val), _Tpvec::nlanes - n))); \
+} \
+template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a) \
+{ \
+    return a; \
+} \
+template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_extq_##suffix(MSA_TPV_REINTERPRET(_Tpvs, a.val), MSA_TPV_REINTERPRET(_Tpvs, b.val), n))); \
+} \
+template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_extq_##suffix(MSA_TPV_REINTERPRET(_Tpvs, b.val), MSA_TPV_REINTERPRET(_Tpvs, a.val), _Tpvec::nlanes - n))); \
+} \
+template<> inline _Tpvec v_rotate_left<0>(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    CV_UNUSED(b); \
+    return a; \
+}
+
+OPENCV_HAL_IMPL_MSA_ROTATE_OP(v_uint8x16, v16u8, v16i8, s8)
+OPENCV_HAL_IMPL_MSA_ROTATE_OP(v_int8x16, v16i8, v16i8, s8)
+OPENCV_HAL_IMPL_MSA_ROTATE_OP(v_uint16x8, v8u16, v8i16, s16)
+OPENCV_HAL_IMPL_MSA_ROTATE_OP(v_int16x8, v8i16, v8i16, s16)
+OPENCV_HAL_IMPL_MSA_ROTATE_OP(v_uint32x4, v4u32, v4i32, s32)
+OPENCV_HAL_IMPL_MSA_ROTATE_OP(v_int32x4, v4i32, v4i32, s32)
+OPENCV_HAL_IMPL_MSA_ROTATE_OP(v_float32x4, v4f32, v4i32, s32)
+OPENCV_HAL_IMPL_MSA_ROTATE_OP(v_uint64x2, v2u64, v2i64, s64)
+OPENCV_HAL_IMPL_MSA_ROTATE_OP(v_int64x2, v2i64, v2i64, s64)
+OPENCV_HAL_IMPL_MSA_ROTATE_OP(v_float64x2, v2f64, v2i64, s64)
+
+#define OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(_Tpvec, _Tp, suffix) \
+inline _Tpvec v_load(const _Tp* ptr) \
+{ return _Tpvec(msa_ld1q_##suffix(ptr)); } \
+inline _Tpvec v_load_aligned(const _Tp* ptr) \
+{ return _Tpvec(msa_ld1q_##suffix(ptr)); } \
+inline _Tpvec v_load_low(const _Tp* ptr) \
+{ return _Tpvec(msa_combine_##suffix(msa_ld1_##suffix(ptr), msa_dup_n_##suffix((_Tp)0))); } \
+inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
+{ return _Tpvec(msa_combine_##suffix(msa_ld1_##suffix(ptr0), msa_ld1_##suffix(ptr1))); } \
+inline void v_store(_Tp* ptr, const _Tpvec& a) \
+{ msa_st1q_##suffix(ptr, a.val); } \
+inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
+{ msa_st1q_##suffix(ptr, a.val); } \
+inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
+{ msa_st1q_##suffix(ptr, a.val); } \
+inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \
+{ msa_st1q_##suffix(ptr, a.val); } \
+inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
+{ \
+    int n  = _Tpvec::nlanes; \
+    for( int i = 0; i < (n/2); i++ ) \
+        ptr[i] = a.val[i]; \
+} \
+inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
+{ \
+    int n  = _Tpvec::nlanes; \
+    for( int i = 0; i < (n/2); i++ ) \
+        ptr[i] = a.val[i+(n/2)]; \
+}
+
+OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_uint8x16, uchar, u8)
+OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_int8x16, schar, s8)
+OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_uint16x8, ushort, u16)
+OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_int16x8, short, s16)
+OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_uint32x4, unsigned, u32)
+OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_int32x4, int, s32)
+OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_uint64x2, uint64, u64)
+OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_int64x2, int64, s64)
+OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_float32x4, float, f32)
+OPENCV_HAL_IMPL_MSA_LOADSTORE_OP(v_float64x2, double, f64)
+
+
+/** Reverse **/
+inline v_uint8x16 v_reverse(const v_uint8x16 &a)
+{
+    v_uint8x16 c = v_uint8x16((v16u8)__builtin_msa_vshf_b((v16i8)((v2i64){0x08090A0B0C0D0E0F, 0x0001020304050607}), msa_dupq_n_s8(0), (v16i8)a.val));
+    return c;
+}
+
+inline v_int8x16 v_reverse(const v_int8x16 &a)
+{ return v_reinterpret_as_s8(v_reverse(v_reinterpret_as_u8(a))); }
+
+inline v_uint16x8 v_reverse(const v_uint16x8 &a)
+{
+    v_uint16x8 c = v_uint16x8((v8u16)__builtin_msa_vshf_h((v8i16)((v2i64){0x0004000500060007, 0x0000000100020003}), msa_dupq_n_s16(0), (v8i16)a.val));
+    return c;
+}
+
+inline v_int16x8 v_reverse(const v_int16x8 &a)
+{ return v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); }
+
+inline v_uint32x4 v_reverse(const v_uint32x4 &a)
+{
+    v_uint32x4 c;
+    c.val[0] = a.val[3];
+    c.val[1] = a.val[2];
+    c.val[2] = a.val[1];
+    c.val[3] = a.val[0];
+    return c;
+}
+
+inline v_int32x4 v_reverse(const v_int32x4 &a)
+{ return v_reinterpret_as_s32(v_reverse(v_reinterpret_as_u32(a))); }
+
+inline v_float32x4 v_reverse(const v_float32x4 &a)
+{ return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); }
+
+inline v_uint64x2 v_reverse(const v_uint64x2 &a)
+{
+    v_uint64x2 c;
+    c.val[0] = a.val[1];
+    c.val[1] = a.val[0];
+    return c;
+}
+
+inline v_int64x2 v_reverse(const v_int64x2 &a)
+{ return v_reinterpret_as_s64(v_reverse(v_reinterpret_as_u64(a))); }
+
+inline v_float64x2 v_reverse(const v_float64x2 &a)
+{ return v_reinterpret_as_f64(v_reverse(v_reinterpret_as_u64(a))); }
+
+
+#define OPENCV_HAL_IMPL_MSA_REDUCE_OP_8U(func, cfunc) \
+inline unsigned short v_reduce_##func(const v_uint16x8& a) \
+{ \
+    v8u16 a_lo, a_hi; \
+    ILVRL_H2_UH(a.val, msa_dupq_n_u16(0), a_lo, a_hi); \
+    v4u32 b = msa_##func##q_u32(msa_paddlq_u16(a_lo), msa_paddlq_u16(a_hi)); \
+    v4u32 b_lo, b_hi; \
+    ILVRL_W2_UW(b, msa_dupq_n_u32(0), b_lo, b_hi); \
+    v2u64 c = msa_##func##q_u64(msa_paddlq_u32(b_lo), msa_paddlq_u32(b_hi)); \
+    return (unsigned short)cfunc(c[0], c[1]); \
+}
+
+OPENCV_HAL_IMPL_MSA_REDUCE_OP_8U(max, std::max)
+OPENCV_HAL_IMPL_MSA_REDUCE_OP_8U(min, std::min)
+
+#define OPENCV_HAL_IMPL_MSA_REDUCE_OP_8S(func, cfunc) \
+inline short v_reduce_##func(const v_int16x8& a) \
+{ \
+    v8i16 a_lo, a_hi; \
+    ILVRL_H2_SH(a.val, msa_dupq_n_s16(0), a_lo, a_hi); \
+    v4i32 b = msa_##func##q_s32(msa_paddlq_s16(a_lo), msa_paddlq_s16(a_hi)); \
+    v4i32 b_lo, b_hi; \
+    ILVRL_W2_SW(b, msa_dupq_n_s32(0), b_lo, b_hi); \
+    v2i64 c = msa_##func##q_s64(msa_paddlq_s32(b_lo), msa_paddlq_s32(b_hi)); \
+    return (short)cfunc(c[0], c[1]); \
+}
+
+OPENCV_HAL_IMPL_MSA_REDUCE_OP_8S(max, std::max)
+OPENCV_HAL_IMPL_MSA_REDUCE_OP_8S(min, std::min)
+
+#define OPENCV_HAL_IMPL_MSA_REDUCE_OP_4(_Tpvec, scalartype, func, cfunc) \
+inline scalartype v_reduce_##func(const _Tpvec& a) \
+{ \
+    return (scalartype)cfunc(cfunc(a.val[0], a.val[1]), cfunc(a.val[2], a.val[3])); \
+}
+
+OPENCV_HAL_IMPL_MSA_REDUCE_OP_4(v_uint32x4, unsigned, max, std::max)
+OPENCV_HAL_IMPL_MSA_REDUCE_OP_4(v_uint32x4, unsigned, min, std::min)
+OPENCV_HAL_IMPL_MSA_REDUCE_OP_4(v_int32x4, int, max, std::max)
+OPENCV_HAL_IMPL_MSA_REDUCE_OP_4(v_int32x4, int, min, std::min)
+OPENCV_HAL_IMPL_MSA_REDUCE_OP_4(v_float32x4, float, max, std::max)
+OPENCV_HAL_IMPL_MSA_REDUCE_OP_4(v_float32x4, float, min, std::min)
+
+
+#define OPENCV_HAL_IMPL_MSA_REDUCE_OP_16(_Tpvec, scalartype, _Tpvec2, func) \
+inline scalartype v_reduce_##func(const _Tpvec& a) \
+{ \
+    _Tpvec2 a1, a2; \
+    v_expand(a, a1, a2); \
+    return (scalartype)v_reduce_##func(v_##func(a1, a2)); \
+}
+
+OPENCV_HAL_IMPL_MSA_REDUCE_OP_16(v_uint8x16, uchar, v_uint16x8, min)
+OPENCV_HAL_IMPL_MSA_REDUCE_OP_16(v_uint8x16, uchar, v_uint16x8, max)
+OPENCV_HAL_IMPL_MSA_REDUCE_OP_16(v_int8x16, char, v_int16x8, min)
+OPENCV_HAL_IMPL_MSA_REDUCE_OP_16(v_int8x16, char, v_int16x8, max)
+
+
+
+#define OPENCV_HAL_IMPL_MSA_REDUCE_SUM(_Tpvec, scalartype, suffix) \
+inline scalartype v_reduce_sum(const _Tpvec& a) \
+{ \
+    return (scalartype)msa_sum_##suffix(a.val); \
+}
+
+OPENCV_HAL_IMPL_MSA_REDUCE_SUM(v_uint8x16, unsigned char, u8)
+OPENCV_HAL_IMPL_MSA_REDUCE_SUM(v_int8x16, char, s8)
+OPENCV_HAL_IMPL_MSA_REDUCE_SUM(v_uint16x8, unsigned short, u16)
+OPENCV_HAL_IMPL_MSA_REDUCE_SUM(v_int16x8, short, s16)
+OPENCV_HAL_IMPL_MSA_REDUCE_SUM(v_uint32x4, unsigned, u32)
+OPENCV_HAL_IMPL_MSA_REDUCE_SUM(v_int32x4, int, s32)
+OPENCV_HAL_IMPL_MSA_REDUCE_SUM(v_float32x4, float, f32)
+
+inline uint64 v_reduce_sum(const v_uint64x2& a)
+{ return (uint64)(msa_getq_lane_u64(a.val, 0) + msa_getq_lane_u64(a.val, 1)); }
+inline int64 v_reduce_sum(const v_int64x2& a)
+{ return (int64)(msa_getq_lane_s64(a.val, 0) + msa_getq_lane_s64(a.val, 1)); }
+inline double v_reduce_sum(const v_float64x2& a)
+{
+    return msa_getq_lane_f64(a.val, 0) + msa_getq_lane_f64(a.val, 1);
+}
+
+/* v_reduce_sum4, v_reduce_sad */
+inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
+                                 const v_float32x4& c, const v_float32x4& d)
+{
+    v4f32 u0 = msa_addq_f32(MSA_TPV_REINTERPRET(v4f32, msa_ilvevq_s32(MSA_TPV_REINTERPRET(v4i32, b.val), MSA_TPV_REINTERPRET(v4i32, a.val))),
+                            MSA_TPV_REINTERPRET(v4f32, msa_ilvodq_s32(MSA_TPV_REINTERPRET(v4i32, b.val), MSA_TPV_REINTERPRET(v4i32, a.val)))); // a0+a1 b0+b1 a2+a3 b2+b3
+    v4f32 u1 = msa_addq_f32(MSA_TPV_REINTERPRET(v4f32, msa_ilvevq_s32(MSA_TPV_REINTERPRET(v4i32, d.val), MSA_TPV_REINTERPRET(v4i32, c.val))),
+                            MSA_TPV_REINTERPRET(v4f32, msa_ilvodq_s32(MSA_TPV_REINTERPRET(v4i32, d.val), MSA_TPV_REINTERPRET(v4i32, c.val)))); // c0+c1 d0+d1 c2+c3 d2+d3
+
+    return v_float32x4(msa_addq_f32(MSA_TPV_REINTERPRET(v4f32, msa_ilvrq_s64(MSA_TPV_REINTERPRET(v2i64, u1), MSA_TPV_REINTERPRET(v2i64, u0))),
+                                    MSA_TPV_REINTERPRET(v4f32, msa_ilvlq_s64(MSA_TPV_REINTERPRET(v2i64, u1), MSA_TPV_REINTERPRET(v2i64, u0)))));
+}
+
+inline unsigned v_reduce_sad(const v_uint8x16& a, const v_uint8x16& b)
+{
+    v16u8 t0 = msa_abdq_u8(a.val, b.val);
+    v8u16 t1 = msa_paddlq_u8(t0);
+    v4u32 t2 = msa_paddlq_u16(t1);
+    return msa_sum_u32(t2);
+}
+inline unsigned v_reduce_sad(const v_int8x16& a, const v_int8x16& b)
+{
+    v16u8 t0 = MSA_TPV_REINTERPRET(v16u8, msa_abdq_s8(a.val, b.val));
+    v8u16 t1 = msa_paddlq_u8(t0);
+    v4u32 t2 = msa_paddlq_u16(t1);
+    return msa_sum_u32(t2);
+}
+inline unsigned v_reduce_sad(const v_uint16x8& a, const v_uint16x8& b)
+{
+    v8u16 t0 = msa_abdq_u16(a.val, b.val);
+    v4u32 t1 = msa_paddlq_u16(t0);
+    return msa_sum_u32(t1);
+}
+inline unsigned v_reduce_sad(const v_int16x8& a, const v_int16x8& b)
+{
+    v8u16 t0 = MSA_TPV_REINTERPRET(v8u16, msa_abdq_s16(a.val, b.val));
+    v4u32 t1 = msa_paddlq_u16(t0);
+    return msa_sum_u32(t1);
+}
+inline unsigned v_reduce_sad(const v_uint32x4& a, const v_uint32x4& b)
+{
+    v4u32 t0 = msa_abdq_u32(a.val, b.val);
+    return msa_sum_u32(t0);
+}
+inline unsigned v_reduce_sad(const v_int32x4& a, const v_int32x4& b)
+{
+    v4u32 t0 = MSA_TPV_REINTERPRET(v4u32, msa_abdq_s32(a.val, b.val));
+    return msa_sum_u32(t0);
+}
+inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
+{
+    v4f32 t0 = msa_abdq_f32(a.val, b.val);
+    return msa_sum_f32(t0);
+}
+
+/* v_popcount */
+#define OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE8(_Tpvec) \
+inline v_uint8x16 v_popcount(const _Tpvec& a) \
+{ \
+    v16u8 t = MSA_TPV_REINTERPRET(v16u8, msa_cntq_s8(MSA_TPV_REINTERPRET(v16i8, a.val))); \
+    return v_uint8x16(t); \
+}
+OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE8(v_uint8x16)
+OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE8(v_int8x16)
+
+#define OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE16(_Tpvec) \
+inline v_uint16x8 v_popcount(const _Tpvec& a) \
+{ \
+    v8u16 t = MSA_TPV_REINTERPRET(v8u16, msa_cntq_s16(MSA_TPV_REINTERPRET(v8i16, a.val))); \
+    return v_uint16x8(t); \
+}
+OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE16(v_uint16x8)
+OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE16(v_int16x8)
+
+#define OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE32(_Tpvec) \
+inline v_uint32x4 v_popcount(const _Tpvec& a) \
+{ \
+    v4u32 t = MSA_TPV_REINTERPRET(v4u32, msa_cntq_s32(MSA_TPV_REINTERPRET(v4i32, a.val))); \
+    return v_uint32x4(t); \
+}
+OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE32(v_uint32x4)
+OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE32(v_int32x4)
+
+#define OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE64(_Tpvec) \
+inline v_uint64x2 v_popcount(const _Tpvec& a) \
+{ \
+    v2u64 t = MSA_TPV_REINTERPRET(v2u64, msa_cntq_s64(MSA_TPV_REINTERPRET(v2i64, a.val))); \
+    return v_uint64x2(t); \
+}
+OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE64(v_uint64x2)
+OPENCV_HAL_IMPL_MSA_POPCOUNT_SIZE64(v_int64x2)
+
+inline int v_signmask(const v_uint8x16& a)
+{
+    v8i8 m0 = msa_create_s8(CV_BIG_UINT(0x0706050403020100));
+    v16u8 v0 = msa_shlq_u8(msa_shrq_n_u8(a.val, 7), msa_combine_s8(m0, m0));
+    v8u16 v1 = msa_paddlq_u8(v0);
+    v4u32 v2 = msa_paddlq_u16(v1);
+    v2u64 v3 = msa_paddlq_u32(v2);
+    return (int)msa_getq_lane_u64(v3, 0) + ((int)msa_getq_lane_u64(v3, 1) << 8);
+}
+inline int v_signmask(const v_int8x16& a)
+{ return v_signmask(v_reinterpret_as_u8(a)); }
+
+inline int v_signmask(const v_uint16x8& a)
+{
+    v4i16 m0 = msa_create_s16(CV_BIG_UINT(0x0003000200010000));
+    v8u16 v0 = msa_shlq_u16(msa_shrq_n_u16(a.val, 15), msa_combine_s16(m0, m0));
+    v4u32 v1 = msa_paddlq_u16(v0);
+    v2u64 v2 = msa_paddlq_u32(v1);
+    return (int)msa_getq_lane_u64(v2, 0) + ((int)msa_getq_lane_u64(v2, 1) << 4);
+}
+inline int v_signmask(const v_int16x8& a)
+{ return v_signmask(v_reinterpret_as_u16(a)); }
+
+inline int v_signmask(const v_uint32x4& a)
+{
+    v2i32 m0 = msa_create_s32(CV_BIG_UINT(0x0000000100000000));
+    v4u32 v0 = msa_shlq_u32(msa_shrq_n_u32(a.val, 31), msa_combine_s32(m0, m0));
+    v2u64 v1 = msa_paddlq_u32(v0);
+    return (int)msa_getq_lane_u64(v1, 0) + ((int)msa_getq_lane_u64(v1, 1) << 2);
+}
+inline int v_signmask(const v_int32x4& a)
+{ return v_signmask(v_reinterpret_as_u32(a)); }
+inline int v_signmask(const v_float32x4& a)
+{ return v_signmask(v_reinterpret_as_u32(a)); }
+
+inline int v_signmask(const v_uint64x2& a)
+{
+    v2u64 v0 = msa_shrq_n_u64(a.val, 63);
+    return (int)msa_getq_lane_u64(v0, 0) + ((int)msa_getq_lane_u64(v0, 1) << 1);
+}
+inline int v_signmask(const v_int64x2& a)
+{ return v_signmask(v_reinterpret_as_u64(a)); }
+inline int v_signmask(const v_float64x2& a)
+{ return v_signmask(v_reinterpret_as_u64(a)); }
+
+inline int v_scan_forward(const v_int8x16& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_uint8x16& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_int16x8& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_uint16x8& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_int32x4& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_uint32x4& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_float32x4& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_int64x2& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_uint64x2& a) { return trailingZeros32(v_signmask(a)); }
+inline int v_scan_forward(const v_float64x2& a) { return trailingZeros32(v_signmask(a)); }
+
+#define OPENCV_HAL_IMPL_MSA_CHECK_ALLANY(_Tpvec, _Tpvec2, suffix, shift) \
+inline bool v_check_all(const v_##_Tpvec& a) \
+{ \
+    _Tpvec2 v0 = msa_shrq_n_##suffix(msa_mvnq_##suffix(a.val), shift); \
+    v2u64 v1 = MSA_TPV_REINTERPRET(v2u64, v0); \
+    return (msa_getq_lane_u64(v1, 0) | msa_getq_lane_u64(v1, 1)) == 0; \
+} \
+inline bool v_check_any(const v_##_Tpvec& a) \
+{ \
+    _Tpvec2 v0 = msa_shrq_n_##suffix(a.val, shift); \
+    v2u64 v1 = MSA_TPV_REINTERPRET(v2u64, v0); \
+    return (msa_getq_lane_u64(v1, 0) | msa_getq_lane_u64(v1, 1)) != 0; \
+}
+
+OPENCV_HAL_IMPL_MSA_CHECK_ALLANY(uint8x16, v16u8, u8, 7)
+OPENCV_HAL_IMPL_MSA_CHECK_ALLANY(uint16x8, v8u16, u16, 15)
+OPENCV_HAL_IMPL_MSA_CHECK_ALLANY(uint32x4, v4u32, u32, 31)
+OPENCV_HAL_IMPL_MSA_CHECK_ALLANY(uint64x2, v2u64, u64, 63)
+
+inline bool v_check_all(const v_int8x16& a)
+{ return v_check_all(v_reinterpret_as_u8(a)); }
+inline bool v_check_all(const v_int16x8& a)
+{ return v_check_all(v_reinterpret_as_u16(a)); }
+inline bool v_check_all(const v_int32x4& a)
+{ return v_check_all(v_reinterpret_as_u32(a)); }
+inline bool v_check_all(const v_float32x4& a)
+{ return v_check_all(v_reinterpret_as_u32(a)); }
+
+inline bool v_check_any(const v_int8x16& a)
+{ return v_check_any(v_reinterpret_as_u8(a)); }
+inline bool v_check_any(const v_int16x8& a)
+{ return v_check_any(v_reinterpret_as_u16(a)); }
+inline bool v_check_any(const v_int32x4& a)
+{ return v_check_any(v_reinterpret_as_u32(a)); }
+inline bool v_check_any(const v_float32x4& a)
+{ return v_check_any(v_reinterpret_as_u32(a)); }
+
+inline bool v_check_all(const v_int64x2& a)
+{ return v_check_all(v_reinterpret_as_u64(a)); }
+inline bool v_check_all(const v_float64x2& a)
+{ return v_check_all(v_reinterpret_as_u64(a)); }
+inline bool v_check_any(const v_int64x2& a)
+{ return v_check_any(v_reinterpret_as_u64(a)); }
+inline bool v_check_any(const v_float64x2& a)
+{ return v_check_any(v_reinterpret_as_u64(a)); }
+
+/* v_select */
+#define OPENCV_HAL_IMPL_MSA_SELECT(_Tpvec, _Tpv, _Tpvu) \
+inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_bslq_u8(MSA_TPV_REINTERPRET(_Tpvu, mask.val), \
+                  MSA_TPV_REINTERPRET(_Tpvu, b.val), MSA_TPV_REINTERPRET(_Tpvu, a.val)))); \
+}
+
+OPENCV_HAL_IMPL_MSA_SELECT(v_uint8x16, v16u8, v16u8)
+OPENCV_HAL_IMPL_MSA_SELECT(v_int8x16, v16i8, v16u8)
+OPENCV_HAL_IMPL_MSA_SELECT(v_uint16x8, v8u16, v16u8)
+OPENCV_HAL_IMPL_MSA_SELECT(v_int16x8, v8i16, v16u8)
+OPENCV_HAL_IMPL_MSA_SELECT(v_uint32x4, v4u32, v16u8)
+OPENCV_HAL_IMPL_MSA_SELECT(v_int32x4, v4i32, v16u8)
+OPENCV_HAL_IMPL_MSA_SELECT(v_float32x4, v4f32, v16u8)
+OPENCV_HAL_IMPL_MSA_SELECT(v_float64x2, v2f64, v16u8)
+
+#define OPENCV_HAL_IMPL_MSA_EXPAND(_Tpvec, _Tpwvec, _Tp, suffix, ssuffix, _Tpv, _Tpvs) \
+inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
+{ \
+    _Tpv a_lo = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a.val), msa_dupq_n_##ssuffix(0))); \
+    _Tpv a_hi = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a.val), msa_dupq_n_##ssuffix(0))); \
+    b0.val = msa_paddlq_##suffix(a_lo); \
+    b1.val = msa_paddlq_##suffix(a_hi); \
+} \
+inline _Tpwvec v_expand_low(const _Tpvec& a) \
+{ \
+    _Tpv a_lo = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a.val), msa_dupq_n_##ssuffix(0))); \
+    return _Tpwvec(msa_paddlq_##suffix(a_lo)); \
+} \
+inline _Tpwvec v_expand_high(const _Tpvec& a) \
+{ \
+    _Tpv a_hi = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a.val), msa_dupq_n_##ssuffix(0))); \
+    return _Tpwvec(msa_paddlq_##suffix(a_hi)); \
+} \
+inline _Tpwvec v_load_expand(const _Tp* ptr) \
+{ \
+    return _Tpwvec(msa_movl_##suffix(msa_ld1_##suffix(ptr))); \
+}
+
+OPENCV_HAL_IMPL_MSA_EXPAND(v_uint8x16, v_uint16x8, uchar, u8, s8, v16u8, v16i8)
+OPENCV_HAL_IMPL_MSA_EXPAND(v_int8x16, v_int16x8, schar, s8, s8, v16i8, v16i8)
+OPENCV_HAL_IMPL_MSA_EXPAND(v_uint16x8, v_uint32x4, ushort, u16, s16, v8u16, v8i16)
+OPENCV_HAL_IMPL_MSA_EXPAND(v_int16x8, v_int32x4, short, s16, s16, v8i16, v8i16)
+OPENCV_HAL_IMPL_MSA_EXPAND(v_uint32x4, v_uint64x2, uint, u32, s32, v4u32, v4i32)
+OPENCV_HAL_IMPL_MSA_EXPAND(v_int32x4, v_int64x2, int, s32, s32, v4i32, v4i32)
+
+inline v_uint32x4 v_load_expand_q(const uchar* ptr)
+{
+    return v_uint32x4((v4u32){ptr[0], ptr[1], ptr[2], ptr[3]});
+}
+
+inline v_int32x4 v_load_expand_q(const schar* ptr)
+{
+    return v_int32x4((v4i32){ptr[0], ptr[1], ptr[2], ptr[3]});
+}
+
+/* v_zip, v_combine_low, v_combine_high, v_recombine */
+#define OPENCV_HAL_IMPL_MSA_UNPACKS(_Tpvec, _Tpv, _Tpvs, ssuffix) \
+inline void v_zip(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1) \
+{ \
+    b0.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a1.val), MSA_TPV_REINTERPRET(_Tpvs, a0.val))); \
+    b1.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a1.val), MSA_TPV_REINTERPRET(_Tpvs, a0.val))); \
+} \
+inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_s64(MSA_TPV_REINTERPRET(v2i64, b.val), MSA_TPV_REINTERPRET(v2i64, a.val)))); \
+} \
+inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_s64(MSA_TPV_REINTERPRET(v2i64, b.val), MSA_TPV_REINTERPRET(v2i64, a.val)))); \
+} \
+inline void v_recombine(const _Tpvec& a, const _Tpvec& b, _Tpvec& c, _Tpvec& d) \
+{ \
+    c.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_s64(MSA_TPV_REINTERPRET(v2i64, b.val), MSA_TPV_REINTERPRET(v2i64, a.val))); \
+    d.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_s64(MSA_TPV_REINTERPRET(v2i64, b.val), MSA_TPV_REINTERPRET(v2i64, a.val))); \
+}
+
+OPENCV_HAL_IMPL_MSA_UNPACKS(v_uint8x16, v16u8, v16i8, s8)
+OPENCV_HAL_IMPL_MSA_UNPACKS(v_int8x16, v16i8, v16i8, s8)
+OPENCV_HAL_IMPL_MSA_UNPACKS(v_uint16x8, v8u16, v8i16, s16)
+OPENCV_HAL_IMPL_MSA_UNPACKS(v_int16x8, v8i16, v8i16, s16)
+OPENCV_HAL_IMPL_MSA_UNPACKS(v_uint32x4, v4u32, v4i32, s32)
+OPENCV_HAL_IMPL_MSA_UNPACKS(v_int32x4, v4i32, v4i32, s32)
+OPENCV_HAL_IMPL_MSA_UNPACKS(v_float32x4, v4f32, v4i32, s32)
+OPENCV_HAL_IMPL_MSA_UNPACKS(v_float64x2, v2f64, v2i64, s64)
+
+/* v_extract */
+#define OPENCV_HAL_IMPL_MSA_EXTRACT(_Tpvec, _Tpv, _Tpvs, suffix) \
+template <int s> \
+inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(MSA_TPV_REINTERPRET(_Tpv, msa_extq_##suffix(MSA_TPV_REINTERPRET(_Tpvs, a.val), MSA_TPV_REINTERPRET(_Tpvs, b.val), s))); \
+}
+
+OPENCV_HAL_IMPL_MSA_EXTRACT(v_uint8x16, v16u8, v16i8, s8)
+OPENCV_HAL_IMPL_MSA_EXTRACT(v_int8x16, v16i8, v16i8, s8)
+OPENCV_HAL_IMPL_MSA_EXTRACT(v_uint16x8, v8u16, v8i16, s16)
+OPENCV_HAL_IMPL_MSA_EXTRACT(v_int16x8, v8i16, v8i16, s16)
+OPENCV_HAL_IMPL_MSA_EXTRACT(v_uint32x4, v4u32, v4i32, s32)
+OPENCV_HAL_IMPL_MSA_EXTRACT(v_int32x4, v4i32, v4i32, s32)
+OPENCV_HAL_IMPL_MSA_EXTRACT(v_uint64x2, v2u64, v2i64, s64)
+OPENCV_HAL_IMPL_MSA_EXTRACT(v_int64x2, v2i64, v2i64, s64)
+OPENCV_HAL_IMPL_MSA_EXTRACT(v_float32x4, v4f32, v4i32, s32)
+OPENCV_HAL_IMPL_MSA_EXTRACT(v_float64x2, v2f64, v2i64, s64)
+
+/* v_round, v_floor, v_ceil, v_trunc */
+inline v_int32x4 v_round(const v_float32x4& a)
+{
+    return v_int32x4(msa_cvttintq_s32_f32(a.val));
+}
+
+inline v_int32x4 v_floor(const v_float32x4& a)
+{
+    v4i32 a1 = msa_cvttintq_s32_f32(a.val);
+    return v_int32x4(msa_addq_s32(a1, MSA_TPV_REINTERPRET(v4i32, msa_cgtq_f32(msa_cvtfintq_f32_s32(a1), a.val))));
+}
+
+inline v_int32x4 v_ceil(const v_float32x4& a)
+{
+    v4i32 a1 = msa_cvttintq_s32_f32(a.val);
+    return v_int32x4(msa_subq_s32(a1, MSA_TPV_REINTERPRET(v4i32, msa_cgtq_f32(a.val, msa_cvtfintq_f32_s32(a1)))));
+}
+
+inline v_int32x4 v_trunc(const v_float32x4& a)
+{
+    return v_int32x4(msa_cvttruncq_s32_f32(a.val));
+}
+
+inline v_int32x4 v_round(const v_float64x2& a)
+{
+    return v_int32x4(msa_pack_s64(msa_cvttintq_s64_f64(a.val), msa_dupq_n_s64(0)));
+}
+
+inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b)
+{
+    return v_int32x4(msa_pack_s64(msa_cvttintq_s64_f64(a.val), msa_cvttintq_s64_f64(b.val)));
+}
+
+inline v_int32x4 v_floor(const v_float64x2& a)
+{
+    v2f64 a1 = msa_cvtrintq_f64(a.val);
+    return v_int32x4(msa_pack_s64(msa_addq_s64(msa_cvttruncq_s64_f64(a1), MSA_TPV_REINTERPRET(v2i64, msa_cgtq_f64(a1, a.val))), msa_dupq_n_s64(0)));
+}
+
+inline v_int32x4 v_ceil(const v_float64x2& a)
+{
+    v2f64 a1 = msa_cvtrintq_f64(a.val);
+    return v_int32x4(msa_pack_s64(msa_subq_s64(msa_cvttruncq_s64_f64(a1), MSA_TPV_REINTERPRET(v2i64, msa_cgtq_f64(a.val, a1))), msa_dupq_n_s64(0)));
+}
+
+inline v_int32x4 v_trunc(const v_float64x2& a)
+{
+    return v_int32x4(msa_pack_s64(msa_cvttruncq_s64_f64(a.val), msa_dupq_n_s64(0)));
+}
+
+#define OPENCV_HAL_IMPL_MSA_TRANSPOSE4x4(_Tpvec, _Tpv, _Tpvs, ssuffix) \
+inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, \
+                           const _Tpvec& a2, const _Tpvec& a3, \
+                           _Tpvec& b0, _Tpvec& b1, \
+                           _Tpvec& b2, _Tpvec& b3) \
+{ \
+    _Tpv t00 = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a1.val), MSA_TPV_REINTERPRET(_Tpvs, a0.val))); \
+    _Tpv t01 = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a1.val), MSA_TPV_REINTERPRET(_Tpvs, a0.val))); \
+    _Tpv t10 = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a3.val), MSA_TPV_REINTERPRET(_Tpvs, a2.val))); \
+    _Tpv t11 = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_##ssuffix(MSA_TPV_REINTERPRET(_Tpvs, a3.val), MSA_TPV_REINTERPRET(_Tpvs, a2.val))); \
+    b0.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_s64(MSA_TPV_REINTERPRET(v2i64, t10), MSA_TPV_REINTERPRET(v2i64, t00))); \
+    b1.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_s64(MSA_TPV_REINTERPRET(v2i64, t10), MSA_TPV_REINTERPRET(v2i64, t00))); \
+    b2.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvrq_s64(MSA_TPV_REINTERPRET(v2i64, t11), MSA_TPV_REINTERPRET(v2i64, t01))); \
+    b3.val = MSA_TPV_REINTERPRET(_Tpv, msa_ilvlq_s64(MSA_TPV_REINTERPRET(v2i64, t11), MSA_TPV_REINTERPRET(v2i64, t01))); \
+}
+
+OPENCV_HAL_IMPL_MSA_TRANSPOSE4x4(v_uint32x4, v4u32, v4i32, s32)
+OPENCV_HAL_IMPL_MSA_TRANSPOSE4x4(v_int32x4, v4i32, v4i32, s32)
+OPENCV_HAL_IMPL_MSA_TRANSPOSE4x4(v_float32x4, v4f32, v4i32, s32)
+
+#define OPENCV_HAL_IMPL_MSA_INTERLEAVED(_Tpvec, _Tp, suffix) \
+inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b) \
+{ \
+    msa_ld2q_##suffix(ptr, &a.val, &b.val); \
+} \
+inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, v_##_Tpvec& c) \
+{ \
+    msa_ld3q_##suffix(ptr, &a.val, &b.val, &c.val); \
+} \
+inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, \
+                                v_##_Tpvec& c, v_##_Tpvec& d) \
+{ \
+    msa_ld4q_##suffix(ptr, &a.val, &b.val, &c.val, &d.val); \
+} \
+inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
+{ \
+    msa_st2q_##suffix(ptr, a.val, b.val); \
+} \
+inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
+                                const v_##_Tpvec& c, hal::StoreMode /*mode*/=hal::STORE_UNALIGNED) \
+{ \
+    msa_st3q_##suffix(ptr, a.val, b.val, c.val); \
+} \
+inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, \
+                                const v_##_Tpvec& c, const v_##_Tpvec& d, \
+                                hal::StoreMode /*mode*/=hal::STORE_UNALIGNED ) \
+{ \
+    msa_st4q_##suffix(ptr, a.val, b.val, c.val, d.val); \
+}
+
+OPENCV_HAL_IMPL_MSA_INTERLEAVED(uint8x16, uchar, u8)
+OPENCV_HAL_IMPL_MSA_INTERLEAVED(int8x16, schar, s8)
+OPENCV_HAL_IMPL_MSA_INTERLEAVED(uint16x8, ushort, u16)
+OPENCV_HAL_IMPL_MSA_INTERLEAVED(int16x8, short, s16)
+OPENCV_HAL_IMPL_MSA_INTERLEAVED(uint32x4, unsigned, u32)
+OPENCV_HAL_IMPL_MSA_INTERLEAVED(int32x4, int, s32)
+OPENCV_HAL_IMPL_MSA_INTERLEAVED(float32x4, float, f32)
+OPENCV_HAL_IMPL_MSA_INTERLEAVED(uint64x2, uint64, u64)
+OPENCV_HAL_IMPL_MSA_INTERLEAVED(int64x2, int64, s64)
+OPENCV_HAL_IMPL_MSA_INTERLEAVED(float64x2, double, f64)
+
+/* v_cvt_f32, v_cvt_f64, v_cvt_f64_high */
+inline v_float32x4 v_cvt_f32(const v_int32x4& a)
+{
+    return v_float32x4(msa_cvtfintq_f32_s32(a.val));
+}
+
+inline v_float32x4 v_cvt_f32(const v_float64x2& a)
+{
+    return v_float32x4(msa_cvtfq_f32_f64(a.val, msa_dupq_n_f64(0.0f)));
+}
+
+inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
+{
+    return v_float32x4(msa_cvtfq_f32_f64(a.val, b.val));
+}
+
+inline v_float64x2 v_cvt_f64(const v_int32x4& a)
+{
+    return v_float64x2(msa_cvtflq_f64_f32(msa_cvtfintq_f32_s32(a.val)));
+}
+
+inline v_float64x2 v_cvt_f64_high(const v_int32x4& a)
+{
+    return v_float64x2(msa_cvtfhq_f64_f32(msa_cvtfintq_f32_s32(a.val)));
+}
+
+inline v_float64x2 v_cvt_f64(const v_float32x4& a)
+{
+    return v_float64x2(msa_cvtflq_f64_f32(a.val));
+}
+
+inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
+{
+    return v_float64x2(msa_cvtfhq_f64_f32(a.val));
+}
+
+inline v_float64x2 v_cvt_f64(const v_int64x2& a)
+{
+    return v_float64x2(msa_cvtfintq_f64_s64(a.val));
+}
+
+////////////// Lookup table access ////////////////////
+inline v_int8x16 v_lut(const schar* tab, const int* idx)
+{
+    schar CV_DECL_ALIGNED(32) elems[16] =
+    {
+        tab[idx[ 0]],
+        tab[idx[ 1]],
+        tab[idx[ 2]],
+        tab[idx[ 3]],
+        tab[idx[ 4]],
+        tab[idx[ 5]],
+        tab[idx[ 6]],
+        tab[idx[ 7]],
+        tab[idx[ 8]],
+        tab[idx[ 9]],
+        tab[idx[10]],
+        tab[idx[11]],
+        tab[idx[12]],
+        tab[idx[13]],
+        tab[idx[14]],
+        tab[idx[15]]
+    };
+    return v_int8x16(msa_ld1q_s8(elems));
+}
+inline v_int8x16 v_lut_pairs(const schar* tab, const int* idx)
+{
+    schar CV_DECL_ALIGNED(32) elems[16] =
+    {
+        tab[idx[0]],
+        tab[idx[0] + 1],
+        tab[idx[1]],
+        tab[idx[1] + 1],
+        tab[idx[2]],
+        tab[idx[2] + 1],
+        tab[idx[3]],
+        tab[idx[3] + 1],
+        tab[idx[4]],
+        tab[idx[4] + 1],
+        tab[idx[5]],
+        tab[idx[5] + 1],
+        tab[idx[6]],
+        tab[idx[6] + 1],
+        tab[idx[7]],
+        tab[idx[7] + 1]
+    };
+    return v_int8x16(msa_ld1q_s8(elems));
+}
+inline v_int8x16 v_lut_quads(const schar* tab, const int* idx)
+{
+    schar CV_DECL_ALIGNED(32) elems[16] =
+    {
+        tab[idx[0]],
+        tab[idx[0] + 1],
+        tab[idx[0] + 2],
+        tab[idx[0] + 3],
+        tab[idx[1]],
+        tab[idx[1] + 1],
+        tab[idx[1] + 2],
+        tab[idx[1] + 3],
+        tab[idx[2]],
+        tab[idx[2] + 1],
+        tab[idx[2] + 2],
+        tab[idx[2] + 3],
+        tab[idx[3]],
+        tab[idx[3] + 1],
+        tab[idx[3] + 2],
+        tab[idx[3] + 3]
+    };
+    return v_int8x16(msa_ld1q_s8(elems));
+}
+inline v_uint8x16 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((schar*)tab, idx)); }
+inline v_uint8x16 v_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_pairs((schar*)tab, idx)); }
+inline v_uint8x16 v_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_quads((schar*)tab, idx)); }
+
+
+inline v_int16x8 v_lut(const short* tab, const int* idx)
+{
+    short CV_DECL_ALIGNED(32) elems[8] =
+    {
+        tab[idx[0]],
+        tab[idx[1]],
+        tab[idx[2]],
+        tab[idx[3]],
+        tab[idx[4]],
+        tab[idx[5]],
+        tab[idx[6]],
+        tab[idx[7]]
+    };
+    return v_int16x8(msa_ld1q_s16(elems));
+}
+inline v_int16x8 v_lut_pairs(const short* tab, const int* idx)
+{
+    short CV_DECL_ALIGNED(32) elems[8] =
+    {
+        tab[idx[0]],
+        tab[idx[0] + 1],
+        tab[idx[1]],
+        tab[idx[1] + 1],
+        tab[idx[2]],
+        tab[idx[2] + 1],
+        tab[idx[3]],
+        tab[idx[3] + 1]
+    };
+    return v_int16x8(msa_ld1q_s16(elems));
+}
+inline v_int16x8 v_lut_quads(const short* tab, const int* idx)
+{
+    return v_int16x8(msa_combine_s16(msa_ld1_s16(tab + idx[0]), msa_ld1_s16(tab + idx[1])));
+}
+inline v_uint16x8 v_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut((short*)tab, idx)); }
+inline v_uint16x8 v_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_pairs((short*)tab, idx)); }
+inline v_uint16x8 v_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_quads((short*)tab, idx)); }
+
+inline v_int32x4 v_lut(const int* tab, const int* idx)
+{
+    int CV_DECL_ALIGNED(32) elems[4] =
+    {
+        tab[idx[0]],
+        tab[idx[1]],
+        tab[idx[2]],
+        tab[idx[3]]
+    };
+    return v_int32x4(msa_ld1q_s32(elems));
+}
+inline v_int32x4 v_lut_pairs(const int* tab, const int* idx)
+{
+    return v_int32x4(msa_combine_s32(msa_ld1_s32(tab + idx[0]), msa_ld1_s32(tab + idx[1])));
+}
+inline v_int32x4 v_lut_quads(const int* tab, const int* idx)
+{
+    return v_int32x4(msa_ld1q_s32(tab + idx[0]));
+}
+inline v_uint32x4 v_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut((int*)tab, idx)); }
+inline v_uint32x4 v_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_pairs((int*)tab, idx)); }
+inline v_uint32x4 v_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_quads((int*)tab, idx)); }
+
+inline v_int64x2 v_lut(const int64_t* tab, const int* idx)
+{
+    return v_int64x2(msa_combine_s64(msa_create_s64(tab[idx[0]]), msa_create_s64(tab[idx[1]])));
+}
+inline v_int64x2 v_lut_pairs(const int64_t* tab, const int* idx)
+{
+    return v_int64x2(msa_ld1q_s64(tab + idx[0]));
+}
+inline v_uint64x2 v_lut(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut((const int64_t *)tab, idx)); }
+inline v_uint64x2 v_lut_pairs(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_pairs((const int64_t *)tab, idx)); }
+
+inline v_float32x4 v_lut(const float* tab, const int* idx)
+{
+    float CV_DECL_ALIGNED(32) elems[4] =
+    {
+        tab[idx[0]],
+        tab[idx[1]],
+        tab[idx[2]],
+        tab[idx[3]]
+    };
+    return v_float32x4(msa_ld1q_f32(elems));
+}
+inline v_float32x4 v_lut_pairs(const float* tab, const int* idx)
+{
+    uint64 CV_DECL_ALIGNED(32) elems[2] =
+    {
+        *(uint64*)(tab + idx[0]),
+        *(uint64*)(tab + idx[1])
+    };
+    return v_float32x4(MSA_TPV_REINTERPRET(v4f32, msa_ld1q_u64(elems)));
+}
+inline v_float32x4 v_lut_quads(const float* tab, const int* idx)
+{
+    return v_float32x4(msa_ld1q_f32(tab + idx[0]));
+}
+
+inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
+{
+    int CV_DECL_ALIGNED(32) idx[4];
+    v_store_aligned(idx, idxvec);
+
+    return v_int32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
+}
+
+inline v_uint32x4 v_lut(const unsigned* tab, const v_int32x4& idxvec)
+{
+    unsigned CV_DECL_ALIGNED(32) elems[4] =
+    {
+        tab[msa_getq_lane_s32(idxvec.val, 0)],
+        tab[msa_getq_lane_s32(idxvec.val, 1)],
+        tab[msa_getq_lane_s32(idxvec.val, 2)],
+        tab[msa_getq_lane_s32(idxvec.val, 3)]
+    };
+    return v_uint32x4(msa_ld1q_u32(elems));
+}
+
+inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
+{
+    int CV_DECL_ALIGNED(32) idx[4];
+    v_store_aligned(idx, idxvec);
+
+    return v_float32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
+}
+
+inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y)
+{
+    int CV_DECL_ALIGNED(32) idx[4];
+    v_store_aligned(idx, idxvec);
+
+    v4f32 xy02 = msa_combine_f32(msa_ld1_f32(tab + idx[0]), msa_ld1_f32(tab + idx[2]));
+    v4f32 xy13 = msa_combine_f32(msa_ld1_f32(tab + idx[1]), msa_ld1_f32(tab + idx[3]));
+    x = v_float32x4(MSA_TPV_REINTERPRET(v4f32, msa_ilvevq_s32(MSA_TPV_REINTERPRET(v4i32, xy13), MSA_TPV_REINTERPRET(v4i32, xy02))));
+    y = v_float32x4(MSA_TPV_REINTERPRET(v4f32, msa_ilvodq_s32(MSA_TPV_REINTERPRET(v4i32, xy13), MSA_TPV_REINTERPRET(v4i32, xy02))));
+}
+
+inline v_int8x16 v_interleave_pairs(const v_int8x16& vec)
+{
+    v_int8x16 c = v_int8x16(__builtin_msa_vshf_b((v16i8)((v2i64){0x0705060403010200, 0x0F0D0E0C0B090A08}), msa_dupq_n_s8(0), vec.val));
+    return c;
+}
+inline v_uint8x16 v_interleave_pairs(const v_uint8x16& vec)
+{ return v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec))); }
+inline v_int8x16 v_interleave_quads(const v_int8x16& vec)
+{
+    v_int8x16 c = v_int8x16(__builtin_msa_vshf_b((v16i8)((v2i64){0x0703060205010400, 0x0F0B0E0A0D090C08}), msa_dupq_n_s8(0), vec.val));
+    return c;
+}
+inline v_uint8x16 v_interleave_quads(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec))); }
+
+inline v_int16x8 v_interleave_pairs(const v_int16x8& vec)
+{
+    v_int16x8 c = v_int16x8(__builtin_msa_vshf_h((v8i16)((v2i64){0x0003000100020000, 0x0007000500060004}), msa_dupq_n_s16(0), vec.val));
+    return c;
+}
+
+inline v_uint16x8 v_interleave_pairs(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
+
+inline v_int16x8 v_interleave_quads(const v_int16x8& vec)
+{
+    v_int16x8 c = v_int16x8(__builtin_msa_vshf_h((v8i16)((v2i64){0x0005000100040000, 0x0007000300060002}), msa_dupq_n_s16(0), vec.val));
+    return c;
+}
+
+inline v_uint16x8 v_interleave_quads(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }
+
+inline v_int32x4 v_interleave_pairs(const v_int32x4& vec)
+{
+    v_int32x4 c;
+    c.val[0] = vec.val[0];
+    c.val[1] = vec.val[2];
+    c.val[2] = vec.val[1];
+    c.val[3] = vec.val[3];
+    return c;
+}
+
+inline v_uint32x4 v_interleave_pairs(const v_uint32x4& vec) { return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
+inline v_float32x4 v_interleave_pairs(const v_float32x4& vec) { return v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
+
+inline v_int8x16 v_pack_triplets(const v_int8x16& vec)
+{
+    v_int8x16 c = v_int8x16(__builtin_msa_vshf_b((v16i8)((v2i64){0x0908060504020100, 0x131211100E0D0C0A}), msa_dupq_n_s8(0), vec.val));
+    return c;
+}
+
+inline v_uint8x16 v_pack_triplets(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }
+
+inline v_int16x8 v_pack_triplets(const v_int16x8& vec)
+{
+    v_int16x8 c = v_int16x8(__builtin_msa_vshf_h((v8i16)((v2i64){0x0004000200010000, 0x0009000800060005}), msa_dupq_n_s16(0), vec.val));
+    return c;
+}
+
+inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
+inline v_int32x4 v_pack_triplets(const v_int32x4& vec) { return vec; }
+inline v_uint32x4 v_pack_triplets(const v_uint32x4& vec) { return vec; }
+inline v_float32x4 v_pack_triplets(const v_float32x4& vec) { return vec; }
+
+inline v_float64x2 v_lut(const double* tab, const int* idx)
+{
+    double CV_DECL_ALIGNED(32) elems[2] =
+    {
+        tab[idx[0]],
+        tab[idx[1]]
+    };
+    return v_float64x2(msa_ld1q_f64(elems));
+}
+
+inline v_float64x2 v_lut_pairs(const double* tab, const int* idx)
+{
+    return v_float64x2(msa_ld1q_f64(tab + idx[0]));
+}
+
+inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
+{
+    int CV_DECL_ALIGNED(32) idx[4];
+    v_store_aligned(idx, idxvec);
+
+    return v_float64x2(tab[idx[0]], tab[idx[1]]);
+}
+
+inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y)
+{
+    int CV_DECL_ALIGNED(32) idx[4];
+    v_store_aligned(idx, idxvec);
+
+    v2f64 xy0 = msa_ld1q_f64(tab + idx[0]);
+    v2f64 xy1 = msa_ld1q_f64(tab + idx[1]);
+    x = v_float64x2(MSA_TPV_REINTERPRET(v2f64, msa_ilvevq_s64(MSA_TPV_REINTERPRET(v2i64, xy1), MSA_TPV_REINTERPRET(v2i64, xy0))));
+    y = v_float64x2(MSA_TPV_REINTERPRET(v2f64, msa_ilvodq_s64(MSA_TPV_REINTERPRET(v2i64, xy1), MSA_TPV_REINTERPRET(v2i64, xy0))));
+}
+
+template<int i, typename _Tp>
+inline typename _Tp::lane_type v_extract_n(const _Tp& a)
+{
+    return v_rotate_right<i>(a).get0();
+}
+
+template<int i>
+inline v_uint32x4 v_broadcast_element(const v_uint32x4& a)
+{
+    return v_setall_u32(v_extract_n<i>(a));
+}
+template<int i>
+inline v_int32x4 v_broadcast_element(const v_int32x4& a)
+{
+    return v_setall_s32(v_extract_n<i>(a));
+}
+template<int i>
+inline v_float32x4 v_broadcast_element(const v_float32x4& a)
+{
+    return v_setall_f32(v_extract_n<i>(a));
+}
+
+////// FP16 support ///////
+#if CV_FP16
+inline v_float32x4 v_load_expand(const float16_t* ptr)
+{
+#ifndef msa_ld1_f16
+    v4f16 v = (v4f16)msa_ld1_s16((const short*)ptr);
+#else
+    v4f16 v = msa_ld1_f16((const __fp16*)ptr);
+#endif
+    return v_float32x4(msa_cvt_f32_f16(v));
+}
+
+inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
+{
+    v4f16 hv = msa_cvt_f16_f32(v.val);
+
+#ifndef msa_st1_f16
+    msa_st1_s16((short*)ptr, (int16x4_t)hv);
+#else
+    msa_st1_f16((__fp16*)ptr, hv);
+#endif
+}
+#else
+inline v_float32x4 v_load_expand(const float16_t* ptr)
+{
+    float buf[4];
+    for( int i = 0; i < 4; i++ )
+        buf[i] = (float)ptr[i];
+    return v_load(buf);
+}
+
+inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
+{
+    float buf[4];
+    v_store(buf, v);
+    for( int i = 0; i < 4; i++ )
+        ptr[i] = (float16_t)buf[i];
+}
+#endif
+
+inline void v_cleanup() {}
+
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
+
+//! @endcond
+
+}
+
+#endif
diff --git a/3rdparty/opencv/include/opencv2/core/hal/intrin_wasm.hpp b/3rdparty/opencv/include/opencv2/core/hal/intrin_wasm.hpp
new file mode 100644
index 00000000..b4178af8
--- /dev/null
+++ b/3rdparty/opencv/include/opencv2/core/hal/intrin_wasm.hpp
@@ -0,0 +1,2782 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_HAL_INTRIN_WASM_HPP
+#define OPENCV_HAL_INTRIN_WASM_HPP
+
+#include <limits>
+#include <cstring>
+#include <algorithm>
+#include "opencv2/core/saturate.hpp"
+
+#define CV_SIMD128 1
+#define CV_SIMD128_64F 0 // Now all implementation of f64 use fallback, so disable it.
+#define CV_SIMD128_FP16 0
+
+namespace cv
+{
+
+//! @cond IGNORED
+
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
+
+#if (__EMSCRIPTEN_major__ * 1000000 + __EMSCRIPTEN_minor__ * 1000 + __EMSCRIPTEN_tiny__) < (1038046)
+// handle renames: https://github.com/emscripten-core/emscripten/pull/9440 (https://github.com/emscripten-core/emscripten/commit/755d5b46cb84d0aa120c10981b11d05646c29673)
+#define wasm_i32x4_trunc_saturate_f32x4 wasm_trunc_saturate_i32x4_f32x4
+#define wasm_u32x4_trunc_saturate_f32x4 wasm_trunc_saturate_u32x4_f32x4
+#define wasm_i64x2_trunc_saturate_f64x2 wasm_trunc_saturate_i64x2_f64x2
+#define wasm_u64x2_trunc_saturate_f64x2 wasm_trunc_saturate_u64x2_f64x2
+#define wasm_f32x4_convert_i32x4 wasm_convert_f32x4_i32x4
+#define wasm_f32x4_convert_u32x4 wasm_convert_f32x4_u32x4
+#define wasm_f64x2_convert_i64x2 wasm_convert_f64x2_i64x2
+#define wasm_f64x2_convert_u64x2 wasm_convert_f64x2_u64x2
+#endif // COMPATIBILITY: <1.38.46
+
+///////// Types ///////////
+
+struct v_uint8x16
+{
+    typedef uchar lane_type;
+    typedef v128_t vector_type;
+    enum { nlanes = 16 };
+
+    v_uint8x16() {}
+    explicit v_uint8x16(v128_t v) : val(v) {}
+    v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
+            uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
+    {
+        uchar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
+        val = wasm_v128_load(v);
+    }
+
+    uchar get0() const
+    {
+        return (uchar)wasm_i8x16_extract_lane(val, 0);
+    }
+
+    v128_t val;
+};
+
+struct v_int8x16
+{
+    typedef schar lane_type;
+    typedef v128_t vector_type;
+    enum { nlanes = 16 };
+
+    v_int8x16() {}
+    explicit v_int8x16(v128_t v) : val(v) {}
+    v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
+            schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
+    {
+        schar v[] = {v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15};
+        val = wasm_v128_load(v);
+    }
+
+    schar get0() const
+    {
+        return wasm_i8x16_extract_lane(val, 0);
+    }
+
+    v128_t val;
+};
+
+struct v_uint16x8
+{
+    typedef ushort lane_type;
+    typedef v128_t vector_type;
+    enum { nlanes = 8 };
+
+    v_uint16x8() {}
+    explicit v_uint16x8(v128_t v) : val(v) {}
+    v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
+    {
+        ushort v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
+        val = wasm_v128_load(v);
+    }
+
+    ushort get0() const
+    {
+        return (ushort)wasm_i16x8_extract_lane(val, 0);    // wasm_u16x8_extract_lane() unimplemented yet
+    }
+
+    v128_t val;
+};
+
+struct v_int16x8
+{
+    typedef short lane_type;
+    typedef v128_t vector_type;
+    enum { nlanes = 8 };
+
+    v_int16x8() {}
+    explicit v_int16x8(v128_t v) : val(v) {}
+    v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
+    {
+        short v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
+        val = wasm_v128_load(v);
+    }
+
+    short get0() const
+    {
+        return wasm_i16x8_extract_lane(val, 0);
+    }
+
+    v128_t val;
+};
+
+struct v_uint32x4
+{
+    typedef unsigned lane_type;
+    typedef v128_t vector_type;
+    enum { nlanes = 4 };
+
+    v_uint32x4() {}
+    explicit v_uint32x4(v128_t v) : val(v) {}
+    v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
+    {
+        unsigned v[] = {v0, v1, v2, v3};
+        val = wasm_v128_load(v);
+    }
+
+    unsigned get0() const
+    {
+        return (unsigned)wasm_i32x4_extract_lane(val, 0);
+    }
+
+    v128_t val;
+};
+
+struct v_int32x4
+{
+    typedef int lane_type;
+    typedef v128_t vector_type;
+    enum { nlanes = 4 };
+
+    v_int32x4() {}
+    explicit v_int32x4(v128_t v) : val(v) {}
+    v_int32x4(int v0, int v1, int v2, int v3)
+    {
+        int v[] = {v0, v1, v2, v3};
+        val = wasm_v128_load(v);
+    }
+
+    int get0() const
+    {
+        return wasm_i32x4_extract_lane(val, 0);
+    }
+
+    v128_t val;
+};
+
+struct v_float32x4
+{
+    typedef float lane_type;
+    typedef v128_t vector_type;
+    enum { nlanes = 4 };
+
+    v_float32x4() {}
+    explicit v_float32x4(v128_t v) : val(v) {}
+    v_float32x4(float v0, float v1, float v2, float v3)
+    {
+        float v[] = {v0, v1, v2, v3};
+        val = wasm_v128_load(v);
+    }
+
+    float get0() const
+    {
+        return wasm_f32x4_extract_lane(val, 0);
+    }
+
+    v128_t val;
+};
+
+struct v_uint64x2
+{
+    typedef uint64 lane_type;
+    typedef v128_t vector_type;
+    enum { nlanes = 2 };
+
+    v_uint64x2() {}
+    explicit v_uint64x2(v128_t v) : val(v) {}
+    v_uint64x2(uint64 v0, uint64 v1)
+    {
+        uint64 v[] = {v0, v1};
+        val = wasm_v128_load(v);
+    }
+
+    uint64 get0() const
+    {
+        return (uint64)wasm_i64x2_extract_lane(val, 0);
+    }
+
+    v128_t val;
+};
+
+struct v_int64x2
+{
+    typedef int64 lane_type;
+    typedef v128_t vector_type;
+    enum { nlanes = 2 };
+
+    v_int64x2() {}
+    explicit v_int64x2(v128_t v) : val(v) {}
+    v_int64x2(int64 v0, int64 v1)
+    {
+        int64 v[] = {v0, v1};
+        val = wasm_v128_load(v);
+    }
+
+    int64 get0() const
+    {
+        return wasm_i64x2_extract_lane(val, 0);
+    }
+
+    v128_t val;
+};
+
+struct v_float64x2
+{
+    typedef double lane_type;
+    typedef v128_t vector_type;
+    enum { nlanes = 2 };
+
+    v_float64x2() {}
+    explicit v_float64x2(v128_t v) : val(v) {}
+    v_float64x2(double v0, double v1)
+    {
+        double v[] = {v0, v1};
+        val = wasm_v128_load(v);
+    }
+
+    double get0() const
+    {
+        return wasm_f64x2_extract_lane(val, 0);
+    }
+
+    v128_t val;
+};
+
+namespace
+{
+#define OPENCV_HAL_IMPL_REINTERPRET_INT(ft, tt) \
+inline tt reinterpret_int(ft x) { union { ft l; tt i; } v; v.l = x; return v.i; }
+OPENCV_HAL_IMPL_REINTERPRET_INT(uchar, schar)
+OPENCV_HAL_IMPL_REINTERPRET_INT(schar, schar)
+OPENCV_HAL_IMPL_REINTERPRET_INT(ushort, short)
+OPENCV_HAL_IMPL_REINTERPRET_INT(short, short)
+OPENCV_HAL_IMPL_REINTERPRET_INT(unsigned, int)
+OPENCV_HAL_IMPL_REINTERPRET_INT(int, int)
+OPENCV_HAL_IMPL_REINTERPRET_INT(float, int)
+OPENCV_HAL_IMPL_REINTERPRET_INT(uint64, int64)
+OPENCV_HAL_IMPL_REINTERPRET_INT(int64, int64)
+OPENCV_HAL_IMPL_REINTERPRET_INT(double, int64)
+
+static const unsigned char popCountTable[] =
+{
+    0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
+    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+    4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8,
+};
+}  // namespace
+
+static v128_t wasm_unpacklo_i8x16(v128_t a, v128_t b) {
+    return wasm_v8x16_shuffle(a, b, 0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23);
+}
+
+static v128_t wasm_unpacklo_i16x8(v128_t a, v128_t b) {
+    return wasm_v8x16_shuffle(a, b, 0,1,16,17,2,3,18,19,4,5,20,21,6,7,22,23);
+}
+
+static v128_t wasm_unpacklo_i32x4(v128_t a, v128_t b) {
+    return wasm_v8x16_shuffle(a, b, 0,1,2,3,16,17,18,19,4,5,6,7,20,21,22,23);
+}
+
+static v128_t wasm_unpacklo_i64x2(v128_t a, v128_t b) {
+    return wasm_v8x16_shuffle(a, b, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
+}
+
+static v128_t wasm_unpackhi_i8x16(v128_t a, v128_t b) {
+    return wasm_v8x16_shuffle(a, b, 8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31);
+}
+
+static v128_t wasm_unpackhi_i16x8(v128_t a, v128_t b) {
+    return wasm_v8x16_shuffle(a, b, 8,9,24,25,10,11,26,27,12,13,28,29,14,15,30,31);
+}
+
+static v128_t wasm_unpackhi_i32x4(v128_t a, v128_t b) {
+    return wasm_v8x16_shuffle(a, b, 8,9,10,11,24,25,26,27,12,13,14,15,28,29,30,31);
+}
+
+static v128_t wasm_unpackhi_i64x2(v128_t a, v128_t b) {
+    return wasm_v8x16_shuffle(a, b, 8,9,10,11,12,13,14,15,24,25,26,27,28,29,30,31);
+}
+
+/** Convert **/
+// 8 >> 16
+inline v128_t v128_cvtu8x16_i16x8(const v128_t& a)
+{
+    const v128_t z = wasm_i8x16_splat(0);
+    return wasm_unpacklo_i8x16(a, z);
+}
+inline v128_t v128_cvti8x16_i16x8(const v128_t& a)
+{ return wasm_i16x8_shr(wasm_unpacklo_i8x16(a, a), 8); }
+// 8 >> 32
+inline v128_t v128_cvtu8x16_i32x4(const v128_t& a)
+{
+    const v128_t z = wasm_i8x16_splat(0);
+    return wasm_unpacklo_i16x8(wasm_unpacklo_i8x16(a, z), z);
+}
+inline v128_t v128_cvti8x16_i32x4(const v128_t& a)
+{
+    v128_t r = wasm_unpacklo_i8x16(a, a);
+    r = wasm_unpacklo_i8x16(r, r);
+    return wasm_i32x4_shr(r, 24);
+}
+// 16 >> 32
+inline v128_t v128_cvtu16x8_i32x4(const v128_t& a)
+{
+    const v128_t z = wasm_i8x16_splat(0);
+    return wasm_unpacklo_i16x8(a, z);
+}
+inline v128_t v128_cvti16x8_i32x4(const v128_t& a)
+{ return wasm_i32x4_shr(wasm_unpacklo_i16x8(a, a), 16); }
+// 32 >> 64
+inline v128_t v128_cvtu32x4_i64x2(const v128_t& a)
+{
+    const v128_t z = wasm_i8x16_splat(0);
+    return wasm_unpacklo_i32x4(a, z);
+}
+inline v128_t v128_cvti32x4_i64x2(const v128_t& a)
+{ return wasm_unpacklo_i32x4(a, wasm_i32x4_shr(a, 31)); }
+
+// 16 << 8
+inline v128_t v128_cvtu8x16_i16x8_high(const v128_t& a)
+{
+    const v128_t z = wasm_i8x16_splat(0);
+    return wasm_unpackhi_i8x16(a, z);
+}
+inline v128_t v128_cvti8x16_i16x8_high(const v128_t& a)
+{ return wasm_i16x8_shr(wasm_unpackhi_i8x16(a, a), 8); }
+// 32 << 16
+inline v128_t v128_cvtu16x8_i32x4_high(const v128_t& a)
+{
+    const v128_t z = wasm_i8x16_splat(0);
+    return wasm_unpackhi_i16x8(a, z);
+}
+inline v128_t v128_cvti16x8_i32x4_high(const v128_t& a)
+{ return wasm_i32x4_shr(wasm_unpackhi_i16x8(a, a), 16); }
+// 64 << 32
+inline v128_t v128_cvtu32x4_i64x2_high(const v128_t& a)
+{
+    const v128_t z = wasm_i8x16_splat(0);
+    return wasm_unpackhi_i32x4(a, z);
+}
+inline v128_t v128_cvti32x4_i64x2_high(const v128_t& a)
+{ return wasm_unpackhi_i32x4(a, wasm_i32x4_shr(a, 31)); }
+
+#define OPENCV_HAL_IMPL_WASM_INITVEC(_Tpvec, _Tp, suffix, zsuffix, _Tps) \
+inline _Tpvec v_setzero_##suffix() { return _Tpvec(wasm_##zsuffix##_splat((_Tps)0)); } \
+inline _Tpvec v_setall_##suffix(_Tp v) { return _Tpvec(wasm_##zsuffix##_splat((_Tps)v)); } \
+template<typename _Tpvec0> inline _Tpvec v_reinterpret_as_##suffix(const _Tpvec0& a) \
+{ return _Tpvec(a.val); }
+
+OPENCV_HAL_IMPL_WASM_INITVEC(v_uint8x16, uchar, u8, i8x16, schar)
+OPENCV_HAL_IMPL_WASM_INITVEC(v_int8x16, schar, s8, i8x16, schar)
+OPENCV_HAL_IMPL_WASM_INITVEC(v_uint16x8, ushort, u16, i16x8, short)
+OPENCV_HAL_IMPL_WASM_INITVEC(v_int16x8, short, s16, i16x8, short)
+OPENCV_HAL_IMPL_WASM_INITVEC(v_uint32x4, unsigned, u32, i32x4, int)
+OPENCV_HAL_IMPL_WASM_INITVEC(v_int32x4, int, s32, i32x4, int)
+OPENCV_HAL_IMPL_WASM_INITVEC(v_float32x4, float, f32, f32x4, float)
+OPENCV_HAL_IMPL_WASM_INITVEC(v_uint64x2, uint64, u64, i64x2, int64)
+OPENCV_HAL_IMPL_WASM_INITVEC(v_int64x2, int64, s64, i64x2, int64)
+OPENCV_HAL_IMPL_WASM_INITVEC(v_float64x2, double, f64, f64x2, double)
+
+//////////////// PACK ///////////////
+inline v_uint8x16 v_pack(const v_uint16x8& a, const v_uint16x8& b)
+{
+    v128_t maxval = wasm_i16x8_splat(255);
+    v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_u16x8_gt(a.val, maxval));
+    v128_t b1 = wasm_v128_bitselect(maxval, b.val, wasm_u16x8_gt(b.val, maxval));
+    return v_uint8x16(wasm_v8x16_shuffle(a1, b1, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30));
+}
+inline v_int8x16 v_pack(const v_int16x8& a, const v_int16x8& b)
+{
+    v128_t maxval = wasm_i16x8_splat(127);
+    v128_t minval = wasm_i16x8_splat(-128);
+    v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_i16x8_gt(a.val, maxval));
+    v128_t b1 = wasm_v128_bitselect(maxval, b.val, wasm_i16x8_gt(b.val, maxval));
+    v128_t a2 = wasm_v128_bitselect(minval, a1, wasm_i16x8_lt(a1, minval));
+    v128_t b2 = wasm_v128_bitselect(minval, b1, wasm_i16x8_lt(b1, minval));
+    return v_int8x16(wasm_v8x16_shuffle(a2, b2, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30));
+}
+inline v_uint16x8 v_pack(const v_uint32x4& a, const v_uint32x4& b)
+{
+    v128_t maxval = wasm_i32x4_splat(65535);
+    v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_u32x4_gt(a.val, maxval));
+    v128_t b1 = wasm_v128_bitselect(maxval, b.val, wasm_u32x4_gt(b.val, maxval));
+    return v_uint16x8(wasm_v8x16_shuffle(a1, b1, 0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29));
+}
+inline v_int16x8 v_pack(const v_int32x4& a, const v_int32x4& b)
+{
+    v128_t maxval = wasm_i32x4_splat(32767);
+    v128_t minval = wasm_i32x4_splat(-32768);
+    v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_i32x4_gt(a.val, maxval));
+    v128_t b1 = wasm_v128_bitselect(maxval, b.val, wasm_i32x4_gt(b.val, maxval));
+    v128_t a2 = wasm_v128_bitselect(minval, a1, wasm_i32x4_lt(a1, minval));
+    v128_t b2 = wasm_v128_bitselect(minval, b1, wasm_i32x4_lt(b1, minval));
+    return v_int16x8(wasm_v8x16_shuffle(a2, b2, 0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29));
+}
+inline v_uint32x4 v_pack(const v_uint64x2& a, const v_uint64x2& b)
+{
+    return v_uint32x4(wasm_v8x16_shuffle(a.val, b.val, 0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27));
+}
+inline v_int32x4 v_pack(const v_int64x2& a, const v_int64x2& b)
+{
+    return v_int32x4(wasm_v8x16_shuffle(a.val, b.val, 0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27));
+}
+inline v_uint8x16 v_pack_u(const v_int16x8& a, const v_int16x8& b)
+{
+    v128_t maxval = wasm_i16x8_splat(255);
+    v128_t minval = wasm_i16x8_splat(0);
+    v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_i16x8_gt(a.val, maxval));
+    v128_t b1 = wasm_v128_bitselect(maxval, b.val, wasm_i16x8_gt(b.val, maxval));
+    v128_t a2 = wasm_v128_bitselect(minval, a1, wasm_i16x8_lt(a1, minval));
+    v128_t b2 = wasm_v128_bitselect(minval, b1, wasm_i16x8_lt(b1, minval));
+    return v_uint8x16(wasm_v8x16_shuffle(a2, b2, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30));
+}
+inline v_uint16x8 v_pack_u(const v_int32x4& a, const v_int32x4& b)
+{
+    v128_t maxval = wasm_i32x4_splat(65535);
+    v128_t minval = wasm_i32x4_splat(0);
+    v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_i32x4_gt(a.val, maxval));
+    v128_t b1 = wasm_v128_bitselect(maxval, b.val, wasm_i32x4_gt(b.val, maxval));
+    v128_t a2 = wasm_v128_bitselect(minval, a1, wasm_i32x4_lt(a1, minval));
+    v128_t b2 = wasm_v128_bitselect(minval, b1, wasm_i32x4_lt(b1, minval));
+    return v_uint16x8(wasm_v8x16_shuffle(a2, b2, 0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29));
+}
+
+template<int n>
+inline v_uint8x16 v_rshr_pack(const v_uint16x8& a, const v_uint16x8& b)
+{
+    v128_t delta = wasm_i16x8_splat(((short)1 << (n-1)));
+    v128_t a1 = wasm_u16x8_shr(wasm_i16x8_add(a.val, delta), n);
+    v128_t b1 = wasm_u16x8_shr(wasm_i16x8_add(b.val, delta), n);
+    v128_t maxval = wasm_i16x8_splat(255);
+    v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_u16x8_gt(a1, maxval));
+    v128_t b2 = wasm_v128_bitselect(maxval, b1, wasm_u16x8_gt(b1, maxval));
+    return v_uint8x16(wasm_v8x16_shuffle(a2, b2, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30));
+}
+template<int n>
+inline v_int8x16 v_rshr_pack(const v_int16x8& a, const v_int16x8& b)
+{
+    v128_t delta = wasm_i16x8_splat(((short)1 << (n-1)));
+    v128_t a1 = wasm_i16x8_shr(wasm_i16x8_add(a.val, delta), n);
+    v128_t b1 = wasm_i16x8_shr(wasm_i16x8_add(b.val, delta), n);
+    v128_t maxval = wasm_i16x8_splat(127);
+    v128_t minval = wasm_i16x8_splat(-128);
+    v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_i16x8_gt(a1, maxval));
+    v128_t b2 = wasm_v128_bitselect(maxval, b1, wasm_i16x8_gt(b1, maxval));
+    v128_t a3 = wasm_v128_bitselect(minval, a2, wasm_i16x8_lt(a1, minval));
+    v128_t b3 = wasm_v128_bitselect(minval, b2, wasm_i16x8_lt(b1, minval));
+    return v_int8x16(wasm_v8x16_shuffle(a3, b3, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30));
+}
+template<int n>
+inline v_uint16x8 v_rshr_pack(const v_uint32x4& a, const v_uint32x4& b)
+{
+    v128_t delta = wasm_i32x4_splat(((int)1 << (n-1)));
+    v128_t a1 = wasm_u32x4_shr(wasm_i32x4_add(a.val, delta), n);
+    v128_t b1 = wasm_u32x4_shr(wasm_i32x4_add(b.val, delta), n);
+    v128_t maxval = wasm_i32x4_splat(65535);
+    v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_u32x4_gt(a1, maxval));
+    v128_t b2 = wasm_v128_bitselect(maxval, b1, wasm_u32x4_gt(b1, maxval));
+    return v_uint16x8(wasm_v8x16_shuffle(a2, b2, 0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29));
+}
+template<int n>
+inline v_int16x8 v_rshr_pack(const v_int32x4& a, const v_int32x4& b)
+{
+    v128_t delta = wasm_i32x4_splat(((int)1 << (n-1)));
+    v128_t a1 = wasm_i32x4_shr(wasm_i32x4_add(a.val, delta), n);
+    v128_t b1 = wasm_i32x4_shr(wasm_i32x4_add(b.val, delta), n);
+    v128_t maxval = wasm_i32x4_splat(32767);
+    v128_t minval = wasm_i16x8_splat(-32768);
+    v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_i32x4_gt(a1, maxval));
+    v128_t b2 = wasm_v128_bitselect(maxval, b1, wasm_i32x4_gt(b1, maxval));
+    v128_t a3 = wasm_v128_bitselect(minval, a2, wasm_i32x4_lt(a1, minval));
+    v128_t b3 = wasm_v128_bitselect(minval, b2, wasm_i32x4_lt(b1, minval));
+    return v_int16x8(wasm_v8x16_shuffle(a3, b3, 0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29));
+}
+template<int n>
+inline v_uint32x4 v_rshr_pack(const v_uint64x2& a, const v_uint64x2& b)
+{
+    v128_t delta = wasm_i64x2_splat(((int64)1 << (n-1)));
+    v128_t a1 = wasm_u64x2_shr(wasm_i64x2_add(a.val, delta), n);
+    v128_t b1 = wasm_u64x2_shr(wasm_i64x2_add(b.val, delta), n);
+    return v_uint32x4(wasm_v8x16_shuffle(a1, b1, 0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27));
+}
+template<int n>
+inline v_int32x4 v_rshr_pack(const v_int64x2& a, const v_int64x2& b)
+{
+    v128_t delta = wasm_i64x2_splat(((int64)1 << (n-1)));
+    v128_t a1 = wasm_i64x2_shr(wasm_i64x2_add(a.val, delta), n);
+    v128_t b1 = wasm_i64x2_shr(wasm_i64x2_add(b.val, delta), n);
+    return v_int32x4(wasm_v8x16_shuffle(a1, b1, 0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27));
+}
+template<int n>
+inline v_uint8x16 v_rshr_pack_u(const v_int16x8& a, const v_int16x8& b)
+{
+    v128_t delta = wasm_i16x8_splat(((short)1 << (n-1)));
+    v128_t a1 = wasm_i16x8_shr(wasm_i16x8_add(a.val, delta), n);
+    v128_t b1 = wasm_i16x8_shr(wasm_i16x8_add(b.val, delta), n);
+    v128_t maxval = wasm_i16x8_splat(255);
+    v128_t minval = wasm_i16x8_splat(0);
+    v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_i16x8_gt(a1, maxval));
+    v128_t b2 = wasm_v128_bitselect(maxval, b1, wasm_i16x8_gt(b1, maxval));
+    v128_t a3 = wasm_v128_bitselect(minval, a2, wasm_i16x8_lt(a1, minval));
+    v128_t b3 = wasm_v128_bitselect(minval, b2, wasm_i16x8_lt(b1, minval));
+    return v_uint8x16(wasm_v8x16_shuffle(a3, b3, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30));
+}
+template<int n>
+inline v_uint16x8 v_rshr_pack_u(const v_int32x4& a, const v_int32x4& b)
+{
+    v128_t delta = wasm_i32x4_splat(((int)1 << (n-1)));
+    v128_t a1 = wasm_i32x4_shr(wasm_i32x4_add(a.val, delta), n);
+    v128_t b1 = wasm_i32x4_shr(wasm_i32x4_add(b.val, delta), n);
+    v128_t maxval = wasm_i32x4_splat(65535);
+    v128_t minval = wasm_i16x8_splat(0);
+    v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_i32x4_gt(a1, maxval));
+    v128_t b2 = wasm_v128_bitselect(maxval, b1, wasm_i32x4_gt(b1, maxval));
+    v128_t a3 = wasm_v128_bitselect(minval, a2, wasm_i32x4_lt(a1, minval));
+    v128_t b3 = wasm_v128_bitselect(minval, b2, wasm_i32x4_lt(b1, minval));
+    return v_uint16x8(wasm_v8x16_shuffle(a3, b3, 0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29));
+}
+
+inline void v_pack_store(uchar* ptr, const v_uint16x8& a)
+{
+    v128_t maxval = wasm_i16x8_splat(255);
+    v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_u16x8_gt(a.val, maxval));
+    v128_t r = wasm_v8x16_shuffle(a1, a1, 0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14);
+    uchar t_ptr[16];
+    wasm_v128_store(t_ptr, r);
+    for (int i=0; i<8; ++i) {
+        ptr[i] = t_ptr[i];
+    }
+}
+inline void v_pack_store(schar* ptr, const v_int16x8& a)
+{
+    v128_t maxval = wasm_i16x8_splat(127);
+    v128_t minval = wasm_i16x8_splat(-128);
+    v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_i16x8_gt(a.val, maxval));
+    v128_t a2 = wasm_v128_bitselect(minval, a1, wasm_i16x8_lt(a1, minval));
+    v128_t r = wasm_v8x16_shuffle(a2, a2, 0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14);
+    schar t_ptr[16];
+    wasm_v128_store(t_ptr, r);
+    for (int i=0; i<8; ++i) {
+        ptr[i] = t_ptr[i];
+    }
+}
+inline void v_pack_store(ushort* ptr, const v_uint32x4& a)
+{
+    v128_t maxval = wasm_i32x4_splat(65535);
+    v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_u32x4_gt(a.val, maxval));
+    v128_t r = wasm_v8x16_shuffle(a1, a1, 0,1,4,5,8,9,12,13,0,1,4,5,8,9,12,13);
+    ushort t_ptr[8];
+    wasm_v128_store(t_ptr, r);
+    for (int i=0; i<4; ++i) {
+        ptr[i] = t_ptr[i];
+    }
+}
+inline void v_pack_store(short* ptr, const v_int32x4& a)
+{
+    v128_t maxval = wasm_i32x4_splat(32767);
+    v128_t minval = wasm_i32x4_splat(-32768);
+    v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_i32x4_gt(a.val, maxval));
+    v128_t a2 = wasm_v128_bitselect(minval, a1, wasm_i32x4_lt(a1, minval));
+    v128_t r = wasm_v8x16_shuffle(a2, a2, 0,1,4,5,8,9,12,13,0,1,4,5,8,9,12,13);
+    short t_ptr[8];
+    wasm_v128_store(t_ptr, r);
+    for (int i=0; i<4; ++i) {
+        ptr[i] = t_ptr[i];
+    }
+}
+inline void v_pack_store(unsigned* ptr, const v_uint64x2& a)
+{
+    v128_t r = wasm_v8x16_shuffle(a.val, a.val, 0,1,2,3,8,9,10,11,0,1,2,3,8,9,10,11);
+    unsigned t_ptr[4];
+    wasm_v128_store(t_ptr, r);
+    for (int i=0; i<2; ++i) {
+        ptr[i] = t_ptr[i];
+    }
+}
+inline void v_pack_store(int* ptr, const v_int64x2& a)
+{
+    v128_t r = wasm_v8x16_shuffle(a.val, a.val, 0,1,2,3,8,9,10,11,0,1,2,3,8,9,10,11);
+    int t_ptr[4];
+    wasm_v128_store(t_ptr, r);
+    for (int i=0; i<2; ++i) {
+        ptr[i] = t_ptr[i];
+    }
+}
+inline void v_pack_u_store(uchar* ptr, const v_int16x8& a)
+{
+    v128_t maxval = wasm_i16x8_splat(255);
+    v128_t minval = wasm_i16x8_splat(0);
+    v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_i16x8_gt(a.val, maxval));
+    v128_t a2 = wasm_v128_bitselect(minval, a1, wasm_i16x8_lt(a1, minval));
+    v128_t r = wasm_v8x16_shuffle(a2, a2, 0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14);
+    uchar t_ptr[16];
+    wasm_v128_store(t_ptr, r);
+    for (int i=0; i<8; ++i) {
+        ptr[i] = t_ptr[i];
+    }
+}
+inline void v_pack_u_store(ushort* ptr, const v_int32x4& a)
+{
+    v128_t maxval = wasm_i32x4_splat(65535);
+    v128_t minval = wasm_i32x4_splat(0);
+    v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_i32x4_gt(a.val, maxval));
+    v128_t a2 = wasm_v128_bitselect(minval, a1, wasm_i32x4_lt(a1, minval));
+    v128_t r = wasm_v8x16_shuffle(a2, a2, 0,1,4,5,8,9,12,13,0,1,4,5,8,9,12,13);
+    ushort t_ptr[8];
+    wasm_v128_store(t_ptr, r);
+    for (int i=0; i<4; ++i) {
+        ptr[i] = t_ptr[i];
+    }
+}
+
+template<int n>
+inline void v_rshr_pack_store(uchar* ptr, const v_uint16x8& a)
+{
+    v128_t delta = wasm_i16x8_splat((short)(1 << (n-1)));
+    v128_t a1 = wasm_u16x8_shr(wasm_i16x8_add(a.val, delta), n);
+    v128_t maxval = wasm_i16x8_splat(255);
+    v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_u16x8_gt(a1, maxval));
+    v128_t r = wasm_v8x16_shuffle(a2, a2, 0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14);
+    uchar t_ptr[16];
+    wasm_v128_store(t_ptr, r);
+    for (int i=0; i<8; ++i) {
+        ptr[i] = t_ptr[i];
+    }
+}
+template<int n>
+inline void v_rshr_pack_store(schar* ptr, const v_int16x8& a)
+{
+    v128_t delta = wasm_i16x8_splat(((short)1 << (n-1)));
+    v128_t a1 = wasm_i16x8_shr(wasm_i16x8_add(a.val, delta), n);
+    v128_t maxval = wasm_i16x8_splat(127);
+    v128_t minval = wasm_i16x8_splat(-128);
+    v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_i16x8_gt(a1, maxval));
+    v128_t a3 = wasm_v128_bitselect(minval, a2, wasm_i16x8_lt(a1, minval));
+    v128_t r = wasm_v8x16_shuffle(a3, a3, 0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14);
+    schar t_ptr[16];
+    wasm_v128_store(t_ptr, r);
+    for (int i=0; i<8; ++i) {
+        ptr[i] = t_ptr[i];
+    }
+}
+template<int n>
+inline void v_rshr_pack_store(ushort* ptr, const v_uint32x4& a)
+{
+    v128_t delta = wasm_i32x4_splat(((int)1 << (n-1)));
+    v128_t a1 = wasm_u32x4_shr(wasm_i32x4_add(a.val, delta), n);
+    v128_t maxval = wasm_i32x4_splat(65535);
+    v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_u32x4_gt(a1, maxval));
+    v128_t r = wasm_v8x16_shuffle(a2, a2, 0,1,4,5,8,9,12,13,0,1,4,5,8,9,12,13);
+    ushort t_ptr[8];
+    wasm_v128_store(t_ptr, r);
+    for (int i=0; i<4; ++i) {
+        ptr[i] = t_ptr[i];
+    }
+}
+template<int n>
+inline void v_rshr_pack_store(short* ptr, const v_int32x4& a)
+{
+    v128_t delta = wasm_i32x4_splat(((int)1 << (n-1)));
+    v128_t a1 = wasm_i32x4_shr(wasm_i32x4_add(a.val, delta), n);
+    v128_t maxval = wasm_i32x4_splat(32767);
+    v128_t minval = wasm_i32x4_splat(-32768);
+    v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_i32x4_gt(a1, maxval));
+    v128_t a3 = wasm_v128_bitselect(minval, a2, wasm_i32x4_lt(a1, minval));
+    v128_t r = wasm_v8x16_shuffle(a3, a3, 0,1,4,5,8,9,12,13,0,1,4,5,8,9,12,13);
+    short t_ptr[8];
+    wasm_v128_store(t_ptr, r);
+    for (int i=0; i<4; ++i) {
+        ptr[i] = t_ptr[i];
+    }
+}
+template<int n>
+inline void v_rshr_pack_store(unsigned* ptr, const v_uint64x2& a)
+{
+    v128_t delta = wasm_i64x2_splat(((int64)1 << (n-1)));
+    v128_t a1 = wasm_u64x2_shr(wasm_i64x2_add(a.val, delta), n);
+    v128_t r = wasm_v8x16_shuffle(a1, a1, 0,1,2,3,8,9,10,11,0,1,2,3,8,9,10,11);
+    unsigned t_ptr[4];
+    wasm_v128_store(t_ptr, r);
+    for (int i=0; i<2; ++i) {
+        ptr[i] = t_ptr[i];
+    }
+}
+template<int n>
+inline void v_rshr_pack_store(int* ptr, const v_int64x2& a)
+{
+    v128_t delta = wasm_i64x2_splat(((int64)1 << (n-1)));
+    v128_t a1 = wasm_i64x2_shr(wasm_i64x2_add(a.val, delta), n);
+    v128_t r = wasm_v8x16_shuffle(a1, a1, 0,1,2,3,8,9,10,11,0,1,2,3,8,9,10,11);
+    int t_ptr[4];
+    wasm_v128_store(t_ptr, r);
+    for (int i=0; i<2; ++i) {
+        ptr[i] = t_ptr[i];
+    }
+}
+template<int n>
+inline void v_rshr_pack_u_store(uchar* ptr, const v_int16x8& a)
+{
+    v128_t delta = wasm_i16x8_splat(((short)1 << (n-1)));
+    v128_t a1 = wasm_i16x8_shr(wasm_i16x8_add(a.val, delta), n);
+    v128_t maxval = wasm_i16x8_splat(255);
+    v128_t minval = wasm_i16x8_splat(0);
+    v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_i16x8_gt(a1, maxval));
+    v128_t a3 = wasm_v128_bitselect(minval, a2, wasm_i16x8_lt(a1, minval));
+    v128_t r = wasm_v8x16_shuffle(a3, a3, 0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14);
+    uchar t_ptr[16];
+    wasm_v128_store(t_ptr, r);
+    for (int i=0; i<8; ++i) {
+        ptr[i] = t_ptr[i];
+    }
+}
+template<int n>
+inline void v_rshr_pack_u_store(ushort* ptr, const v_int32x4& a)
+{
+    v128_t delta = wasm_i32x4_splat(((int)1 << (n-1)));
+    v128_t a1 = wasm_i32x4_shr(wasm_i32x4_add(a.val, delta), n);
+    v128_t maxval = wasm_i32x4_splat(65535);
+    v128_t minval = wasm_i32x4_splat(0);
+    v128_t a2 = wasm_v128_bitselect(maxval, a1, wasm_i32x4_gt(a1, maxval));
+    v128_t a3 = wasm_v128_bitselect(minval, a2, wasm_i32x4_lt(a1, minval));
+    v128_t r = wasm_v8x16_shuffle(a3, a3, 0,1,4,5,8,9,12,13,0,1,4,5,8,9,12,13);
+    ushort t_ptr[8];
+    wasm_v128_store(t_ptr, r);
+    for (int i=0; i<4; ++i) {
+        ptr[i] = t_ptr[i];
+    }
+}
+
+inline v_uint8x16 v_pack_b(const v_uint16x8& a, const v_uint16x8& b)
+{
+    v128_t maxval = wasm_i16x8_splat(255);
+    v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_u16x8_gt(a.val, maxval));
+    v128_t b1 = wasm_v128_bitselect(maxval, b.val, wasm_u16x8_gt(b.val, maxval));
+    return v_uint8x16(wasm_v8x16_shuffle(a1, b1, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30));
+}
+
+inline v_uint8x16 v_pack_b(const v_uint32x4& a, const v_uint32x4& b,
+                           const v_uint32x4& c, const v_uint32x4& d)
+{
+    v128_t maxval = wasm_i32x4_splat(255);
+    v128_t a1 = wasm_v128_bitselect(maxval, a.val, wasm_u32x4_gt(a.val, maxval));
+    v128_t b1 = wasm_v128_bitselect(maxval, b.val, wasm_u32x4_gt(b.val, maxval));
+    v128_t c1 = wasm_v128_bitselect(maxval, c.val, wasm_u32x4_gt(c.val, maxval));
+    v128_t d1 = wasm_v128_bitselect(maxval, d.val, wasm_u32x4_gt(d.val, maxval));
+    v128_t ab = wasm_v8x16_shuffle(a1, b1, 0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28);
+    v128_t cd = wasm_v8x16_shuffle(c1, d1, 0,4,8,12,16,20,24,28,0,4,8,12,16,20,24,28);
+    return v_uint8x16(wasm_v8x16_shuffle(ab, cd, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23));
+}
+
+inline v_uint8x16 v_pack_b(const v_uint64x2& a, const v_uint64x2& b, const v_uint64x2& c,
+                           const v_uint64x2& d, const v_uint64x2& e, const v_uint64x2& f,
+                           const v_uint64x2& g, const v_uint64x2& h)
+{
+    v128_t maxval = wasm_i32x4_splat(255);
+    v128_t a1 = wasm_v128_bitselect(maxval, a.val, ((__u64x2)(a.val) > (__u64x2)maxval));
+    v128_t b1 = wasm_v128_bitselect(maxval, b.val, ((__u64x2)(b.val) > (__u64x2)maxval));
+    v128_t c1 = wasm_v128_bitselect(maxval, c.val, ((__u64x2)(c.val) > (__u64x2)maxval));
+    v128_t d1 = wasm_v128_bitselect(maxval, d.val, ((__u64x2)(d.val) > (__u64x2)maxval));
+    v128_t e1 = wasm_v128_bitselect(maxval, e.val, ((__u64x2)(e.val) > (__u64x2)maxval));
+    v128_t f1 = wasm_v128_bitselect(maxval, f.val, ((__u64x2)(f.val) > (__u64x2)maxval));
+    v128_t g1 = wasm_v128_bitselect(maxval, g.val, ((__u64x2)(g.val) > (__u64x2)maxval));
+    v128_t h1 = wasm_v128_bitselect(maxval, h.val, ((__u64x2)(h.val) > (__u64x2)maxval));
+    v128_t ab = wasm_v8x16_shuffle(a1, b1, 0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24);
+    v128_t cd = wasm_v8x16_shuffle(c1, d1, 0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24);
+    v128_t ef = wasm_v8x16_shuffle(e1, f1, 0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24);
+    v128_t gh = wasm_v8x16_shuffle(g1, h1, 0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24);
+    v128_t abcd = wasm_v8x16_shuffle(ab, cd, 0,1,2,3,16,17,18,19,0,1,2,3,16,17,18,19);
+    v128_t efgh = wasm_v8x16_shuffle(ef, gh, 0,1,2,3,16,17,18,19,0,1,2,3,16,17,18,19);
+    return v_uint8x16(wasm_v8x16_shuffle(abcd, efgh, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23));
+}
+
+inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
+                            const v_float32x4& m1, const v_float32x4& m2,
+                            const v_float32x4& m3)
+{
+    v128_t v0 = wasm_f32x4_splat(wasm_f32x4_extract_lane(v.val, 0));
+    v128_t v1 = wasm_f32x4_splat(wasm_f32x4_extract_lane(v.val, 1));
+    v128_t v2 = wasm_f32x4_splat(wasm_f32x4_extract_lane(v.val, 2));
+    v128_t v3 = wasm_f32x4_splat(wasm_f32x4_extract_lane(v.val, 3));
+    v0 = wasm_f32x4_mul(v0, m0.val);
+    v1 = wasm_f32x4_mul(v1, m1.val);
+    v2 = wasm_f32x4_mul(v2, m2.val);
+    v3 = wasm_f32x4_mul(v3, m3.val);
+
+    return v_float32x4(wasm_f32x4_add(wasm_f32x4_add(v0, v1), wasm_f32x4_add(v2, v3)));
+}
+
+inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
+                               const v_float32x4& m1, const v_float32x4& m2,
+                               const v_float32x4& a)
+{
+    v128_t v0 = wasm_f32x4_splat(wasm_f32x4_extract_lane(v.val, 0));
+    v128_t v1 = wasm_f32x4_splat(wasm_f32x4_extract_lane(v.val, 1));
+    v128_t v2 = wasm_f32x4_splat(wasm_f32x4_extract_lane(v.val, 2));
+    v0 = wasm_f32x4_mul(v0, m0.val);
+    v1 = wasm_f32x4_mul(v1, m1.val);
+    v2 = wasm_f32x4_mul(v2, m2.val);
+
+    return v_float32x4(wasm_f32x4_add(wasm_f32x4_add(v0, v1), wasm_f32x4_add(v2, a.val)));
+}
+
+#define OPENCV_HAL_IMPL_WASM_BIN_OP(bin_op, _Tpvec, intrin) \
+inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(intrin(a.val, b.val)); \
+} \
+inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
+{ \
+    a.val = intrin(a.val, b.val); \
+    return a; \
+}
+
+OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_uint8x16, wasm_u8x16_add_saturate)
+OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_uint8x16, wasm_u8x16_sub_saturate)
+OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_int8x16, wasm_i8x16_add_saturate)
+OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_int8x16, wasm_i8x16_sub_saturate)
+OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_uint16x8, wasm_u16x8_add_saturate)
+OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_uint16x8, wasm_u16x8_sub_saturate)
+OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_int16x8, wasm_i16x8_add_saturate)
+OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_int16x8, wasm_i16x8_sub_saturate)
+OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_uint32x4, wasm_i32x4_add)
+OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_uint32x4, wasm_i32x4_sub)
+OPENCV_HAL_IMPL_WASM_BIN_OP(*, v_uint32x4, wasm_i32x4_mul)
+OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_int32x4, wasm_i32x4_add)
+OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_int32x4, wasm_i32x4_sub)
+OPENCV_HAL_IMPL_WASM_BIN_OP(*, v_int32x4, wasm_i32x4_mul)
+OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_float32x4, wasm_f32x4_add)
+OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_float32x4, wasm_f32x4_sub)
+OPENCV_HAL_IMPL_WASM_BIN_OP(*, v_float32x4, wasm_f32x4_mul)
+OPENCV_HAL_IMPL_WASM_BIN_OP(/, v_float32x4, wasm_f32x4_div)
+OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_uint64x2, wasm_i64x2_add)
+OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_uint64x2, wasm_i64x2_sub)
+OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_int64x2, wasm_i64x2_add)
+OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_int64x2, wasm_i64x2_sub)
+OPENCV_HAL_IMPL_WASM_BIN_OP(+, v_float64x2, wasm_f64x2_add)
+OPENCV_HAL_IMPL_WASM_BIN_OP(-, v_float64x2, wasm_f64x2_sub)
+OPENCV_HAL_IMPL_WASM_BIN_OP(*, v_float64x2, wasm_f64x2_mul)
+OPENCV_HAL_IMPL_WASM_BIN_OP(/, v_float64x2, wasm_f64x2_div)
+
+// saturating multiply 8-bit, 16-bit
+#define OPENCV_HAL_IMPL_WASM_MUL_SAT(_Tpvec, _Tpwvec)        \
+inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b)  \
+{                                                            \
+    _Tpwvec c, d;                                            \
+    v_mul_expand(a, b, c, d);                                \
+    return v_pack(c, d);                                     \
+}                                                            \
+inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b)      \
+{ a = a * b; return a; }
+
+OPENCV_HAL_IMPL_WASM_MUL_SAT(v_uint8x16, v_uint16x8)
+OPENCV_HAL_IMPL_WASM_MUL_SAT(v_int8x16,  v_int16x8)
+OPENCV_HAL_IMPL_WASM_MUL_SAT(v_uint16x8, v_uint32x4)
+OPENCV_HAL_IMPL_WASM_MUL_SAT(v_int16x8,  v_int32x4)
+
+//  Multiply and expand
+inline void v_mul_expand(const v_uint8x16& a, const v_uint8x16& b,
+                         v_uint16x8& c, v_uint16x8& d)
+{
+    v_uint16x8 a0, a1, b0, b1;
+    v_expand(a, a0, a1);
+    v_expand(b, b0, b1);
+    c = v_mul_wrap(a0, b0);
+    d = v_mul_wrap(a1, b1);
+}
+
+inline void v_mul_expand(const v_int8x16& a, const v_int8x16& b,
+                         v_int16x8& c, v_int16x8& d)
+{
+    v_int16x8 a0, a1, b0, b1;
+    v_expand(a, a0, a1);
+    v_expand(b, b0, b1);
+    c = v_mul_wrap(a0, b0);
+    d = v_mul_wrap(a1, b1);
+}
+
+inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
+                         v_int32x4& c, v_int32x4& d)
+{
+    v_int32x4 a0, a1, b0, b1;
+    v_expand(a, a0, a1);
+    v_expand(b, b0, b1);
+    c.val = wasm_i32x4_mul(a0.val, b0.val);
+    d.val = wasm_i32x4_mul(a1.val, b1.val);
+}
+
+inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b,
+                         v_uint32x4& c, v_uint32x4& d)
+{
+    v_uint32x4 a0, a1, b0, b1;
+    v_expand(a, a0, a1);
+    v_expand(b, b0, b1);
+    c.val = wasm_i32x4_mul(a0.val, b0.val);
+    d.val = wasm_i32x4_mul(a1.val, b1.val);
+}
+
+inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
+                         v_uint64x2& c, v_uint64x2& d)
+{
+    v_uint64x2 a0, a1, b0, b1;
+    v_expand(a, a0, a1);
+    v_expand(b, b0, b1);
+    c.val = ((__u64x2)(a0.val) * (__u64x2)(b0.val));
+    d.val = ((__u64x2)(a1.val) * (__u64x2)(b1.val));
+}
+
+inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b)
+{
+    v_int32x4 a0, a1, b0, b1;
+    v_expand(a, a0, a1);
+    v_expand(b, b0, b1);
+    v128_t c = wasm_i32x4_mul(a0.val, b0.val);
+    v128_t d = wasm_i32x4_mul(a1.val, b1.val);
+    return v_int16x8(wasm_v8x16_shuffle(c, d, 2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31));
+}
+inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b)
+{
+    v_uint32x4 a0, a1, b0, b1;
+    v_expand(a, a0, a1);
+    v_expand(b, b0, b1);
+    v128_t c = wasm_i32x4_mul(a0.val, b0.val);
+    v128_t d = wasm_i32x4_mul(a1.val, b1.val);
+    return v_uint16x8(wasm_v8x16_shuffle(c, d, 2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31));
+}
+
+//////// Dot Product ////////
+
+inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
+{
+    v128_t a0 = wasm_i32x4_shr(wasm_i32x4_shl(a.val, 16), 16);
+    v128_t a1 = wasm_i32x4_shr(a.val, 16);
+    v128_t b0 = wasm_i32x4_shr(wasm_i32x4_shl(b.val, 16), 16);
+    v128_t b1 = wasm_i32x4_shr(b.val, 16);
+    v128_t c = wasm_i32x4_mul(a0, b0);
+    v128_t d = wasm_i32x4_mul(a1, b1);
+    return v_int32x4(wasm_i32x4_add(c, d));
+}
+
+inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
+{ return v_dotprod(a, b) + c; }
+
+inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
+{
+    v128_t a0 = wasm_i64x2_shr(wasm_i64x2_shl(a.val, 32), 32);
+    v128_t a1 = wasm_i64x2_shr(a.val, 32);
+    v128_t b0 = wasm_i64x2_shr(wasm_i64x2_shl(b.val, 32), 32);
+    v128_t b1 = wasm_i64x2_shr(b.val, 32);
+    v128_t c = (v128_t)((__i64x2)a0 * (__i64x2)b0);
+    v128_t d = (v128_t)((__i64x2)a1 * (__i64x2)b1);
+    return v_int64x2(wasm_i64x2_add(c, d));
+}
+inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
+{
+    return v_dotprod(a, b) + c;
+}
+
+// 8 >> 32
+inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
+{
+    v128_t a0 = wasm_u16x8_shr(wasm_i16x8_shl(a.val, 8), 8);
+    v128_t a1 = wasm_u16x8_shr(a.val, 8);
+    v128_t b0 = wasm_u16x8_shr(wasm_i16x8_shl(b.val, 8), 8);
+    v128_t b1 = wasm_u16x8_shr(b.val, 8);
+    return v_uint32x4((
+        v_dotprod(v_int16x8(a0), v_int16x8(b0)) +
+        v_dotprod(v_int16x8(a1), v_int16x8(b1))).val
+    );
+}
+inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
+{
+    v128_t a0 = wasm_i16x8_shr(wasm_i16x8_shl(a.val, 8), 8);
+    v128_t a1 = wasm_i16x8_shr(a.val, 8);
+    v128_t b0 = wasm_i16x8_shr(wasm_i16x8_shl(b.val, 8), 8);
+    v128_t b1 = wasm_i16x8_shr(b.val, 8);
+    return v_int32x4(
+        v_dotprod(v_int16x8(a0), v_int16x8(b0)) +
+        v_dotprod(v_int16x8(a1), v_int16x8(b1))
+    );
+}
+inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+// 16 >> 64
+inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
+{
+    v128_t a0 = wasm_u32x4_shr(wasm_i32x4_shl(a.val, 16), 16);
+    v128_t a1 = wasm_u32x4_shr(a.val, 16);
+    v128_t b0 = wasm_u32x4_shr(wasm_i32x4_shl(b.val, 16), 16);
+    v128_t b1 = wasm_u32x4_shr(b.val, 16);
+    return v_uint64x2((
+        v_dotprod(v_int32x4(a0), v_int32x4(b0)) +
+        v_dotprod(v_int32x4(a1), v_int32x4(b1))).val
+    );
+}
+inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
+{
+    v128_t a0 = wasm_i32x4_shr(wasm_i32x4_shl(a.val, 16), 16);
+    v128_t a1 = wasm_i32x4_shr(a.val, 16);
+    v128_t b0 = wasm_i32x4_shr(wasm_i32x4_shl(b.val, 16), 16);
+    v128_t b1 = wasm_i32x4_shr(b.val, 16);
+    return v_int64x2((
+        v_dotprod(v_int32x4(a0), v_int32x4(b0)) +
+        v_dotprod(v_int32x4(a1), v_int32x4(b1)))
+    );
+}
+
+inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+// 32 >> 64f
+inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
+{ return v_cvt_f64(v_dotprod(a, b)); }
+inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
+{ return v_dotprod_expand(a, b) + c; }
+
+//////// Fast Dot Product ////////
+
+// 16 >> 32
+inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
+{ return v_dotprod(a, b); }
+inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
+{ return v_dotprod(a, b, c); }
+
+// 32 >> 64
+inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
+{ return v_dotprod(a, b); }
+inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
+{ return v_dotprod(a, b, c); }
+
+// 8 >> 32
+inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b)
+{ return v_dotprod_expand(a, b); }
+inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
+{ return v_dotprod_expand(a, b, c); }
+inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
+{ return v_dotprod_expand(a, b); }
+inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
+{ return v_dotprod_expand(a, b, c); }
+
+// 16 >> 64
+inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)
+{ return v_dotprod_expand(a, b); }
+inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
+{ return v_dotprod_expand(a, b, c); }
+inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
+{ return v_dotprod_expand(a, b); }
+inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
+{ return v_dotprod_expand(a, b, c); }
+
+// 32 >> 64f
+inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
+{ return v_dotprod_expand(a, b); }
+inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
+{ return v_dotprod_expand(a, b, c); }
+
+#define OPENCV_HAL_IMPL_WASM_LOGIC_OP(_Tpvec) \
+OPENCV_HAL_IMPL_WASM_BIN_OP(&, _Tpvec, wasm_v128_and) \
+OPENCV_HAL_IMPL_WASM_BIN_OP(|, _Tpvec, wasm_v128_or) \
+OPENCV_HAL_IMPL_WASM_BIN_OP(^, _Tpvec, wasm_v128_xor) \
+inline _Tpvec operator ~ (const _Tpvec& a) \
+{ \
+    return _Tpvec(wasm_v128_not(a.val)); \
+}
+
+OPENCV_HAL_IMPL_WASM_LOGIC_OP(v_uint8x16)
+OPENCV_HAL_IMPL_WASM_LOGIC_OP(v_int8x16)
+OPENCV_HAL_IMPL_WASM_LOGIC_OP(v_uint16x8)
+OPENCV_HAL_IMPL_WASM_LOGIC_OP(v_int16x8)
+OPENCV_HAL_IMPL_WASM_LOGIC_OP(v_uint32x4)
+OPENCV_HAL_IMPL_WASM_LOGIC_OP(v_int32x4)
+OPENCV_HAL_IMPL_WASM_LOGIC_OP(v_uint64x2)
+OPENCV_HAL_IMPL_WASM_LOGIC_OP(v_int64x2)
+OPENCV_HAL_IMPL_WASM_LOGIC_OP(v_float32x4)
+OPENCV_HAL_IMPL_WASM_LOGIC_OP(v_float64x2)
+
+inline v_float32x4 v_sqrt(const v_float32x4& x)
+{
+    return v_float32x4(wasm_f32x4_sqrt(x.val));
+}
+
+inline v_float32x4 v_invsqrt(const v_float32x4& x)
+{
+    const v128_t _1_0 = wasm_f32x4_splat(1.0);
+    return v_float32x4(wasm_f32x4_div(_1_0, wasm_f32x4_sqrt(x.val)));
+}
+
+inline v_float64x2 v_sqrt(const v_float64x2& x)
+{
+    return v_float64x2(wasm_f64x2_sqrt(x.val));
+}
+
+inline v_float64x2 v_invsqrt(const v_float64x2& x)
+{
+    const v128_t _1_0 = wasm_f64x2_splat(1.0);
+    return v_float64x2(wasm_f64x2_div(_1_0, wasm_f64x2_sqrt(x.val)));
+}
+
+#define OPENCV_HAL_IMPL_WASM_ABS_INT_FUNC(_Tpuvec, _Tpsvec, suffix, zsuffix, shiftWidth) \
+inline _Tpuvec v_abs(const _Tpsvec& x) \
+{ \
+    v128_t s = wasm_##suffix##_shr(x.val, shiftWidth); \
+    v128_t f = wasm_##zsuffix##_shr(x.val, shiftWidth); \
+    return _Tpuvec(wasm_##zsuffix##_add(wasm_v128_xor(x.val, f), s)); \
+}
+
+OPENCV_HAL_IMPL_WASM_ABS_INT_FUNC(v_uint8x16, v_int8x16, u8x16, i8x16, 7)
+OPENCV_HAL_IMPL_WASM_ABS_INT_FUNC(v_uint16x8, v_int16x8, u16x8, i16x8, 15)
+OPENCV_HAL_IMPL_WASM_ABS_INT_FUNC(v_uint32x4, v_int32x4, u32x4, i32x4, 31)
+
+inline v_float32x4 v_abs(const v_float32x4& x)
+{ return v_float32x4(wasm_f32x4_abs(x.val)); }
+inline v_float64x2 v_abs(const v_float64x2& x)
+{
+    return v_float64x2(wasm_f64x2_abs(x.val));
+}
+
+// TODO: exp, log, sin, cos
+
+#define OPENCV_HAL_IMPL_WASM_BIN_FUNC(_Tpvec, func, intrin) \
+inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(intrin(a.val, b.val)); \
+}
+
+OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_float32x4, v_min, wasm_f32x4_min)
+OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_float32x4, v_max, wasm_f32x4_max)
+OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_float64x2, v_min, wasm_f64x2_min)
+OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_float64x2, v_max, wasm_f64x2_max)
+
+#define OPENCV_HAL_IMPL_WASM_MINMAX_S_INIT_FUNC(_Tpvec, suffix) \
+inline _Tpvec v_min(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(wasm_v128_bitselect(b.val, a.val, wasm_##suffix##_gt(a.val, b.val))); \
+} \
+inline _Tpvec v_max(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(wasm_v128_bitselect(a.val, b.val, wasm_##suffix##_gt(a.val, b.val))); \
+}
+
+OPENCV_HAL_IMPL_WASM_MINMAX_S_INIT_FUNC(v_int8x16, i8x16)
+OPENCV_HAL_IMPL_WASM_MINMAX_S_INIT_FUNC(v_int16x8, i16x8)
+OPENCV_HAL_IMPL_WASM_MINMAX_S_INIT_FUNC(v_int32x4, i32x4)
+
+#define OPENCV_HAL_IMPL_WASM_MINMAX_U_INIT_FUNC(_Tpvec, suffix, deltaNum) \
+inline _Tpvec v_min(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    v128_t delta = wasm_##suffix##_splat(deltaNum); \
+    v128_t mask = wasm_##suffix##_gt(wasm_v128_xor(a.val, delta), wasm_v128_xor(b.val, delta)); \
+    return _Tpvec(wasm_v128_bitselect(b.val, a.val, mask)); \
+} \
+inline _Tpvec v_max(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    v128_t delta = wasm_##suffix##_splat(deltaNum); \
+    v128_t mask = wasm_##suffix##_gt(wasm_v128_xor(a.val, delta), wasm_v128_xor(b.val, delta)); \
+    return _Tpvec(wasm_v128_bitselect(a.val, b.val, mask)); \
+}
+
+OPENCV_HAL_IMPL_WASM_MINMAX_U_INIT_FUNC(v_uint8x16, i8x16, (schar)0x80)
+OPENCV_HAL_IMPL_WASM_MINMAX_U_INIT_FUNC(v_uint16x8, i16x8, (short)0x8000)
+OPENCV_HAL_IMPL_WASM_MINMAX_U_INIT_FUNC(v_uint32x4, i32x4, (int)0x80000000)
+
+#define OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(_Tpvec, suffix, esuffix) \
+inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(wasm_##esuffix##_eq(a.val, b.val)); } \
+inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(wasm_##esuffix##_ne(a.val, b.val)); } \
+inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(wasm_##suffix##_lt(a.val, b.val)); } \
+inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(wasm_##suffix##_gt(a.val, b.val)); } \
+inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(wasm_##suffix##_le(a.val, b.val)); } \
+inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(wasm_##suffix##_ge(a.val, b.val)); }
+
+OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(v_uint8x16, u8x16, i8x16)
+OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(v_int8x16, i8x16, i8x16)
+OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(v_uint16x8, u16x8, i16x8)
+OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(v_int16x8, i16x8, i16x8)
+OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(v_uint32x4, u32x4, i32x4)
+OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(v_int32x4, i32x4, i32x4)
+OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(v_float32x4, f32x4, f32x4)
+OPENCV_HAL_IMPL_WASM_INIT_CMP_OP(v_float64x2, f64x2, f64x2)
+
+#define OPENCV_HAL_IMPL_WASM_64BIT_CMP_OP(_Tpvec, cast) \
+inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
+{ return cast(v_reinterpret_as_f64(a) == v_reinterpret_as_f64(b)); } \
+inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
+{ return cast(v_reinterpret_as_f64(a) != v_reinterpret_as_f64(b)); }
+
+OPENCV_HAL_IMPL_WASM_64BIT_CMP_OP(v_uint64x2, v_reinterpret_as_u64)
+OPENCV_HAL_IMPL_WASM_64BIT_CMP_OP(v_int64x2, v_reinterpret_as_s64)
+
+inline v_float32x4 v_not_nan(const v_float32x4& a)
+{
+    v128_t z = wasm_i32x4_splat(0x7fffffff);
+    v128_t t = wasm_i32x4_splat(0x7f800000);
+    return v_float32x4(wasm_u32x4_lt(wasm_v128_and(a.val, z), t));
+}
+inline v_float64x2 v_not_nan(const v_float64x2& a)
+{
+    v128_t z = wasm_i64x2_splat(0x7fffffffffffffff);
+    v128_t t = wasm_i64x2_splat(0x7ff0000000000000);
+    return v_float64x2((__u64x2)(wasm_v128_and(a.val, z)) < (__u64x2)t);
+}
+
+OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_uint8x16, v_add_wrap, wasm_i8x16_add)
+OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_int8x16, v_add_wrap, wasm_i8x16_add)
+OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_uint16x8, v_add_wrap, wasm_i16x8_add)
+OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_int16x8, v_add_wrap, wasm_i16x8_add)
+OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_uint8x16, v_sub_wrap, wasm_i8x16_sub)
+OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_int8x16, v_sub_wrap, wasm_i8x16_sub)
+OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_uint16x8, v_sub_wrap, wasm_i16x8_sub)
+OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_int16x8, v_sub_wrap, wasm_i16x8_sub)
+#if (__EMSCRIPTEN_major__ * 1000000 + __EMSCRIPTEN_minor__ * 1000 + __EMSCRIPTEN_tiny__) >= (1039012)
+// details: https://github.com/opencv/opencv/issues/18097 ( https://github.com/emscripten-core/emscripten/issues/12018 )
+// 1.39.12: https://github.com/emscripten-core/emscripten/commit/cd801d0f110facfd694212a3c8b2ed2ffcd630e2
+inline v_uint8x16 v_mul_wrap(const v_uint8x16& a, const v_uint8x16& b)
+{
+    uchar a_[16], b_[16];
+    wasm_v128_store(a_, a.val);
+    wasm_v128_store(b_, b.val);
+    for (int i = 0; i < 16; i++)
+        a_[i] = (uchar)(a_[i] * b_[i]);
+    return v_uint8x16(wasm_v128_load(a_));
+}
+inline v_int8x16 v_mul_wrap(const v_int8x16& a, const v_int8x16& b)
+{
+    schar a_[16], b_[16];
+    wasm_v128_store(a_, a.val);
+    wasm_v128_store(b_, b.val);
+    for (int i = 0; i < 16; i++)
+        a_[i] = (schar)(a_[i] * b_[i]);
+    return v_int8x16(wasm_v128_load(a_));
+}
+#else
+OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_uint8x16, v_mul_wrap, wasm_i8x16_mul)
+OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_int8x16, v_mul_wrap, wasm_i8x16_mul)
+#endif
+OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_uint16x8, v_mul_wrap, wasm_i16x8_mul)
+OPENCV_HAL_IMPL_WASM_BIN_FUNC(v_int16x8, v_mul_wrap, wasm_i16x8_mul)
+
+
+/** Absolute difference **/
+
+inline v_uint8x16 v_absdiff(const v_uint8x16& a, const v_uint8x16& b)
+{ return v_add_wrap(a - b,  b - a); }
+inline v_uint16x8 v_absdiff(const v_uint16x8& a, const v_uint16x8& b)
+{ return v_add_wrap(a - b,  b - a); }
+inline v_uint32x4 v_absdiff(const v_uint32x4& a, const v_uint32x4& b)
+{ return v_max(a, b) - v_min(a, b); }
+
+inline v_uint8x16 v_absdiff(const v_int8x16& a, const v_int8x16& b)
+{
+    v_int8x16 d = v_sub_wrap(a, b);
+    v_int8x16 m = a < b;
+    return v_reinterpret_as_u8(v_sub_wrap(d ^ m, m));
+}
+inline v_uint16x8 v_absdiff(const v_int16x8& a, const v_int16x8& b)
+{
+    return v_reinterpret_as_u16(v_sub_wrap(v_max(a, b), v_min(a, b)));
+}
+inline v_uint32x4 v_absdiff(const v_int32x4& a, const v_int32x4& b)
+{
+    v_int32x4 d = a - b;
+    v_int32x4 m = a < b;
+    return v_reinterpret_as_u32((d ^ m) - m);
+}
+
+/** Saturating absolute difference **/
+inline v_int8x16 v_absdiffs(const v_int8x16& a, const v_int8x16& b)
+{
+    v_int8x16 d = a - b;
+    v_int8x16 m = a < b;
+    return (d ^ m) - m;
+ }
+inline v_int16x8 v_absdiffs(const v_int16x8& a, const v_int16x8& b)
+{ return v_max(a, b) - v_min(a, b); }
+
+
+inline v_int32x4 v_fma(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
+{
+    return a * b + c;
+}
+
+inline v_int32x4 v_muladd(const v_int32x4& a, const v_int32x4& b, const v_int32x4& c)
+{
+    return v_fma(a, b, c);
+}
+
+inline v_float32x4 v_fma(const v_float32x4& a, const v_float32x4& b, const v_float32x4& c)
+{
+    return a * b + c;
+}
+
+inline v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c)
+{
+    return a * b + c;
+}
+
+inline v_float32x4 v_absdiff(const v_float32x4& a, const v_float32x4& b)
+{
+    v128_t absmask_vec = wasm_i32x4_splat(0x7fffffff);
+    return v_float32x4(wasm_v128_and(wasm_f32x4_sub(a.val, b.val), absmask_vec));
+}
+inline v_float64x2 v_absdiff(const v_float64x2& a, const v_float64x2& b)
+{
+    v128_t absmask_vec = wasm_u64x2_shr(wasm_i32x4_splat(-1), 1);
+    return v_float64x2(wasm_v128_and(wasm_f64x2_sub(a.val, b.val), absmask_vec));
+}
+
+#define OPENCV_HAL_IMPL_WASM_MISC_FLT_OP(_Tpvec, suffix) \
+inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    v128_t a_Square = wasm_##suffix##_mul(a.val, a.val); \
+    v128_t b_Square = wasm_##suffix##_mul(b.val, b.val); \
+    return _Tpvec(wasm_##suffix##_sqrt(wasm_##suffix##_add(a_Square, b_Square))); \
+} \
+inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    v128_t a_Square = wasm_##suffix##_mul(a.val, a.val); \
+    v128_t b_Square = wasm_##suffix##_mul(b.val, b.val); \
+    return _Tpvec(wasm_##suffix##_add(a_Square, b_Square)); \
+} \
+inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
+{ \
+    return _Tpvec(wasm_##suffix##_add(wasm_##suffix##_mul(a.val, b.val), c.val)); \
+}
+
+OPENCV_HAL_IMPL_WASM_MISC_FLT_OP(v_float32x4, f32x4)
+OPENCV_HAL_IMPL_WASM_MISC_FLT_OP(v_float64x2, f64x2)
+
+#define OPENCV_HAL_IMPL_WASM_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, ssuffix) \
+inline _Tpuvec operator << (const _Tpuvec& a, int imm) \
+{ \
+    return _Tpuvec(wasm_##suffix##_shl(a.val, imm)); \
+} \
+inline _Tpsvec operator << (const _Tpsvec& a, int imm) \
+{ \
+    return _Tpsvec(wasm_##suffix##_shl(a.val, imm)); \
+} \
+inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \
+{ \
+    return _Tpuvec(wasm_##ssuffix##_shr(a.val, imm)); \
+} \
+inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \
+{ \
+    return _Tpsvec(wasm_##suffix##_shr(a.val, imm)); \
+} \
+template<int imm> \
+inline _Tpuvec v_shl(const _Tpuvec& a) \
+{ \
+    return _Tpuvec(wasm_##suffix##_shl(a.val, imm)); \
+} \
+template<int imm> \
+inline _Tpsvec v_shl(const _Tpsvec& a) \
+{ \
+    return _Tpsvec(wasm_##suffix##_shl(a.val, imm)); \
+} \
+template<int imm> \
+inline _Tpuvec v_shr(const _Tpuvec& a) \
+{ \
+    return _Tpuvec(wasm_##ssuffix##_shr(a.val, imm)); \
+} \
+template<int imm> \
+inline _Tpsvec v_shr(const _Tpsvec& a) \
+{ \
+    return _Tpsvec(wasm_##suffix##_shr(a.val, imm)); \
+}
+
+OPENCV_HAL_IMPL_WASM_SHIFT_OP(v_uint8x16, v_int8x16, i8x16, u8x16)
+OPENCV_HAL_IMPL_WASM_SHIFT_OP(v_uint16x8, v_int16x8, i16x8, u16x8)
+OPENCV_HAL_IMPL_WASM_SHIFT_OP(v_uint32x4, v_int32x4, i32x4, u32x4)
+OPENCV_HAL_IMPL_WASM_SHIFT_OP(v_uint64x2, v_int64x2, i64x2, u64x2)
+
+namespace hal_wasm_internal
+{
+    template <int imm,
+        bool is_invalid = ((imm < 0) || (imm > 16)),
+        bool is_first = (imm == 0),
+        bool is_second = (imm == 16),
+        bool is_other = (((imm > 0) && (imm < 16)))>
+    class v_wasm_palignr_u8_class;
+
+    template <int imm>
+    class v_wasm_palignr_u8_class<imm, true, false, false, false>;
+
+    template <int imm>
+    class v_wasm_palignr_u8_class<imm, false, true, false, false>
+    {
+    public:
+        inline v128_t operator()(const v128_t& a, const v128_t&) const
+        {
+            return a;
+        }
+    };
+
+    template <int imm>
+    class v_wasm_palignr_u8_class<imm, false, false, true, false>
+    {
+    public:
+        inline v128_t operator()(const v128_t&, const v128_t& b) const
+        {
+            return b;
+        }
+    };
+
+    template <int imm>
+    class v_wasm_palignr_u8_class<imm, false, false, false, true>
+    {
+    public:
+        inline v128_t operator()(const v128_t& a, const v128_t& b) const
+        {
+            enum { imm2 = (sizeof(v128_t) - imm) };
+            return wasm_v8x16_shuffle(a, b,
+                                      imm, imm+1, imm+2, imm+3,
+                                      imm+4, imm+5, imm+6, imm+7,
+                                      imm+8, imm+9, imm+10, imm+11,
+                                      imm+12, imm+13, imm+14, imm+15);
+        }
+    };
+
+    template <int imm>
+    inline v128_t v_wasm_palignr_u8(const v128_t& a, const v128_t& b)
+    {
+        CV_StaticAssert((imm >= 0) && (imm <= 16), "Invalid imm for v_wasm_palignr_u8.");
+        return v_wasm_palignr_u8_class<imm>()(a, b);
+    }
+}
+
+template<int imm, typename _Tpvec>
+inline _Tpvec v_rotate_right(const _Tpvec &a)
+{
+    using namespace hal_wasm_internal;
+    enum { imm2 = (imm * sizeof(typename _Tpvec::lane_type)) };
+    v128_t z = wasm_i8x16_splat(0);
+    return _Tpvec(v_wasm_palignr_u8<imm2>(a.val, z));
+}
+
+template<int imm, typename _Tpvec>
+inline _Tpvec v_rotate_left(const _Tpvec &a)
+{
+    using namespace hal_wasm_internal;
+    enum { imm2 = ((_Tpvec::nlanes - imm) * sizeof(typename _Tpvec::lane_type)) };
+    v128_t z = wasm_i8x16_splat(0);
+    return _Tpvec(v_wasm_palignr_u8<imm2>(z, a.val));
+}
+
+template<int imm, typename _Tpvec>
+inline _Tpvec v_rotate_right(const _Tpvec &a, const _Tpvec &b)
+{
+    using namespace hal_wasm_internal;
+    enum { imm2 = (imm * sizeof(typename _Tpvec::lane_type)) };
+    return _Tpvec(v_wasm_palignr_u8<imm2>(a.val, b.val));
+}
+
+template<int imm, typename _Tpvec>
+inline _Tpvec v_rotate_left(const _Tpvec &a, const _Tpvec &b)
+{
+    using namespace hal_wasm_internal;
+    enum { imm2 = ((_Tpvec::nlanes - imm) * sizeof(typename _Tpvec::lane_type)) };
+    return _Tpvec(v_wasm_palignr_u8<imm2>(b.val, a.val));
+}
+
+#define OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(_Tpvec, _Tp) \
+inline _Tpvec v_load(const _Tp* ptr) \
+{ return _Tpvec(wasm_v128_load(ptr)); } \
+inline _Tpvec v_load_aligned(const _Tp* ptr) \
+{ return _Tpvec(wasm_v128_load(ptr)); } \
+inline _Tpvec v_load_low(const _Tp* ptr) \
+{ \
+    _Tp tmp[_Tpvec::nlanes] = {0}; \
+    for (int i=0; i<_Tpvec::nlanes/2; ++i) { \
+        tmp[i] = ptr[i]; \
+    } \
+    return _Tpvec(wasm_v128_load(tmp)); \
+} \
+inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
+{ \
+    _Tp tmp[_Tpvec::nlanes]; \
+    for (int i=0; i<_Tpvec::nlanes/2; ++i) { \
+        tmp[i] = ptr0[i]; \
+        tmp[i+_Tpvec::nlanes/2] = ptr1[i]; \
+    } \
+    return _Tpvec(wasm_v128_load(tmp)); \
+} \
+inline void v_store(_Tp* ptr, const _Tpvec& a) \
+{ wasm_v128_store(ptr, a.val); } \
+inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
+{ wasm_v128_store(ptr, a.val); } \
+inline void v_store_aligned_nocache(_Tp* ptr, const _Tpvec& a) \
+{ wasm_v128_store(ptr, a.val); } \
+inline void v_store(_Tp* ptr, const _Tpvec& a, hal::StoreMode /*mode*/) \
+{ \
+    wasm_v128_store(ptr, a.val); \
+} \
+inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
+{ \
+    _Tpvec::lane_type a_[_Tpvec::nlanes]; \
+    wasm_v128_store(a_, a.val); \
+    for (int i = 0; i < (_Tpvec::nlanes / 2); i++) \
+        ptr[i] = a_[i]; \
+} \
+inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
+{ \
+    _Tpvec::lane_type a_[_Tpvec::nlanes]; \
+    wasm_v128_store(a_, a.val); \
+    for (int i = 0; i < (_Tpvec::nlanes / 2); i++) \
+        ptr[i] = a_[i + (_Tpvec::nlanes / 2)]; \
+}
+
+OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_uint8x16, uchar)
+OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_int8x16, schar)
+OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_uint16x8, ushort)
+OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_int16x8, short)
+OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_uint32x4, unsigned)
+OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_int32x4, int)
+OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_uint64x2, uint64)
+OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_int64x2, int64)
+OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_float32x4, float)
+OPENCV_HAL_IMPL_WASM_LOADSTORE_INT_OP(v_float64x2, double)
+
+
+/** Reverse **/
+inline v_uint8x16 v_reverse(const v_uint8x16 &a)
+{ return v_uint8x16(wasm_v8x16_shuffle(a.val, a.val, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); }
+
+inline v_int8x16 v_reverse(const v_int8x16 &a)
+{ return v_reinterpret_as_s8(v_reverse(v_reinterpret_as_u8(a))); }
+
+inline v_uint16x8 v_reverse(const v_uint16x8 &a)
+{ return v_uint16x8(wasm_v8x16_shuffle(a.val, a.val, 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1)); }
+
+inline v_int16x8 v_reverse(const v_int16x8 &a)
+{ return v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); }
+
+inline v_uint32x4 v_reverse(const v_uint32x4 &a)
+{ return v_uint32x4(wasm_v8x16_shuffle(a.val, a.val, 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3)); }
+
+inline v_int32x4 v_reverse(const v_int32x4 &a)
+{ return v_reinterpret_as_s32(v_reverse(v_reinterpret_as_u32(a))); }
+
+inline v_float32x4 v_reverse(const v_float32x4 &a)
+{ return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); }
+
+inline v_uint64x2 v_reverse(const v_uint64x2 &a)
+{ return v_uint64x2(wasm_v8x16_shuffle(a.val, a.val, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7)); }
+
+inline v_int64x2 v_reverse(const v_int64x2 &a)
+{ return v_reinterpret_as_s64(v_reverse(v_reinterpret_as_u64(a))); }
+
+inline v_float64x2 v_reverse(const v_float64x2 &a)
+{ return v_reinterpret_as_f64(v_reverse(v_reinterpret_as_u64(a))); }
+
+
+#define OPENCV_HAL_IMPL_WASM_REDUCE_OP_4_SUM(_Tpvec, scalartype, regtype, suffix, esuffix) \
+inline scalartype v_reduce_sum(const _Tpvec& a) \
+{ \
+    regtype val = a.val; \
+    val = wasm_##suffix##_add(val, wasm_v8x16_shuffle(val, val, 8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7)); \
+    val = wasm_##suffix##_add(val, wasm_v8x16_shuffle(val, val, 4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3)); \
+    return (scalartype)wasm_##esuffix##_extract_lane(val, 0); \
+}
+
+OPENCV_HAL_IMPL_WASM_REDUCE_OP_4_SUM(v_uint32x4, unsigned, v128_t, i32x4, i32x4)
+OPENCV_HAL_IMPL_WASM_REDUCE_OP_4_SUM(v_int32x4, int, v128_t, i32x4, i32x4)
+OPENCV_HAL_IMPL_WASM_REDUCE_OP_4_SUM(v_float32x4, float, v128_t, f32x4, f32x4)
+
+// To do: Optimize v_reduce_sum with wasm intrin.
+//        Now use fallback implementation as there is no widening op in wasm intrin.
+
+#define OPENCV_HAL_IMPL_FALLBACK_REDUCE_OP_SUM(_Tpvec, scalartype) \
+inline scalartype v_reduce_sum(const _Tpvec& a) \
+{ \
+    _Tpvec::lane_type a_[_Tpvec::nlanes]; \
+    wasm_v128_store(a_, a.val); \
+    scalartype c = a_[0]; \
+    for (int i = 1; i < _Tpvec::nlanes; i++) \
+        c += a_[i]; \
+    return c; \
+}
+
+OPENCV_HAL_IMPL_FALLBACK_REDUCE_OP_SUM(v_uint8x16, unsigned)
+OPENCV_HAL_IMPL_FALLBACK_REDUCE_OP_SUM(v_int8x16, int)
+OPENCV_HAL_IMPL_FALLBACK_REDUCE_OP_SUM(v_uint16x8, unsigned)
+OPENCV_HAL_IMPL_FALLBACK_REDUCE_OP_SUM(v_int16x8, int)
+
+
+#define OPENCV_HAL_IMPL_WASM_REDUCE_OP_2_SUM(_Tpvec, scalartype, regtype, suffix, esuffix) \
+inline scalartype v_reduce_sum(const _Tpvec& a) \
+{ \
+    regtype val = a.val; \
+    val = wasm_##suffix##_add(val, wasm_v8x16_shuffle(val, val, 8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7)); \
+    return (scalartype)wasm_##esuffix##_extract_lane(val, 0); \
+}
+OPENCV_HAL_IMPL_WASM_REDUCE_OP_2_SUM(v_uint64x2, uint64, v128_t, i64x2, i64x2)
+OPENCV_HAL_IMPL_WASM_REDUCE_OP_2_SUM(v_int64x2, int64,  v128_t, i64x2, i64x2)
+OPENCV_HAL_IMPL_WASM_REDUCE_OP_2_SUM(v_float64x2, double,  v128_t, f64x2,f64x2)
+
+inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
+                                 const v_float32x4& c, const v_float32x4& d)
+{
+    v128_t ac = wasm_f32x4_add(wasm_unpacklo_i32x4(a.val, c.val), wasm_unpackhi_i32x4(a.val, c.val));
+    v128_t bd = wasm_f32x4_add(wasm_unpacklo_i32x4(b.val, d.val), wasm_unpackhi_i32x4(b.val, d.val));
+    return v_float32x4(wasm_f32x4_add(wasm_unpacklo_i32x4(ac, bd), wasm_unpackhi_i32x4(ac, bd)));
+}
+
+#define OPENCV_HAL_IMPL_WASM_REDUCE_OP(_Tpvec, scalartype, func, scalar_func) \
+inline scalartype v_reduce_##func(const _Tpvec& a) \
+{ \
+    scalartype buf[_Tpvec::nlanes]; \
+    v_store(buf, a); \
+    scalartype tmp = buf[0]; \
+    for (int i=1; i<_Tpvec::nlanes; ++i) { \
+        tmp = scalar_func(tmp, buf[i]); \
+    } \
+    return tmp; \
+}
+
+OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_uint8x16, uchar, max, std::max)
+OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_uint8x16, uchar, min, std::min)
+OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_int8x16, schar, max, std::max)
+OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_int8x16, schar, min, std::min)
+OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_uint16x8, ushort, max, std::max)
+OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_uint16x8, ushort, min, std::min)
+OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_int16x8, short, max, std::max)
+OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_int16x8, short, min, std::min)
+OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_uint32x4, unsigned, max, std::max)
+OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_uint32x4, unsigned, min, std::min)
+OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_int32x4, int, max, std::max)
+OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_int32x4, int, min, std::min)
+OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_float32x4, float, max, std::max)
+OPENCV_HAL_IMPL_WASM_REDUCE_OP(v_float32x4, float, min, std::min)
+
+inline unsigned v_reduce_sad(const v_uint8x16& a, const v_uint8x16& b)
+{
+    v_uint16x8 l16, h16;
+    v_uint32x4 l16_l32, l16_h32, h16_l32, h16_h32;
+    v_expand(v_absdiff(a, b), l16, h16);
+    v_expand(l16, l16_l32, l16_h32);
+    v_expand(h16, h16_l32, h16_h32);
+    return v_reduce_sum(l16_l32+l16_h32+h16_l32+h16_h32);
+}
+inline unsigned v_reduce_sad(const v_int8x16& a, const v_int8x16& b)
+{
+    v_uint16x8 l16, h16;
+    v_uint32x4 l16_l32, l16_h32, h16_l32, h16_h32;
+    v_expand(v_absdiff(a, b), l16, h16);
+    v_expand(l16, l16_l32, l16_h32);
+    v_expand(h16, h16_l32, h16_h32);
+    return v_reduce_sum(l16_l32+l16_h32+h16_l32+h16_h32);
+}
+inline unsigned v_reduce_sad(const v_uint16x8& a, const v_uint16x8& b)
+{
+    v_uint32x4 l, h;
+    v_expand(v_absdiff(a, b), l, h);
+    return v_reduce_sum(l + h);
+}
+inline unsigned v_reduce_sad(const v_int16x8& a, const v_int16x8& b)
+{
+    v_uint32x4 l, h;
+    v_expand(v_absdiff(a, b), l, h);
+    return v_reduce_sum(l + h);
+}
+inline unsigned v_reduce_sad(const v_uint32x4& a, const v_uint32x4& b)
+{
+    return v_reduce_sum(v_absdiff(a, b));
+}
+inline unsigned v_reduce_sad(const v_int32x4& a, const v_int32x4& b)
+{
+    return v_reduce_sum(v_absdiff(a, b));
+}
+inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
+{
+    return v_reduce_sum(v_absdiff(a, b));
+}
+
+inline v_uint8x16 v_popcount(const v_uint8x16& a)
+{
+    v128_t m1 = wasm_i32x4_splat(0x55555555);
+    v128_t m2 = wasm_i32x4_splat(0x33333333);
+    v128_t m4 = wasm_i32x4_splat(0x0f0f0f0f);
+    v128_t p = a.val;
+    p = wasm_i32x4_add(wasm_v128_and(wasm_u32x4_shr(p, 1), m1), wasm_v128_and(p, m1));
+    p = wasm_i32x4_add(wasm_v128_and(wasm_u32x4_shr(p, 2), m2), wasm_v128_and(p, m2));
+    p = wasm_i32x4_add(wasm_v128_and(wasm_u32x4_shr(p, 4), m4), wasm_v128_and(p, m4));
+    return v_uint8x16(p);
+}
+inline v_uint16x8 v_popcount(const v_uint16x8& a)
+{
+    v_uint8x16 p = v_popcount(v_reinterpret_as_u8(a));
+    p += v_rotate_right<1>(p);
+    return v_reinterpret_as_u16(p) & v_setall_u16(0x00ff);
+}
+inline v_uint32x4 v_popcount(const v_uint32x4& a)
+{
+    v_uint8x16 p = v_popcount(v_reinterpret_as_u8(a));
+    p += v_rotate_right<1>(p);
+    p += v_rotate_right<2>(p);
+    return v_reinterpret_as_u32(p) & v_setall_u32(0x000000ff);
+}
+inline v_uint64x2 v_popcount(const v_uint64x2& a)
+{
+    uint64 a_[2], b_[2] = { 0 };
+    wasm_v128_store(a_, a.val);
+    for (int i = 0; i < 16; i++)
+        b_[i / 8] += popCountTable[((uint8_t*)a_)[i]];
+    return v_uint64x2(wasm_v128_load(b_));
+}
+inline v_uint8x16 v_popcount(const v_int8x16& a)
+{ return v_popcount(v_reinterpret_as_u8(a)); }
+inline v_uint16x8 v_popcount(const v_int16x8& a)
+{ return v_popcount(v_reinterpret_as_u16(a)); }
+inline v_uint32x4 v_popcount(const v_int32x4& a)
+{ return v_popcount(v_reinterpret_as_u32(a)); }
+inline v_uint64x2 v_popcount(const v_int64x2& a)
+{ return v_popcount(v_reinterpret_as_u64(a)); }
+
+#define OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(_Tpvec, suffix, scalarType) \
+inline int v_signmask(const _Tpvec& a) \
+{ \
+    _Tpvec::lane_type a_[_Tpvec::nlanes]; \
+    wasm_v128_store(a_, a.val); \
+    int mask = 0; \
+    for (int i = 0; i < _Tpvec::nlanes; i++) \
+        mask |= (reinterpret_int(a_[i]) < 0) << i; \
+    return mask; \
+} \
+inline bool v_check_all(const _Tpvec& a) \
+{ return wasm_i8x16_all_true(wasm_##suffix##_lt(a.val, wasm_##suffix##_splat(0))); } \
+inline bool v_check_any(const _Tpvec& a) \
+{ return wasm_i8x16_any_true(wasm_##suffix##_lt(a.val, wasm_##suffix##_splat(0)));; }
+
+OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(v_uint8x16, i8x16, schar)
+OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(v_int8x16, i8x16, schar)
+OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(v_uint16x8, i16x8, short)
+OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(v_int16x8, i16x8, short)
+OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(v_uint32x4, i32x4, int)
+OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(v_int32x4, i32x4, int)
+OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(v_float32x4, i32x4, float)
+OPENCV_HAL_IMPL_WASM_CHECK_SIGNS(v_float64x2, f64x2, double)
+
+#define OPENCV_HAL_IMPL_WASM_CHECK_ALL_ANY(_Tpvec, suffix, esuffix) \
+inline bool v_check_all(const _Tpvec& a) \
+{ \
+    v128_t masked = v_reinterpret_as_##esuffix(a).val; \
+    masked = wasm_i32x4_replace_lane(masked, 0, 0xffffffff); \
+    masked = wasm_i32x4_replace_lane(masked, 2, 0xffffffff); \
+    return wasm_i8x16_all_true(wasm_##suffix##_lt(masked, wasm_##suffix##_splat(0))); \
+} \
+inline bool v_check_any(const _Tpvec& a) \
+{ \
+    v128_t masked = v_reinterpret_as_##esuffix(a).val; \
+    masked = wasm_i32x4_replace_lane(masked, 0, 0x0); \
+    masked = wasm_i32x4_replace_lane(masked, 2, 0x0); \
+    return wasm_i8x16_any_true(wasm_##suffix##_lt(masked, wasm_##suffix##_splat(0))); \
+} \
+
+OPENCV_HAL_IMPL_WASM_CHECK_ALL_ANY(v_int64x2, i32x4, s32)
+OPENCV_HAL_IMPL_WASM_CHECK_ALL_ANY(v_uint64x2, i32x4, u32)
+
+
+inline int v_scan_forward(const v_int8x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); }
+inline int v_scan_forward(const v_uint8x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); }
+inline int v_scan_forward(const v_int16x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 2; }
+inline int v_scan_forward(const v_uint16x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 2; }
+inline int v_scan_forward(const v_int32x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
+inline int v_scan_forward(const v_uint32x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
+inline int v_scan_forward(const v_float32x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
+inline int v_scan_forward(const v_int64x2& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
+inline int v_scan_forward(const v_uint64x2& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
+inline int v_scan_forward(const v_float64x2& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
+
+#define OPENCV_HAL_IMPL_WASM_SELECT(_Tpvec) \
+inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(wasm_v128_bitselect(a.val, b.val, mask.val)); \
+}
+
+OPENCV_HAL_IMPL_WASM_SELECT(v_uint8x16)
+OPENCV_HAL_IMPL_WASM_SELECT(v_int8x16)
+OPENCV_HAL_IMPL_WASM_SELECT(v_uint16x8)
+OPENCV_HAL_IMPL_WASM_SELECT(v_int16x8)
+OPENCV_HAL_IMPL_WASM_SELECT(v_uint32x4)
+OPENCV_HAL_IMPL_WASM_SELECT(v_int32x4)
+OPENCV_HAL_IMPL_WASM_SELECT(v_uint64x2)
+OPENCV_HAL_IMPL_WASM_SELECT(v_int64x2)
+OPENCV_HAL_IMPL_WASM_SELECT(v_float32x4)
+OPENCV_HAL_IMPL_WASM_SELECT(v_float64x2)
+
+#define OPENCV_HAL_IMPL_WASM_EXPAND(_Tpvec, _Tpwvec, _Tp, intrin)    \
+inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1)      \
+{                                                                    \
+    b0.val = intrin(a.val);                                          \
+    b1.val = __CV_CAT(intrin, _high)(a.val);                         \
+}                                                                    \
+inline _Tpwvec v_expand_low(const _Tpvec& a)                         \
+{ return _Tpwvec(intrin(a.val)); }                                   \
+inline _Tpwvec v_expand_high(const _Tpvec& a)                        \
+{ return _Tpwvec(__CV_CAT(intrin, _high)(a.val)); }                  \
+inline _Tpwvec v_load_expand(const _Tp* ptr)                         \
+{                                                                    \
+    v128_t a = wasm_v128_load(ptr);                                  \
+    return _Tpwvec(intrin(a));                                       \
+}
+
+OPENCV_HAL_IMPL_WASM_EXPAND(v_uint8x16, v_uint16x8, uchar, v128_cvtu8x16_i16x8)
+OPENCV_HAL_IMPL_WASM_EXPAND(v_int8x16,  v_int16x8,  schar, v128_cvti8x16_i16x8)
+OPENCV_HAL_IMPL_WASM_EXPAND(v_uint16x8, v_uint32x4, ushort, v128_cvtu16x8_i32x4)
+OPENCV_HAL_IMPL_WASM_EXPAND(v_int16x8,  v_int32x4,  short, v128_cvti16x8_i32x4)
+OPENCV_HAL_IMPL_WASM_EXPAND(v_uint32x4, v_uint64x2, unsigned, v128_cvtu32x4_i64x2)
+OPENCV_HAL_IMPL_WASM_EXPAND(v_int32x4,  v_int64x2,  int, v128_cvti32x4_i64x2)
+
+#define OPENCV_HAL_IMPL_WASM_EXPAND_Q(_Tpvec, _Tp, intrin)  \
+inline _Tpvec v_load_expand_q(const _Tp* ptr)               \
+{                                                           \
+    v128_t a = wasm_v128_load(ptr);                         \
+    return _Tpvec(intrin(a));                               \
+}
+
+OPENCV_HAL_IMPL_WASM_EXPAND_Q(v_uint32x4, uchar, v128_cvtu8x16_i32x4)
+OPENCV_HAL_IMPL_WASM_EXPAND_Q(v_int32x4, schar, v128_cvti8x16_i32x4)
+
+#define OPENCV_HAL_IMPL_WASM_UNPACKS(_Tpvec, suffix) \
+inline void v_zip(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1) \
+{ \
+    b0.val = wasm_unpacklo_##suffix(a0.val, a1.val); \
+    b1.val = wasm_unpackhi_##suffix(a0.val, a1.val); \
+} \
+inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(wasm_unpacklo_i64x2(a.val, b.val)); \
+} \
+inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b) \
+{ \
+    return _Tpvec(wasm_unpackhi_i64x2(a.val, b.val)); \
+} \
+inline void v_recombine(const _Tpvec& a, const _Tpvec& b, _Tpvec& c, _Tpvec& d) \
+{ \
+    c.val = wasm_unpacklo_i64x2(a.val, b.val); \
+    d.val = wasm_unpackhi_i64x2(a.val, b.val); \
+}
+
+OPENCV_HAL_IMPL_WASM_UNPACKS(v_uint8x16, i8x16)
+OPENCV_HAL_IMPL_WASM_UNPACKS(v_int8x16, i8x16)
+OPENCV_HAL_IMPL_WASM_UNPACKS(v_uint16x8, i16x8)
+OPENCV_HAL_IMPL_WASM_UNPACKS(v_int16x8, i16x8)
+OPENCV_HAL_IMPL_WASM_UNPACKS(v_uint32x4, i32x4)
+OPENCV_HAL_IMPL_WASM_UNPACKS(v_int32x4, i32x4)
+OPENCV_HAL_IMPL_WASM_UNPACKS(v_float32x4, i32x4)
+OPENCV_HAL_IMPL_WASM_UNPACKS(v_float64x2, i64x2)
+
+template<int s, typename _Tpvec>
+inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b)
+{
+    return v_rotate_right<s>(a, b);
+}
+
+inline v_int32x4 v_round(const v_float32x4& a)
+{
+    v128_t h = wasm_f32x4_splat(0.5);
+    return v_int32x4(wasm_i32x4_trunc_saturate_f32x4(wasm_f32x4_add(a.val, h)));
+}
+
+inline v_int32x4 v_floor(const v_float32x4& a)
+{
+    v128_t a1 = wasm_i32x4_trunc_saturate_f32x4(a.val);
+    v128_t mask = wasm_f32x4_lt(a.val, wasm_f32x4_convert_i32x4(a1));
+    return v_int32x4(wasm_i32x4_add(a1, mask));
+}
+
+inline v_int32x4 v_ceil(const v_float32x4& a)
+{
+    v128_t a1 = wasm_i32x4_trunc_saturate_f32x4(a.val);
+    v128_t mask = wasm_f32x4_gt(a.val, wasm_f32x4_convert_i32x4(a1));
+    return v_int32x4(wasm_i32x4_sub(a1, mask));
+}
+
+inline v_int32x4 v_trunc(const v_float32x4& a)
+{ return v_int32x4(wasm_i32x4_trunc_saturate_f32x4(a.val)); }
+
+#define OPENCV_HAL_IMPL_WASM_MATH_FUNC(func, cfunc) \
+inline v_int32x4 func(const v_float64x2& a) \
+{ \
+    double a_[2]; \
+    wasm_v128_store(a_, a.val); \
+    int c_[4]; \
+    c_[0] = cfunc(a_[0]); \
+    c_[1] = cfunc(a_[1]); \
+    c_[2] = 0; \
+    c_[3] = 0; \
+    return v_int32x4(wasm_v128_load(c_)); \
+}
+
+OPENCV_HAL_IMPL_WASM_MATH_FUNC(v_round, cvRound)
+OPENCV_HAL_IMPL_WASM_MATH_FUNC(v_floor, cvFloor)
+OPENCV_HAL_IMPL_WASM_MATH_FUNC(v_ceil, cvCeil)
+OPENCV_HAL_IMPL_WASM_MATH_FUNC(v_trunc, int)
+
+inline v_int32x4 v_round(const v_float64x2& a, const v_float64x2& b)
+{
+    double a_[2], b_[2];
+    wasm_v128_store(a_, a.val);
+    wasm_v128_store(b_, b.val);
+    int c_[4];
+    c_[0] = cvRound(a_[0]);
+    c_[1] = cvRound(a_[1]);
+    c_[2] = cvRound(b_[0]);
+    c_[3] = cvRound(b_[1]);
+    return v_int32x4(wasm_v128_load(c_));
+}
+
+#define OPENCV_HAL_IMPL_WASM_TRANSPOSE4x4(_Tpvec, suffix) \
+inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, \
+                           const _Tpvec& a2, const _Tpvec& a3, \
+                           _Tpvec& b0, _Tpvec& b1, \
+                           _Tpvec& b2, _Tpvec& b3) \
+{ \
+    v128_t t0 = wasm_unpacklo_##suffix(a0.val, a1.val); \
+    v128_t t1 = wasm_unpacklo_##suffix(a2.val, a3.val); \
+    v128_t t2 = wasm_unpackhi_##suffix(a0.val, a1.val); \
+    v128_t t3 = wasm_unpackhi_##suffix(a2.val, a3.val); \
+\
+    b0.val = wasm_unpacklo_i64x2(t0, t1); \
+    b1.val = wasm_unpackhi_i64x2(t0, t1); \
+    b2.val = wasm_unpacklo_i64x2(t2, t3); \
+    b3.val = wasm_unpackhi_i64x2(t2, t3); \
+}
+
+OPENCV_HAL_IMPL_WASM_TRANSPOSE4x4(v_uint32x4, i32x4)
+OPENCV_HAL_IMPL_WASM_TRANSPOSE4x4(v_int32x4, i32x4)
+OPENCV_HAL_IMPL_WASM_TRANSPOSE4x4(v_float32x4, i32x4)
+
+// load deinterleave
+inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b)
+{
+    v128_t t00 = wasm_v128_load(ptr);
+    v128_t t01 = wasm_v128_load(ptr + 16);
+
+    a.val = wasm_v8x16_shuffle(t00, t01, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30);
+    b.val = wasm_v8x16_shuffle(t00, t01, 1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31);
+}
+
+inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c)
+{
+    v128_t t00 = wasm_v128_load(ptr);
+    v128_t t01 = wasm_v128_load(ptr + 16);
+    v128_t t02 = wasm_v128_load(ptr + 32);
+
+    v128_t t10 = wasm_v8x16_shuffle(t00, t01, 0,3,6,9,12,15,18,21,24,27,30,1,2,4,5,7);
+    v128_t t11 = wasm_v8x16_shuffle(t00, t01, 1,4,7,10,13,16,19,22,25,28,31,0,2,3,5,6);
+    v128_t t12 = wasm_v8x16_shuffle(t00, t01, 2,5,8,11,14,17,20,23,26,29,0,1,3,4,6,7);
+
+    a.val = wasm_v8x16_shuffle(t10, t02, 0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29);
+    b.val = wasm_v8x16_shuffle(t11, t02, 0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30);
+    c.val = wasm_v8x16_shuffle(t12, t02, 0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31);
+}
+
+inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c, v_uint8x16& d)
+{
+    v128_t u0 = wasm_v128_load(ptr); // a0 b0 c0 d0 a1 b1 c1 d1 ...
+    v128_t u1 = wasm_v128_load(ptr + 16); // a4 b4 c4 d4 ...
+    v128_t u2 = wasm_v128_load(ptr + 32); // a8 b8 c8 d8 ...
+    v128_t u3 = wasm_v128_load(ptr + 48); // a12 b12 c12 d12 ...
+
+    v128_t v0 = wasm_v8x16_shuffle(u0, u1, 0,4,8,12,16,20,24,28,1,5,9,13,17,21,25,29);
+    v128_t v1 = wasm_v8x16_shuffle(u2, u3, 0,4,8,12,16,20,24,28,1,5,9,13,17,21,25,29);
+    v128_t v2 = wasm_v8x16_shuffle(u0, u1, 2,6,10,14,18,22,26,30,3,7,11,15,19,23,27,31);
+    v128_t v3 = wasm_v8x16_shuffle(u2, u3, 2,6,10,14,18,22,26,30,3,7,11,15,19,23,27,31);
+
+    a.val = wasm_v8x16_shuffle(v0, v1, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
+    b.val = wasm_v8x16_shuffle(v0, v1, 8,9,10,11,12,13,14,15,24,25,26,27,28,29,30,31);
+    c.val = wasm_v8x16_shuffle(v2, v3, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
+    d.val = wasm_v8x16_shuffle(v2, v3, 8,9,10,11,12,13,14,15,24,25,26,27,28,29,30,31);
+}
+
+inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b)
+{
+    v128_t v0 = wasm_v128_load(ptr);     // a0 b0 a1 b1 a2 b2 a3 b3
+    v128_t v1 = wasm_v128_load(ptr + 8); // a4 b4 a5 b5 a6 b6 a7 b7
+
+    a.val = wasm_v8x16_shuffle(v0, v1, 0,1,4,5,8,9,12,13,16,17,20,21,24,25,28,29); // a0 a1 a2 a3 a4 a5 a6 a7
+    b.val = wasm_v8x16_shuffle(v0, v1, 2,3,6,7,10,11,14,15,18,19,22,23,26,27,30,31); // b0 b1 ab b3 b4 b5 b6 b7
+}
+
+inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c)
+{
+    v128_t t00 = wasm_v128_load(ptr);        // a0 b0 c0 a1 b1 c1 a2 b2
+    v128_t t01 = wasm_v128_load(ptr + 8);    // c2 a3 b3 c3 a4 b4 c4 a5
+    v128_t t02 = wasm_v128_load(ptr + 16);  // b5 c5 a6 b6 c6 a7 b7 c7
+
+    v128_t t10 = wasm_v8x16_shuffle(t00, t01, 0,1,6,7,12,13,18,19,24,25,30,31,2,3,4,5);
+    v128_t t11 = wasm_v8x16_shuffle(t00, t01, 2,3,8,9,14,15,20,21,26,27,0,1,4,5,6,7);
+    v128_t t12 = wasm_v8x16_shuffle(t00, t01, 4,5,10,11,16,17,22,23,28,29,0,1,2,3,6,7);
+
+    a.val = wasm_v8x16_shuffle(t10, t02, 0,1,2,3,4,5,6,7,8,9,10,11,20,21,26,27);
+    b.val = wasm_v8x16_shuffle(t11, t02, 0,1,2,3,4,5,6,7,8,9,16,17,22,23,28,29);
+    c.val = wasm_v8x16_shuffle(t12, t02, 0,1,2,3,4,5,6,7,8,9,18,19,24,25,30,31);
+}
+
+inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c, v_uint16x8& d)
+{
+    v128_t u0 = wasm_v128_load(ptr); // a0 b0 c0 d0 a1 b1 c1 d1
+    v128_t u1 = wasm_v128_load(ptr + 8); // a2 b2 c2 d2 ...
+    v128_t u2 = wasm_v128_load(ptr + 16); // a4 b4 c4 d4 ...
+    v128_t u3 = wasm_v128_load(ptr + 24); // a6 b6 c6 d6 ...
+
+    v128_t v0 = wasm_v8x16_shuffle(u0, u1, 0,1,8,9,16,17,24,25,2,3,10,11,18,19,26,27); // a0 a1 a2 a3 b0 b1 b2 b3
+    v128_t v1 = wasm_v8x16_shuffle(u2, u3, 0,1,8,9,16,17,24,25,2,3,10,11,18,19,26,27); // a4 a5 a6 a7 b4 b5 b6 b7
+    v128_t v2 = wasm_v8x16_shuffle(u0, u1, 4,5,12,13,20,21,28,29,6,7,14,15,22,23,30,31); // c0 c1 c2 c3 d0 d1 d2 d3
+    v128_t v3 = wasm_v8x16_shuffle(u2, u3, 4,5,12,13,20,21,28,29,6,7,14,15,22,23,30,31); // c4 c5 c6 c7 d4 d5 d6 d7
+
+    a.val = wasm_v8x16_shuffle(v0, v1, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
+    b.val = wasm_v8x16_shuffle(v0, v1, 8,9,10,11,12,13,14,15,24,25,26,27,28,29,30,31);
+    c.val = wasm_v8x16_shuffle(v2, v3, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
+    d.val = wasm_v8x16_shuffle(v2, v3, 8,9,10,11,12,13,14,15,24,25,26,27,28,29,30,31);
+}
+
+inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b)
+{
+    v128_t v0 = wasm_v128_load(ptr);     // a0 b0 a1 b1
+    v128_t v1 = wasm_v128_load(ptr + 4); // a2 b2 a3 b3
+
+    a.val = wasm_v8x16_shuffle(v0, v1, 0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27); // a0 a1 a2 a3
+    b.val = wasm_v8x16_shuffle(v0, v1, 4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31); // b0 b1 b2 b3
+}
+
+inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c)
+{
+    v128_t t00 = wasm_v128_load(ptr);        // a0 b0 c0 a1
+    v128_t t01 = wasm_v128_load(ptr + 4);     // b2 c2 a3 b3
+    v128_t t02 = wasm_v128_load(ptr + 8);    // c3 a4 b4 c4
+
+    v128_t t10 = wasm_v8x16_shuffle(t00, t01, 0,1,2,3,12,13,14,15,24,25,26,27,4,5,6,7);
+    v128_t t11 = wasm_v8x16_shuffle(t00, t01, 4,5,6,7,16,17,18,19,28,29,30,31,0,1,2,3);
+    v128_t t12 = wasm_v8x16_shuffle(t00, t01, 8,9,10,11,20,21,22,23,0,1,2,3,4,5,6,7);
+
+    a.val = wasm_v8x16_shuffle(t10, t02, 0,1,2,3,4,5,6,7,8,9,10,11,20,21,22,23);
+    b.val = wasm_v8x16_shuffle(t11, t02, 0,1,2,3,4,5,6,7,8,9,10,11,24,25,26,27);
+    c.val = wasm_v8x16_shuffle(t12, t02, 0,1,2,3,4,5,6,7,16,17,18,19,28,29,30,31);
+}
+
+inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c, v_uint32x4& d)
+{
+    v_uint32x4 s0(wasm_v128_load(ptr));      // a0 b0 c0 d0
+    v_uint32x4 s1(wasm_v128_load(ptr + 4));  // a1 b1 c1 d1
+    v_uint32x4 s2(wasm_v128_load(ptr + 8));  // a2 b2 c2 d2
+    v_uint32x4 s3(wasm_v128_load(ptr + 12)); // a3 b3 c3 d3
+
+    v_transpose4x4(s0, s1, s2, s3, a, b, c, d);
+}
+
+inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b)
+{
+    v128_t v0 = wasm_v128_load(ptr);       // a0 b0 a1 b1
+    v128_t v1 = wasm_v128_load((ptr + 4)); // a2 b2 a3 b3
+
+    a.val = wasm_v8x16_shuffle(v0, v1, 0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27); // a0 a1 a2 a3
+    b.val = wasm_v8x16_shuffle(v0, v1, 4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31); // b0 b1 b2 b3
+}
+
+inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b, v_float32x4& c)
+{
+    v128_t t00 = wasm_v128_load(ptr);        // a0 b0 c0 a1
+    v128_t t01 = wasm_v128_load(ptr + 4);     // b2 c2 a3 b3
+    v128_t t02 = wasm_v128_load(ptr + 8);    // c3 a4 b4 c4
+
+    v128_t t10 = wasm_v8x16_shuffle(t00, t01, 0,1,2,3,12,13,14,15,24,25,26,27,4,5,6,7);
+    v128_t t11 = wasm_v8x16_shuffle(t00, t01, 4,5,6,7,16,17,18,19,28,29,30,31,0,1,2,3);
+    v128_t t12 = wasm_v8x16_shuffle(t00, t01, 8,9,10,11,20,21,22,23,0,1,2,3,4,5,6,7);
+
+    a.val = wasm_v8x16_shuffle(t10, t02, 0,1,2,3,4,5,6,7,8,9,10,11,20,21,22,23);
+    b.val = wasm_v8x16_shuffle(t11, t02, 0,1,2,3,4,5,6,7,8,9,10,11,24,25,26,27);
+    c.val = wasm_v8x16_shuffle(t12, t02, 0,1,2,3,4,5,6,7,16,17,18,19,28,29,30,31);
+}
+
+inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b, v_float32x4& c, v_float32x4& d)
+{
+    v_float32x4 s0(wasm_v128_load(ptr));      // a0 b0 c0 d0
+    v_float32x4 s1(wasm_v128_load(ptr + 4));  // a1 b1 c1 d1
+    v_float32x4 s2(wasm_v128_load(ptr + 8));  // a2 b2 c2 d2
+    v_float32x4 s3(wasm_v128_load(ptr + 12)); // a3 b3 c3 d3
+
+    v_transpose4x4(s0, s1, s2, s3, a, b, c, d);
+}
+
+inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a, v_uint64x2& b)
+{
+    v128_t t0 = wasm_v128_load(ptr);      // a0 b0
+    v128_t t1 = wasm_v128_load(ptr + 2);  // a1 b1
+
+    a.val = wasm_unpacklo_i64x2(t0, t1);
+    b.val = wasm_unpackhi_i64x2(t0, t1);
+}
+
+inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a, v_uint64x2& b, v_uint64x2& c)
+{
+    v128_t t0 = wasm_v128_load(ptr);     // a0, b0
+    v128_t t1 = wasm_v128_load(ptr + 2); // c0, a1
+    v128_t t2 = wasm_v128_load(ptr + 4); // b1, c1
+
+    a.val = wasm_v8x16_shuffle(t0, t1, 0,1,2,3,4,5,6,7,24,25,26,27,28,29,30,31);
+    b.val = wasm_v8x16_shuffle(t0, t2, 8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23);
+    c.val = wasm_v8x16_shuffle(t1, t2, 0,1,2,3,4,5,6,7,24,25,26,27,28,29,30,31);
+}
+
+inline void v_load_deinterleave(const uint64 *ptr, v_uint64x2& a,
+                                v_uint64x2& b, v_uint64x2& c, v_uint64x2& d)
+{
+    v128_t t0 = wasm_v128_load(ptr);     // a0 b0
+    v128_t t1 = wasm_v128_load(ptr + 2); // c0 d0
+    v128_t t2 = wasm_v128_load(ptr + 4); // a1 b1
+    v128_t t3 = wasm_v128_load(ptr + 6); // c1 d1
+
+    a.val = wasm_unpacklo_i64x2(t0, t2);
+    b.val = wasm_unpackhi_i64x2(t0, t2);
+    c.val = wasm_unpacklo_i64x2(t1, t3);
+    d.val = wasm_unpackhi_i64x2(t1, t3);
+}
+
+// store interleave
+
+inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
+                                hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
+{
+    v128_t v0 = wasm_unpacklo_i8x16(a.val, b.val);
+    v128_t v1 = wasm_unpackhi_i8x16(a.val, b.val);
+
+    wasm_v128_store(ptr, v0);
+    wasm_v128_store(ptr + 16, v1);
+}
+
+inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
+                                const v_uint8x16& c, hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
+{
+    v128_t t00 = wasm_v8x16_shuffle(a.val, b.val, 0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5);
+    v128_t t01 = wasm_v8x16_shuffle(a.val, b.val, 21,0,6,22,0,7,23,0,8,24,0,9,25,0,10,26);
+    v128_t t02 = wasm_v8x16_shuffle(a.val, b.val, 0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0);
+
+    v128_t t10 = wasm_v8x16_shuffle(t00, c.val, 0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15);
+    v128_t t11 = wasm_v8x16_shuffle(t01, c.val, 0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15);
+    v128_t t12 = wasm_v8x16_shuffle(t02, c.val, 26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31);
+
+    wasm_v128_store(ptr, t10);
+    wasm_v128_store(ptr + 16, t11);
+    wasm_v128_store(ptr + 32, t12);
+}
+
+inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
+                                const v_uint8x16& c, const v_uint8x16& d,
+                                hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
+{
+    // a0 a1 a2 a3 ....
+    // b0 b1 b2 b3 ....
+    // c0 c1 c2 c3 ....
+    // d0 d1 d2 d3 ....
+    v128_t u0 = wasm_unpacklo_i8x16(a.val, c.val); // a0 c0 a1 c1 ...
+    v128_t u1 = wasm_unpackhi_i8x16(a.val, c.val); // a8 c8 a9 c9 ...
+    v128_t u2 = wasm_unpacklo_i8x16(b.val, d.val); // b0 d0 b1 d1 ...
+    v128_t u3 = wasm_unpackhi_i8x16(b.val, d.val); // b8 d8 b9 d9 ...
+
+    v128_t v0 = wasm_unpacklo_i8x16(u0, u2); // a0 b0 c0 d0 ...
+    v128_t v1 = wasm_unpackhi_i8x16(u0, u2); // a4 b4 c4 d4 ...
+    v128_t v2 = wasm_unpacklo_i8x16(u1, u3); // a8 b8 c8 d8 ...
+    v128_t v3 = wasm_unpackhi_i8x16(u1, u3); // a12 b12 c12 d12 ...
+
+    wasm_v128_store(ptr, v0);
+    wasm_v128_store(ptr + 16, v1);
+    wasm_v128_store(ptr + 32, v2);
+    wasm_v128_store(ptr + 48, v3);
+}
+
+inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16x8& b,
+                                hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
+{
+    v128_t v0 = wasm_unpacklo_i16x8(a.val, b.val);
+    v128_t v1 = wasm_unpackhi_i16x8(a.val, b.val);
+
+    wasm_v128_store(ptr, v0);
+    wasm_v128_store(ptr + 8, v1);
+}
+
+inline void v_store_interleave( ushort* ptr, const v_uint16x8& a,
+                                const v_uint16x8& b, const v_uint16x8& c,
+                                hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
+{
+    v128_t t00 = wasm_v8x16_shuffle(a.val, b.val, 0,1,16,17,0,0,2,3,18,19,0,0,4,5,20,21);
+    v128_t t01 = wasm_v8x16_shuffle(a.val, b.val, 0,0,6,7,22,23,0,0,8,9,24,25,0,0,10,11);
+    v128_t t02 = wasm_v8x16_shuffle(a.val, b.val, 26,27,0,0,12,13,28,29,0,0,14,15,30,31,0,0);
+
+    v128_t t10 = wasm_v8x16_shuffle(t00, c.val, 0,1,2,3,16,17,6,7,8,9,18,19,12,13,14,15);
+    v128_t t11 = wasm_v8x16_shuffle(t01, c.val, 20,21,2,3,4,5,22,23,8,9,10,11,24,25,14,15);
+    v128_t t12 = wasm_v8x16_shuffle(t02, c.val, 0,1,26,27,4,5,6,7,28,29,10,11,12,13,30,31);
+
+    wasm_v128_store(ptr, t10);
+    wasm_v128_store(ptr + 8, t11);
+    wasm_v128_store(ptr + 16, t12);
+}
+
+inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16x8& b,
+                                const v_uint16x8& c, const v_uint16x8& d,
+                                hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
+{
+    // a0 a1 a2 a3 ....
+    // b0 b1 b2 b3 ....
+    // c0 c1 c2 c3 ....
+    // d0 d1 d2 d3 ....
+    v128_t u0 = wasm_unpacklo_i16x8(a.val, c.val); // a0 c0 a1 c1 ...
+    v128_t u1 = wasm_unpackhi_i16x8(a.val, c.val); // a4 c4 a5 c5 ...
+    v128_t u2 = wasm_unpacklo_i16x8(b.val, d.val); // b0 d0 b1 d1 ...
+    v128_t u3 = wasm_unpackhi_i16x8(b.val, d.val); // b4 d4 b5 d5 ...
+
+    v128_t v0 = wasm_unpacklo_i16x8(u0, u2); // a0 b0 c0 d0 ...
+    v128_t v1 = wasm_unpackhi_i16x8(u0, u2); // a2 b2 c2 d2 ...
+    v128_t v2 = wasm_unpacklo_i16x8(u1, u3); // a4 b4 c4 d4 ...
+    v128_t v3 = wasm_unpackhi_i16x8(u1, u3); // a6 b6 c6 d6 ...
+
+    wasm_v128_store(ptr, v0);
+    wasm_v128_store(ptr + 8, v1);
+    wasm_v128_store(ptr + 16, v2);
+    wasm_v128_store(ptr + 24, v3);
+}
+
+inline void v_store_interleave( unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
+                                hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
+{
+    v128_t v0 = wasm_unpacklo_i32x4(a.val, b.val);
+    v128_t v1 = wasm_unpackhi_i32x4(a.val, b.val);
+
+    wasm_v128_store(ptr, v0);
+    wasm_v128_store(ptr + 4, v1);
+}
+
+inline void v_store_interleave( unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
+                                const v_uint32x4& c, hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
+{
+    v128_t t00 = wasm_v8x16_shuffle(a.val, b.val, 0,1,2,3,16,17,18,19,0,0,0,0,4,5,6,7);
+    v128_t t01 = wasm_v8x16_shuffle(a.val, b.val, 20,21,22,23,0,0,0,0,8,9,10,11,24,25,26,27);
+    v128_t t02 = wasm_v8x16_shuffle(a.val, b.val, 0,0,0,0,12,13,14,15,28,29,30,31,0,0,0,0);
+
+    v128_t t10 = wasm_v8x16_shuffle(t00, c.val, 0,1,2,3,4,5,6,7,16,17,18,19,12,13,14,15);
+    v128_t t11 = wasm_v8x16_shuffle(t01, c.val, 0,1,2,3,20,21,22,23,8,9,10,11,12,13,14,15);
+    v128_t t12 = wasm_v8x16_shuffle(t02, c.val, 24,25,26,27,4,5,6,7,8,9,10,11,28,29,30,31);
+
+    wasm_v128_store(ptr, t10);
+    wasm_v128_store(ptr + 4, t11);
+    wasm_v128_store(ptr + 8, t12);
+}
+
+inline void v_store_interleave(unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
+                               const v_uint32x4& c, const v_uint32x4& d,
+                               hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
+{
+    v_uint32x4 v0, v1, v2, v3;
+    v_transpose4x4(a, b, c, d, v0, v1, v2, v3);
+
+    wasm_v128_store(ptr, v0.val);
+    wasm_v128_store(ptr + 4, v1.val);
+    wasm_v128_store(ptr + 8, v2.val);
+    wasm_v128_store(ptr + 12, v3.val);
+}
+
+// 2-channel, float only
+inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b,
+                               hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
+{
+    v128_t v0 = wasm_unpacklo_i32x4(a.val, b.val);
+    v128_t v1 = wasm_unpackhi_i32x4(a.val, b.val);
+
+    wasm_v128_store(ptr, v0);
+    wasm_v128_store(ptr + 4, v1);
+}
+
+inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b,
+                               const v_float32x4& c, hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
+{
+    v128_t t00 = wasm_v8x16_shuffle(a.val, b.val, 0,1,2,3,16,17,18,19,0,0,0,0,4,5,6,7);
+    v128_t t01 = wasm_v8x16_shuffle(a.val, b.val, 20,21,22,23,0,0,0,0,8,9,10,11,24,25,26,27);
+    v128_t t02 = wasm_v8x16_shuffle(a.val, b.val, 0,0,0,0,12,13,14,15,28,29,30,31,0,0,0,0);
+
+    v128_t t10 = wasm_v8x16_shuffle(t00, c.val, 0,1,2,3,4,5,6,7,16,17,18,19,12,13,14,15);
+    v128_t t11 = wasm_v8x16_shuffle(t01, c.val, 0,1,2,3,20,21,22,23,8,9,10,11,12,13,14,15);
+    v128_t t12 = wasm_v8x16_shuffle(t02, c.val, 24,25,26,27,4,5,6,7,8,9,10,11,28,29,30,31);
+
+    wasm_v128_store(ptr, t10);
+    wasm_v128_store(ptr + 4, t11);
+    wasm_v128_store(ptr + 8, t12);
+}
+
+inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b,
+                               const v_float32x4& c, const v_float32x4& d,
+                               hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
+{
+    v_float32x4 v0, v1, v2, v3;
+    v_transpose4x4(a, b, c, d, v0, v1, v2, v3);
+
+    wasm_v128_store(ptr, v0.val);
+    wasm_v128_store(ptr + 4, v1.val);
+    wasm_v128_store(ptr + 8, v2.val);
+    wasm_v128_store(ptr + 12, v3.val);
+}
+
+inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b,
+                               hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
+{
+    v128_t v0 = wasm_unpacklo_i64x2(a.val, b.val);
+    v128_t v1 = wasm_unpackhi_i64x2(a.val, b.val);
+
+    wasm_v128_store(ptr, v0);
+    wasm_v128_store(ptr + 2, v1);
+}
+
+inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b,
+                               const v_uint64x2& c, hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
+{
+    v128_t v0 = wasm_v8x16_shuffle(a.val, b.val, 0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
+    v128_t v1 = wasm_v8x16_shuffle(a.val, c.val, 16,17,18,19,20,21,22,23,8,9,10,11,12,13,14,15);
+    v128_t v2 = wasm_v8x16_shuffle(b.val, c.val, 8,9,10,11,12,13,14,15,24,25,26,27,28,29,30,31);
+
+    wasm_v128_store(ptr, v0);
+    wasm_v128_store(ptr + 2, v1);
+    wasm_v128_store(ptr + 4, v2);
+}
+
+inline void v_store_interleave(uint64 *ptr, const v_uint64x2& a, const v_uint64x2& b,
+                               const v_uint64x2& c, const v_uint64x2& d,
+                               hal::StoreMode /*mode*/ = hal::STORE_UNALIGNED)
+{
+    v128_t v0 = wasm_unpacklo_i64x2(a.val, b.val);
+    v128_t v1 = wasm_unpacklo_i64x2(c.val, d.val);
+    v128_t v2 = wasm_unpackhi_i64x2(a.val, b.val);
+    v128_t v3 = wasm_unpackhi_i64x2(c.val, d.val);
+
+    wasm_v128_store(ptr, v0);
+    wasm_v128_store(ptr + 2, v1);
+    wasm_v128_store(ptr + 4, v2);
+    wasm_v128_store(ptr + 6, v3);
+}
+
+#define OPENCV_HAL_IMPL_WASM_LOADSTORE_INTERLEAVE(_Tpvec0, _Tp0, suffix0, _Tpvec1, _Tp1, suffix1) \
+inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0 ) \
+{ \
+    _Tpvec1 a1, b1; \
+    v_load_deinterleave((const _Tp1*)ptr, a1, b1); \
+    a0 = v_reinterpret_as_##suffix0(a1); \
+    b0 = v_reinterpret_as_##suffix0(b1); \
+} \
+inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0 ) \
+{ \
+    _Tpvec1 a1, b1, c1; \
+    v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1); \
+    a0 = v_reinterpret_as_##suffix0(a1); \
+    b0 = v_reinterpret_as_##suffix0(b1); \
+    c0 = v_reinterpret_as_##suffix0(c1); \
+} \
+inline void v_load_deinterleave( const _Tp0* ptr, _Tpvec0& a0, _Tpvec0& b0, _Tpvec0& c0, _Tpvec0& d0 ) \
+{ \
+    _Tpvec1 a1, b1, c1, d1; \
+    v_load_deinterleave((const _Tp1*)ptr, a1, b1, c1, d1); \
+    a0 = v_reinterpret_as_##suffix0(a1); \
+    b0 = v_reinterpret_as_##suffix0(b1); \
+    c0 = v_reinterpret_as_##suffix0(c1); \
+    d0 = v_reinterpret_as_##suffix0(d1); \
+} \
+inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
+                                hal::StoreMode mode = hal::STORE_UNALIGNED ) \
+{ \
+    _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
+    _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
+    v_store_interleave((_Tp1*)ptr, a1, b1, mode);      \
+} \
+inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
+                                const _Tpvec0& c0, hal::StoreMode mode = hal::STORE_UNALIGNED ) \
+{ \
+    _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
+    _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
+    _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
+    v_store_interleave((_Tp1*)ptr, a1, b1, c1, mode);  \
+} \
+inline void v_store_interleave( _Tp0* ptr, const _Tpvec0& a0, const _Tpvec0& b0, \
+                                const _Tpvec0& c0, const _Tpvec0& d0, \
+                                hal::StoreMode mode = hal::STORE_UNALIGNED ) \
+{ \
+    _Tpvec1 a1 = v_reinterpret_as_##suffix1(a0); \
+    _Tpvec1 b1 = v_reinterpret_as_##suffix1(b0); \
+    _Tpvec1 c1 = v_reinterpret_as_##suffix1(c0); \
+    _Tpvec1 d1 = v_reinterpret_as_##suffix1(d0); \
+    v_store_interleave((_Tp1*)ptr, a1, b1, c1, d1, mode); \
+}
+
+OPENCV_HAL_IMPL_WASM_LOADSTORE_INTERLEAVE(v_int8x16, schar, s8, v_uint8x16, uchar, u8)
+OPENCV_HAL_IMPL_WASM_LOADSTORE_INTERLEAVE(v_int16x8, short, s16, v_uint16x8, ushort, u16)
+OPENCV_HAL_IMPL_WASM_LOADSTORE_INTERLEAVE(v_int32x4, int, s32, v_uint32x4, unsigned, u32)
+OPENCV_HAL_IMPL_WASM_LOADSTORE_INTERLEAVE(v_int64x2, int64, s64, v_uint64x2, uint64, u64)
+OPENCV_HAL_IMPL_WASM_LOADSTORE_INTERLEAVE(v_float64x2, double, f64, v_uint64x2, uint64, u64)
+
+inline v_float32x4 v_cvt_f32(const v_int32x4& a)
+{
+    return v_float32x4(wasm_f32x4_convert_i32x4(a.val));
+}
+
+inline v_float32x4 v_cvt_f32(const v_float64x2& a)
+{
+    double a_[2];
+    wasm_v128_store(a_, a.val);
+    float c_[4];
+    c_[0] = (float)(a_[0]);
+    c_[1] = (float)(a_[1]);
+    c_[2] = 0;
+    c_[3] = 0;
+    return v_float32x4(wasm_v128_load(c_));
+}
+
+inline v_float32x4 v_cvt_f32(const v_float64x2& a, const v_float64x2& b)
+{
+    double a_[2], b_[2];
+    wasm_v128_store(a_, a.val);
+    wasm_v128_store(b_, b.val);
+    float c_[4];
+    c_[0] = (float)(a_[0]);
+    c_[1] = (float)(a_[1]);
+    c_[2] = (float)(b_[0]);
+    c_[3] = (float)(b_[1]);
+    return v_float32x4(wasm_v128_load(c_));
+}
+
+inline v_float64x2 v_cvt_f64(const v_int32x4& a)
+{
+#ifdef __wasm_unimplemented_simd128__
+    v128_t p = v128_cvti32x4_i64x2(a.val);
+    return v_float64x2(wasm_f64x2_convert_i64x2(p));
+#else
+    int a_[4];
+    wasm_v128_store(a_, a.val);
+    double c_[2];
+    c_[0] = (double)(a_[0]);
+    c_[1] = (double)(a_[1]);
+    return v_float64x2(wasm_v128_load(c_));
+#endif
+}
+
+inline v_float64x2 v_cvt_f64_high(const v_int32x4& a)
+{
+#ifdef __wasm_unimplemented_simd128__
+    v128_t p = v128_cvti32x4_i64x2_high(a.val);
+    return v_float64x2(wasm_f64x2_convert_i64x2(p));
+#else
+    int a_[4];
+    wasm_v128_store(a_, a.val);
+    double c_[2];
+    c_[0] = (double)(a_[2]);
+    c_[1] = (double)(a_[3]);
+    return v_float64x2(wasm_v128_load(c_));
+#endif
+}
+
+inline v_float64x2 v_cvt_f64(const v_float32x4& a)
+{
+    float a_[4];
+    wasm_v128_store(a_, a.val);
+    double c_[2];
+    c_[0] = (double)(a_[0]);
+    c_[1] = (double)(a_[1]);
+    return v_float64x2(wasm_v128_load(c_));
+}
+
+inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
+{
+    float a_[4];
+    wasm_v128_store(a_, a.val);
+    double c_[2];
+    c_[0] = (double)(a_[2]);
+    c_[1] = (double)(a_[3]);
+    return v_float64x2(wasm_v128_load(c_));
+}
+
+inline v_float64x2 v_cvt_f64(const v_int64x2& a)
+{
+#ifdef __wasm_unimplemented_simd128__
+    return v_float64x2(wasm_f64x2_convert_i64x2(a.val));
+#else
+    int64 a_[2];
+    wasm_v128_store(a_, a.val);
+    double c_[2];
+    c_[0] = (double)(a_[0]);
+    c_[1] = (double)(a_[1]);
+    return v_float64x2(wasm_v128_load(c_));
+#endif
+}
+
+////////////// Lookup table access ////////////////////
+
+inline v_int8x16 v_lut(const schar* tab, const int* idx)
+{
+    return v_int8x16(tab[idx[0]], tab[idx[1]], tab[idx[ 2]], tab[idx[ 3]], tab[idx[ 4]], tab[idx[ 5]], tab[idx[ 6]], tab[idx[ 7]],
+                     tab[idx[8]], tab[idx[9]], tab[idx[10]], tab[idx[11]], tab[idx[12]], tab[idx[13]], tab[idx[14]], tab[idx[15]]);
+}
+inline v_int8x16 v_lut_pairs(const schar* tab, const int* idx)
+{
+    return v_int8x16(tab[idx[0]], tab[idx[0]+1], tab[idx[1]], tab[idx[1]+1], tab[idx[2]], tab[idx[2]+1], tab[idx[3]], tab[idx[3]+1],
+                     tab[idx[4]], tab[idx[4]+1], tab[idx[5]], tab[idx[5]+1], tab[idx[6]], tab[idx[6]+1], tab[idx[7]], tab[idx[7]+1]);
+}
+inline v_int8x16 v_lut_quads(const schar* tab, const int* idx)
+{
+    return v_int8x16(tab[idx[0]], tab[idx[0]+1], tab[idx[0]+2], tab[idx[0]+3], tab[idx[1]], tab[idx[1]+1], tab[idx[1]+2], tab[idx[1]+3],
+                     tab[idx[2]], tab[idx[2]+1], tab[idx[2]+2], tab[idx[2]+3], tab[idx[3]], tab[idx[3]+1], tab[idx[3]+2], tab[idx[3]+3]);
+}
+inline v_uint8x16 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((const schar *)tab, idx)); }
+inline v_uint8x16 v_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_pairs((const schar *)tab, idx)); }
+inline v_uint8x16 v_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_quads((const schar *)tab, idx)); }
+
+inline v_int16x8 v_lut(const short* tab, const int* idx)
+{
+    return v_int16x8(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]],
+                     tab[idx[4]], tab[idx[5]], tab[idx[6]], tab[idx[7]]);
+}
+inline v_int16x8 v_lut_pairs(const short* tab, const int* idx)
+{
+    return v_int16x8(tab[idx[0]], tab[idx[0]+1], tab[idx[1]], tab[idx[1]+1],
+                     tab[idx[2]], tab[idx[2]+1], tab[idx[3]], tab[idx[3]+1]);
+}
+inline v_int16x8 v_lut_quads(const short* tab, const int* idx)
+{
+    return v_int16x8(tab[idx[0]], tab[idx[0]+1], tab[idx[0]+2], tab[idx[0]+3],
+                     tab[idx[1]], tab[idx[1]+1], tab[idx[1]+2], tab[idx[1]+3]);
+}
+inline v_uint16x8 v_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut((const short *)tab, idx)); }
+inline v_uint16x8 v_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_pairs((const short *)tab, idx)); }
+inline v_uint16x8 v_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_quads((const short *)tab, idx)); }
+
+inline v_int32x4 v_lut(const int* tab, const int* idx)
+{
+    return v_int32x4(tab[idx[0]], tab[idx[1]],
+                     tab[idx[2]], tab[idx[3]]);
+}
+inline v_int32x4 v_lut_pairs(const int* tab, const int* idx)
+{
+    return v_int32x4(tab[idx[0]], tab[idx[0]+1],
+                     tab[idx[1]], tab[idx[1]+1]);
+}
+inline v_int32x4 v_lut_quads(const int* tab, const int* idx)
+{
+    return v_int32x4(wasm_v128_load(tab + idx[0]));
+}
+inline v_uint32x4 v_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut((const int *)tab, idx)); }
+inline v_uint32x4 v_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_pairs((const int *)tab, idx)); }
+inline v_uint32x4 v_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_quads((const int *)tab, idx)); }
+
+inline v_int64x2 v_lut(const int64_t* tab, const int* idx)
+{
+    return v_int64x2(tab[idx[0]], tab[idx[1]]);
+}
+inline v_int64x2 v_lut_pairs(const int64_t* tab, const int* idx)
+{
+    return v_int64x2(wasm_v128_load(tab + idx[0]));
+}
+inline v_uint64x2 v_lut(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut((const int64_t *)tab, idx)); }
+inline v_uint64x2 v_lut_pairs(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_pairs((const int64_t *)tab, idx)); }
+
+inline v_float32x4 v_lut(const float* tab, const int* idx)
+{
+    return v_float32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
+}
+inline v_float32x4 v_lut_pairs(const float* tab, const int* idx) { return v_reinterpret_as_f32(v_lut_pairs((const int *)tab, idx)); }
+inline v_float32x4 v_lut_quads(const float* tab, const int* idx) { return v_reinterpret_as_f32(v_lut_quads((const int *)tab, idx)); }
+
+inline v_float64x2 v_lut(const double* tab, const int* idx)
+{
+    return v_float64x2(tab[idx[0]], tab[idx[1]]);
+}
+inline v_float64x2 v_lut_pairs(const double* tab, const int* idx)
+{
+    return v_float64x2(wasm_v128_load(tab + idx[0]));
+}
+
+inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
+{
+    return v_int32x4(tab[wasm_i32x4_extract_lane(idxvec.val, 0)],
+                     tab[wasm_i32x4_extract_lane(idxvec.val, 1)],
+                     tab[wasm_i32x4_extract_lane(idxvec.val, 2)],
+                     tab[wasm_i32x4_extract_lane(idxvec.val, 3)]);
+}
+
+inline v_uint32x4 v_lut(const unsigned* tab, const v_int32x4& idxvec)
+{
+    return v_reinterpret_as_u32(v_lut((const int *)tab, idxvec));
+}
+
+inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
+{
+    return v_float32x4(tab[wasm_i32x4_extract_lane(idxvec.val, 0)],
+                       tab[wasm_i32x4_extract_lane(idxvec.val, 1)],
+                       tab[wasm_i32x4_extract_lane(idxvec.val, 2)],
+                       tab[wasm_i32x4_extract_lane(idxvec.val, 3)]);
+}
+
+inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
+{
+    return v_float64x2(tab[wasm_i32x4_extract_lane(idxvec.val, 0)],
+                       tab[wasm_i32x4_extract_lane(idxvec.val, 1)]);
+}
+
+// loads pairs from the table and deinterleaves them, e.g. returns:
+//   x = (tab[idxvec[0], tab[idxvec[1]], tab[idxvec[2]], tab[idxvec[3]]),
+//   y = (tab[idxvec[0]+1], tab[idxvec[1]+1], tab[idxvec[2]+1], tab[idxvec[3]+1])
+// note that the indices are float's indices, not the float-pair indices.
+// in theory, this function can be used to implement bilinear interpolation,
+// when idxvec are the offsets within the image.
+inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_float32x4& x, v_float32x4& y)
+{
+    x = v_float32x4(tab[wasm_i32x4_extract_lane(idxvec.val, 0)],
+                    tab[wasm_i32x4_extract_lane(idxvec.val, 1)],
+                    tab[wasm_i32x4_extract_lane(idxvec.val, 2)],
+                    tab[wasm_i32x4_extract_lane(idxvec.val, 3)]);
+    y = v_float32x4(tab[wasm_i32x4_extract_lane(idxvec.val, 0)+1],
+                    tab[wasm_i32x4_extract_lane(idxvec.val, 1)+1],
+                    tab[wasm_i32x4_extract_lane(idxvec.val, 2)+1],
+                    tab[wasm_i32x4_extract_lane(idxvec.val, 3)+1]);
+}
+
+inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_float64x2& x, v_float64x2& y)
+{
+    v128_t xy0 = wasm_v128_load(tab + wasm_i32x4_extract_lane(idxvec.val, 0));
+    v128_t xy1 = wasm_v128_load(tab + wasm_i32x4_extract_lane(idxvec.val, 1));
+    x.val = wasm_unpacklo_i64x2(xy0, xy1);
+    y.val = wasm_unpacklo_i64x2(xy0, xy1);
+}
+
+inline v_int8x16 v_interleave_pairs(const v_int8x16& vec)
+{
+    return v_int8x16(wasm_v8x16_shuffle(vec.val, vec.val, 0,2,1,3,4,6,5,7,8,10,9,11,12,14,13,15));
+}
+inline v_uint8x16 v_interleave_pairs(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec))); }
+inline v_int8x16 v_interleave_quads(const v_int8x16& vec)
+{
+    return v_int8x16(wasm_v8x16_shuffle(vec.val, vec.val, 0,4,1,5,2,6,3,7,8,12,9,13,10,14,11,15));
+}
+inline v_uint8x16 v_interleave_quads(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec))); }
+
+inline v_int16x8 v_interleave_pairs(const v_int16x8& vec)
+{
+    return v_int16x8(wasm_v8x16_shuffle(vec.val, vec.val, 0,1,4,5,2,3,6,7,8,9,12,13,10,11,14,15));
+}
+inline v_uint16x8 v_interleave_pairs(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
+inline v_int16x8 v_interleave_quads(const v_int16x8& vec)
+{
+    return v_int16x8(wasm_v8x16_shuffle(vec.val, vec.val, 0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15));
+}
+inline v_uint16x8 v_interleave_quads(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }
+
+inline v_int32x4 v_interleave_pairs(const v_int32x4& vec)
+{
+    return v_int32x4(wasm_v8x16_shuffle(vec.val, vec.val, 0,1,2,3,8,9,10,11,4,5,6,7,12,13,14,15));
+}
+inline v_uint32x4 v_interleave_pairs(const v_uint32x4& vec) { return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
+inline v_float32x4 v_interleave_pairs(const v_float32x4& vec)
+{
+    return v_float32x4(wasm_v8x16_shuffle(vec.val, vec.val, 0,1,2,3,8,9,10,11,4,5,6,7,12,13,14,15));
+}
+
+inline v_int8x16 v_pack_triplets(const v_int8x16& vec)
+{
+    return v_int8x16(wasm_v8x16_shuffle(vec.val, vec.val, 0,1,2,4,5,6,8,9,10,12,13,14,16,16,16,16));
+}
+inline v_uint8x16 v_pack_triplets(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }
+
+inline v_int16x8 v_pack_triplets(const v_int16x8& vec)
+{
+    return v_int16x8(wasm_v8x16_shuffle(vec.val, vec.val, 0,1,2,3,4,5,8,9,10,11,12,13,14,15,6,7));
+}
+inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
+
+inline v_int32x4 v_pack_triplets(const v_int32x4& vec) { return vec; }
+inline v_uint32x4 v_pack_triplets(const v_uint32x4& vec) { return vec; }
+inline v_float32x4 v_pack_triplets(const v_float32x4& vec) { return vec; }
+
+template<int i, typename _Tp>
+inline typename _Tp::lane_type v_extract_n(const _Tp& a)
+{
+    return v_rotate_right<i>(a).get0();
+}
+
+template<int i>
+inline v_uint32x4 v_broadcast_element(const v_uint32x4& a)
+{
+    return v_setall_u32(v_extract_n<i>(a));
+}
+template<int i>
+inline v_int32x4 v_broadcast_element(const v_int32x4& a)
+{
+    return v_setall_s32(v_extract_n<i>(a));
+}
+template<int i>
+inline v_float32x4 v_broadcast_element(const v_float32x4& a)
+{
+    return v_setall_f32(v_extract_n<i>(a));
+}
+
+
+////////////// FP16 support ///////////////////////////
+
+inline v_float32x4 v_load_expand(const float16_t* ptr)
+{
+    float a[4];
+    for (int i = 0; i < 4; i++)
+        a[i] = ptr[i];
+    return v_float32x4(wasm_v128_load(a));
+}
+
+inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
+{
+    double v_[4];
+    wasm_v128_store(v_, v.val);
+    ptr[0] = float16_t(v_[0]);
+    ptr[1] = float16_t(v_[1]);
+    ptr[2] = float16_t(v_[2]);
+    ptr[3] = float16_t(v_[3]);
+}
+
+inline void v_cleanup() {}
+
+CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
+
+//! @endcond
+
+}
+
+#endif
diff --git a/3rdparty/opencv/include/opencv2/core/hal/msa_macros.h b/3rdparty/opencv/include/opencv2/core/hal/msa_macros.h
new file mode 100644
index 00000000..bd6ddb12
--- /dev/null
+++ b/3rdparty/opencv/include/opencv2/core/hal/msa_macros.h
@@ -0,0 +1,1558 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_CORE_HAL_MSA_MACROS_H
+#define OPENCV_CORE_HAL_MSA_MACROS_H
+
+#ifdef __mips_msa
+#include "msa.h"
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Define 64 bits vector types */
+typedef signed char v8i8 __attribute__ ((vector_size(8), aligned(8)));
+typedef unsigned char v8u8 __attribute__ ((vector_size(8), aligned(8)));
+typedef short v4i16 __attribute__ ((vector_size(8), aligned(8)));
+typedef unsigned short v4u16 __attribute__ ((vector_size(8), aligned(8)));
+typedef int v2i32 __attribute__ ((vector_size(8), aligned(8)));
+typedef unsigned int v2u32 __attribute__ ((vector_size(8), aligned(8)));
+typedef long long v1i64 __attribute__ ((vector_size(8), aligned(8)));
+typedef unsigned long long v1u64 __attribute__ ((vector_size(8), aligned(8)));
+typedef float v2f32 __attribute__ ((vector_size(8), aligned(8)));
+typedef double v1f64 __attribute__ ((vector_size(8), aligned(8)));
+
+
+/* Load values from the given memory a 64-bit vector. */
+#define msa_ld1_s8(__a)  (*((v8i8*)(__a)))
+#define msa_ld1_s16(__a) (*((v4i16*)(__a)))
+#define msa_ld1_s32(__a) (*((v2i32*)(__a)))
+#define msa_ld1_s64(__a) (*((v1i64*)(__a)))
+#define msa_ld1_u8(__a)  (*((v8u8*)(__a)))
+#define msa_ld1_u16(__a) (*((v4u16*)(__a)))
+#define msa_ld1_u32(__a) (*((v2u32*)(__a)))
+#define msa_ld1_u64(__a) (*((v1u64*)(__a)))
+#define msa_ld1_f32(__a) (*((v2f32*)(__a)))
+#define msa_ld1_f64(__a) (*((v1f64*)(__a)))
+
+/* Load values from the given memory address to a 128-bit vector */
+#define msa_ld1q_s8(__a)  ((v16i8)__builtin_msa_ld_b(__a, 0))
+#define msa_ld1q_s16(__a) ((v8i16)__builtin_msa_ld_h(__a, 0))
+#define msa_ld1q_s32(__a) ((v4i32)__builtin_msa_ld_w(__a, 0))
+#define msa_ld1q_s64(__a) ((v2i64)__builtin_msa_ld_d(__a, 0))
+#define msa_ld1q_u8(__a)  ((v16u8)__builtin_msa_ld_b(__a, 0))
+#define msa_ld1q_u16(__a) ((v8u16)__builtin_msa_ld_h(__a, 0))
+#define msa_ld1q_u32(__a) ((v4u32)__builtin_msa_ld_w(__a, 0))
+#define msa_ld1q_u64(__a) ((v2u64)__builtin_msa_ld_d(__a, 0))
+#define msa_ld1q_f32(__a) ((v4f32)__builtin_msa_ld_w(__a, 0))
+#define msa_ld1q_f64(__a) ((v2f64)__builtin_msa_ld_d(__a, 0))
+
+/* Store 64bits vector elements values to the given memory address. */
+#define msa_st1_s8(__a, __b)  (*((v8i8*)(__a)) = __b)
+#define msa_st1_s16(__a, __b) (*((v4i16*)(__a)) = __b)
+#define msa_st1_s32(__a, __b) (*((v2i32*)(__a)) = __b)
+#define msa_st1_s64(__a, __b) (*((v1i64*)(__a)) = __b)
+#define msa_st1_u8(__a, __b)  (*((v8u8*)(__a)) = __b)
+#define msa_st1_u16(__a, __b) (*((v4u16*)(__a)) = __b)
+#define msa_st1_u32(__a, __b) (*((v2u32*)(__a)) = __b)
+#define msa_st1_u64(__a, __b) (*((v1u64*)(__a)) = __b)
+#define msa_st1_f32(__a, __b) (*((v2f32*)(__a)) = __b)
+#define msa_st1_f64(__a, __b) (*((v1f64*)(__a)) = __b)
+
+/* Store the values of elements in the 128 bits vector __a to the given memory address __a. */
+#define msa_st1q_s8(__a, __b)  (__builtin_msa_st_b((v16i8)(__b), __a, 0))
+#define msa_st1q_s16(__a, __b) (__builtin_msa_st_h((v8i16)(__b), __a, 0))
+#define msa_st1q_s32(__a, __b) (__builtin_msa_st_w((v4i32)(__b), __a, 0))
+#define msa_st1q_s64(__a, __b) (__builtin_msa_st_d((v2i64)(__b), __a, 0))
+#define msa_st1q_u8(__a, __b)  (__builtin_msa_st_b((v16i8)(__b), __a, 0))
+#define msa_st1q_u16(__a, __b) (__builtin_msa_st_h((v8i16)(__b), __a, 0))
+#define msa_st1q_u32(__a, __b) (__builtin_msa_st_w((v4i32)(__b), __a, 0))
+#define msa_st1q_u64(__a, __b) (__builtin_msa_st_d((v2i64)(__b), __a, 0))
+#define msa_st1q_f32(__a, __b) (__builtin_msa_st_w((v4i32)(__b), __a, 0))
+#define msa_st1q_f64(__a, __b) (__builtin_msa_st_d((v2i64)(__b), __a, 0))
+
+/* Store the value of the element with the index __c in vector __a to the given memory address __a. */
+#define msa_st1_lane_s8(__a, __b, __c)   (*((int8_t*)(__a)) = __b[__c])
+#define msa_st1_lane_s16(__a, __b, __c)  (*((int16_t*)(__a)) = __b[__c])
+#define msa_st1_lane_s32(__a, __b, __c)  (*((int32_t*)(__a)) = __b[__c])
+#define msa_st1_lane_s64(__a, __b, __c)  (*((int64_t*)(__a)) = __b[__c])
+#define msa_st1_lane_u8(__a, __b, __c)   (*((uint8_t*)(__a)) = __b[__c])
+#define msa_st1_lane_u16(__a, __b, __c)  (*((uint16_t*)(__a)) = __b[__c])
+#define msa_st1_lane_u32(__a, __b, __c)  (*((uint32_t*)(__a)) = __b[__c])
+#define msa_st1_lane_u64(__a, __b, __c)  (*((uint64_t*)(__a)) = __b[__c])
+#define msa_st1_lane_f32(__a, __b, __c)  (*((float*)(__a)) = __b[__c])
+#define msa_st1_lane_f64(__a, __b, __c)  (*((double*)(__a)) = __b[__c])
+#define msa_st1q_lane_s8(__a, __b, __c)  (*((int8_t*)(__a)) = (int8_t)__builtin_msa_copy_s_b(__b, __c))
+#define msa_st1q_lane_s16(__a, __b, __c) (*((int16_t*)(__a)) = (int16_t)__builtin_msa_copy_s_h(__b, __c))
+#define msa_st1q_lane_s32(__a, __b, __c) (*((int32_t*)(__a)) = __builtin_msa_copy_s_w(__b, __c))
+#define msa_st1q_lane_s64(__a, __b, __c) (*((int64_t*)(__a)) = __builtin_msa_copy_s_d(__b, __c))
+#define msa_st1q_lane_u8(__a, __b, __c)  (*((uint8_t*)(__a)) = (uint8_t)__builtin_msa_copy_u_b((v16i8)(__b), __c))
+#define msa_st1q_lane_u16(__a, __b, __c) (*((uint16_t*)(__a)) = (uint16_t)__builtin_msa_copy_u_h((v8i16)(__b), __c))
+#define msa_st1q_lane_u32(__a, __b, __c) (*((uint32_t*)(__a)) = __builtin_msa_copy_u_w((v4i32)(__b), __c))
+#define msa_st1q_lane_u64(__a, __b, __c) (*((uint64_t*)(__a)) = __builtin_msa_copy_u_d((v2i64)(__b), __c))
+#define msa_st1q_lane_f32(__a, __b, __c) (*((float*)(__a)) = __b[__c])
+#define msa_st1q_lane_f64(__a, __b, __c) (*((double*)(__a)) = __b[__c])
+
+/* Duplicate elements for 64-bit doubleword vectors */
+#define msa_dup_n_s8(__a)  ((v8i8)__builtin_msa_copy_s_d((v2i64)__builtin_msa_fill_b((int32_t)(__a)), 0))
+#define msa_dup_n_s16(__a) ((v4i16)__builtin_msa_copy_s_d((v2i64)__builtin_msa_fill_h((int32_t)(__a)), 0))
+#define msa_dup_n_s32(__a) ((v2i32){__a, __a})
+#define msa_dup_n_s64(__a) ((v1i64){__a})
+#define msa_dup_n_u8(__a)  ((v8u8)__builtin_msa_copy_u_d((v2i64)__builtin_msa_fill_b((int32_t)(__a)), 0))
+#define msa_dup_n_u16(__a) ((v4u16)__builtin_msa_copy_u_d((v2i64)__builtin_msa_fill_h((int32_t)(__a)), 0))
+#define msa_dup_n_u32(__a) ((v2u32){__a, __a})
+#define msa_dup_n_u64(__a) ((v1u64){__a})
+#define msa_dup_n_f32(__a) ((v2f32){__a, __a})
+#define msa_dup_n_f64(__a) ((v1f64){__a})
+
+/* Duplicate elements for 128-bit quadword vectors */
+#define msa_dupq_n_s8(__a)  (__builtin_msa_fill_b((int32_t)(__a)))
+#define msa_dupq_n_s16(__a) (__builtin_msa_fill_h((int32_t)(__a)))
+#define msa_dupq_n_s32(__a) (__builtin_msa_fill_w((int32_t)(__a)))
+#define msa_dupq_n_s64(__a) (__builtin_msa_fill_d((int64_t)(__a)))
+#define msa_dupq_n_u8(__a)  ((v16u8)__builtin_msa_fill_b((int32_t)(__a)))
+#define msa_dupq_n_u16(__a) ((v8u16)__builtin_msa_fill_h((int32_t)(__a)))
+#define msa_dupq_n_u32(__a) ((v4u32)__builtin_msa_fill_w((int32_t)(__a)))
+#define msa_dupq_n_u64(__a) ((v2u64)__builtin_msa_fill_d((int64_t)(__a)))
+#define msa_dupq_n_f32(__a) ((v4f32){__a, __a, __a, __a})
+#define msa_dupq_n_f64(__a) ((v2f64){__a, __a})
+#define msa_dupq_lane_s8(__a, __b)  (__builtin_msa_splat_b(__a, __b))
+#define msa_dupq_lane_s16(__a, __b) (__builtin_msa_splat_h(__a, __b))
+#define msa_dupq_lane_s32(__a, __b) (__builtin_msa_splat_w(__a, __b))
+#define msa_dupq_lane_s64(__a, __b) (__builtin_msa_splat_d(__a, __b))
+#define msa_dupq_lane_u8(__a, __b)  ((v16u8)__builtin_msa_splat_b((v16i8)(__a), __b))
+#define msa_dupq_lane_u16(__a, __b) ((v8u16)__builtin_msa_splat_h((v8i16)(__a), __b))
+#define msa_dupq_lane_u32(__a, __b) ((v4u32)__builtin_msa_splat_w((v4i32)(__a), __b))
+#define msa_dupq_lane_u64(__a, __b) ((v2u64)__builtin_msa_splat_d((v2i64)(__a), __b))
+
+/* Create a 64 bits vector */
+#define msa_create_s8(__a)  ((v8i8)((uint64_t)(__a)))
+#define msa_create_s16(__a) ((v4i16)((uint64_t)(__a)))
+#define msa_create_s32(__a) ((v2i32)((uint64_t)(__a)))
+#define msa_create_s64(__a) ((v1i64)((uint64_t)(__a)))
+#define msa_create_u8(__a)  ((v8u8)((uint64_t)(__a)))
+#define msa_create_u16(__a) ((v4u16)((uint64_t)(__a)))
+#define msa_create_u32(__a) ((v2u32)((uint64_t)(__a)))
+#define msa_create_u64(__a) ((v1u64)((uint64_t)(__a)))
+#define msa_create_f32(__a) ((v2f32)((uint64_t)(__a)))
+#define msa_create_f64(__a) ((v1f64)((uint64_t)(__a)))
+
+/* Sign extends or zero extends each element in a 64 bits vector to twice its original length, and places the results in a 128 bits vector. */
+/*Transform v8i8 to v8i16*/
+#define msa_movl_s8(__a) \
+((v8i16){(__a)[0], (__a)[1], (__a)[2], (__a)[3], \
+         (__a)[4], (__a)[5], (__a)[6], (__a)[7]})
+
+/*Transform v8u8 to v8u16*/
+#define msa_movl_u8(__a) \
+((v8u16){(__a)[0], (__a)[1], (__a)[2], (__a)[3], \
+         (__a)[4], (__a)[5], (__a)[6], (__a)[7]})
+
+/*Transform v4i16 to v8i16*/
+#define msa_movl_s16(__a) ((v4i32){(__a)[0], (__a)[1], (__a)[2], (__a)[3]})
+
+/*Transform v2i32 to v4i32*/
+#define msa_movl_s32(__a) ((v2i64){(__a)[0], (__a)[1]})
+
+/*Transform v4u16 to v8u16*/
+#define msa_movl_u16(__a) ((v4u32){(__a)[0], (__a)[1], (__a)[2], (__a)[3]})
+
+/*Transform v2u32 to v4u32*/
+#define msa_movl_u32(__a) ((v2u64){(__a)[0], (__a)[1]})
+
+/* Copies the least significant half of each element of a 128 bits vector into the corresponding elements of a 64 bits vector. */
+#define msa_movn_s16(__a) \
+({ \
+  v16i8 __d = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)(__a)); \
+  (v8i8)__builtin_msa_copy_s_d((v2i64)__d, 0); \
+})
+
+#define msa_movn_s32(__a) \
+({ \
+  v8i16 __d = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)(__a)); \
+  (v4i16)__builtin_msa_copy_s_d((v2i64)__d, 0); \
+})
+
+#define msa_movn_s64(__a) \
+({ \
+  v4i32 __d = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)(__a)); \
+  (v2i32)__builtin_msa_copy_s_d((v2i64)__d, 0); \
+})
+
+#define msa_movn_u16(__a) \
+({ \
+  v16i8 __d = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)(__a)); \
+  (v8u8)__builtin_msa_copy_u_d((v2i64)__d, 0); \
+})
+
+#define msa_movn_u32(__a) \
+({ \
+  v8i16 __d = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)(__a)); \
+  (v4u16)__builtin_msa_copy_u_d((v2i64)__d, 0); \
+})
+
+#define msa_movn_u64(__a) \
+({ \
+  v4i32 __d = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)(__a)); \
+  (v2u32)__builtin_msa_copy_u_d((v2i64)__d, 0); \
+})
+
+/* qmovn */
+#define msa_qmovn_s16(__a) \
+({ \
+  v16i8 __d = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)__builtin_msa_sat_s_h((v8i16)(__a), 7)); \
+  (v8i8)__builtin_msa_copy_s_d((v2i64)__d, 0); \
+})
+
+#define msa_qmovn_s32(__a) \
+({ \
+  v8i16 __d = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)__builtin_msa_sat_s_w((v4i32)(__a), 15)); \
+  (v4i16)__builtin_msa_copy_s_d((v2i64)__d, 0); \
+})
+
+#define msa_qmovn_s64(__a) \
+({ \
+  v4i32 __d = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)__builtin_msa_sat_s_d((v2i64)(__a), 31)); \
+  (v2i32)__builtin_msa_copy_s_d((v2i64)__d, 0); \
+})
+
+#define msa_qmovn_u16(__a) \
+({ \
+  v16i8 __d = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)__builtin_msa_sat_u_h((v8u16)(__a), 7)); \
+  (v8u8)__builtin_msa_copy_u_d((v2i64)__d, 0); \
+})
+
+#define msa_qmovn_u32(__a) \
+({ \
+  v8i16 __d = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)__builtin_msa_sat_u_w((v4u32)(__a), 15)); \
+  (v4u16)__builtin_msa_copy_u_d((v2i64)__d, 0); \
+})
+
+#define msa_qmovn_u64(__a) \
+({ \
+  v4i32 __d = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)__builtin_msa_sat_u_d((v2u64)(__a), 31)); \
+  (v2u32)__builtin_msa_copy_u_d((v2i64)__d, 0); \
+})
+
+/* qmovun */
+#define msa_qmovun_s16(__a) \
+({ \
+  v8i16 __d = __builtin_msa_max_s_h(__builtin_msa_fill_h(0), (v8i16)(__a)); \
+  v16i8 __e = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)__builtin_msa_sat_u_h((v8u16)__d, 7)); \
+  (v8u8)__builtin_msa_copy_u_d((v2i64)__e, 0); \
+})
+
+#define msa_qmovun_s32(__a) \
+({ \
+  v4i32 __d = __builtin_msa_max_s_w(__builtin_msa_fill_w(0), (v4i32)(__a)); \
+  v8i16 __e = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)__builtin_msa_sat_u_w((v4u32)__d, 15)); \
+  (v4u16)__builtin_msa_copy_u_d((v2i64)__e, 0); \
+})
+
+#define msa_qmovun_s64(__a) \
+({ \
+  v2i64 __d = __builtin_msa_max_s_d(__builtin_msa_fill_d(0), (v2i64)(__a)); \
+  v4i32 __e = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)__builtin_msa_sat_u_d((v2u64)__d, 31)); \
+  (v2u32)__builtin_msa_copy_u_d((v2i64)__e, 0); \
+})
+
+/* Right shift elements in a 128 bits vector by an immediate value, and places the results in a 64 bits vector. */
+#define msa_shrn_n_s16(__a, __b) \
+({ \
+  v16i8 __d = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)__builtin_msa_srai_h((v8i16)(__a), (int)(__b))); \
+  (v8i8)__builtin_msa_copy_s_d((v2i64)__d, 0); \
+})
+
+#define msa_shrn_n_s32(__a, __b) \
+({ \
+  v8i16 __d = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)__builtin_msa_srai_w((v4i32)(__a), (int)(__b))); \
+  (v4i16)__builtin_msa_copy_s_d((v2i64)__d, 0); \
+})
+
+#define msa_shrn_n_s64(__a, __b) \
+({ \
+  v4i32 __d = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)__builtin_msa_srai_d((v2i64)(__a), (int)(__b))); \
+  (v2i32)__builtin_msa_copy_s_d((v2i64)__d, 0); \
+})
+
+#define msa_shrn_n_u16(__a, __b) \
+({ \
+  v16i8 __d = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)__builtin_msa_srli_h((v8i16)(__a), (int)(__b))); \
+  (v8u8)__builtin_msa_copy_u_d((v2i64)__d, 0); \
+})
+
+#define msa_shrn_n_u32(__a, __b) \
+({ \
+  v8i16 __d = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)__builtin_msa_srli_w((v4i32)(__a), (int)(__b))); \
+  (v4u16)__builtin_msa_copy_u_d((v2i64)__d, 0); \
+})
+
+#define msa_shrn_n_u64(__a, __b) \
+({ \
+  v4i32 __d = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)__builtin_msa_srli_d((v2i64)(__a), (int)(__b))); \
+  (v2u32)__builtin_msa_copy_u_d((v2i64)__d, 0); \
+})
+
+/* Right shift elements in a 128 bits vector by an immediate value, and places the results in a 64 bits vector. */
+#define msa_rshrn_n_s16(__a, __b) \
+({ \
+  v16i8 __d = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)__builtin_msa_srari_h((v8i16)(__a), (int)__b)); \
+  (v8i8)__builtin_msa_copy_s_d((v2i64)__d, 0); \
+})
+
+#define msa_rshrn_n_s32(__a, __b) \
+({ \
+  v8i16 __d = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)__builtin_msa_srari_w((v4i32)(__a), (int)__b)); \
+  (v4i16)__builtin_msa_copy_s_d((v2i64)__d, 0); \
+})
+
+#define msa_rshrn_n_s64(__a, __b) \
+({ \
+  v4i32 __d = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)__builtin_msa_srari_d((v2i64)(__a), (int)__b)); \
+  (v2i32)__builtin_msa_copy_s_d((v2i64)__d, 0); \
+})
+
+#define msa_rshrn_n_u16(__a, __b) \
+({ \
+  v16i8 __d = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)__builtin_msa_srlri_h((v8i16)(__a), (int)__b)); \
+  (v8u8)__builtin_msa_copy_u_d((v2i64)__d, 0); \
+})
+
+#define msa_rshrn_n_u32(__a, __b) \
+({ \
+  v8i16 __d = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)__builtin_msa_srlri_w((v4i32)(__a), (int)__b)); \
+  (v4u16)__builtin_msa_copy_u_d((v2i64)__d, 0); \
+})
+
+#define msa_rshrn_n_u64(__a, __b) \
+({ \
+  v4i32 __d = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)__builtin_msa_srlri_d((v2i64)(__a), (int)__b)); \
+  (v2u32)__builtin_msa_copy_u_d((v2i64)__d, 0); \
+})
+
+/* Right shift elements in a 128 bits vector by an immediate value, saturate the results and them in a 64 bits vector. */
+#define msa_qrshrn_n_s16(__a, __b) \
+({ \
+  v8i16 __d = __builtin_msa_sat_s_h(__builtin_msa_srari_h((v8i16)(__a), (int)(__b)), 7); \
+  v16i8 __e = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)__d); \
+  (v8i8)__builtin_msa_copy_s_d((v2i64)__e, 0); \
+})
+
+#define msa_qrshrn_n_s32(__a, __b) \
+({ \
+  v4i32 __d = __builtin_msa_sat_s_w(__builtin_msa_srari_w((v4i32)(__a), (int)(__b)), 15); \
+  v8i16 __e = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)__d); \
+  (v4i16)__builtin_msa_copy_s_d((v2i64)__e, 0); \
+})
+
+#define msa_qrshrn_n_s64(__a, __b) \
+({ \
+  v2i64 __d = __builtin_msa_sat_s_d(__builtin_msa_srari_d((v2i64)(__a), (int)(__b)), 31); \
+  v4i32 __e = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)__d); \
+  (v2i32)__builtin_msa_copy_s_d((v2i64)__e, 0); \
+})
+
+#define msa_qrshrn_n_u16(__a, __b) \
+({ \
+  v8u16 __d = __builtin_msa_sat_u_h((v8u16)__builtin_msa_srlri_h((v8i16)(__a), (int)(__b)), 7); \
+  v16i8 __e = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)__d); \
+  (v8u8)__builtin_msa_copy_u_d((v2i64)__e, 0); \
+})
+
+#define msa_qrshrn_n_u32(__a, __b) \
+({ \
+  v4u32 __d = __builtin_msa_sat_u_w((v4u32)__builtin_msa_srlri_w((v4i32)(__a), (int)(__b)), 15); \
+  v8i16 __e = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)__d); \
+  (v4u16)__builtin_msa_copy_u_d((v2i64)__e, 0); \
+})
+
+#define msa_qrshrn_n_u64(__a, __b) \
+({ \
+  v2u64 __d = __builtin_msa_sat_u_d((v2u64)__builtin_msa_srlri_d((v2i64)(__a), (int)(__b)), 31); \
+  v4i32 __e = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)__d); \
+  (v2u32)__builtin_msa_copy_u_d((v2i64)__e, 0); \
+})
+
+/* Right shift elements in a 128 bits vector by an immediate value, saturate the results and them in a 64 bits vector.
+   Input is signed and output is unsigned. */
+#define msa_qrshrun_n_s16(__a, __b) \
+({ \
+  v8i16 __d = __builtin_msa_srlri_h(__builtin_msa_max_s_h(__builtin_msa_fill_h(0), (v8i16)(__a)), (int)(__b)); \
+  v16i8 __e = __builtin_msa_pckev_b(__builtin_msa_fill_b(0), (v16i8)__builtin_msa_sat_u_h((v8u16)__d, 7)); \
+  (v8u8)__builtin_msa_copy_u_d((v2i64)__e, 0); \
+})
+
+#define msa_qrshrun_n_s32(__a, __b) \
+({ \
+  v4i32 __d = __builtin_msa_srlri_w(__builtin_msa_max_s_w(__builtin_msa_fill_w(0), (v4i32)(__a)), (int)(__b)); \
+  v8i16 __e = __builtin_msa_pckev_h(__builtin_msa_fill_h(0), (v8i16)__builtin_msa_sat_u_w((v4u32)__d, 15)); \
+  (v4u16)__builtin_msa_copy_u_d((v2i64)__e, 0); \
+})
+
+#define msa_qrshrun_n_s64(__a, __b) \
+({ \
+  v2i64 __d = __builtin_msa_srlri_d(__builtin_msa_max_s_d(__builtin_msa_fill_d(0), (v2i64)(__a)), (int)(__b)); \
+  v4i32 __e = __builtin_msa_pckev_w(__builtin_msa_fill_w(0), (v4i32)__builtin_msa_sat_u_d((v2u64)__d, 31)); \
+  (v2u32)__builtin_msa_copy_u_d((v2i64)__e, 0); \
+})
+
+/* pack */
+#define msa_pack_s16(__a, __b) (__builtin_msa_pckev_b((v16i8)(__b), (v16i8)(__a)))
+#define msa_pack_s32(__a, __b) (__builtin_msa_pckev_h((v8i16)(__b), (v8i16)(__a)))
+#define msa_pack_s64(__a, __b) (__builtin_msa_pckev_w((v4i32)(__b), (v4i32)(__a)))
+#define msa_pack_u16(__a, __b) ((v16u8)__builtin_msa_pckev_b((v16i8)(__b), (v16i8)(__a)))
+#define msa_pack_u32(__a, __b) ((v8u16)__builtin_msa_pckev_h((v8i16)(__b), (v8i16)(__a)))
+#define msa_pack_u64(__a, __b) ((v4u32)__builtin_msa_pckev_w((v4i32)(__b), (v4i32)(__a)))
+
+/* qpack */
+#define msa_qpack_s16(__a, __b) \
+(__builtin_msa_pckev_b((v16i8)__builtin_msa_sat_s_h((v8i16)(__b), 7), (v16i8)__builtin_msa_sat_s_h((v8i16)(__a), 7)))
+#define msa_qpack_s32(__a, __b) \
+(__builtin_msa_pckev_h((v8i16)__builtin_msa_sat_s_w((v4i32)(__b), 15), (v8i16)__builtin_msa_sat_s_w((v4i32)(__a), 15)))
+#define msa_qpack_s64(__a, __b) \
+(__builtin_msa_pckev_w((v4i32)__builtin_msa_sat_s_d((v2i64)(__b), 31), (v4i32)__builtin_msa_sat_s_d((v2i64)(__a), 31)))
+#define msa_qpack_u16(__a, __b) \
+((v16u8)__builtin_msa_pckev_b((v16i8)__builtin_msa_sat_u_h((v8u16)(__b), 7), (v16i8)__builtin_msa_sat_u_h((v8u16)(__a), 7)))
+#define msa_qpack_u32(__a, __b) \
+((v8u16)__builtin_msa_pckev_h((v8i16)__builtin_msa_sat_u_w((v4u32)(__b), 15), (v8i16)__builtin_msa_sat_u_w((v4u32)(__a), 15)))
+#define msa_qpack_u64(__a, __b) \
+((v4u32)__builtin_msa_pckev_w((v4i32)__builtin_msa_sat_u_d((v2u64)(__b), 31), (v4i32)__builtin_msa_sat_u_d((v2u64)(__a), 31)))
+
+/* qpacku */
+#define msa_qpacku_s16(__a, __b) \
+((v16u8)__builtin_msa_pckev_b((v16i8)__builtin_msa_sat_u_h((v8u16)(__builtin_msa_max_s_h(__builtin_msa_fill_h(0), (v8i16)(__b))), 7), \
+                              (v16i8)__builtin_msa_sat_u_h((v8u16)(__builtin_msa_max_s_h(__builtin_msa_fill_h(0), (v8i16)(__a))), 7)))
+#define msa_qpacku_s32(__a, __b) \
+((v8u16)__builtin_msa_pckev_h((v8i16)__builtin_msa_sat_u_w((v4u32)(__builtin_msa_max_s_w(__builtin_msa_fill_w(0), (v4i32)(__b))), 15), \
+                              (v8i16)__builtin_msa_sat_u_w((v4u32)(__builtin_msa_max_s_w(__builtin_msa_fill_w(0), (v4i32)(__a))), 15)))
+#define msa_qpacku_s64(__a, __b) \
+((v4u32)__builtin_msa_pckev_w((v4i32)__builtin_msa_sat_u_d((v2u64)(__builtin_msa_max_s_d(__builtin_msa_fill_d(0), (v2i64)(__b))), 31), \
+                              (v4i32)__builtin_msa_sat_u_d((v2u64)(__builtin_msa_max_s_d(__builtin_msa_fill_d(0), (v2i64)(__a))), 31)))
+
+/* packr */
+#define msa_packr_s16(__a, __b, __c) \
+(__builtin_msa_pckev_b((v16i8)__builtin_msa_srai_h((v8i16)(__b), (int)(__c)), (v16i8)__builtin_msa_srai_h((v8i16)(__a), (int)(__c))))
+#define msa_packr_s32(__a, __b, __c) \
+(__builtin_msa_pckev_h((v8i16)__builtin_msa_srai_w((v4i32)(__b), (int)(__c)), (v8i16)__builtin_msa_srai_w((v4i32)(__a), (int)(__c))))
+#define msa_packr_s64(__a, __b, __c) \
+(__builtin_msa_pckev_w((v4i32)__builtin_msa_srai_d((v2i64)(__b), (int)(__c)), (v4i32)__builtin_msa_srai_d((v2i64)(__a), (int)(__c))))
+#define msa_packr_u16(__a, __b, __c) \
+((v16u8)__builtin_msa_pckev_b((v16i8)__builtin_msa_srli_h((v8i16)(__b), (int)(__c)), (v16i8)__builtin_msa_srli_h((v8i16)(__a), (int)(__c))))
+#define msa_packr_u32(__a, __b, __c) \
+((v8u16)__builtin_msa_pckev_h((v8i16)__builtin_msa_srli_w((v4i32)(__b), (int)(__c)), (v8i16)__builtin_msa_srli_w((v4i32)(__a), (int)(__c))))
+#define msa_packr_u64(__a, __b, __c) \
+((v4u32)__builtin_msa_pckev_w((v4i32)__builtin_msa_srli_d((v2i64)(__b), (int)(__c)), (v4i32)__builtin_msa_srli_d((v2i64)(__a), (int)(__c))))
+
+/* rpackr */
+#define msa_rpackr_s16(__a, __b, __c) \
+(__builtin_msa_pckev_b((v16i8)__builtin_msa_srari_h((v8i16)(__b), (int)(__c)), (v16i8)__builtin_msa_srari_h((v8i16)(__a), (int)(__c))))
+#define msa_rpackr_s32(__a, __b, __c) \
+(__builtin_msa_pckev_h((v8i16)__builtin_msa_srari_w((v4i32)(__b), (int)(__c)), (v8i16)__builtin_msa_srari_w((v4i32)(__a), (int)(__c))))
+#define msa_rpackr_s64(__a, __b, __c) \
+(__builtin_msa_pckev_w((v4i32)__builtin_msa_srari_d((v2i64)(__b), (int)(__c)), (v4i32)__builtin_msa_srari_d((v2i64)(__a), (int)(__c))))
+#define msa_rpackr_u16(__a, __b, __c) \
+((v16u8)__builtin_msa_pckev_b((v16i8)__builtin_msa_srlri_h((v8i16)(__b), (int)(__c)), (v16i8)__builtin_msa_srlri_h((v8i16)(__a), (int)(__c))))
+#define msa_rpackr_u32(__a, __b, __c) \
+((v8u16)__builtin_msa_pckev_h((v8i16)__builtin_msa_srlri_w((v4i32)(__b), (int)(__c)), (v8i16)__builtin_msa_srlri_w((v4i32)(__a), (int)(__c))))
+#define msa_rpackr_u64(__a, __b, __c) \
+((v4u32)__builtin_msa_pckev_w((v4i32)__builtin_msa_srlri_d((v2i64)(__b), (int)(__c)), (v4i32)__builtin_msa_srlri_d((v2i64)(__a), (int)(__c))))
+
+/* qrpackr */
+#define msa_qrpackr_s16(__a, __b, __c) \
+(__builtin_msa_pckev_b((v16i8)__builtin_msa_sat_s_h(__builtin_msa_srari_h((v8i16)(__b), (int)(__c)), 7), \
+                       (v16i8)__builtin_msa_sat_s_h(__builtin_msa_srari_h((v8i16)(__a), (int)(__c)), 7)))
+#define msa_qrpackr_s32(__a, __b, __c) \
+(__builtin_msa_pckev_h((v8i16)__builtin_msa_sat_s_w(__builtin_msa_srari_w((v4i32)(__b), (int)(__c)), 15), \
+                       (v8i16)__builtin_msa_sat_s_w(__builtin_msa_srari_w((v4i32)(__a), (int)(__c)), 15)))
+#define msa_qrpackr_s64(__a, __b, __c) \
+(__builtin_msa_pckev_w((v4i32)__builtin_msa_sat_s_d(__builtin_msa_srari_d((v2i64)(__b), (int)(__c)), 31), \
+                       (v4i32)__builtin_msa_sat_s_d(__builtin_msa_srari_d((v2i64)(__a), (int)(__c)), 31)))
+#define msa_qrpackr_u16(__a, __b, __c) \
+((v16u8)__builtin_msa_pckev_b((v16i8)__builtin_msa_sat_u_h((v8u16)__builtin_msa_srlri_h((v8i16)(__b), (int)(__c)), 7), \
+                              (v16i8)__builtin_msa_sat_u_h((v8u16)__builtin_msa_srlri_h((v8i16)(__a), (int)(__c)), 7)))
+#define msa_qrpackr_u32(__a, __b, __c) \
+((v8u16)__builtin_msa_pckev_h((v8i16)__builtin_msa_sat_u_w((v4u32)__builtin_msa_srlri_w((v4i32)(__b), (int)(__c)), 15), \
+                              (v8i16)__builtin_msa_sat_u_w((v4u32)__builtin_msa_srlri_w((v4i32)(__a), (int)(__c)), 15)))
+#define msa_qrpackr_u64(__a, __b, __c) \
+((v4u32)__builtin_msa_pckev_w((v4i32)__builtin_msa_sat_u_d((v2u64)__builtin_msa_srlri_d((v2i64)(__b), (int)(__c)), 31), \
+                              (v4i32)__builtin_msa_sat_u_d((v2u64)__builtin_msa_srlri_d((v2i64)(__a), (int)(__c)), 31)))
+
+/* qrpackru */
+#define msa_qrpackru_s16(__a, __b, __c) \
+({ \
+  v8i16 __d = __builtin_msa_srlri_h(__builtin_msa_max_s_h(__builtin_msa_fill_h(0), (v8i16)(__a)), (int)(__c)); \
+  v8i16 __e = __builtin_msa_srlri_h(__builtin_msa_max_s_h(__builtin_msa_fill_h(0), (v8i16)(__b)), (int)(__c)); \
+  (v16u8)__builtin_msa_pckev_b((v16i8)__builtin_msa_sat_u_h((v8u16)__e, 7), (v16i8)__builtin_msa_sat_u_h((v8u16)__d, 7)); \
+})
+
+#define msa_qrpackru_s32(__a, __b, __c) \
+({ \
+  v4i32 __d = __builtin_msa_srlri_w(__builtin_msa_max_s_w(__builtin_msa_fill_w(0), (v4i32)(__a)), (int)(__c)); \
+  v4i32 __e = __builtin_msa_srlri_w(__builtin_msa_max_s_w(__builtin_msa_fill_w(0), (v4i32)(__b)), (int)(__c)); \
+  (v8u16)__builtin_msa_pckev_h((v8i16)__builtin_msa_sat_u_w((v4u32)__e, 15), (v8i16)__builtin_msa_sat_u_w((v4u32)__d, 15)); \
+})
+
+#define msa_qrpackru_s64(__a, __b, __c) \
+({ \
+  v2i64 __d = __builtin_msa_srlri_d(__builtin_msa_max_s_d(__builtin_msa_fill_d(0), (v2i64)(__a)), (int)(__c)); \
+  v2i64 __e = __builtin_msa_srlri_d(__builtin_msa_max_s_d(__builtin_msa_fill_d(0), (v2i64)(__b)), (int)(__c)); \
+  (v4u32)__builtin_msa_pckev_w((v4i32)__builtin_msa_sat_u_d((v2u64)__e, 31), (v4i32)__builtin_msa_sat_u_d((v2u64)__d, 31)); \
+})
+
+/* Minimum values between corresponding elements in the two vectors are written to the returned vector. */
+#define msa_minq_s8(__a, __b)  (__builtin_msa_min_s_b(__a, __b))
+#define msa_minq_s16(__a, __b) (__builtin_msa_min_s_h(__a, __b))
+#define msa_minq_s32(__a, __b) (__builtin_msa_min_s_w(__a, __b))
+#define msa_minq_s64(__a, __b) (__builtin_msa_min_s_d(__a, __b))
+#define msa_minq_u8(__a, __b)  ((v16u8)__builtin_msa_min_u_b(__a, __b))
+#define msa_minq_u16(__a, __b) ((v8u16)__builtin_msa_min_u_h(__a, __b))
+#define msa_minq_u32(__a, __b) ((v4u32)__builtin_msa_min_u_w(__a, __b))
+#define msa_minq_u64(__a, __b) ((v2u64)__builtin_msa_min_u_d(__a, __b))
+#define msa_minq_f32(__a, __b) (__builtin_msa_fmin_w(__a, __b))
+#define msa_minq_f64(__a, __b) (__builtin_msa_fmin_d(__a, __b))
+
+/* Maximum values between corresponding elements in the two vectors are written to the returned vector. */
+#define msa_maxq_s8(__a, __b)  (__builtin_msa_max_s_b(__a, __b))
+#define msa_maxq_s16(__a, __b) (__builtin_msa_max_s_h(__a, __b))
+#define msa_maxq_s32(__a, __b) (__builtin_msa_max_s_w(__a, __b))
+#define msa_maxq_s64(__a, __b) (__builtin_msa_max_s_d(__a, __b))
+#define msa_maxq_u8(__a, __b)  ((v16u8)__builtin_msa_max_u_b(__a, __b))
+#define msa_maxq_u16(__a, __b) ((v8u16)__builtin_msa_max_u_h(__a, __b))
+#define msa_maxq_u32(__a, __b) ((v4u32)__builtin_msa_max_u_w(__a, __b))
+#define msa_maxq_u64(__a, __b) ((v2u64)__builtin_msa_max_u_d(__a, __b))
+#define msa_maxq_f32(__a, __b) (__builtin_msa_fmax_w(__a, __b))
+#define msa_maxq_f64(__a, __b) (__builtin_msa_fmax_d(__a, __b))
+
+/* Vector type reinterpretion */
+#define MSA_TPV_REINTERPRET(_Tpv, Vec) ((_Tpv)(Vec))
+
+/* Add the odd elements in vector __a with the even elements in vector __b to double width elements in the returned vector. */
+/* v8i16 msa_hadd_s16 ((v16i8)__a, (v16i8)__b) */
+#define msa_hadd_s16(__a, __b) (__builtin_msa_hadd_s_h((v16i8)(__a), (v16i8)(__b)))
+/* v4i32 msa_hadd_s32 ((v8i16)__a, (v8i16)__b) */
+#define msa_hadd_s32(__a, __b) (__builtin_msa_hadd_s_w((v8i16)(__a), (v8i16)(__b)))
+/* v2i64 msa_hadd_s64 ((v4i32)__a, (v4i32)__b) */
+#define msa_hadd_s64(__a, __b) (__builtin_msa_hadd_s_d((v4i32)(__a), (v4i32)(__b)))
+
+/* Copy even elements in __a to the left half and even elements in __b to the right half and return the result vector. */
+#define msa_pckev_s8(__a, __b)  (__builtin_msa_pckev_b((v16i8)(__a), (v16i8)(__b)))
+#define msa_pckev_s16(__a, __b) (__builtin_msa_pckev_h((v8i16)(__a), (v8i16)(__b)))
+#define msa_pckev_s32(__a, __b) (__builtin_msa_pckev_w((v4i32)(__a), (v4i32)(__b)))
+#define msa_pckev_s64(__a, __b) (__builtin_msa_pckev_d((v2i64)(__a), (v2i64)(__b)))
+
+/* Copy even elements in __a to the left half and even elements in __b to the right half and return the result vector. */
+#define msa_pckod_s8(__a, __b)  (__builtin_msa_pckod_b((v16i8)(__a), (v16i8)(__b)))
+#define msa_pckod_s16(__a, __b) (__builtin_msa_pckod_h((v8i16)(__a), (v8i16)(__b)))
+#define msa_pckod_s32(__a, __b) (__builtin_msa_pckod_w((v4i32)(__a), (v4i32)(__b)))
+#define msa_pckod_s64(__a, __b) (__builtin_msa_pckod_d((v2i64)(__a), (v2i64)(__b)))
+
+#ifdef _MIPSEB
+#define LANE_IMM0_1(x)  (0b1 - ((x) & 0b1))
+#define LANE_IMM0_3(x)  (0b11 - ((x) & 0b11))
+#define LANE_IMM0_7(x)  (0b111 - ((x) & 0b111))
+#define LANE_IMM0_15(x) (0b1111 - ((x) & 0b1111))
+#else
+#define LANE_IMM0_1(x)  ((x) & 0b1)
+#define LANE_IMM0_3(x)  ((x) & 0b11)
+#define LANE_IMM0_7(x)  ((x) & 0b111)
+#define LANE_IMM0_15(x) ((x) & 0b1111)
+#endif
+
+#define msa_get_lane_u8(__a, __b)        ((uint8_t)(__a)[LANE_IMM0_7(__b)])
+#define msa_get_lane_s8(__a, __b)        ((int8_t)(__a)[LANE_IMM0_7(__b)])
+#define msa_get_lane_u16(__a, __b)       ((uint16_t)(__a)[LANE_IMM0_3(__b)])
+#define msa_get_lane_s16(__a, __b)       ((int16_t)(__a)[LANE_IMM0_3(__b)])
+#define msa_get_lane_u32(__a, __b)       ((uint32_t)(__a)[LANE_IMM0_1(__b)])
+#define msa_get_lane_s32(__a, __b)       ((int32_t)(__a)[LANE_IMM0_1(__b)])
+#define msa_get_lane_f32(__a, __b)       ((float)(__a)[LANE_IMM0_3(__b)])
+#define msa_get_lane_s64(__a, __b)       ((int64_t)(__a)[LANE_IMM0_1(__b)])
+#define msa_get_lane_u64(__a, __b)       ((uint64_t)(__a)[LANE_IMM0_1(__b)])
+#define msa_get_lane_f64(__a, __b)       ((double)(__a)[LANE_IMM0_1(__b)])
+#define msa_getq_lane_u8(__a, imm0_15)   ((uint8_t)__builtin_msa_copy_u_b((v16i8)(__a), imm0_15))
+#define msa_getq_lane_s8(__a, imm0_15)   ((int8_t)__builtin_msa_copy_s_b(__a, imm0_15))
+#define msa_getq_lane_u16(__a, imm0_7)   ((uint16_t)__builtin_msa_copy_u_h((v8i16)(__a), imm0_7))
+#define msa_getq_lane_s16(__a, imm0_7)   ((int16_t)__builtin_msa_copy_s_h(__a, imm0_7))
+#define msa_getq_lane_u32(__a, imm0_3)   __builtin_msa_copy_u_w((v4i32)(__a), imm0_3)
+#define msa_getq_lane_s32                __builtin_msa_copy_s_w
+#define msa_getq_lane_f32(__a, __b)      ((float)(__a)[LANE_IMM0_3(__b)])
+#define msa_getq_lane_f64(__a, __b)      ((double)(__a)[LANE_IMM0_1(__b)])
+#if (__mips == 64)
+#define msa_getq_lane_u64(__a, imm0_1)   __builtin_msa_copy_u_d((v2i64)(__a), imm0_1)
+#define msa_getq_lane_s64                __builtin_msa_copy_s_d
+#else
+#define msa_getq_lane_u64(__a, imm0_1)   ((uint64_t)(__a)[LANE_IMM0_1(imm0_1)])
+#define msa_getq_lane_s64(__a, imm0_1)   ((int64_t)(__a)[LANE_IMM0_1(imm0_1)])
+#endif
+
+/* combine */
+#if (__mips == 64)
+#define __COMBINE_64_64(__TYPE, a, b)    ((__TYPE)((v2u64){((v1u64)(a))[0], ((v1u64)(b))[0]}))
+#else
+#define __COMBINE_64_64(__TYPE, a, b)    ((__TYPE)((v4u32){((v2u32)(a))[0], ((v2u32)(a))[1],  \
+                                                           ((v2u32)(b))[0], ((v2u32)(b))[1]}))
+#endif
+
+/* v16i8 msa_combine_s8 (v8i8 __a, v8i8 __b) */
+#define msa_combine_s8(__a, __b)  __COMBINE_64_64(v16i8, __a, __b)
+
+/* v8i16 msa_combine_s16(v4i16 __a, v4i16 __b) */
+#define msa_combine_s16(__a, __b)  __COMBINE_64_64(v8i16, __a, __b)
+
+/* v4i32 msa_combine_s32(v2i32 __a, v2i32 __b) */
+#define msa_combine_s32(__a, __b)  __COMBINE_64_64(v4i32, __a, __b)
+
+/* v2i64 msa_combine_s64(v1i64 __a, v1i64 __b) */
+#define msa_combine_s64(__a, __b)  __COMBINE_64_64(v2i64, __a, __b)
+
+/* v4f32 msa_combine_f32(v2f32 __a, v2f32 __b) */
+#define msa_combine_f32(__a, __b)  __COMBINE_64_64(v4f32, __a, __b)
+
+/* v16u8 msa_combine_u8(v8u8 __a, v8u8 __b) */
+#define msa_combine_u8(__a, __b)  __COMBINE_64_64(v16u8, __a, __b)
+
+/* v8u16 msa_combine_u16(v4u16 __a, v4u16 __b) */
+#define msa_combine_u16(__a, __b)  __COMBINE_64_64(v8u16, __a, __b)
+
+/* v4u32 msa_combine_u32(v2u32 __a, v2u32 __b) */
+#define msa_combine_u32(__a, __b)  __COMBINE_64_64(v4u32, __a, __b)
+
+/* v2u64 msa_combine_u64(v1u64 __a, v1u64 __b) */
+#define msa_combine_u64(__a, __b)  __COMBINE_64_64(v2u64, __a, __b)
+
+/* v2f64 msa_combine_f64(v1f64 __a, v1f64 __b) */
+#define msa_combine_f64(__a, __b)  __COMBINE_64_64(v2f64, __a, __b)
+
+/* get_low, get_high */
+#if (__mips == 64)
+#define __GET_LOW(__TYPE, a)   ((__TYPE)((v1u64)(__builtin_msa_copy_u_d((v2i64)(a), 0))))
+#define __GET_HIGH(__TYPE, a)  ((__TYPE)((v1u64)(__builtin_msa_copy_u_d((v2i64)(a), 1))))
+#else
+#define __GET_LOW(__TYPE, a)   ((__TYPE)(((v2u64)(a))[0]))
+#define __GET_HIGH(__TYPE, a)  ((__TYPE)(((v2u64)(a))[1]))
+#endif
+
+/* v8i8 msa_get_low_s8(v16i8 __a) */
+#define msa_get_low_s8(__a)  __GET_LOW(v8i8, __a)
+
+/* v4i16 msa_get_low_s16(v8i16 __a) */
+#define msa_get_low_s16(__a)  __GET_LOW(v4i16, __a)
+
+/* v2i32 msa_get_low_s32(v4i32 __a) */
+#define msa_get_low_s32(__a)  __GET_LOW(v2i32, __a)
+
+/* v1i64 msa_get_low_s64(v2i64 __a) */
+#define msa_get_low_s64(__a)  __GET_LOW(v1i64, __a)
+
+/* v8u8 msa_get_low_u8(v16u8 __a) */
+#define msa_get_low_u8(__a)  __GET_LOW(v8u8, __a)
+
+/* v4u16 msa_get_low_u16(v8u16 __a) */
+#define msa_get_low_u16(__a)  __GET_LOW(v4u16, __a)
+
+/* v2u32 msa_get_low_u32(v4u32 __a) */
+#define msa_get_low_u32(__a)  __GET_LOW(v2u32, __a)
+
+/* v1u64 msa_get_low_u64(v2u64 __a) */
+#define msa_get_low_u64(__a)  __GET_LOW(v1u64, __a)
+
+/* v2f32 msa_get_low_f32(v4f32 __a) */
+#define msa_get_low_f32(__a)  __GET_LOW(v2f32, __a)
+
+/* v1f64 msa_get_low_f64(v2f64 __a) */
+#define msa_get_low_f64(__a)  __GET_LOW(v1f64, __a)
+
+/* v8i8 msa_get_high_s8(v16i8 __a) */
+#define msa_get_high_s8(__a)  __GET_HIGH(v8i8, __a)
+
+/* v4i16 msa_get_high_s16(v8i16 __a) */
+#define msa_get_high_s16(__a)  __GET_HIGH(v4i16, __a)
+
+/* v2i32 msa_get_high_s32(v4i32 __a) */
+#define msa_get_high_s32(__a)  __GET_HIGH(v2i32, __a)
+
+/* v1i64 msa_get_high_s64(v2i64 __a) */
+#define msa_get_high_s64(__a)  __GET_HIGH(v1i64, __a)
+
+/* v8u8 msa_get_high_u8(v16u8 __a) */
+#define msa_get_high_u8(__a)  __GET_HIGH(v8u8, __a)
+
+/* v4u16 msa_get_high_u16(v8u16 __a) */
+#define msa_get_high_u16(__a)  __GET_HIGH(v4u16, __a)
+
+/* v2u32 msa_get_high_u32(v4u32 __a) */
+#define msa_get_high_u32(__a)  __GET_HIGH(v2u32, __a)
+
+/* v1u64 msa_get_high_u64(v2u64 __a) */
+#define msa_get_high_u64(__a)  __GET_HIGH(v1u64, __a)
+
+/* v2f32 msa_get_high_f32(v4f32 __a) */
+#define msa_get_high_f32(__a)  __GET_HIGH(v2f32, __a)
+
+/* v1f64 msa_get_high_f64(v2f64 __a) */
+#define msa_get_high_f64(__a)  __GET_HIGH(v1f64, __a)
+
+/* ri = ai * b[lane] */
+/* v4f32 msa_mulq_lane_f32(v4f32 __a, v4f32 __b, const int __lane) */
+#define msa_mulq_lane_f32(__a, __b, __lane)  ((__a) * msa_getq_lane_f32(__b, __lane))
+
+/* ri = ai + bi * c[lane] */
+/* v4f32 msa_mlaq_lane_f32(v4f32 __a, v4f32 __b, v4f32 __c, const int __lane) */
+#define msa_mlaq_lane_f32(__a, __b, __c, __lane)  ((__a) + ((__b) * msa_getq_lane_f32(__c, __lane)))
+
+/* uint16_t msa_sum_u16(v8u16 __a)*/
+#define msa_sum_u16(__a)                         \
+({                                               \
+  v4u32 _b;                                      \
+  v2u64 _c;                                      \
+  _b = __builtin_msa_hadd_u_w(__a, __a);         \
+  _c = __builtin_msa_hadd_u_d(_b, _b);           \
+  (uint16_t)(_c[0] + _c[1]);                     \
+})
+
+/* int16_t msa_sum_s16(v8i16 __a) */
+#define msa_sum_s16(__a)                        \
+({                                              \
+  v4i32 _b;                                     \
+  v2i64 _c;                                     \
+  _b = __builtin_msa_hadd_s_w(__a, __a);        \
+  _c = __builtin_msa_hadd_s_d(_b, _b);          \
+  (int16_t)(_c[0] + _c[1]);                     \
+})
+
+
+/* uint32_t msa_sum_u32(v4u32 __a)*/
+#define msa_sum_u32(__a)                       \
+({                                             \
+  v2u64 _b;                                    \
+  _b = __builtin_msa_hadd_u_d(__a, __a);       \
+  (uint32_t)(_b[0] + _b[1]);                   \
+})
+
+/* int32_t  msa_sum_s32(v4i32 __a)*/
+#define msa_sum_s32(__a)                       \
+({                                             \
+  v2i64 _b;                                    \
+  _b = __builtin_msa_hadd_s_d(__a, __a);       \
+  (int32_t)(_b[0] + _b[1]);                    \
+})
+
+/* uint8_t msa_sum_u8(v16u8 __a)*/
+#define msa_sum_u8(__a)                        \
+({                                             \
+  v8u16 _b16;                                    \
+  v4u32 _c32;                                    \
+  _b16 = __builtin_msa_hadd_u_h(__a, __a);       \
+  _c32 = __builtin_msa_hadd_u_w(_b16, _b16);         \
+  (uint8_t)msa_sum_u32(_c32);                    \
+})
+
+/* int8_t msa_sum_s8(v16s8 __a)*/
+#define msa_sum_s8(__a)                        \
+({                                             \
+  v8i16 _b16;                                    \
+  v4i32 _c32;                                    \
+  _b16 = __builtin_msa_hadd_s_h(__a, __a);       \
+  _c32 = __builtin_msa_hadd_s_w(_b16, _b16);         \
+  (int8_t)msa_sum_s32(_c32);                     \
+})
+
+/* float msa_sum_f32(v4f32 __a)*/
+#define msa_sum_f32(__a)  ((__a)[0] + (__a)[1] + (__a)[2] + (__a)[3])
+
+/* v8u16 msa_paddlq_u8(v16u8 __a) */
+#define msa_paddlq_u8(__a)  (__builtin_msa_hadd_u_h(__a, __a))
+
+/* v8i16 msa_paddlq_s8(v16i8 __a) */
+#define msa_paddlq_s8(__a)  (__builtin_msa_hadd_s_h(__a, __a))
+
+/* v4u32 msa_paddlq_u16 (v8u16 __a)*/
+#define msa_paddlq_u16(__a)  (__builtin_msa_hadd_u_w(__a, __a))
+
+/* v4i32 msa_paddlq_s16 (v8i16 __a)*/
+#define msa_paddlq_s16(__a)  (__builtin_msa_hadd_s_w(__a, __a))
+
+/* v2u64 msa_paddlq_u32(v4u32 __a) */
+#define msa_paddlq_u32(__a)  (__builtin_msa_hadd_u_d(__a, __a))
+
+/* v2i64 msa_paddlq_s32(v4i32 __a) */
+#define msa_paddlq_s32(__a)  (__builtin_msa_hadd_s_d(__a, __a))
+
+#define V8U8_2_V8U16(x)   {(uint16_t)x[0], (uint16_t)x[1], (uint16_t)x[2], (uint16_t)x[3], \
+                           (uint16_t)x[4], (uint16_t)x[5], (uint16_t)x[6], (uint16_t)x[7]}
+#define V8U8_2_V8I16(x)   {(int16_t)x[0], (int16_t)x[1], (int16_t)x[2], (int16_t)x[3], \
+                           (int16_t)x[4], (int16_t)x[5], (int16_t)x[6], (int16_t)x[7]}
+#define V8I8_2_V8I16(x)   {(int16_t)x[0], (int16_t)x[1], (int16_t)x[2], (int16_t)x[3], \
+                           (int16_t)x[4], (int16_t)x[5], (int16_t)x[6], (int16_t)x[7]}
+#define V4U16_2_V4U32(x)  {(uint32_t)x[0], (uint32_t)x[1], (uint32_t)x[2], (uint32_t)x[3]}
+#define V4U16_2_V4I32(x)  {(int32_t)x[0], (int32_t)x[1], (int32_t)x[2], (int32_t)x[3]}
+#define V4I16_2_V4I32(x)  {(int32_t)x[0], (int32_t)x[1], (int32_t)x[2], (int32_t)x[3]}
+#define V2U32_2_V2U64(x)  {(uint64_t)x[0], (uint64_t)x[1]}
+#define V2U32_2_V2I64(x)  {(int64_t)x[0], (int64_t)x[1]}
+
+/* v8u16 msa_mull_u8(v8u8 __a, v8u8 __b) */
+#define msa_mull_u8(__a, __b)  ((v8u16)__builtin_msa_mulv_h((v8i16)V8U8_2_V8I16(__a), (v8i16)V8U8_2_V8I16(__b)))
+
+/* v8i16 msa_mull_s8(v8i8 __a, v8i8 __b)*/
+#define msa_mull_s8(__a, __b)  (__builtin_msa_mulv_h((v8i16)V8I8_2_V8I16(__a), (v8i16)V8I8_2_V8I16(__b)))
+
+/* v4u32 msa_mull_u16(v4u16 __a, v4u16 __b) */
+#define msa_mull_u16(__a, __b)  ((v4u32)__builtin_msa_mulv_w((v4i32)V4U16_2_V4I32(__a), (v4i32)V4U16_2_V4I32(__b)))
+
+/* v4i32 msa_mull_s16(v4i16 __a, v4i16 __b) */
+#define msa_mull_s16(__a, __b)  (__builtin_msa_mulv_w((v4i32)V4I16_2_V4I32(__a), (v4i32)V4I16_2_V4I32(__b)))
+
+/* v2u64 msa_mull_u32(v2u32 __a, v2u32 __b) */
+#define msa_mull_u32(__a, __b)  ((v2u64)__builtin_msa_mulv_d((v2i64)V2U32_2_V2I64(__a), (v2i64)V2U32_2_V2I64(__b)))
+
+/* bitwise and: __builtin_msa_and_v */
+#define msa_andq_u8(__a, __b)  ((v16u8)__builtin_msa_and_v((v16u8)(__a), (v16u8)(__b)))
+#define msa_andq_s8(__a, __b)  ((v16i8)__builtin_msa_and_v((v16u8)(__a), (v16u8)(__b)))
+#define msa_andq_u16(__a, __b) ((v8u16)__builtin_msa_and_v((v16u8)(__a), (v16u8)(__b)))
+#define msa_andq_s16(__a, __b) ((v8i16)__builtin_msa_and_v((v16u8)(__a), (v16u8)(__b)))
+#define msa_andq_u32(__a, __b) ((v4u32)__builtin_msa_and_v((v16u8)(__a), (v16u8)(__b)))
+#define msa_andq_s32(__a, __b) ((v4i32)__builtin_msa_and_v((v16u8)(__a), (v16u8)(__b)))
+#define msa_andq_u64(__a, __b) ((v2u64)__builtin_msa_and_v((v16u8)(__a), (v16u8)(__b)))
+#define msa_andq_s64(__a, __b) ((v2i64)__builtin_msa_and_v((v16u8)(__a), (v16u8)(__b)))
+
+/* bitwise or: __builtin_msa_or_v */
+#define msa_orrq_u8(__a, __b)  ((v16u8)__builtin_msa_or_v((v16u8)(__a), (v16u8)(__b)))
+#define msa_orrq_s8(__a, __b)  ((v16i8)__builtin_msa_or_v((v16u8)(__a), (v16u8)(__b)))
+#define msa_orrq_u16(__a, __b) ((v8u16)__builtin_msa_or_v((v16u8)(__a), (v16u8)(__b)))
+#define msa_orrq_s16(__a, __b) ((v8i16)__builtin_msa_or_v((v16u8)(__a), (v16u8)(__b)))
+#define msa_orrq_u32(__a, __b) ((v4u32)__builtin_msa_or_v((v16u8)(__a), (v16u8)(__b)))
+#define msa_orrq_s32(__a, __b) ((v4i32)__builtin_msa_or_v((v16u8)(__a), (v16u8)(__b)))
+#define msa_orrq_u64(__a, __b) ((v2u64)__builtin_msa_or_v((v16u8)(__a), (v16u8)(__b)))
+#define msa_orrq_s64(__a, __b) ((v2i64)__builtin_msa_or_v((v16u8)(__a), (v16u8)(__b)))
+
+/* bitwise xor: __builtin_msa_xor_v */
+#define msa_eorq_u8(__a, __b)  ((v16u8)__builtin_msa_xor_v((v16u8)(__a), (v16u8)(__b)))
+#define msa_eorq_s8(__a, __b)  ((v16i8)__builtin_msa_xor_v((v16u8)(__a), (v16u8)(__b)))
+#define msa_eorq_u16(__a, __b) ((v8u16)__builtin_msa_xor_v((v16u8)(__a), (v16u8)(__b)))
+#define msa_eorq_s16(__a, __b) ((v8i16)__builtin_msa_xor_v((v16u8)(__a), (v16u8)(__b)))
+#define msa_eorq_u32(__a, __b) ((v4u32)__builtin_msa_xor_v((v16u8)(__a), (v16u8)(__b)))
+#define msa_eorq_s32(__a, __b) ((v4i32)__builtin_msa_xor_v((v16u8)(__a), (v16u8)(__b)))
+#define msa_eorq_u64(__a, __b) ((v2u64)__builtin_msa_xor_v((v16u8)(__a), (v16u8)(__b)))
+#define msa_eorq_s64(__a, __b) ((v2i64)__builtin_msa_xor_v((v16u8)(__a), (v16u8)(__b)))
+
+/* bitwise not: v16u8 __builtin_msa_xori_b (v16u8, 0xff) */
+#define msa_mvnq_u8(__a)  ((v16u8)__builtin_msa_xori_b((v16u8)(__a), 0xFF))
+#define msa_mvnq_s8(__a)  ((v16i8)__builtin_msa_xori_b((v16u8)(__a), 0xFF))
+#define msa_mvnq_u16(__a) ((v8u16)__builtin_msa_xori_b((v16u8)(__a), 0xFF))
+#define msa_mvnq_s16(__a) ((v8i16)__builtin_msa_xori_b((v16u8)(__a), 0xFF))
+#define msa_mvnq_u32(__a) ((v4u32)__builtin_msa_xori_b((v16u8)(__a), 0xFF))
+#define msa_mvnq_s32(__a) ((v4i32)__builtin_msa_xori_b((v16u8)(__a), 0xFF))
+#define msa_mvnq_u64(__a) ((v2u64)__builtin_msa_xori_b((v16u8)(__a), 0xFF))
+#define msa_mvnq_s64(__a) ((v2i64)__builtin_msa_xori_b((v16u8)(__a), 0xFF))
+
+/* compare equal: ceq -> ri = ai == bi ? 1...1:0...0 */
+#define msa_ceqq_u8(__a, __b)  ((v16u8)__builtin_msa_ceq_b((v16i8)(__a), (v16i8)(__b)))
+#define msa_ceqq_s8(__a, __b)  ((v16u8)__builtin_msa_ceq_b((v16i8)(__a), (v16i8)(__b)))
+#define msa_ceqq_u16(__a, __b) ((v8u16)__builtin_msa_ceq_h((v8i16)(__a), (v8i16)(__b)))
+#define msa_ceqq_s16(__a, __b) ((v8u16)__builtin_msa_ceq_h((v8i16)(__a), (v8i16)(__b)))
+#define msa_ceqq_u32(__a, __b) ((v4u32)__builtin_msa_ceq_w((v4i32)(__a), (v4i32)(__b)))
+#define msa_ceqq_s32(__a, __b) ((v4u32)__builtin_msa_ceq_w((v4i32)(__a), (v4i32)(__b)))
+#define msa_ceqq_f32(__a, __b) ((v4u32)__builtin_msa_fceq_w((v4f32)(__a), (v4f32)(__b)))
+#define msa_ceqq_u64(__a, __b) ((v2u64)__builtin_msa_ceq_d((v2i64)(__a), (v2i64)(__b)))
+#define msa_ceqq_s64(__a, __b) ((v2u64)__builtin_msa_ceq_d((v2i64)(__a), (v2i64)(__b)))
+#define msa_ceqq_f64(__a, __b) ((v2u64)__builtin_msa_fceq_d((v2f64)(__a), (v2f64)(__b)))
+
+/* Compare less-than: clt -> ri = ai < bi ? 1...1:0...0 */
+#define msa_cltq_u8(__a, __b)  ((v16u8)__builtin_msa_clt_u_b((v16u8)(__a), (v16u8)(__b)))
+#define msa_cltq_s8(__a, __b)  ((v16u8)__builtin_msa_clt_s_b((v16i8)(__a), (v16i8)(__b)))
+#define msa_cltq_u16(__a, __b) ((v8u16)__builtin_msa_clt_u_h((v8u16)(__a), (v8u16)(__b)))
+#define msa_cltq_s16(__a, __b) ((v8u16)__builtin_msa_clt_s_h((v8i16)(__a), (v8i16)(__b)))
+#define msa_cltq_u32(__a, __b) ((v4u32)__builtin_msa_clt_u_w((v4u32)(__a), (v4u32)(__b)))
+#define msa_cltq_s32(__a, __b) ((v4u32)__builtin_msa_clt_s_w((v4i32)(__a), (v4i32)(__b)))
+#define msa_cltq_f32(__a, __b) ((v4u32)__builtin_msa_fclt_w((v4f32)(__a), (v4f32)(__b)))
+#define msa_cltq_u64(__a, __b) ((v2u64)__builtin_msa_clt_u_d((v2u64)(__a), (v2u64)(__b)))
+#define msa_cltq_s64(__a, __b) ((v2u64)__builtin_msa_clt_s_d((v2i64)(__a), (v2i64)(__b)))
+#define msa_cltq_f64(__a, __b) ((v2u64)__builtin_msa_fclt_d((v2f64)(__a), (v2f64)(__b)))
+
+/* compare greater-than: cgt -> ri = ai > bi ? 1...1:0...0 */
+#define msa_cgtq_u8(__a, __b)  ((v16u8)__builtin_msa_clt_u_b((v16u8)(__b), (v16u8)(__a)))
+#define msa_cgtq_s8(__a, __b)  ((v16u8)__builtin_msa_clt_s_b((v16i8)(__b), (v16i8)(__a)))
+#define msa_cgtq_u16(__a, __b) ((v8u16)__builtin_msa_clt_u_h((v8u16)(__b), (v8u16)(__a)))
+#define msa_cgtq_s16(__a, __b) ((v8u16)__builtin_msa_clt_s_h((v8i16)(__b), (v8i16)(__a)))
+#define msa_cgtq_u32(__a, __b) ((v4u32)__builtin_msa_clt_u_w((v4u32)(__b), (v4u32)(__a)))
+#define msa_cgtq_s32(__a, __b) ((v4u32)__builtin_msa_clt_s_w((v4i32)(__b), (v4i32)(__a)))
+#define msa_cgtq_f32(__a, __b) ((v4u32)__builtin_msa_fclt_w((v4f32)(__b), (v4f32)(__a)))
+#define msa_cgtq_u64(__a, __b) ((v2u64)__builtin_msa_clt_u_d((v2u64)(__b), (v2u64)(__a)))
+#define msa_cgtq_s64(__a, __b) ((v2u64)__builtin_msa_clt_s_d((v2i64)(__b), (v2i64)(__a)))
+#define msa_cgtq_f64(__a, __b) ((v2u64)__builtin_msa_fclt_d((v2f64)(__b), (v2f64)(__a)))
+
+/* compare less-equal: cle -> ri = ai <= bi ? 1...1:0...0 */
+#define msa_cleq_u8(__a, __b)  ((v16u8)__builtin_msa_cle_u_b((v16u8)(__a), (v16u8)(__b)))
+#define msa_cleq_s8(__a, __b)  ((v16u8)__builtin_msa_cle_s_b((v16i8)(__a), (v16i8)(__b)))
+#define msa_cleq_u16(__a, __b) ((v8u16)__builtin_msa_cle_u_h((v8u16)(__a), (v8u16)(__b)))
+#define msa_cleq_s16(__a, __b) ((v8u16)__builtin_msa_cle_s_h((v8i16)(__a), (v8i16)(__b)))
+#define msa_cleq_u32(__a, __b) ((v4u32)__builtin_msa_cle_u_w((v4u32)(__a), (v4u32)(__b)))
+#define msa_cleq_s32(__a, __b) ((v4u32)__builtin_msa_cle_s_w((v4i32)(__a), (v4i32)(__b)))
+#define msa_cleq_f32(__a, __b) ((v4u32)__builtin_msa_fcle_w((v4f32)(__a), (v4f32)(__b)))
+#define msa_cleq_u64(__a, __b) ((v2u64)__builtin_msa_cle_u_d((v2u64)(__a), (v2u64)(__b)))
+#define msa_cleq_s64(__a, __b) ((v2u64)__builtin_msa_cle_s_d((v2i64)(__a), (v2i64)(__b)))
+#define msa_cleq_f64(__a, __b) ((v2u64)__builtin_msa_fcle_d((v2f64)(__a), (v2f64)(__b)))
+
+/* compare greater-equal: cge -> ri = ai >= bi ? 1...1:0...0 */
+#define msa_cgeq_u8(__a, __b)  ((v16u8)__builtin_msa_cle_u_b((v16u8)(__b), (v16u8)(__a)))
+#define msa_cgeq_s8(__a, __b)  ((v16u8)__builtin_msa_cle_s_b((v16i8)(__b), (v16i8)(__a)))
+#define msa_cgeq_u16(__a, __b) ((v8u16)__builtin_msa_cle_u_h((v8u16)(__b), (v8u16)(__a)))
+#define msa_cgeq_s16(__a, __b) ((v8u16)__builtin_msa_cle_s_h((v8i16)(__b), (v8i16)(__a)))
+#define msa_cgeq_u32(__a, __b) ((v4u32)__builtin_msa_cle_u_w((v4u32)(__b), (v4u32)(__a)))
+#define msa_cgeq_s32(__a, __b) ((v4u32)__builtin_msa_cle_s_w((v4i32)(__b), (v4i32)(__a)))
+#define msa_cgeq_f32(__a, __b) ((v4u32)__builtin_msa_fcle_w((v4f32)(__b), (v4f32)(__a)))
+#define msa_cgeq_u64(__a, __b) ((v2u64)__builtin_msa_cle_u_d((v2u64)(__b), (v2u64)(__a)))
+#define msa_cgeq_s64(__a, __b) ((v2u64)__builtin_msa_cle_s_d((v2i64)(__b), (v2i64)(__a)))
+#define msa_cgeq_f64(__a, __b) ((v2u64)__builtin_msa_fcle_d((v2f64)(__b), (v2f64)(__a)))
+
+/* Shift Left Logical: shl -> ri = ai << bi; */
+#define msa_shlq_u8(__a, __b)  ((v16u8)__builtin_msa_sll_b((v16i8)(__a), (v16i8)(__b)))
+#define msa_shlq_s8(__a, __b)  ((v16i8)__builtin_msa_sll_b((v16i8)(__a), (v16i8)(__b)))
+#define msa_shlq_u16(__a, __b) ((v8u16)__builtin_msa_sll_h((v8i16)(__a), (v8i16)(__b)))
+#define msa_shlq_s16(__a, __b) ((v8i16)__builtin_msa_sll_h((v8i16)(__a), (v8i16)(__b)))
+#define msa_shlq_u32(__a, __b) ((v4u32)__builtin_msa_sll_w((v4i32)(__a), (v4i32)(__b)))
+#define msa_shlq_s32(__a, __b) ((v4i32)__builtin_msa_sll_w((v4i32)(__a), (v4i32)(__b)))
+#define msa_shlq_u64(__a, __b) ((v2u64)__builtin_msa_sll_d((v2i64)(__a), (v2i64)(__b)))
+#define msa_shlq_s64(__a, __b) ((v2i64)__builtin_msa_sll_d((v2i64)(__a), (v2i64)(__b)))
+
+/* Immediate Shift Left Logical: shl -> ri = ai << imm; */
+#define msa_shlq_n_u8(__a, __imm)  ((v16u8)__builtin_msa_slli_b((v16i8)(__a), __imm))
+#define msa_shlq_n_s8(__a, __imm)  ((v16i8)__builtin_msa_slli_b((v16i8)(__a), __imm))
+#define msa_shlq_n_u16(__a, __imm) ((v8u16)__builtin_msa_slli_h((v8i16)(__a), __imm))
+#define msa_shlq_n_s16(__a, __imm) ((v8i16)__builtin_msa_slli_h((v8i16)(__a), __imm))
+#define msa_shlq_n_u32(__a, __imm) ((v4u32)__builtin_msa_slli_w((v4i32)(__a), __imm))
+#define msa_shlq_n_s32(__a, __imm) ((v4i32)__builtin_msa_slli_w((v4i32)(__a), __imm))
+#define msa_shlq_n_u64(__a, __imm) ((v2u64)__builtin_msa_slli_d((v2i64)(__a), __imm))
+#define msa_shlq_n_s64(__a, __imm) ((v2i64)__builtin_msa_slli_d((v2i64)(__a), __imm))
+
+/* shift right: shrq -> ri = ai >> bi; */
+#define msa_shrq_u8(__a, __b)  ((v16u8)__builtin_msa_srl_b((v16i8)(__a), (v16i8)(__b)))
+#define msa_shrq_s8(__a, __b)  ((v16i8)__builtin_msa_sra_b((v16i8)(__a), (v16i8)(__b)))
+#define msa_shrq_u16(__a, __b) ((v8u16)__builtin_msa_srl_h((v8i16)(__a), (v8i16)(__b)))
+#define msa_shrq_s16(__a, __b) ((v8i16)__builtin_msa_sra_h((v8i16)(__a), (v8i16)(__b)))
+#define msa_shrq_u32(__a, __b) ((v4u32)__builtin_msa_srl_w((v4i32)(__a), (v4i32)(__b)))
+#define msa_shrq_s32(__a, __b) ((v4i32)__builtin_msa_sra_w((v4i32)(__a), (v4i32)(__b)))
+#define msa_shrq_u64(__a, __b) ((v2u64)__builtin_msa_srl_d((v2i64)(__a), (v2i64)(__b)))
+#define msa_shrq_s64(__a, __b) ((v2i64)__builtin_msa_sra_d((v2i64)(__a), (v2i64)(__b)))
+
+/* Immediate Shift Right: shr -> ri = ai >> imm; */
+#define msa_shrq_n_u8(__a, __imm)  ((v16u8)__builtin_msa_srli_b((v16i8)(__a), __imm))
+#define msa_shrq_n_s8(__a, __imm)  ((v16i8)__builtin_msa_srai_b((v16i8)(__a), __imm))
+#define msa_shrq_n_u16(__a, __imm) ((v8u16)__builtin_msa_srli_h((v8i16)(__a), __imm))
+#define msa_shrq_n_s16(__a, __imm) ((v8i16)__builtin_msa_srai_h((v8i16)(__a), __imm))
+#define msa_shrq_n_u32(__a, __imm) ((v4u32)__builtin_msa_srli_w((v4i32)(__a), __imm))
+#define msa_shrq_n_s32(__a, __imm) ((v4i32)__builtin_msa_srai_w((v4i32)(__a), __imm))
+#define msa_shrq_n_u64(__a, __imm) ((v2u64)__builtin_msa_srli_d((v2i64)(__a), __imm))
+#define msa_shrq_n_s64(__a, __imm) ((v2i64)__builtin_msa_srai_d((v2i64)(__a), __imm))
+
+/* Immediate Shift Right Rounded: shr -> ri = ai >> (rounded)imm; */
+#define msa_rshrq_n_u8(__a, __imm)  ((v16u8)__builtin_msa_srlri_b((v16i8)(__a), __imm))
+#define msa_rshrq_n_s8(__a, __imm)  ((v16i8)__builtin_msa_srari_b((v16i8)(__a), __imm))
+#define msa_rshrq_n_u16(__a, __imm) ((v8u16)__builtin_msa_srlri_h((v8i16)(__a), __imm))
+#define msa_rshrq_n_s16(__a, __imm) ((v8i16)__builtin_msa_srari_h((v8i16)(__a), __imm))
+#define msa_rshrq_n_u32(__a, __imm) ((v4u32)__builtin_msa_srlri_w((v4i32)(__a), __imm))
+#define msa_rshrq_n_s32(__a, __imm) ((v4i32)__builtin_msa_srari_w((v4i32)(__a), __imm))
+#define msa_rshrq_n_u64(__a, __imm) ((v2u64)__builtin_msa_srlri_d((v2i64)(__a), __imm))
+#define msa_rshrq_n_s64(__a, __imm) ((v2i64)__builtin_msa_srari_d((v2i64)(__a), __imm))
+
+/* Vector saturating rounding shift left, qrshl -> ri = ai << bi; */
+#define msa_qrshrq_s32(a, b)  ((v4i32)__msa_srar_w((v4i32)(a), (v4i32)(b)))
+
+/* Rename the msa builtin func to unify the name style for intrin_msa.hpp */
+#define msa_qaddq_u8          __builtin_msa_adds_u_b
+#define msa_qaddq_s8          __builtin_msa_adds_s_b
+#define msa_qaddq_u16         __builtin_msa_adds_u_h
+#define msa_qaddq_s16         __builtin_msa_adds_s_h
+#define msa_qaddq_u32         __builtin_msa_adds_u_w
+#define msa_qaddq_s32         __builtin_msa_adds_s_w
+#define msa_qaddq_u64         __builtin_msa_adds_u_d
+#define msa_qaddq_s64         __builtin_msa_adds_s_d
+#define msa_addq_u8(a, b)     ((v16u8)__builtin_msa_addv_b((v16i8)(a), (v16i8)(b)))
+#define msa_addq_s8           __builtin_msa_addv_b
+#define msa_addq_u16(a, b)    ((v8u16)__builtin_msa_addv_h((v8i16)(a), (v8i16)(b)))
+#define msa_addq_s16          __builtin_msa_addv_h
+#define msa_addq_u32(a, b)    ((v4u32)__builtin_msa_addv_w((v4i32)(a), (v4i32)(b)))
+#define msa_addq_s32          __builtin_msa_addv_w
+#define msa_addq_f32          __builtin_msa_fadd_w
+#define msa_addq_u64(a, b)    ((v2u64)__builtin_msa_addv_d((v2i64)(a), (v2i64)(b)))
+#define msa_addq_s64          __builtin_msa_addv_d
+#define msa_addq_f64          __builtin_msa_fadd_d
+#define msa_qsubq_u8          __builtin_msa_subs_u_b
+#define msa_qsubq_s8          __builtin_msa_subs_s_b
+#define msa_qsubq_u16         __builtin_msa_subs_u_h
+#define msa_qsubq_s16         __builtin_msa_subs_s_h
+#define msa_subq_u8(a, b)     ((v16u8)__builtin_msa_subv_b((v16i8)(a), (v16i8)(b)))
+#define msa_subq_s8           __builtin_msa_subv_b
+#define msa_subq_u16(a, b)    ((v8u16)__builtin_msa_subv_h((v8i16)(a), (v8i16)(b)))
+#define msa_subq_s16          __builtin_msa_subv_h
+#define msa_subq_u32(a, b)    ((v4u32)__builtin_msa_subv_w((v4i32)(a), (v4i32)(b)))
+#define msa_subq_s32          __builtin_msa_subv_w
+#define msa_subq_f32          __builtin_msa_fsub_w
+#define msa_subq_u64(a, b)    ((v2u64)__builtin_msa_subv_d((v2i64)(a), (v2i64)(b)))
+#define msa_subq_s64          __builtin_msa_subv_d
+#define msa_subq_f64          __builtin_msa_fsub_d
+#define msa_mulq_u8(a, b)     ((v16u8)__builtin_msa_mulv_b((v16i8)(a), (v16i8)(b)))
+#define msa_mulq_s8(a, b)     ((v16i8)__builtin_msa_mulv_b((v16i8)(a), (v16i8)(b)))
+#define msa_mulq_u16(a, b)    ((v8u16)__builtin_msa_mulv_h((v8i16)(a), (v8i16)(b)))
+#define msa_mulq_s16(a, b)    ((v8i16)__builtin_msa_mulv_h((v8i16)(a), (v8i16)(b)))
+#define msa_mulq_u32(a, b)    ((v4u32)__builtin_msa_mulv_w((v4i32)(a), (v4i32)(b)))
+#define msa_mulq_s32(a, b)    ((v4i32)__builtin_msa_mulv_w((v4i32)(a), (v4i32)(b)))
+#define msa_mulq_u64(a, b)    ((v2u64)__builtin_msa_mulv_d((v2i64)(a), (v2i64)(b)))
+#define msa_mulq_s64(a, b)    ((v2i64)__builtin_msa_mulv_d((v2i64)(a), (v2i64)(b)))
+#define msa_mulq_f32          __builtin_msa_fmul_w
+#define msa_mulq_f64          __builtin_msa_fmul_d
+#define msa_divq_f32          __builtin_msa_fdiv_w
+#define msa_divq_f64          __builtin_msa_fdiv_d
+#define msa_dotp_s_h          __builtin_msa_dotp_s_h
+#define msa_dotp_s_w          __builtin_msa_dotp_s_w
+#define msa_dotp_s_d          __builtin_msa_dotp_s_d
+#define msa_dotp_u_h          __builtin_msa_dotp_u_h
+#define msa_dotp_u_w          __builtin_msa_dotp_u_w
+#define msa_dotp_u_d          __builtin_msa_dotp_u_d
+#define msa_dpadd_s_h         __builtin_msa_dpadd_s_h
+#define msa_dpadd_s_w         __builtin_msa_dpadd_s_w
+#define msa_dpadd_s_d         __builtin_msa_dpadd_s_d
+#define msa_dpadd_u_h         __builtin_msa_dpadd_u_h
+#define msa_dpadd_u_w         __builtin_msa_dpadd_u_w
+#define msa_dpadd_u_d         __builtin_msa_dpadd_u_d
+
+#define ILVRL_B2(RTYPE, in0, in1, low, hi) do {       \
+      low = (RTYPE)__builtin_msa_ilvr_b((v16i8)(in0), (v16i8)(in1));  \
+      hi  = (RTYPE)__builtin_msa_ilvl_b((v16i8)(in0), (v16i8)(in1));  \
+    } while (0)
+#define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__)
+#define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__)
+#define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__)
+#define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__)
+#define ILVRL_B2_SW(...) ILVRL_B2(v4i32, __VA_ARGS__)
+
+#define ILVRL_H2(RTYPE, in0, in1, low, hi) do {       \
+      low = (RTYPE)__builtin_msa_ilvr_h((v8i16)(in0), (v8i16)(in1));  \
+      hi  = (RTYPE)__builtin_msa_ilvl_h((v8i16)(in0), (v8i16)(in1));  \
+    } while (0)
+#define ILVRL_H2_UB(...) ILVRL_H2(v16u8, __VA_ARGS__)
+#define ILVRL_H2_SB(...) ILVRL_H2(v16i8, __VA_ARGS__)
+#define ILVRL_H2_UH(...) ILVRL_H2(v8u16, __VA_ARGS__)
+#define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__)
+#define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__)
+#define ILVRL_H2_UW(...) ILVRL_H2(v4u32, __VA_ARGS__)
+
+#define ILVRL_W2(RTYPE, in0, in1, low, hi) do {       \
+      low = (RTYPE)__builtin_msa_ilvr_w((v4i32)(in0), (v4i32)(in1));  \
+      hi  = (RTYPE)__builtin_msa_ilvl_w((v4i32)(in0), (v4i32)(in1));  \
+    } while (0)
+#define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__)
+#define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)
+#define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
+#define ILVRL_W2_UW(...) ILVRL_W2(v4u32, __VA_ARGS__)
+
+/* absq, qabsq (r = |a|;) */
+#define msa_absq_s8(a)        __builtin_msa_add_a_b(a, __builtin_msa_fill_b(0))
+#define msa_absq_s16(a)       __builtin_msa_add_a_h(a, __builtin_msa_fill_h(0))
+#define msa_absq_s32(a)       __builtin_msa_add_a_w(a, __builtin_msa_fill_w(0))
+#define msa_absq_s64(a)       __builtin_msa_add_a_d(a, __builtin_msa_fill_d(0))
+#define msa_absq_f32(a)       ((v4f32)__builtin_msa_bclri_w((v4u32)(a), 31))
+#define msa_absq_f64(a)       ((v2f64)__builtin_msa_bclri_d((v2u64)(a), 63))
+#define msa_qabsq_s8(a)       __builtin_msa_adds_a_b(a, __builtin_msa_fill_b(0))
+#define msa_qabsq_s16(a)      __builtin_msa_adds_a_h(a, __builtin_msa_fill_h(0))
+#define msa_qabsq_s32(a)      __builtin_msa_adds_a_w(a, __builtin_msa_fill_w(0))
+#define msa_qabsq_s64(a)      __builtin_msa_adds_a_d(a, __builtin_msa_fill_d(0))
+
+/* abdq, qabdq (r = |a - b|;) */
+#define msa_abdq_u8           __builtin_msa_asub_u_b
+#define msa_abdq_s8           __builtin_msa_asub_s_b
+#define msa_abdq_u16          __builtin_msa_asub_u_h
+#define msa_abdq_s16          __builtin_msa_asub_s_h
+#define msa_abdq_u32          __builtin_msa_asub_u_w
+#define msa_abdq_s32          __builtin_msa_asub_s_w
+#define msa_abdq_u64          __builtin_msa_asub_u_d
+#define msa_abdq_s64          __builtin_msa_asub_s_d
+#define msa_abdq_f32(a, b)    msa_absq_f32(__builtin_msa_fsub_w(a, b))
+#define msa_abdq_f64(a, b)    msa_absq_f64(__builtin_msa_fsub_d(a, b))
+#define msa_qabdq_s8(a, b)    msa_qabsq_s8(__builtin_msa_subs_s_b(a, b))
+#define msa_qabdq_s16(a, b)   msa_qabsq_s16(__builtin_msa_subs_s_h(a, b))
+#define msa_qabdq_s32(a, b)   msa_qabsq_s32(__builtin_msa_subs_s_w(a, b))
+#define msa_qabdq_s64(a, b)   msa_qabsq_s64(__builtin_msa_subs_s_d(a, b))
+
+/* sqrtq, rsqrtq */
+#define msa_sqrtq_f32         __builtin_msa_fsqrt_w
+#define msa_sqrtq_f64         __builtin_msa_fsqrt_d
+#define msa_rsqrtq_f32        __builtin_msa_frsqrt_w
+#define msa_rsqrtq_f64        __builtin_msa_frsqrt_d
+
+
+/* mlaq: r = a + b * c; */
+__extension__ extern __inline v4i32
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+msa_mlaq_s32(v4i32 __a, v4i32 __b, v4i32 __c)
+{
+  __asm__ volatile("maddv.w %w[__a], %w[__b], %w[__c]\n"
+               // Outputs
+               : [__a] "+f"(__a)
+               // Inputs
+               : [__b] "f"(__b), [__c] "f"(__c));
+  return __a;
+}
+
+__extension__ extern __inline v2i64
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+msa_mlaq_s64(v2i64 __a, v2i64 __b, v2i64 __c)
+{
+  __asm__ volatile("maddv.d %w[__a], %w[__b], %w[__c]\n"
+               // Outputs
+               : [__a] "+f"(__a)
+               // Inputs
+               : [__b] "f"(__b), [__c] "f"(__c));
+  return __a;
+}
+
+__extension__ extern __inline v4f32
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+msa_mlaq_f32(v4f32 __a, v4f32 __b, v4f32 __c)
+{
+  __asm__ volatile("fmadd.w %w[__a], %w[__b], %w[__c]\n"
+               // Outputs
+               : [__a] "+f"(__a)
+               // Inputs
+               : [__b] "f"(__b), [__c] "f"(__c));
+  return __a;
+}
+
+__extension__ extern __inline v2f64
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+msa_mlaq_f64(v2f64 __a, v2f64 __b, v2f64 __c)
+{
+  __asm__ volatile("fmadd.d %w[__a], %w[__b], %w[__c]\n"
+               // Outputs
+               : [__a] "+f"(__a)
+               // Inputs
+               : [__b] "f"(__b), [__c] "f"(__c));
+  return __a;
+}
+
+/* cntq */
+#define msa_cntq_s8           __builtin_msa_pcnt_b
+#define msa_cntq_s16          __builtin_msa_pcnt_h
+#define msa_cntq_s32          __builtin_msa_pcnt_w
+#define msa_cntq_s64          __builtin_msa_pcnt_d
+
+/* bslq (a: mask; r = b(if a == 0); r = c(if a == 1);) */
+#define msa_bslq_u8           __builtin_msa_bsel_v
+
+/* ilvrq, ilvlq (For EL only, ilvrq: b0, a0, b1, a1; ilvlq: b2, a2, b3, a3;) */
+#define msa_ilvrq_s8          __builtin_msa_ilvr_b
+#define msa_ilvrq_s16         __builtin_msa_ilvr_h
+#define msa_ilvrq_s32         __builtin_msa_ilvr_w
+#define msa_ilvrq_s64         __builtin_msa_ilvr_d
+#define msa_ilvlq_s8          __builtin_msa_ilvl_b
+#define msa_ilvlq_s16         __builtin_msa_ilvl_h
+#define msa_ilvlq_s32         __builtin_msa_ilvl_w
+#define msa_ilvlq_s64         __builtin_msa_ilvl_d
+
+/* ilvevq, ilvodq (ilvevq: b0, a0, b2, a2; ilvodq: b1, a1, b3, a3; ) */
+#define msa_ilvevq_s8         __builtin_msa_ilvev_b
+#define msa_ilvevq_s16        __builtin_msa_ilvev_h
+#define msa_ilvevq_s32        __builtin_msa_ilvev_w
+#define msa_ilvevq_s64        __builtin_msa_ilvev_d
+#define msa_ilvodq_s8         __builtin_msa_ilvod_b
+#define msa_ilvodq_s16        __builtin_msa_ilvod_h
+#define msa_ilvodq_s32        __builtin_msa_ilvod_w
+#define msa_ilvodq_s64        __builtin_msa_ilvod_d
+
+/* extq (r = (a || b); a concatenation b and get elements from index c) */
+#ifdef _MIPSEB
+#define msa_extq_s8(a, b, c)  \
+(__builtin_msa_vshf_b(__builtin_msa_subv_b((v16i8)((v2i64){0x1716151413121110, 0x1F1E1D1C1B1A1918}), __builtin_msa_fill_b(c)), a, b))
+#define msa_extq_s16(a, b, c) \
+(__builtin_msa_vshf_h(__builtin_msa_subv_h((v8i16)((v2i64){0x000B000A00090008, 0x000F000E000D000C}), __builtin_msa_fill_h(c)), a, b))
+#define msa_extq_s32(a, b, c) \
+(__builtin_msa_vshf_w(__builtin_msa_subv_w((v4i32)((v2i64){0x0000000500000004, 0x0000000700000006}), __builtin_msa_fill_w(c)), a, b))
+#define msa_extq_s64(a, b, c) \
+(__builtin_msa_vshf_d(__builtin_msa_subv_d((v2i64){0x0000000000000002, 0x0000000000000003}, __builtin_msa_fill_d(c)), a, b))
+#else
+#define msa_extq_s8(a, b, c)  \
+(__builtin_msa_vshf_b(__builtin_msa_addv_b((v16i8)((v2i64){0x0706050403020100, 0x0F0E0D0C0B0A0908}), __builtin_msa_fill_b(c)), b, a))
+#define msa_extq_s16(a, b, c) \
+(__builtin_msa_vshf_h(__builtin_msa_addv_h((v8i16)((v2i64){0x0003000200010000, 0x0007000600050004}), __builtin_msa_fill_h(c)), b, a))
+#define msa_extq_s32(a, b, c) \
+(__builtin_msa_vshf_w(__builtin_msa_addv_w((v4i32)((v2i64){0x0000000100000000, 0x0000000300000002}), __builtin_msa_fill_w(c)), b, a))
+#define msa_extq_s64(a, b, c) \
+(__builtin_msa_vshf_d(__builtin_msa_addv_d((v2i64){0x0000000000000000, 0x0000000000000001}, __builtin_msa_fill_d(c)), b, a))
+#endif /* _MIPSEB */
+
+/* cvttruncq, cvttintq, cvtrintq */
+#define msa_cvttruncq_u32_f32 __builtin_msa_ftrunc_u_w
+#define msa_cvttruncq_s32_f32 __builtin_msa_ftrunc_s_w
+#define msa_cvttruncq_u64_f64 __builtin_msa_ftrunc_u_d
+#define msa_cvttruncq_s64_f64 __builtin_msa_ftrunc_s_d
+#define msa_cvttintq_u32_f32  __builtin_msa_ftint_u_w
+#define msa_cvttintq_s32_f32  __builtin_msa_ftint_s_w
+#define msa_cvttintq_u64_f64  __builtin_msa_ftint_u_d
+#define msa_cvttintq_s64_f64  __builtin_msa_ftint_s_d
+#define msa_cvtrintq_f32      __builtin_msa_frint_w
+#define msa_cvtrintq_f64      __builtin_msa_frint_d
+
+/* cvtfintq, cvtfq */
+#define msa_cvtfintq_f32_u32  __builtin_msa_ffint_u_w
+#define msa_cvtfintq_f32_s32  __builtin_msa_ffint_s_w
+#define msa_cvtfintq_f64_u64  __builtin_msa_ffint_u_d
+#define msa_cvtfintq_f64_s64  __builtin_msa_ffint_s_d
+#define msa_cvtfq_f32_f64     __builtin_msa_fexdo_w
+#define msa_cvtflq_f64_f32    __builtin_msa_fexupr_d
+#define msa_cvtfhq_f64_f32    __builtin_msa_fexupl_d
+
+#define msa_addl_u8(a, b)     ((v8u16)__builtin_msa_addv_h((v8i16)V8U8_2_V8I16(a), (v8i16)V8U8_2_V8I16(b)))
+#define msa_addl_s8(a, b)     (__builtin_msa_addv_h((v8i16)V8I8_2_V8I16(a), (v8i16)V8I8_2_V8I16(b)))
+#define msa_addl_u16(a, b)    ((v4u32)__builtin_msa_addv_w((v4i32)V4U16_2_V4I32(a), (v4i32)V4U16_2_V4I32(b)))
+#define msa_addl_s16(a, b)    (__builtin_msa_addv_w((v4i32)V4I16_2_V4I32(a), (v4i32)V4I16_2_V4I32(b)))
+#define msa_subl_s16(a, b)    (__builtin_msa_subv_w((v4i32)V4I16_2_V4I32(a), (v4i32)V4I16_2_V4I32(b)))
+#define msa_recpeq_f32        __builtin_msa_frcp_w
+#define msa_recpsq_f32(a, b)  (__builtin_msa_fsub_w(msa_dupq_n_f32(2.0f), __builtin_msa_fmul_w(a, b)))
+
+#define MSA_INTERLEAVED_IMPL_LOAD2_STORE2(_Tp, _Tpv, _Tpvs, suffix, df, nlanes) \
+__extension__ extern __inline void \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
+msa_ld2q_##suffix(const _Tp* ptr, _Tpv* a, _Tpv* b) \
+{ \
+  _Tpv v0 = msa_ld1q_##suffix(ptr); \
+  _Tpv v1 = msa_ld1q_##suffix(ptr + nlanes); \
+  *a = (_Tpv)__builtin_msa_pckev_##df((_Tpvs)v1, (_Tpvs)v0); \
+  *b = (_Tpv)__builtin_msa_pckod_##df((_Tpvs)v1, (_Tpvs)v0); \
+} \
+__extension__ extern __inline void \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
+msa_st2q_##suffix(_Tp* ptr, const _Tpv a, const _Tpv b) \
+{ \
+  msa_st1q_##suffix(ptr, (_Tpv)__builtin_msa_ilvr_##df((_Tpvs)b, (_Tpvs)a)); \
+  msa_st1q_##suffix(ptr + nlanes, (_Tpv)__builtin_msa_ilvl_##df((_Tpvs)b, (_Tpvs)a)); \
+}
+
+MSA_INTERLEAVED_IMPL_LOAD2_STORE2(uint8_t, v16u8, v16i8, u8, b, 16)
+MSA_INTERLEAVED_IMPL_LOAD2_STORE2(int8_t, v16i8, v16i8, s8, b, 16)
+MSA_INTERLEAVED_IMPL_LOAD2_STORE2(uint16_t, v8u16, v8i16, u16, h, 8)
+MSA_INTERLEAVED_IMPL_LOAD2_STORE2(int16_t, v8i16, v8i16, s16, h, 8)
+MSA_INTERLEAVED_IMPL_LOAD2_STORE2(uint32_t, v4u32, v4i32, u32, w, 4)
+MSA_INTERLEAVED_IMPL_LOAD2_STORE2(int32_t, v4i32, v4i32, s32, w, 4)
+MSA_INTERLEAVED_IMPL_LOAD2_STORE2(float, v4f32, v4i32, f32, w, 4)
+MSA_INTERLEAVED_IMPL_LOAD2_STORE2(uint64_t, v2u64, v2i64, u64, d, 2)
+MSA_INTERLEAVED_IMPL_LOAD2_STORE2(int64_t, v2i64, v2i64, s64, d, 2)
+MSA_INTERLEAVED_IMPL_LOAD2_STORE2(double, v2f64, v2i64, f64, d, 2)
+
+#ifdef _MIPSEB
+#define MSA_INTERLEAVED_IMPL_LOAD3_8(_Tp, _Tpv, _Tpvs, suffix) \
+__extension__ extern __inline void \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
+msa_ld3q_##suffix(const _Tp* ptr, _Tpv* a, _Tpv* b, _Tpv* c) \
+{ \
+  _Tpv v0 = msa_ld1q_##suffix(ptr); \
+  _Tpv v1 = msa_ld1q_##suffix(ptr + 16); \
+  _Tpv v2 = msa_ld1q_##suffix(ptr + 32); \
+  _Tpvs v3 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x0704011F1F1F1F1F, 0x1F1C191613100D0A}), (_Tpvs)v0, (_Tpvs)v1); \
+  *a = (_Tpv)__builtin_msa_vshf_b((_Tpvs)((v2i64){0x1716150E0B080502, 0x1F1E1D1C1B1A1918}), v3, (_Tpvs)v2); \
+  v3 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x0603001F1F1F1F1F, 0x1E1B1815120F0C09}), (_Tpvs)v0, (_Tpvs)v1); \
+  *b = (_Tpv)__builtin_msa_vshf_b((_Tpvs)((v2i64){0x1716150D0A070401, 0x1F1E1D1C1B1A1918}), v3, (_Tpvs)v2); \
+  v3 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x05021F1F1F1F1F1F, 0x1D1A1714110E0B08}), (_Tpvs)v0, (_Tpvs)v1); \
+  *c = (_Tpv)__builtin_msa_vshf_b((_Tpvs)((v2i64){0x17160F0C09060300, 0x1F1E1D1C1B1A1918}), v3, (_Tpvs)v2); \
+}
+#else
+#define MSA_INTERLEAVED_IMPL_LOAD3_8(_Tp, _Tpv, _Tpvs, suffix) \
+__extension__ extern __inline void \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
+msa_ld3q_##suffix(const _Tp* ptr, _Tpv* a, _Tpv* b, _Tpv* c) \
+{ \
+  _Tpv v0 = msa_ld1q_##suffix(ptr); \
+  _Tpv v1 = msa_ld1q_##suffix(ptr + 16); \
+  _Tpv v2 = msa_ld1q_##suffix(ptr + 32); \
+  _Tpvs v3 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x15120F0C09060300, 0x00000000001E1B18}), (_Tpvs)v1, (_Tpvs)v0); \
+  *a = (_Tpv)__builtin_msa_vshf_b((_Tpvs)((v2i64){0x0706050403020100, 0x1D1A1714110A0908}), (_Tpvs)v2, v3); \
+  v3 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x1613100D0A070401, 0x00000000001F1C19}), (_Tpvs)v1, (_Tpvs)v0); \
+  *b = (_Tpv)__builtin_msa_vshf_b((_Tpvs)((v2i64){0x0706050403020100, 0x1E1B1815120A0908}), (_Tpvs)v2, v3); \
+  v3 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x1714110E0B080502, 0x0000000000001D1A}), (_Tpvs)v1, (_Tpvs)v0); \
+  *c = (_Tpv)__builtin_msa_vshf_b((_Tpvs)((v2i64){0x0706050403020100, 0x1F1C191613100908}), (_Tpvs)v2, v3); \
+}
+#endif
+
+MSA_INTERLEAVED_IMPL_LOAD3_8(uint8_t, v16u8, v16i8, u8)
+MSA_INTERLEAVED_IMPL_LOAD3_8(int8_t, v16i8, v16i8, s8)
+
+#ifdef _MIPSEB
+#define MSA_INTERLEAVED_IMPL_LOAD3_16(_Tp, _Tpv, _Tpvs, suffix) \
+__extension__ extern __inline void \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
+msa_ld3q_##suffix(const _Tp* ptr, _Tpv* a, _Tpv* b, _Tpv* c) \
+{ \
+  _Tpv v0 = msa_ld1q_##suffix(ptr); \
+  _Tpv v1 = msa_ld1q_##suffix(ptr + 8); \
+  _Tpv v2 = msa_ld1q_##suffix(ptr + 16); \
+  _Tpvs v3 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x00030000000F000F, 0x000F000C00090006}), (_Tpvs)v1, (_Tpvs)v0); \
+  *a = (_Tpv)__builtin_msa_vshf_h((_Tpvs)((v2i64){0x000B000A00050002, 0x000F000E000D000C}), (_Tpvs)v2, v3); \
+  v3 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x0002000F000F000F, 0x000E000B00080005}), (_Tpvs)v1, (_Tpvs)v0); \
+  *b = (_Tpv)__builtin_msa_vshf_h((_Tpvs)((v2i64){0x000B000700040001, 0x000F000E000D000C}), (_Tpvs)v2, v3); \
+  v3 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x0001000F000F000F, 0x000D000A00070004}), (_Tpvs)v1, (_Tpvs)v0); \
+  *c = (_Tpv)__builtin_msa_vshf_h((_Tpvs)((v2i64){0x000B000600030000, 0x000F000E000D000C}), (_Tpvs)v2, v3); \
+}
+#else
+#define MSA_INTERLEAVED_IMPL_LOAD3_16(_Tp, _Tpv, _Tpvs, suffix) \
+__extension__ extern __inline void \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
+msa_ld3q_##suffix(const _Tp* ptr, _Tpv* a, _Tpv* b, _Tpv* c) \
+{ \
+  _Tpv v0 = msa_ld1q_##suffix(ptr); \
+  _Tpv v1 = msa_ld1q_##suffix(ptr + 8); \
+  _Tpv v2 = msa_ld1q_##suffix(ptr + 16); \
+  _Tpvs v3 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x0009000600030000, 0x00000000000F000C}), (_Tpvs)v1, (_Tpvs)v0); \
+  *a = (_Tpv)__builtin_msa_vshf_h((_Tpvs)((v2i64){0x0003000200010000, 0x000D000A00050004}), (_Tpvs)v2, v3); \
+  v3 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x000A000700040001, 0x000000000000000D}), (_Tpvs)v1, (_Tpvs)v0); \
+  *b = (_Tpv)__builtin_msa_vshf_h((_Tpvs)((v2i64){0x0003000200010000, 0x000E000B00080004}), (_Tpvs)v2, v3); \
+  v3 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x000B000800050002, 0x000000000000000E}), (_Tpvs)v1, (_Tpvs)v0); \
+  *c = (_Tpv)__builtin_msa_vshf_h((_Tpvs)((v2i64){0x0003000200010000, 0x000F000C00090004}), (_Tpvs)v2, v3); \
+}
+#endif
+
+MSA_INTERLEAVED_IMPL_LOAD3_16(uint16_t, v8u16, v8i16, u16)
+MSA_INTERLEAVED_IMPL_LOAD3_16(int16_t, v8i16, v8i16, s16)
+
+#define MSA_INTERLEAVED_IMPL_LOAD3_32(_Tp, _Tpv, _Tpvs, suffix) \
+__extension__ extern __inline void \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
+msa_ld3q_##suffix(const _Tp* ptr, _Tpv* a, _Tpv* b, _Tpv* c) \
+{ \
+  _Tpv v00 = msa_ld1q_##suffix(ptr); \
+  _Tpv v01 = msa_ld1q_##suffix(ptr + 4); \
+  _Tpv v02 = msa_ld1q_##suffix(ptr + 8); \
+  _Tpvs v10 = __builtin_msa_ilvr_w((_Tpvs)__builtin_msa_ilvl_d((v2i64)v01, (v2i64)v01), (_Tpvs)v00); \
+  _Tpvs v11 = __builtin_msa_ilvr_w((_Tpvs)v02, (_Tpvs)__builtin_msa_ilvl_d((v2i64)v00, (v2i64)v00)); \
+  _Tpvs v12 = __builtin_msa_ilvr_w((_Tpvs)__builtin_msa_ilvl_d((v2i64)v02, (v2i64)v02), (_Tpvs)v01); \
+  *a = (_Tpv)__builtin_msa_ilvr_w((_Tpvs)__builtin_msa_ilvl_d((v2i64)v11, (v2i64)v11), v10); \
+  *b = (_Tpv)__builtin_msa_ilvr_w(v12, (_Tpvs)__builtin_msa_ilvl_d((v2i64)v10, (v2i64)v10)); \
+  *c = (_Tpv)__builtin_msa_ilvr_w((_Tpvs)__builtin_msa_ilvl_d((v2i64)v12, (v2i64)v12), v11); \
+}
+
+MSA_INTERLEAVED_IMPL_LOAD3_32(uint32_t, v4u32, v4i32, u32)
+MSA_INTERLEAVED_IMPL_LOAD3_32(int32_t, v4i32, v4i32, s32)
+MSA_INTERLEAVED_IMPL_LOAD3_32(float, v4f32, v4i32, f32)
+
+#define MSA_INTERLEAVED_IMPL_LOAD3_64(_Tp, _Tpv, suffix) \
+__extension__ extern __inline void \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
+msa_ld3q_##suffix(const _Tp* ptr, _Tpv* a, _Tpv* b, _Tpv* c) \
+{ \
+  *((_Tp*)a) = *ptr;           *((_Tp*)b) = *(ptr + 1);     *((_Tp*)c) = *(ptr + 2);     \
+  *((_Tp*)a + 1) = *(ptr + 3); *((_Tp*)b + 1) = *(ptr + 4); *((_Tp*)c + 1) = *(ptr + 5); \
+}
+
+MSA_INTERLEAVED_IMPL_LOAD3_64(uint64_t, v2u64, u64)
+MSA_INTERLEAVED_IMPL_LOAD3_64(int64_t, v2i64, s64)
+MSA_INTERLEAVED_IMPL_LOAD3_64(double, v2f64, f64)
+
+#ifdef _MIPSEB
+#define MSA_INTERLEAVED_IMPL_STORE3_8(_Tp, _Tpv, _Tpvs, suffix) \
+__extension__ extern __inline void \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
+msa_st3q_##suffix(_Tp* ptr, const _Tpv a, const _Tpv b, const _Tpv c) \
+{ \
+  _Tpvs v0 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x0F0E0D0C0B1F1F1F, 0x1F1E1D1C1B1A1F1F}), (_Tpvs)b, (_Tpvs)a); \
+  _Tpvs v1 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x0D1C140C1B130B1A, 0x1F170F1E160E1D15}), (_Tpvs)c, (_Tpvs)v0); \
+  msa_st1q_##suffix(ptr, (_Tpv)v1); \
+  v0 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x0A09080706051F1F, 0x19181716151F1F1F}), (_Tpvs)b, (_Tpvs)a); \
+  v1 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x1D14071C13061B12, 0x170A1F16091E1508}), (_Tpvs)c, (_Tpvs)v0); \
+  msa_st1q_##suffix(ptr + 16, (_Tpv)v1); \
+  v0 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x04030201001F1F1F, 0x14131211101F1F1F}), (_Tpvs)b, (_Tpvs)a); \
+  v1 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x15021C14011B1300, 0x051F17041E16031D}), (_Tpvs)c, (_Tpvs)v0); \
+  msa_st1q_##suffix(ptr + 32, (_Tpv)v1); \
+}
+#else
+#define MSA_INTERLEAVED_IMPL_STORE3_8(_Tp, _Tpv, _Tpvs, suffix) \
+__extension__ extern __inline void \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
+msa_st3q_##suffix(_Tp* ptr, const _Tpv a, const _Tpv b, const _Tpv c) \
+{ \
+  _Tpvs v0 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x0000050403020100, 0x0000001413121110}), (_Tpvs)b, (_Tpvs)a); \
+  _Tpvs v1 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x0A02110901100800, 0x05140C04130B0312}), (_Tpvs)c, (_Tpvs)v0); \
+  msa_st1q_##suffix(ptr, (_Tpv)v1); \
+  v0 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x0000000A09080706, 0x00001A1918171615}), (_Tpvs)b, (_Tpvs)a); \
+  v1 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x170A011609001508, 0x0D04190C03180B02}), (_Tpvs)c, (_Tpvs)v0); \
+  msa_st1q_##suffix(ptr + 16, (_Tpv)v1); \
+  v0 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x0000000F0E0D0C0B, 0x0000001F1E1D1C1B}), (_Tpvs)b, (_Tpvs)a); \
+  v1 = __builtin_msa_vshf_b((_Tpvs)((v2i64){0x021C09011B08001A, 0x1F0C041E0B031D0A}), (_Tpvs)c, (_Tpvs)v0); \
+  msa_st1q_##suffix(ptr + 32, (_Tpv)v1); \
+}
+#endif
+
+MSA_INTERLEAVED_IMPL_STORE3_8(uint8_t, v16u8, v16i8, u8)
+MSA_INTERLEAVED_IMPL_STORE3_8(int8_t, v16i8, v16i8, s8)
+
+#ifdef _MIPSEB
+#define MSA_INTERLEAVED_IMPL_STORE3_16(_Tp, _Tpv, _Tpvs, suffix) \
+__extension__ extern __inline void \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
+msa_st3q_##suffix(_Tp* ptr, const _Tpv a, const _Tpv b, const _Tpv c) \
+{ \
+  _Tpvs v0 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x000700060005000F, 0x000F000E000D000F}), (_Tpvs)b, (_Tpvs)a); \
+  _Tpvs v1 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x000A0006000D0009, 0x000F000B0007000E}), (_Tpvs)c, (_Tpvs)v0); \
+  msa_st1q_##suffix(ptr, (_Tpv)v1); \
+  v0 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x00040003000F000F, 0x000C000B000A000F}), (_Tpvs)b, (_Tpvs)a); \
+  v1 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x000E000A0003000D, 0x0005000F000B0004}), (_Tpvs)c, (_Tpvs)v0); \
+  msa_st1q_##suffix(ptr + 8, (_Tpv)v1); \
+  v0 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x000200010000000F, 0x00090008000F000F}), (_Tpvs)b, (_Tpvs)a); \
+  v1 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x0001000E00090000, 0x000B0002000F000A}), (_Tpvs)c, (_Tpvs)v0); \
+  msa_st1q_##suffix(ptr + 16, (_Tpv)v1); \
+}
+#else
+#define MSA_INTERLEAVED_IMPL_STORE3_16(_Tp, _Tpv, _Tpvs, suffix) \
+__extension__ extern __inline void \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
+msa_st3q_##suffix(_Tp* ptr, const _Tpv a, const _Tpv b, const _Tpv c) \
+{ \
+  _Tpvs v0 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x0000000200010000, 0x0000000A00090008}), (_Tpvs)b, (_Tpvs)a); \
+  _Tpvs v1 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x0001000800040000, 0x0006000200090005}), (_Tpvs)c, (_Tpvs)v0); \
+  msa_st1q_##suffix(ptr, (_Tpv)v1); \
+  v0 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x0000000500040003, 0x00000000000C000B}), (_Tpvs)b, (_Tpvs)a); \
+  v1 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x000B00040000000A, 0x0002000C00050001}), (_Tpvs)c, (_Tpvs)v0); \
+  msa_st1q_##suffix(ptr + 8, (_Tpv)v1); \
+  v0 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x0000000000070006, 0x0000000F000E000D}), (_Tpvs)b, (_Tpvs)a); \
+  v1 = __builtin_msa_vshf_h((_Tpvs)((v2i64){0x00050000000D0004, 0x000F00060001000E}), (_Tpvs)c, (_Tpvs)v0); \
+  msa_st1q_##suffix(ptr + 16, (_Tpv)v1); \
+}
+#endif
+
+MSA_INTERLEAVED_IMPL_STORE3_16(uint16_t, v8u16, v8i16, u16)
+MSA_INTERLEAVED_IMPL_STORE3_16(int16_t, v8i16, v8i16, s16)
+
+#ifdef _MIPSEB
+#define MSA_INTERLEAVED_IMPL_STORE3_32(_Tp, _Tpv, _Tpvs, suffix) \
+__extension__ extern __inline void \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
+msa_st3q_##suffix(_Tp* ptr, const _Tpv a, const _Tpv b, const _Tpv c) \
+{ \
+  _Tpvs v0 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000300000007, 0x0000000700000006}), (_Tpvs)b, (_Tpvs)a); \
+  _Tpvs v1 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000300000006, 0x0000000700000005}), (_Tpvs)c, (_Tpvs)v0); \
+  msa_st1q_##suffix(ptr, (_Tpv)v1); \
+  v0 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000200000001, 0x0000000500000007}), (_Tpvs)b, (_Tpvs)a); \
+  v1 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000700000004, 0x0000000500000002}), (_Tpvs)c, (_Tpvs)v0); \
+  msa_st1q_##suffix(ptr + 4, (_Tpv)v1); \
+  v0 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000000000007, 0x0000000400000007}), (_Tpvs)b, (_Tpvs)a); \
+  v1 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000500000000, 0x0000000100000007}), (_Tpvs)c, (_Tpvs)v0); \
+  msa_st1q_##suffix(ptr + 8, (_Tpv)v1); \
+}
+#else
+#define MSA_INTERLEAVED_IMPL_STORE3_32(_Tp, _Tpv, _Tpvs, suffix) \
+__extension__ extern __inline void \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
+msa_st3q_##suffix(_Tp* ptr, const _Tpv a, const _Tpv b, const _Tpv c) \
+{ \
+  _Tpvs v0 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000100000000, 0x0000000000000004}), (_Tpvs)b, (_Tpvs)a); \
+  _Tpvs v1 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000200000000, 0x0000000100000004}), (_Tpvs)c, (_Tpvs)v0); \
+  msa_st1q_##suffix(ptr, (_Tpv)v1); \
+  v0 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000000000002, 0x0000000600000005}), (_Tpvs)b, (_Tpvs)a); \
+  v1 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000500000002, 0x0000000300000000}), (_Tpvs)c, (_Tpvs)v0); \
+  msa_st1q_##suffix(ptr + 4, (_Tpv)v1); \
+  v0 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000000000003, 0x0000000000000007}), (_Tpvs)b, (_Tpvs)a); \
+  v1 = __builtin_msa_vshf_w((_Tpvs)((v2i64){0x0000000000000006, 0x0000000700000002}), (_Tpvs)c, (_Tpvs)v0); \
+  msa_st1q_##suffix(ptr + 8, (_Tpv)v1); \
+}
+#endif
+
+MSA_INTERLEAVED_IMPL_STORE3_32(uint32_t, v4u32, v4i32, u32)
+MSA_INTERLEAVED_IMPL_STORE3_32(int32_t, v4i32, v4i32, s32)
+MSA_INTERLEAVED_IMPL_STORE3_32(float, v4f32, v4i32, f32)
+
+#define MSA_INTERLEAVED_IMPL_STORE3_64(_Tp, _Tpv, suffix) \
+__extension__ extern __inline void \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
+msa_st3q_##suffix(_Tp* ptr, const _Tpv a, const _Tpv b, const _Tpv c) \
+{ \
+  *ptr = a[0];       *(ptr + 1) = b[0]; *(ptr + 2) = c[0]; \
+  *(ptr + 3) = a[1]; *(ptr + 4) = b[1]; *(ptr + 5) = c[1]; \
+}
+
+MSA_INTERLEAVED_IMPL_STORE3_64(uint64_t, v2u64, u64)
+MSA_INTERLEAVED_IMPL_STORE3_64(int64_t, v2i64, s64)
+MSA_INTERLEAVED_IMPL_STORE3_64(double, v2f64, f64)
+
+#define MSA_INTERLEAVED_IMPL_LOAD4_STORE4(_Tp, _Tpv, _Tpvs, suffix, df, nlanes) \
+__extension__ extern __inline void \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
+msa_ld4q_##suffix(const _Tp* ptr, _Tpv* a, _Tpv* b, _Tpv* c, _Tpv* d) \
+{ \
+  _Tpv v0 = msa_ld1q_##suffix(ptr); \
+  _Tpv v1 = msa_ld1q_##suffix(ptr + nlanes); \
+  _Tpv v2 = msa_ld1q_##suffix(ptr + nlanes * 2); \
+  _Tpv v3 = msa_ld1q_##suffix(ptr + nlanes * 3); \
+  _Tpvs t0 = __builtin_msa_pckev_##df((_Tpvs)v1, (_Tpvs)v0); \
+  _Tpvs t1 = __builtin_msa_pckev_##df((_Tpvs)v3, (_Tpvs)v2); \
+  _Tpvs t2 = __builtin_msa_pckod_##df((_Tpvs)v1, (_Tpvs)v0); \
+  _Tpvs t3 = __builtin_msa_pckod_##df((_Tpvs)v3, (_Tpvs)v2); \
+  *a = (_Tpv)__builtin_msa_pckev_##df(t1, t0); \
+  *b = (_Tpv)__builtin_msa_pckev_##df(t3, t2); \
+  *c = (_Tpv)__builtin_msa_pckod_##df(t1, t0); \
+  *d = (_Tpv)__builtin_msa_pckod_##df(t3, t2); \
+} \
+__extension__ extern __inline void \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
+msa_st4q_##suffix(_Tp* ptr, const _Tpv a, const _Tpv b, const _Tpv c, const _Tpv d) \
+{ \
+  _Tpvs v0 = __builtin_msa_ilvr_##df((_Tpvs)c, (_Tpvs)a); \
+  _Tpvs v1 = __builtin_msa_ilvr_##df((_Tpvs)d, (_Tpvs)b); \
+  _Tpvs v2 = __builtin_msa_ilvl_##df((_Tpvs)c, (_Tpvs)a); \
+  _Tpvs v3 = __builtin_msa_ilvl_##df((_Tpvs)d, (_Tpvs)b); \
+  msa_st1q_##suffix(ptr, (_Tpv)__builtin_msa_ilvr_##df(v1, v0)); \
+  msa_st1q_##suffix(ptr + nlanes, (_Tpv)__builtin_msa_ilvl_##df(v1, v0)); \
+  msa_st1q_##suffix(ptr + 2 * nlanes, (_Tpv)__builtin_msa_ilvr_##df(v3, v2)); \
+  msa_st1q_##suffix(ptr + 3 * nlanes, (_Tpv)__builtin_msa_ilvl_##df(v3, v2)); \
+}
+
+MSA_INTERLEAVED_IMPL_LOAD4_STORE4(uint8_t, v16u8, v16i8, u8, b, 16)
+MSA_INTERLEAVED_IMPL_LOAD4_STORE4(int8_t, v16i8, v16i8, s8, b, 16)
+MSA_INTERLEAVED_IMPL_LOAD4_STORE4(uint16_t, v8u16, v8i16, u16, h, 8)
+MSA_INTERLEAVED_IMPL_LOAD4_STORE4(int16_t, v8i16, v8i16, s16, h, 8)
+MSA_INTERLEAVED_IMPL_LOAD4_STORE4(uint32_t, v4u32, v4i32, u32, w, 4)
+MSA_INTERLEAVED_IMPL_LOAD4_STORE4(int32_t, v4i32, v4i32, s32, w, 4)
+MSA_INTERLEAVED_IMPL_LOAD4_STORE4(float, v4f32, v4i32, f32, w, 4)
+
+#define MSA_INTERLEAVED_IMPL_LOAD4_STORE4_64(_Tp, _Tpv, _Tpvs, suffix) \
+__extension__ extern __inline void \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
+msa_ld4q_##suffix(const _Tp* ptr, _Tpv* a, _Tpv* b, _Tpv* c, _Tpv* d) \
+{ \
+  _Tpv v0 = msa_ld1q_##suffix(ptr); \
+  _Tpv v1 = msa_ld1q_##suffix(ptr + 2); \
+  _Tpv v2 = msa_ld1q_##suffix(ptr + 4); \
+  _Tpv v3 = msa_ld1q_##suffix(ptr + 6); \
+  *a = (_Tpv)__builtin_msa_ilvr_d((_Tpvs)v2, (_Tpvs)v0); \
+  *b = (_Tpv)__builtin_msa_ilvl_d((_Tpvs)v2, (_Tpvs)v0); \
+  *c = (_Tpv)__builtin_msa_ilvr_d((_Tpvs)v3, (_Tpvs)v1); \
+  *d = (_Tpv)__builtin_msa_ilvl_d((_Tpvs)v3, (_Tpvs)v1); \
+} \
+__extension__ extern __inline void \
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
+msa_st4q_##suffix(_Tp* ptr, const _Tpv a, const _Tpv b, const _Tpv c, const _Tpv d) \
+{ \
+  msa_st1q_##suffix(ptr, (_Tpv)__builtin_msa_ilvr_d((_Tpvs)b, (_Tpvs)a)); \
+  msa_st1q_##suffix(ptr + 2, (_Tpv)__builtin_msa_ilvr_d((_Tpvs)d, (_Tpvs)c)); \
+  msa_st1q_##suffix(ptr + 4, (_Tpv)__builtin_msa_ilvl_d((_Tpvs)b, (_Tpvs)a)); \
+  msa_st1q_##suffix(ptr + 6, (_Tpv)__builtin_msa_ilvl_d((_Tpvs)d, (_Tpvs)c)); \
+}
+
+MSA_INTERLEAVED_IMPL_LOAD4_STORE4_64(uint64_t, v2u64, v2i64, u64)
+MSA_INTERLEAVED_IMPL_LOAD4_STORE4_64(int64_t, v2i64, v2i64, s64)
+MSA_INTERLEAVED_IMPL_LOAD4_STORE4_64(double, v2f64, v2i64, f64)
+
+__extension__ extern __inline v8i16
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+msa_qdmulhq_n_s16(v8i16 a, int16_t b)
+{
+  v8i16 a_lo, a_hi;
+  ILVRL_H2_SH(a, msa_dupq_n_s16(0), a_lo, a_hi);
+  return msa_packr_s32(msa_shlq_n_s32(msa_mulq_s32(msa_paddlq_s16(a_lo), msa_dupq_n_s32(b)), 1),
+                       msa_shlq_n_s32(msa_mulq_s32(msa_paddlq_s16(a_hi), msa_dupq_n_s32(b)), 1), 16);
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif /*__mips_msa*/
+#endif /* OPENCV_CORE_MSA_MACROS_H */
diff --git a/3rdparty/opencv/include/opencv2/core/hal/simd_utils.impl.hpp b/3rdparty/opencv/include/opencv2/core/hal/simd_utils.impl.hpp
new file mode 100644
index 00000000..fff8f942
--- /dev/null
+++ b/3rdparty/opencv/include/opencv2/core/hal/simd_utils.impl.hpp
@@ -0,0 +1,146 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html
+
+// This header is not standalone. Don't include directly, use "intrin.hpp" instead.
+#ifdef OPENCV_HAL_INTRIN_HPP  // defined in intrin.hpp
+
+
+#if CV_SIMD128 || CV_SIMD128_CPP
+
+template<typename _T> struct Type2Vec128_Traits;
+#define CV_INTRIN_DEF_TYPE2VEC128_TRAITS(type_, vec_type_) \
+    template<> struct Type2Vec128_Traits<type_> \
+    { \
+        typedef vec_type_ vec_type; \
+    }
+
+CV_INTRIN_DEF_TYPE2VEC128_TRAITS(uchar, v_uint8x16);
+CV_INTRIN_DEF_TYPE2VEC128_TRAITS(schar, v_int8x16);
+CV_INTRIN_DEF_TYPE2VEC128_TRAITS(ushort, v_uint16x8);
+CV_INTRIN_DEF_TYPE2VEC128_TRAITS(short, v_int16x8);
+CV_INTRIN_DEF_TYPE2VEC128_TRAITS(unsigned, v_uint32x4);
+CV_INTRIN_DEF_TYPE2VEC128_TRAITS(int, v_int32x4);
+CV_INTRIN_DEF_TYPE2VEC128_TRAITS(float, v_float32x4);
+CV_INTRIN_DEF_TYPE2VEC128_TRAITS(uint64, v_uint64x2);
+CV_INTRIN_DEF_TYPE2VEC128_TRAITS(int64, v_int64x2);
+#if CV_SIMD128_64F
+CV_INTRIN_DEF_TYPE2VEC128_TRAITS(double, v_float64x2);
+#endif
+
+template<typename _T> static inline
+typename Type2Vec128_Traits<_T>::vec_type v_setall(const _T& a);
+
+template<> inline Type2Vec128_Traits< uchar>::vec_type v_setall< uchar>(const  uchar& a) { return v_setall_u8(a); }
+template<> inline Type2Vec128_Traits< schar>::vec_type v_setall< schar>(const  schar& a) { return v_setall_s8(a); }
+template<> inline Type2Vec128_Traits<ushort>::vec_type v_setall<ushort>(const ushort& a) { return v_setall_u16(a); }
+template<> inline Type2Vec128_Traits< short>::vec_type v_setall< short>(const  short& a) { return v_setall_s16(a); }
+template<> inline Type2Vec128_Traits<  uint>::vec_type v_setall<  uint>(const   uint& a) { return v_setall_u32(a); }
+template<> inline Type2Vec128_Traits<   int>::vec_type v_setall<   int>(const    int& a) { return v_setall_s32(a); }
+template<> inline Type2Vec128_Traits<uint64>::vec_type v_setall<uint64>(const uint64& a) { return v_setall_u64(a); }
+template<> inline Type2Vec128_Traits< int64>::vec_type v_setall< int64>(const  int64& a) { return v_setall_s64(a); }
+template<> inline Type2Vec128_Traits< float>::vec_type v_setall< float>(const  float& a) { return v_setall_f32(a); }
+#if CV_SIMD128_64F
+template<> inline Type2Vec128_Traits<double>::vec_type v_setall<double>(const double& a) { return v_setall_f64(a); }
+#endif
+
+#endif  // SIMD128
+
+
+#if CV_SIMD256
+
+template<typename _T> struct Type2Vec256_Traits;
+#define CV_INTRIN_DEF_TYPE2VEC256_TRAITS(type_, vec_type_) \
+    template<> struct Type2Vec256_Traits<type_> \
+    { \
+        typedef vec_type_ vec_type; \
+    }
+
+CV_INTRIN_DEF_TYPE2VEC256_TRAITS(uchar, v_uint8x32);
+CV_INTRIN_DEF_TYPE2VEC256_TRAITS(schar, v_int8x32);
+CV_INTRIN_DEF_TYPE2VEC256_TRAITS(ushort, v_uint16x16);
+CV_INTRIN_DEF_TYPE2VEC256_TRAITS(short, v_int16x16);
+CV_INTRIN_DEF_TYPE2VEC256_TRAITS(unsigned, v_uint32x8);
+CV_INTRIN_DEF_TYPE2VEC256_TRAITS(int, v_int32x8);
+CV_INTRIN_DEF_TYPE2VEC256_TRAITS(float, v_float32x8);
+CV_INTRIN_DEF_TYPE2VEC256_TRAITS(uint64, v_uint64x4);
+CV_INTRIN_DEF_TYPE2VEC256_TRAITS(int64, v_int64x4);
+#if CV_SIMD256_64F
+CV_INTRIN_DEF_TYPE2VEC256_TRAITS(double, v_float64x4);
+#endif
+
+template<typename _T> static inline
+typename Type2Vec256_Traits<_T>::vec_type v256_setall(const _T& a);
+
+template<> inline Type2Vec256_Traits< uchar>::vec_type v256_setall< uchar>(const  uchar& a) { return v256_setall_u8(a); }
+template<> inline Type2Vec256_Traits< schar>::vec_type v256_setall< schar>(const  schar& a) { return v256_setall_s8(a); }
+template<> inline Type2Vec256_Traits<ushort>::vec_type v256_setall<ushort>(const ushort& a) { return v256_setall_u16(a); }
+template<> inline Type2Vec256_Traits< short>::vec_type v256_setall< short>(const  short& a) { return v256_setall_s16(a); }
+template<> inline Type2Vec256_Traits<  uint>::vec_type v256_setall<  uint>(const   uint& a) { return v256_setall_u32(a); }
+template<> inline Type2Vec256_Traits<   int>::vec_type v256_setall<   int>(const    int& a) { return v256_setall_s32(a); }
+template<> inline Type2Vec256_Traits<uint64>::vec_type v256_setall<uint64>(const uint64& a) { return v256_setall_u64(a); }
+template<> inline Type2Vec256_Traits< int64>::vec_type v256_setall< int64>(const  int64& a) { return v256_setall_s64(a); }
+template<> inline Type2Vec256_Traits< float>::vec_type v256_setall< float>(const  float& a) { return v256_setall_f32(a); }
+#if CV_SIMD256_64F
+template<> inline Type2Vec256_Traits<double>::vec_type v256_setall<double>(const double& a) { return v256_setall_f64(a); }
+#endif
+
+#endif  // SIMD256
+
+
+#if CV_SIMD512
+
+template<typename _T> struct Type2Vec512_Traits;
+#define CV_INTRIN_DEF_TYPE2VEC512_TRAITS(type_, vec_type_) \
+    template<> struct Type2Vec512_Traits<type_> \
+    { \
+        typedef vec_type_ vec_type; \
+    }
+
+CV_INTRIN_DEF_TYPE2VEC512_TRAITS(uchar, v_uint8x64);
+CV_INTRIN_DEF_TYPE2VEC512_TRAITS(schar, v_int8x64);
+CV_INTRIN_DEF_TYPE2VEC512_TRAITS(ushort, v_uint16x32);
+CV_INTRIN_DEF_TYPE2VEC512_TRAITS(short, v_int16x32);
+CV_INTRIN_DEF_TYPE2VEC512_TRAITS(unsigned, v_uint32x16);
+CV_INTRIN_DEF_TYPE2VEC512_TRAITS(int, v_int32x16);
+CV_INTRIN_DEF_TYPE2VEC512_TRAITS(float, v_float32x16);
+CV_INTRIN_DEF_TYPE2VEC512_TRAITS(uint64, v_uint64x8);
+CV_INTRIN_DEF_TYPE2VEC512_TRAITS(int64, v_int64x8);
+#if CV_SIMD512_64F
+CV_INTRIN_DEF_TYPE2VEC512_TRAITS(double, v_float64x8);
+#endif
+
+template<typename _T> static inline
+typename Type2Vec512_Traits<_T>::vec_type v512_setall(const _T& a);
+
+template<> inline Type2Vec512_Traits< uchar>::vec_type v512_setall< uchar>(const  uchar& a) { return v512_setall_u8(a); }
+template<> inline Type2Vec512_Traits< schar>::vec_type v512_setall< schar>(const  schar& a) { return v512_setall_s8(a); }
+template<> inline Type2Vec512_Traits<ushort>::vec_type v512_setall<ushort>(const ushort& a) { return v512_setall_u16(a); }
+template<> inline Type2Vec512_Traits< short>::vec_type v512_setall< short>(const  short& a) { return v512_setall_s16(a); }
+template<> inline Type2Vec512_Traits<  uint>::vec_type v512_setall<  uint>(const   uint& a) { return v512_setall_u32(a); }
+template<> inline Type2Vec512_Traits<   int>::vec_type v512_setall<   int>(const    int& a) { return v512_setall_s32(a); }
+template<> inline Type2Vec512_Traits<uint64>::vec_type v512_setall<uint64>(const uint64& a) { return v512_setall_u64(a); }
+template<> inline Type2Vec512_Traits< int64>::vec_type v512_setall< int64>(const  int64& a) { return v512_setall_s64(a); }
+template<> inline Type2Vec512_Traits< float>::vec_type v512_setall< float>(const  float& a) { return v512_setall_f32(a); }
+#if CV_SIMD512_64F
+template<> inline Type2Vec512_Traits<double>::vec_type v512_setall<double>(const double& a) { return v512_setall_f64(a); }
+#endif
+
+#endif  // SIMD512
+
+
+#if CV_SIMD_WIDTH == 16
+template<typename _T> static inline
+typename Type2Vec128_Traits<_T>::vec_type vx_setall(const _T& a) { return v_setall(a); }
+#elif CV_SIMD_WIDTH == 32
+template<typename _T> static inline
+typename Type2Vec256_Traits<_T>::vec_type vx_setall(const _T& a) { return v256_setall(a); }
+#elif CV_SIMD_WIDTH == 64
+template<typename _T> static inline
+typename Type2Vec512_Traits<_T>::vec_type vx_setall(const _T& a) { return v512_setall(a); }
+#else
+#error "Build configuration error, unsupported CV_SIMD_WIDTH"
+#endif
+
+
+#endif  // OPENCV_HAL_INTRIN_HPP
diff --git a/3rdparty/opencv/include/opencv2/core/simd_intrinsics.hpp b/3rdparty/opencv/include/opencv2/core/simd_intrinsics.hpp
new file mode 100644
index 00000000..309202d1
--- /dev/null
+++ b/3rdparty/opencv/include/opencv2/core/simd_intrinsics.hpp
@@ -0,0 +1,87 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_CORE_SIMD_INTRINSICS_HPP
+#define OPENCV_CORE_SIMD_INTRINSICS_HPP
+
+/**
+Helper header to support SIMD intrinsics (universal intrinsics) in user code.
+Intrinsics documentation: https://docs.opencv.org/3.4/df/d91/group__core__hal__intrin.html
+
+
+Checks of target CPU instruction set based on compiler definitions don't work well enough.
+More reliable solutions require utilization of configuration systems (like CMake).
+
+So, probably you need to specify your own configuration.
+
+You can do that via CMake in this way:
+    add_definitions(/DOPENCV_SIMD_CONFIG_HEADER=opencv_simd_config_custom.hpp)
+or
+    add_definitions(/DOPENCV_SIMD_CONFIG_INCLUDE_DIR=1)
+
+Additionally you may need to add include directory to your files:
+    include_directories("${CMAKE_CURRENT_LIST_DIR}/opencv_config_${MYTARGET}")
+
+These files can be pre-generated for target configurations of your application
+or generated by CMake on the fly (use CMAKE_BINARY_DIR for that).
+
+Notes:
+- H/W capability checks are still responsibility of your application
+- runtime dispatching is not covered by this helper header
+*/
+
+#ifdef __OPENCV_BUILD
+#error "Use core/hal/intrin.hpp during OpenCV build"
+#endif
+
+#ifdef OPENCV_HAL_INTRIN_HPP
+#error "core/simd_intrinsics.hpp must be included before core/hal/intrin.hpp"
+#endif
+
+#include "opencv2/core/cvdef.h"
+
+#ifdef OPENCV_SIMD_CONFIG_HEADER
+#include CVAUX_STR(OPENCV_SIMD_CONFIG_HEADER)
+#elif defined(OPENCV_SIMD_CONFIG_INCLUDE_DIR)
+#include "opencv_simd_config.hpp"  // corresponding directory should be added via -I compiler parameter
+#else  // custom config headers
+
+#if (!defined(CV_AVX_512F) || !CV_AVX_512F) && (defined(__AVX512__) || defined(__AVX512F__))
+#  include <immintrin.h>
+#  undef CV_AVX_512F
+#  define CV_AVX_512F 1
+#  ifndef OPENCV_SIMD_DONT_ASSUME_SKX  // Skylake-X with AVX-512F/CD/BW/DQ/VL
+#    undef CV_AVX512_SKX
+#    define CV_AVX512_SKX 1
+#    undef CV_AVX_512CD
+#    define CV_AVX_512CD 1
+#    undef CV_AVX_512BW
+#    define CV_AVX_512BW 1
+#    undef CV_AVX_512DQ
+#    define CV_AVX_512DQ 1
+#    undef CV_AVX_512VL
+#    define CV_AVX_512VL 1
+#  endif
+#endif // AVX512
+
+// GCC/Clang: -mavx2
+// MSVC: /arch:AVX2
+#if defined __AVX2__
+#  include <immintrin.h>
+#  undef CV_AVX2
+#  define CV_AVX2 1
+#  if defined __F16C__
+#    undef CV_FP16
+#    define CV_FP16 1
+#  endif
+#endif
+
+#endif
+
+// SSE / NEON / VSX is handled by cv_cpu_dispatch.h compatibility block
+#include "cv_cpu_dispatch.h"
+
+#include "hal/intrin.hpp"
+
+#endif // OPENCV_CORE_SIMD_INTRINSICS_HPP
diff --git a/3rdparty/opencv/include/opencv2/core/utils/allocator_stats.hpp b/3rdparty/opencv/include/opencv2/core/utils/allocator_stats.hpp
new file mode 100644
index 00000000..79e93382
--- /dev/null
+++ b/3rdparty/opencv/include/opencv2/core/utils/allocator_stats.hpp
@@ -0,0 +1,29 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_CORE_ALLOCATOR_STATS_HPP
+#define OPENCV_CORE_ALLOCATOR_STATS_HPP
+
+#include "../cvdef.h"
+
+namespace cv { namespace utils {
+
+class AllocatorStatisticsInterface
+{
+protected:
+    AllocatorStatisticsInterface() {}
+    virtual ~AllocatorStatisticsInterface() {}
+public:
+    virtual uint64_t getCurrentUsage() const = 0;
+    virtual uint64_t getTotalUsage() const = 0;
+    virtual uint64_t getNumberOfAllocations() const = 0;
+    virtual uint64_t getPeakUsage() const = 0;
+
+    /** set peak usage = current usage */
+    virtual void resetPeakUsage() = 0;
+};
+
+}} // namespace
+
+#endif // OPENCV_CORE_ALLOCATOR_STATS_HPP
diff --git a/3rdparty/opencv/include/opencv2/core/utils/allocator_stats.impl.hpp b/3rdparty/opencv/include/opencv2/core/utils/allocator_stats.impl.hpp
new file mode 100644
index 00000000..eb5ecde1
--- /dev/null
+++ b/3rdparty/opencv/include/opencv2/core/utils/allocator_stats.impl.hpp
@@ -0,0 +1,158 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_CORE_ALLOCATOR_STATS_IMPL_HPP
+#define OPENCV_CORE_ALLOCATOR_STATS_IMPL_HPP
+
+#include "./allocator_stats.hpp"
+
+//#define OPENCV_DISABLE_ALLOCATOR_STATS
+
+#ifdef CV_CXX11
+
+#include <atomic>
+
+#ifndef OPENCV_ALLOCATOR_STATS_COUNTER_TYPE
+#if defined(__GNUC__) && (\
+        (defined(__SIZEOF_POINTER__) && __SIZEOF_POINTER__ == 4) || \
+        (defined(__GCC_HAVE_SYNC_COMPARE_AND_SWAP_4) && !defined(__GCC_HAVE_SYNC_COMPARE_AND_SWAP_8)) \
+    )
+#define OPENCV_ALLOCATOR_STATS_COUNTER_TYPE int
+#endif
+#endif
+
+#ifndef OPENCV_ALLOCATOR_STATS_COUNTER_TYPE
+#define OPENCV_ALLOCATOR_STATS_COUNTER_TYPE long long
+#endif
+
+#else  // CV_CXX11
+
+#ifndef OPENCV_ALLOCATOR_STATS_COUNTER_TYPE
+#define OPENCV_ALLOCATOR_STATS_COUNTER_TYPE int  // CV_XADD supports int only
+#endif
+
+#endif  // CV_CXX11
+
+namespace cv { namespace utils {
+
+#ifdef CV__ALLOCATOR_STATS_LOG
+namespace {
+#endif
+
+class AllocatorStatistics : public AllocatorStatisticsInterface
+{
+#ifdef OPENCV_DISABLE_ALLOCATOR_STATS
+
+public:
+    AllocatorStatistics() {}
+    ~AllocatorStatistics() CV_OVERRIDE {}
+
+    uint64_t getCurrentUsage() const CV_OVERRIDE { return 0; }
+    uint64_t getTotalUsage() const CV_OVERRIDE { return 0; }
+    uint64_t getNumberOfAllocations() const CV_OVERRIDE { return 0; }
+    uint64_t getPeakUsage() const CV_OVERRIDE { return 0; }
+
+    /** set peak usage = current usage */
+    void resetPeakUsage() CV_OVERRIDE {};
+
+    void onAllocate(size_t /*sz*/) {}
+    void onFree(size_t /*sz*/) {}
+
+#elif defined(CV_CXX11)
+
+protected:
+    typedef OPENCV_ALLOCATOR_STATS_COUNTER_TYPE counter_t;
+    std::atomic<counter_t> curr, total, total_allocs, peak;
+public:
+    AllocatorStatistics() {}
+    ~AllocatorStatistics() CV_OVERRIDE {}
+
+    uint64_t getCurrentUsage() const CV_OVERRIDE { return (uint64_t)curr.load(); }
+    uint64_t getTotalUsage() const CV_OVERRIDE { return (uint64_t)total.load(); }
+    uint64_t getNumberOfAllocations() const CV_OVERRIDE { return (uint64_t)total_allocs.load(); }
+    uint64_t getPeakUsage() const CV_OVERRIDE { return (uint64_t)peak.load(); }
+
+    /** set peak usage = current usage */
+    void resetPeakUsage() CV_OVERRIDE { peak.store(curr.load()); }
+
+    // Controller interface
+    void onAllocate(size_t sz)
+    {
+#ifdef CV__ALLOCATOR_STATS_LOG
+        CV__ALLOCATOR_STATS_LOG(cv::format("allocate: %lld (curr=%lld)", (long long int)sz, (long long int)curr.load()));
+#endif
+
+        counter_t new_curr = curr.fetch_add((counter_t)sz) + (counter_t)sz;
+
+        // peak = std::max((uint64_t)peak, new_curr);
+        auto prev_peak = peak.load();
+        while (prev_peak < new_curr)
+        {
+            if (peak.compare_exchange_weak(prev_peak, new_curr))
+                break;
+        }
+        // end of peak = max(...)
+
+        total += (counter_t)sz;
+        total_allocs++;
+    }
+    void onFree(size_t sz)
+    {
+#ifdef CV__ALLOCATOR_STATS_LOG
+        CV__ALLOCATOR_STATS_LOG(cv::format("free: %lld (curr=%lld)", (long long int)sz, (long long int)curr.load()));
+#endif
+        curr -= (counter_t)sz;
+    }
+
+#else  // non C++11
+
+protected:
+    typedef OPENCV_ALLOCATOR_STATS_COUNTER_TYPE counter_t;
+    volatile counter_t curr, total, total_allocs, peak;  // overflow is possible, CV_XADD operates with 'int' only
+public:
+    AllocatorStatistics()
+        : curr(0), total(0), total_allocs(0), peak(0)
+    {}
+    ~AllocatorStatistics() CV_OVERRIDE {}
+
+    uint64_t getCurrentUsage() const CV_OVERRIDE { return (uint64_t)curr; }
+    uint64_t getTotalUsage() const CV_OVERRIDE { return (uint64_t)total; }
+    uint64_t getNumberOfAllocations() const CV_OVERRIDE { return (uint64_t)total_allocs; }
+    uint64_t getPeakUsage() const CV_OVERRIDE { return (uint64_t)peak; }
+
+    void resetPeakUsage() CV_OVERRIDE { peak = curr; }
+
+    // Controller interface
+    void onAllocate(size_t sz)
+    {
+#ifdef CV__ALLOCATOR_STATS_LOG
+        CV__ALLOCATOR_STATS_LOG(cv::format("allocate: %lld (curr=%lld)", (long long int)sz, (long long int)curr));
+#endif
+
+        counter_t new_curr = (counter_t)CV_XADD(&curr, (counter_t)sz) + (counter_t)sz;
+
+        peak = std::max((counter_t)peak, new_curr);  // non-thread safe
+
+        //CV_XADD(&total, (uint64_t)sz);  // overflow with int, non-reliable...
+        total += sz;
+
+        CV_XADD(&total_allocs, (counter_t)1);
+    }
+    void onFree(size_t sz)
+    {
+#ifdef CV__ALLOCATOR_STATS_LOG
+        CV__ALLOCATOR_STATS_LOG(cv::format("free: %lld (curr=%lld)", (long long int)sz, (long long int)curr));
+#endif
+        CV_XADD(&curr, (counter_t)-sz);
+    }
+#endif
+};
+
+#ifdef CV__ALLOCATOR_STATS_LOG
+} // namespace
+#endif
+
+}} // namespace
+
+#endif // OPENCV_CORE_ALLOCATOR_STATS_IMPL_HPP
diff --git a/3rdparty/opencv/include/opencv2/core/utils/instrumentation.hpp b/3rdparty/opencv/include/opencv2/core/utils/instrumentation.hpp
new file mode 100644
index 00000000..36398670
--- /dev/null
+++ b/3rdparty/opencv/include/opencv2/core/utils/instrumentation.hpp
@@ -0,0 +1,125 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_UTILS_INSTR_HPP
+#define OPENCV_UTILS_INSTR_HPP
+
+#include <opencv2/core/utility.hpp>
+#include <opencv2/core/utils/tls.hpp>
+
+namespace cv {
+
+//! @addtogroup core_utils
+//! @{
+
+#ifdef CV_COLLECT_IMPL_DATA
+CV_EXPORTS void setImpl(int flags); // set implementation flags and reset storage arrays
+CV_EXPORTS void addImpl(int flag, const char* func = 0); // add implementation and function name to storage arrays
+// Get stored implementation flags and functions names arrays
+// Each implementation entry correspond to function name entry, so you can find which implementation was executed in which function
+CV_EXPORTS int getImpl(std::vector<int> &impl, std::vector<String> &funName);
+
+CV_EXPORTS bool useCollection(); // return implementation collection state
+CV_EXPORTS void setUseCollection(bool flag); // set implementation collection state
+
+#define CV_IMPL_PLAIN  0x01 // native CPU OpenCV implementation
+#define CV_IMPL_OCL    0x02 // OpenCL implementation
+#define CV_IMPL_IPP    0x04 // IPP implementation
+#define CV_IMPL_MT     0x10 // multithreaded implementation
+
+#undef CV_IMPL_ADD
+#define CV_IMPL_ADD(impl)                                                   \
+    if(cv::useCollection())                                                 \
+    {                                                                       \
+        cv::addImpl(impl, CV_Func);                                         \
+    }
+#endif
+
+// Instrumentation external interface
+namespace instr
+{
+
+#if !defined OPENCV_ABI_CHECK
+
+enum TYPE
+{
+    TYPE_GENERAL = 0,   // OpenCV API function, e.g. exported function
+    TYPE_MARKER,        // Information marker
+    TYPE_WRAPPER,       // Wrapper function for implementation
+    TYPE_FUN,           // Simple function call
+};
+
+enum IMPL
+{
+    IMPL_PLAIN = 0,
+    IMPL_IPP,
+    IMPL_OPENCL,
+};
+
+struct NodeDataTls
+{
+    NodeDataTls()
+    {
+        m_ticksTotal = 0;
+    }
+    uint64      m_ticksTotal;
+};
+
+class CV_EXPORTS NodeData
+{
+public:
+    NodeData(const char* funName = 0, const char* fileName = NULL, int lineNum = 0, void* retAddress = NULL, bool alwaysExpand = false, cv::instr::TYPE instrType = TYPE_GENERAL, cv::instr::IMPL implType = IMPL_PLAIN);
+    NodeData(NodeData &ref);
+    ~NodeData();
+    NodeData& operator=(const NodeData&);
+
+    cv::String          m_funName;
+    cv::instr::TYPE     m_instrType;
+    cv::instr::IMPL     m_implType;
+    const char*         m_fileName;
+    int                 m_lineNum;
+    void*               m_retAddress;
+    bool                m_alwaysExpand;
+    bool                m_funError;
+
+    volatile int         m_counter;
+    volatile uint64      m_ticksTotal;
+    TLSDataAccumulator<NodeDataTls> m_tls;
+    int                  m_threads;
+
+    // No synchronization
+    double getTotalMs()   const { return ((double)m_ticksTotal / cv::getTickFrequency()) * 1000; }
+    double getMeanMs()    const { return (((double)m_ticksTotal/m_counter) / cv::getTickFrequency()) * 1000; }
+};
+bool operator==(const NodeData& lhs, const NodeData& rhs);
+
+typedef Node<NodeData> InstrNode;
+
+CV_EXPORTS InstrNode* getTrace();
+
+#endif // !defined OPENCV_ABI_CHECK
+
+
+CV_EXPORTS bool       useInstrumentation();
+CV_EXPORTS void       setUseInstrumentation(bool flag);
+CV_EXPORTS void       resetTrace();
+
+enum FLAGS
+{
+    FLAGS_NONE              = 0,
+    FLAGS_MAPPING           = 0x01,
+    FLAGS_EXPAND_SAME_NAMES = 0x02,
+};
+
+CV_EXPORTS void       setFlags(FLAGS modeFlags);
+static inline void    setFlags(int modeFlags) { setFlags((FLAGS)modeFlags); }
+CV_EXPORTS FLAGS      getFlags();
+
+} // namespace instr
+
+//! @}
+
+} // namespace
+
+#endif // OPENCV_UTILS_TLS_HPP
diff --git a/3rdparty/opencv/include/opencv2/core/utils/tls.hpp b/3rdparty/opencv/include/opencv2/core/utils/tls.hpp
new file mode 100644
index 00000000..c0d2962c
--- /dev/null
+++ b/3rdparty/opencv/include/opencv2/core/utils/tls.hpp
@@ -0,0 +1,239 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_UTILS_TLS_HPP
+#define OPENCV_UTILS_TLS_HPP
+
+#ifndef OPENCV_CORE_UTILITY_H
+#error "tls.hpp must be included after opencv2/core/utility.hpp or opencv2/core.hpp"
+#endif
+
+namespace cv {
+
+//! @addtogroup core_utils
+//! @{
+
+namespace details { class TlsStorage; }
+
+/** TLS container base implementation
+ *
+ * Don't use directly.
+ *
+ * @sa TLSData, TLSDataAccumulator templates
+ */
+class CV_EXPORTS TLSDataContainer
+{
+protected:
+    TLSDataContainer();
+    virtual ~TLSDataContainer();
+
+    /// @deprecated use detachData() instead
+    void  gatherData(std::vector<void*> &data) const;
+    /// get TLS data and detach all data from threads (similar to cleanup() call)
+    void  detachData(std::vector<void*>& data);
+
+    void* getData() const;
+    void  release();
+
+protected:
+    virtual void* createDataInstance() const = 0;
+    virtual void  deleteDataInstance(void* pData) const = 0;
+
+#if OPENCV_ABI_COMPATIBILITY > 300
+private:
+#else
+public:
+#endif
+    int key_;
+
+    friend class cv::details::TlsStorage;  // core/src/system.cpp
+
+public:
+    void cleanup(); //!< Release created TLS data container objects. It is similar to release() call, but it keeps TLS container valid.
+
+private:
+    // Disable copy/assign (noncopyable pattern)
+    TLSDataContainer(TLSDataContainer &);
+    TLSDataContainer& operator =(const TLSDataContainer &);
+};
+
+
+/** @brief Simple TLS data class
+ *
+ * @sa TLSDataAccumulator
+ */
+template <typename T>
+class TLSData : protected TLSDataContainer
+{
+public:
+    inline TLSData() {}
+    inline ~TLSData() { release(); }
+
+    inline T* get() const   { return (T*)getData(); }  //!< Get data associated with key
+    inline T& getRef() const { T* ptr = (T*)getData(); CV_DbgAssert(ptr); return *ptr; }  //!< Get data associated with key
+
+    /// Release associated thread data
+    inline void cleanup()
+    {
+        TLSDataContainer::cleanup();
+    }
+
+protected:
+    /// Wrapper to allocate data by template
+    virtual void* createDataInstance() const CV_OVERRIDE { return new T; }
+    /// Wrapper to release data by template
+    virtual void  deleteDataInstance(void* pData) const CV_OVERRIDE { delete (T*)pData; }
+};
+
+
+/// TLS data accumulator with gathering methods
+template <typename T>
+class TLSDataAccumulator : public TLSData<T>
+{
+    mutable cv::Mutex mutex;
+    mutable std::vector<T*> dataFromTerminatedThreads;
+    std::vector<T*> detachedData;
+    bool cleanupMode;
+public:
+    TLSDataAccumulator() : cleanupMode(false) {}
+    ~TLSDataAccumulator()
+    {
+        release();
+    }
+
+    /** @brief Get data from all threads
+     * @deprecated replaced by detachData()
+     *
+     * Lifetime of vector data is valid until next detachData()/cleanup()/release() calls
+     *
+     * @param[out] data result buffer (should be empty)
+     */
+    void gather(std::vector<T*> &data) const
+    {
+        CV_Assert(cleanupMode == false);  // state is not valid
+        CV_Assert(data.empty());
+        {
+            std::vector<void*> &dataVoid = reinterpret_cast<std::vector<void*>&>(data);
+            TLSDataContainer::gatherData(dataVoid);
+        }
+        {
+            AutoLock lock(mutex);
+            data.reserve(data.size() + dataFromTerminatedThreads.size());
+            for (typename std::vector<T*>::const_iterator i = dataFromTerminatedThreads.begin(); i != dataFromTerminatedThreads.end(); ++i)
+            {
+                data.push_back((T*)*i);
+            }
+        }
+    }
+
+    /** @brief Get and detach data from all threads
+     *
+     * Call cleanupDetachedData() when returned vector is not needed anymore.
+     *
+     * @return Vector with associated data. Content is preserved (including lifetime of attached data pointers) until next detachData()/cleanupDetachedData()/cleanup()/release() calls
+     */
+    std::vector<T*>& detachData()
+    {
+        CV_Assert(cleanupMode == false);  // state is not valid
+        std::vector<void*> dataVoid;
+        {
+            TLSDataContainer::detachData(dataVoid);
+        }
+        {
+            AutoLock lock(mutex);
+            detachedData.reserve(dataVoid.size() + dataFromTerminatedThreads.size());
+            for (typename std::vector<T*>::const_iterator i = dataFromTerminatedThreads.begin(); i != dataFromTerminatedThreads.end(); ++i)
+            {
+                detachedData.push_back((T*)*i);
+            }
+            dataFromTerminatedThreads.clear();
+            for (typename std::vector<void*>::const_iterator i = dataVoid.begin(); i != dataVoid.end(); ++i)
+            {
+                detachedData.push_back((T*)(void*)*i);
+            }
+        }
+        dataVoid.clear();
+        return detachedData;
+    }
+
+    /// Release associated thread data returned by detachData() call
+    void cleanupDetachedData()
+    {
+        AutoLock lock(mutex);
+        cleanupMode = true;
+        _cleanupDetachedData();
+        cleanupMode = false;
+    }
+
+    /// Release associated thread data
+    void cleanup()
+    {
+        cleanupMode = true;
+        TLSDataContainer::cleanup();
+
+        AutoLock lock(mutex);
+        _cleanupDetachedData();
+        _cleanupTerminatedData();
+        cleanupMode = false;
+    }
+
+    /// Release associated thread data and free TLS key
+    void release()
+    {
+        cleanupMode = true;
+        TLSDataContainer::release();
+        {
+            AutoLock lock(mutex);
+            _cleanupDetachedData();
+            _cleanupTerminatedData();
+        }
+    }
+
+protected:
+    // synchronized
+    void _cleanupDetachedData()
+    {
+        for (typename std::vector<T*>::iterator i = detachedData.begin(); i != detachedData.end(); ++i)
+        {
+            deleteDataInstance((T*)*i);
+        }
+        detachedData.clear();
+    }
+
+    // synchronized
+    void _cleanupTerminatedData()
+    {
+        for (typename std::vector<T*>::iterator i = dataFromTerminatedThreads.begin(); i != dataFromTerminatedThreads.end(); ++i)
+        {
+            deleteDataInstance((T*)*i);
+        }
+        dataFromTerminatedThreads.clear();
+    }
+
+protected:
+    virtual void* createDataInstance() const CV_OVERRIDE
+    {
+        // Note: we can collect all allocated data here, but this would require raced mutex locks
+        return new T;
+    }
+    virtual void  deleteDataInstance(void* pData) const CV_OVERRIDE
+    {
+        if (cleanupMode)
+        {
+            delete (T*)pData;
+        }
+        else
+        {
+            AutoLock lock(mutex);
+            dataFromTerminatedThreads.push_back((T*)pData);
+        }
+    }
+};
+
+
+//! @}
+
+} // namespace
+
+#endif // OPENCV_UTILS_TLS_HPP
diff --git a/3rdparty/opencv/include/opencv2/world.hpp b/3rdparty/opencv/include/opencv2/world.hpp
new file mode 100644
index 00000000..4902c2f2
--- /dev/null
+++ b/3rdparty/opencv/include/opencv2/world.hpp
@@ -0,0 +1,58 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009-2010, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_WORLD_HPP
+#define OPENCV_WORLD_HPP
+
+#include "opencv2/core.hpp"
+
+#ifdef __cplusplus
+namespace cv
+{
+
+CV_EXPORTS_W bool initAll();
+
+}
+
+#endif
+
+#endif
diff --git a/3rdparty/opencv/x86/lib/opencv_world3414.lib b/3rdparty/opencv/x86/lib/opencv_world3414.lib
new file mode 100644
index 00000000..34984076
Binary files /dev/null and b/3rdparty/opencv/x86/lib/opencv_world3414.lib differ
diff --git a/3rdparty/opencv/x86/lib/opencv_world3414d.lib b/3rdparty/opencv/x86/lib/opencv_world3414d.lib
new file mode 100644
index 00000000..30403cae
Binary files /dev/null and b/3rdparty/opencv/x86/lib/opencv_world3414d.lib differ
diff --git a/3rdparty/opencv/x86/staticlib/debug/opencv_world3414d.lib b/3rdparty/opencv/x86/staticlib/debug/opencv_world3414d.lib
new file mode 100644
index 00000000..a8679629
Binary files /dev/null and b/3rdparty/opencv/x86/staticlib/debug/opencv_world3414d.lib differ
diff --git a/3rdparty/opencv/x86/staticlib/debug/quircd.lib b/3rdparty/opencv/x86/staticlib/debug/quircd.lib
new file mode 100644
index 00000000..0472624b
Binary files /dev/null and b/3rdparty/opencv/x86/staticlib/debug/quircd.lib differ
diff --git a/3rdparty/opencv/x86/staticlib/release/opencv_world3414.lib b/3rdparty/opencv/x86/staticlib/release/opencv_world3414.lib
new file mode 100644
index 00000000..884436be
Binary files /dev/null and b/3rdparty/opencv/x86/staticlib/release/opencv_world3414.lib differ
diff --git a/3rdparty/opencv/x86/staticlib/release/quirc.lib b/3rdparty/opencv/x86/staticlib/release/quirc.lib
new file mode 100644
index 00000000..a84419c8
Binary files /dev/null and b/3rdparty/opencv/x86/staticlib/release/quirc.lib differ