intrin.hpp 41 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989
  1. /*M///////////////////////////////////////////////////////////////////////////////////////
  2. //
  3. // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
  4. //
  5. // By downloading, copying, installing or using the software you agree to this license.
  6. // If you do not agree to this license, do not download, install,
  7. // copy or use the software.
  8. //
  9. //
  10. // License Agreement
  11. // For Open Source Computer Vision Library
  12. //
  13. // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
  14. // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
  15. // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
  16. // Copyright (C) 2015, Itseez Inc., all rights reserved.
  17. // Third party copyrights are property of their respective owners.
  18. //
  19. // Redistribution and use in source and binary forms, with or without modification,
  20. // are permitted provided that the following conditions are met:
  21. //
  22. // * Redistribution's of source code must retain the above copyright notice,
  23. // this list of conditions and the following disclaimer.
  24. //
  25. // * Redistribution's in binary form must reproduce the above copyright notice,
  26. // this list of conditions and the following disclaimer in the documentation
  27. // and/or other materials provided with the distribution.
  28. //
  29. // * The name of the copyright holders may not be used to endorse or promote products
  30. // derived from this software without specific prior written permission.
  31. //
  32. // This software is provided by the copyright holders and contributors "as is" and
  33. // any express or implied warranties, including, but not limited to, the implied
  34. // warranties of merchantability and fitness for a particular purpose are disclaimed.
  35. // In no event shall the Intel Corporation or contributors be liable for any direct,
  36. // indirect, incidental, special, exemplary, or consequential damages
  37. // (including, but not limited to, procurement of substitute goods or services;
  38. // loss of use, data, or profits; or business interruption) however caused
  39. // and on any theory of liability, whether in contract, strict liability,
  40. // or tort (including negligence or otherwise) arising in any way out of
  41. // the use of this software, even if advised of the possibility of such damage.
  42. //
  43. //M*/
  44. #ifndef OPENCV_HAL_INTRIN_HPP
  45. #define OPENCV_HAL_INTRIN_HPP
  46. #include <cmath>
  47. #include <float.h>
  48. #include <stdlib.h>
  49. #include "opencv2/core/cvdef.h"
  50. #if defined(__GNUC__) && __GNUC__ == 12
  51. #pragma GCC diagnostic push
  52. #pragma GCC diagnostic ignored "-Wuninitialized"
  53. #pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
  54. #endif
  55. #define OPENCV_HAL_ADD(a, b) ((a) + (b))
  56. #define OPENCV_HAL_AND(a, b) ((a) & (b))
  57. #define OPENCV_HAL_NOP(a) (a)
  58. #define OPENCV_HAL_1ST(a, b) (a)
  59. namespace {
  60. inline unsigned int trailingZeros32(unsigned int value) {
  61. #if defined(_MSC_VER)
  62. #if (_MSC_VER < 1700) || defined(_M_ARM) || defined(_M_ARM64) || defined(_M_ARM64EC)
  63. unsigned long index = 0;
  64. _BitScanForward(&index, value);
  65. return (unsigned int)index;
  66. #elif defined(__clang__)
  67. // clang-cl doesn't export _tzcnt_u32 for non BMI systems
  68. return value ? __builtin_ctz(value) : 32;
  69. #else
  70. return _tzcnt_u32(value);
  71. #endif
  72. #elif defined(__GNUC__) || defined(__GNUG__)
  73. return __builtin_ctz(value);
  74. #elif defined(__ICC) || defined(__INTEL_COMPILER)
  75. return _bit_scan_forward(value);
  76. #elif defined(__clang__)
  77. return llvm.cttz.i32(value, true);
  78. #else
  79. static const int MultiplyDeBruijnBitPosition[32] = {
  80. 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
  81. 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9 };
  82. return MultiplyDeBruijnBitPosition[((uint32_t)((value & -value) * 0x077CB531U)) >> 27];
  83. #endif
  84. }
  85. }
  86. // unlike HAL API, which is in cv::hal,
  87. // we put intrinsics into cv namespace to make its
  88. // access from within opencv code more accessible
  89. namespace cv {
  90. namespace hal {
  91. enum StoreMode
  92. {
  93. STORE_UNALIGNED = 0,
  94. STORE_ALIGNED = 1,
  95. STORE_ALIGNED_NOCACHE = 2
  96. };
  97. }
  98. // TODO FIXIT: Don't use "God" traits. Split on separate cases.
  99. template<typename _Tp> struct V_TypeTraits
  100. {
  101. };
  102. #define CV_INTRIN_DEF_TYPE_TRAITS(type, int_type_, uint_type_, abs_type_, w_type_, q_type_, sum_type_) \
  103. template<> struct V_TypeTraits<type> \
  104. { \
  105. typedef type value_type; \
  106. typedef int_type_ int_type; \
  107. typedef abs_type_ abs_type; \
  108. typedef uint_type_ uint_type; \
  109. typedef w_type_ w_type; \
  110. typedef q_type_ q_type; \
  111. typedef sum_type_ sum_type; \
  112. \
  113. static inline int_type reinterpret_int(type x) \
  114. { \
  115. union { type l; int_type i; } v; \
  116. v.l = x; \
  117. return v.i; \
  118. } \
  119. \
  120. static inline type reinterpret_from_int(int_type x) \
  121. { \
  122. union { type l; int_type i; } v; \
  123. v.i = x; \
  124. return v.l; \
  125. } \
  126. }
  127. #define CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(type, int_type_, uint_type_, abs_type_, w_type_, sum_type_) \
  128. template<> struct V_TypeTraits<type> \
  129. { \
  130. typedef type value_type; \
  131. typedef int_type_ int_type; \
  132. typedef abs_type_ abs_type; \
  133. typedef uint_type_ uint_type; \
  134. typedef w_type_ w_type; \
  135. typedef sum_type_ sum_type; \
  136. \
  137. static inline int_type reinterpret_int(type x) \
  138. { \
  139. union { type l; int_type i; } v; \
  140. v.l = x; \
  141. return v.i; \
  142. } \
  143. \
  144. static inline type reinterpret_from_int(int_type x) \
  145. { \
  146. union { type l; int_type i; } v; \
  147. v.i = x; \
  148. return v.l; \
  149. } \
  150. }
  151. CV_INTRIN_DEF_TYPE_TRAITS(uchar, schar, uchar, uchar, ushort, unsigned, unsigned);
  152. CV_INTRIN_DEF_TYPE_TRAITS(schar, schar, uchar, uchar, short, int, int);
  153. CV_INTRIN_DEF_TYPE_TRAITS(ushort, short, ushort, ushort, unsigned, uint64, unsigned);
  154. CV_INTRIN_DEF_TYPE_TRAITS(short, short, ushort, ushort, int, int64, int);
  155. CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(unsigned, int, unsigned, unsigned, uint64, unsigned);
  156. CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(int, int, unsigned, unsigned, int64, int);
  157. CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(float, int, unsigned, float, double, float);
  158. CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(uint64, int64, uint64, uint64, void, uint64);
  159. CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(int64, int64, uint64, uint64, void, int64);
  160. CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(double, int64, uint64, double, void, double);
  161. #ifndef CV_DOXYGEN
  162. #ifndef CV_CPU_OPTIMIZATION_HAL_NAMESPACE
  163. #ifdef CV_FORCE_SIMD128_CPP
  164. #define CV_CPU_OPTIMIZATION_HAL_NAMESPACE hal_EMULATOR_CPP
  165. #define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN namespace hal_EMULATOR_CPP {
  166. #define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END }
  167. #elif defined(CV_CPU_DISPATCH_MODE)
  168. #define CV_CPU_OPTIMIZATION_HAL_NAMESPACE __CV_CAT(hal_, CV_CPU_DISPATCH_MODE)
  169. #define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN namespace __CV_CAT(hal_, CV_CPU_DISPATCH_MODE) {
  170. #define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END }
  171. #else
  172. #define CV_CPU_OPTIMIZATION_HAL_NAMESPACE hal_baseline
  173. #define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN namespace hal_baseline {
  174. #define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END }
  175. #endif
  176. #endif // CV_CPU_OPTIMIZATION_HAL_NAMESPACE
  177. CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
  178. template <typename _VecTp> inline _VecTp v_setzero_();
  179. template <typename _VecTp> inline _VecTp v_setall_(uchar);
  180. template <typename _VecTp> inline _VecTp v_setall_(schar);
  181. template <typename _VecTp> inline _VecTp v_setall_(ushort);
  182. template <typename _VecTp> inline _VecTp v_setall_(short);
  183. template <typename _VecTp> inline _VecTp v_setall_(unsigned);
  184. template <typename _VecTp> inline _VecTp v_setall_(int);
  185. template <typename _VecTp> inline _VecTp v_setall_(uint64);
  186. template <typename _VecTp> inline _VecTp v_setall_(int64);
  187. template <typename _VecTp> inline _VecTp v_setall_(float);
  188. template <typename _VecTp> inline _VecTp v_setall_(double);
  189. CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
  190. using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE;
  191. #endif
  192. }
  193. #ifdef CV_DOXYGEN
  194. # undef CV_AVX2
  195. # undef CV_SSE2
  196. # undef CV_NEON
  197. # undef CV_VSX
  198. # undef CV_FP16
  199. # undef CV_MSA
  200. # undef CV_RVV
  201. #endif
  202. #if (CV_SSE2 || CV_NEON || CV_VSX || CV_MSA || CV_WASM_SIMD || CV_RVV071 || CV_LSX) && !defined(CV_FORCE_SIMD128_CPP)
  203. #define CV__SIMD_FORWARD 128
  204. #include "opencv2/core/hal/intrin_forward.hpp"
  205. #endif
  206. #if CV_SSE2 && !defined(CV_FORCE_SIMD128_CPP)
  207. #include "opencv2/core/hal/intrin_sse_em.hpp"
  208. #include "opencv2/core/hal/intrin_sse.hpp"
  209. #elif CV_NEON && !defined(CV_FORCE_SIMD128_CPP)
  210. #include "opencv2/core/hal/intrin_neon.hpp"
  211. #elif CV_RVV071 && !defined(CV_FORCE_SIMD128_CPP)
  212. #define CV_SIMD128_CPP 0
  213. #include "opencv2/core/hal/intrin_rvv071.hpp"
  214. #elif CV_VSX && !defined(CV_FORCE_SIMD128_CPP)
  215. #include "opencv2/core/hal/intrin_vsx.hpp"
  216. #elif CV_MSA && !defined(CV_FORCE_SIMD128_CPP)
  217. #include "opencv2/core/hal/intrin_msa.hpp"
  218. #elif CV_WASM_SIMD && !defined(CV_FORCE_SIMD128_CPP)
  219. #include "opencv2/core/hal/intrin_wasm.hpp"
  220. #elif CV_RVV && !defined(CV_FORCE_SIMD128_CPP)
  221. #include "opencv2/core/hal/intrin_rvv_scalable.hpp"
  222. #elif CV_LSX && !defined(CV_FORCE_SIMD128_CPP)
  223. #include "opencv2/core/hal/intrin_lsx.hpp"
  224. #else
  225. #include "opencv2/core/hal/intrin_cpp.hpp"
  226. #endif
  227. // AVX2 can be used together with SSE2, so
  228. // we define those two sets of intrinsics at once.
  229. // Most of the intrinsics do not conflict (the proper overloaded variant is
  230. // resolved by the argument types, e.g. v_float32x4 ~ SSE2, v_float32x8 ~ AVX2),
  231. // but some of AVX2 intrinsics get v256_ prefix instead of v_, e.g. v256_load() vs v_load().
  232. // Correspondingly, the wide intrinsics (which are mapped to the "widest"
  233. // available instruction set) will get vx_ prefix
  234. // (and will be mapped to v256_ counterparts) (e.g. vx_load() => v256_load())
  235. #if CV_AVX2
  236. #define CV__SIMD_FORWARD 256
  237. #include "opencv2/core/hal/intrin_forward.hpp"
  238. #include "opencv2/core/hal/intrin_avx.hpp"
  239. #endif
  240. // AVX512 can be used together with SSE2 and AVX2, so
  241. // we define those sets of intrinsics at once.
  242. // For some of AVX512 intrinsics get v512_ prefix instead of v_, e.g. v512_load() vs v_load().
  243. // Wide intrinsics will be mapped to v512_ counterparts in this case(e.g. vx_load() => v512_load())
  244. #if CV_AVX512_SKX
  245. #define CV__SIMD_FORWARD 512
  246. #include "opencv2/core/hal/intrin_forward.hpp"
  247. #include "opencv2/core/hal/intrin_avx512.hpp"
  248. #endif
  249. #if CV_LASX
  250. #define CV__SIMD_FORWARD 256
  251. #include "opencv2/core/hal/intrin_forward.hpp"
  252. #include "opencv2/core/hal/intrin_lasx.hpp"
  253. #endif
  254. //! @cond IGNORED
  255. namespace cv {
  256. #ifndef CV_DOXYGEN
  257. CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
  258. #endif
  259. #ifndef CV_SIMD128
  260. #define CV_SIMD128 0
  261. #endif
  262. #ifndef CV_SIMD128_CPP
  263. #define CV_SIMD128_CPP 0
  264. #endif
  265. #ifndef CV_SIMD128_64F
  266. #define CV_SIMD128_64F 0
  267. #endif
  268. #ifndef CV_SIMD256
  269. #define CV_SIMD256 0
  270. #endif
  271. #ifndef CV_SIMD256_64F
  272. #define CV_SIMD256_64F 0
  273. #endif
  274. #ifndef CV_SIMD512
  275. #define CV_SIMD512 0
  276. #endif
  277. #ifndef CV_SIMD512_64F
  278. #define CV_SIMD512_64F 0
  279. #endif
  280. #ifndef CV_SIMD128_FP16
  281. #define CV_SIMD128_FP16 0
  282. #endif
  283. #ifndef CV_SIMD256_FP16
  284. #define CV_SIMD256_FP16 0
  285. #endif
  286. #ifndef CV_SIMD512_FP16
  287. #define CV_SIMD512_FP16 0
  288. #endif
  289. #ifndef CV_SIMD_SCALABLE
  290. #define CV_SIMD_SCALABLE 0
  291. #endif
  292. #ifndef CV_SIMD_SCALABLE_64F
  293. #define CV_SIMD_SCALABLE_64F 0
  294. #endif
  295. //==================================================================================================
  296. template<typename _Tp> struct V_RegTraits
  297. {
  298. };
  299. #define CV_DEF_REG_TRAITS(prefix, _reg, lane_type, suffix, _u_reg, _w_reg, _q_reg, _int_reg, _round_reg) \
  300. template<> struct V_RegTraits<_reg> \
  301. { \
  302. typedef _reg reg; \
  303. typedef _u_reg u_reg; \
  304. typedef _w_reg w_reg; \
  305. typedef _q_reg q_reg; \
  306. typedef _int_reg int_reg; \
  307. typedef _round_reg round_reg; \
  308. }
  309. #if CV_SIMD128 || CV_SIMD128_CPP
  310. CV_DEF_REG_TRAITS(v, v_uint8x16, uchar, u8, v_uint8x16, v_uint16x8, v_uint32x4, v_int8x16, void);
  311. CV_DEF_REG_TRAITS(v, v_int8x16, schar, s8, v_uint8x16, v_int16x8, v_int32x4, v_int8x16, void);
  312. CV_DEF_REG_TRAITS(v, v_uint16x8, ushort, u16, v_uint16x8, v_uint32x4, v_uint64x2, v_int16x8, void);
  313. CV_DEF_REG_TRAITS(v, v_int16x8, short, s16, v_uint16x8, v_int32x4, v_int64x2, v_int16x8, void);
  314. CV_DEF_REG_TRAITS(v, v_uint32x4, unsigned, u32, v_uint32x4, v_uint64x2, void, v_int32x4, void);
  315. CV_DEF_REG_TRAITS(v, v_int32x4, int, s32, v_uint32x4, v_int64x2, void, v_int32x4, void);
  316. #if CV_SIMD128_64F || CV_SIMD128_CPP
  317. CV_DEF_REG_TRAITS(v, v_float32x4, float, f32, v_float32x4, v_float64x2, void, v_int32x4, v_int32x4);
  318. #else
  319. CV_DEF_REG_TRAITS(v, v_float32x4, float, f32, v_float32x4, void, void, v_int32x4, v_int32x4);
  320. #endif
  321. CV_DEF_REG_TRAITS(v, v_uint64x2, uint64, u64, v_uint64x2, void, void, v_int64x2, void);
  322. CV_DEF_REG_TRAITS(v, v_int64x2, int64, s64, v_uint64x2, void, void, v_int64x2, void);
  323. #if CV_SIMD128_64F
  324. CV_DEF_REG_TRAITS(v, v_float64x2, double, f64, v_float64x2, void, void, v_int64x2, v_int32x4);
  325. #endif
  326. #endif
  327. #if CV_SIMD256
  328. CV_DEF_REG_TRAITS(v256, v_uint8x32, uchar, u8, v_uint8x32, v_uint16x16, v_uint32x8, v_int8x32, void);
  329. CV_DEF_REG_TRAITS(v256, v_int8x32, schar, s8, v_uint8x32, v_int16x16, v_int32x8, v_int8x32, void);
  330. CV_DEF_REG_TRAITS(v256, v_uint16x16, ushort, u16, v_uint16x16, v_uint32x8, v_uint64x4, v_int16x16, void);
  331. CV_DEF_REG_TRAITS(v256, v_int16x16, short, s16, v_uint16x16, v_int32x8, v_int64x4, v_int16x16, void);
  332. CV_DEF_REG_TRAITS(v256, v_uint32x8, unsigned, u32, v_uint32x8, v_uint64x4, void, v_int32x8, void);
  333. CV_DEF_REG_TRAITS(v256, v_int32x8, int, s32, v_uint32x8, v_int64x4, void, v_int32x8, void);
  334. CV_DEF_REG_TRAITS(v256, v_float32x8, float, f32, v_float32x8, v_float64x4, void, v_int32x8, v_int32x8);
  335. CV_DEF_REG_TRAITS(v256, v_uint64x4, uint64, u64, v_uint64x4, void, void, v_int64x4, void);
  336. CV_DEF_REG_TRAITS(v256, v_int64x4, int64, s64, v_uint64x4, void, void, v_int64x4, void);
  337. CV_DEF_REG_TRAITS(v256, v_float64x4, double, f64, v_float64x4, void, void, v_int64x4, v_int32x8);
  338. #endif
  339. #if CV_SIMD512
  340. CV_DEF_REG_TRAITS(v512, v_uint8x64, uchar, u8, v_uint8x64, v_uint16x32, v_uint32x16, v_int8x64, void);
  341. CV_DEF_REG_TRAITS(v512, v_int8x64, schar, s8, v_uint8x64, v_int16x32, v_int32x16, v_int8x64, void);
  342. CV_DEF_REG_TRAITS(v512, v_uint16x32, ushort, u16, v_uint16x32, v_uint32x16, v_uint64x8, v_int16x32, void);
  343. CV_DEF_REG_TRAITS(v512, v_int16x32, short, s16, v_uint16x32, v_int32x16, v_int64x8, v_int16x32, void);
  344. CV_DEF_REG_TRAITS(v512, v_uint32x16, unsigned, u32, v_uint32x16, v_uint64x8, void, v_int32x16, void);
  345. CV_DEF_REG_TRAITS(v512, v_int32x16, int, s32, v_uint32x16, v_int64x8, void, v_int32x16, void);
  346. CV_DEF_REG_TRAITS(v512, v_float32x16, float, f32, v_float32x16, v_float64x8, void, v_int32x16, v_int32x16);
  347. CV_DEF_REG_TRAITS(v512, v_uint64x8, uint64, u64, v_uint64x8, void, void, v_int64x8, void);
  348. CV_DEF_REG_TRAITS(v512, v_int64x8, int64, s64, v_uint64x8, void, void, v_int64x8, void);
  349. CV_DEF_REG_TRAITS(v512, v_float64x8, double, f64, v_float64x8, void, void, v_int64x8, v_int32x16);
  350. #endif
  351. #if CV_SIMD_SCALABLE
  352. CV_DEF_REG_TRAITS(v, v_uint8, uchar, u8, v_uint8, v_uint16, v_uint32, v_int8, void);
  353. CV_DEF_REG_TRAITS(v, v_int8, schar, s8, v_uint8, v_int16, v_int32, v_int8, void);
  354. CV_DEF_REG_TRAITS(v, v_uint16, ushort, u16, v_uint16, v_uint32, v_uint64, v_int16, void);
  355. CV_DEF_REG_TRAITS(v, v_int16, short, s16, v_uint16, v_int32, v_int64, v_int16, void);
  356. CV_DEF_REG_TRAITS(v, v_uint32, unsigned, u32, v_uint32, v_uint64, void, v_int32, void);
  357. CV_DEF_REG_TRAITS(v, v_int32, int, s32, v_uint32, v_int64, void, v_int32, void);
  358. CV_DEF_REG_TRAITS(v, v_float32, float, f32, v_float32, v_float64, void, v_int32, v_int32);
  359. CV_DEF_REG_TRAITS(v, v_uint64, uint64, u64, v_uint64, void, void, v_int64, void);
  360. CV_DEF_REG_TRAITS(v, v_int64, int64, s64, v_uint64, void, void, v_int64, void);
  361. CV_DEF_REG_TRAITS(v, v_float64, double, f64, v_float64, void, void, v_int64, v_int32);
  362. #endif
  363. //! @endcond
  364. #if CV_SIMD512 && (!defined(CV__SIMD_FORCE_WIDTH) || CV__SIMD_FORCE_WIDTH == 512)
  365. #define CV__SIMD_NAMESPACE simd512
  366. namespace CV__SIMD_NAMESPACE {
  367. #define CV_SIMD 1
  368. #define CV_SIMD_64F CV_SIMD512_64F
  369. #define CV_SIMD_FP16 CV_SIMD512_FP16
  370. #define CV_SIMD_WIDTH 64
  371. //! @addtogroup core_hal_intrin
  372. //! @{
  373. //! @brief Maximum available vector register capacity 8-bit unsigned integer values
  374. typedef v_uint8x64 v_uint8;
  375. //! @brief Maximum available vector register capacity 8-bit signed integer values
  376. typedef v_int8x64 v_int8;
  377. //! @brief Maximum available vector register capacity 16-bit unsigned integer values
  378. typedef v_uint16x32 v_uint16;
  379. //! @brief Maximum available vector register capacity 16-bit signed integer values
  380. typedef v_int16x32 v_int16;
  381. //! @brief Maximum available vector register capacity 32-bit unsigned integer values
  382. typedef v_uint32x16 v_uint32;
  383. //! @brief Maximum available vector register capacity 32-bit signed integer values
  384. typedef v_int32x16 v_int32;
  385. //! @brief Maximum available vector register capacity 64-bit unsigned integer values
  386. typedef v_uint64x8 v_uint64;
  387. //! @brief Maximum available vector register capacity 64-bit signed integer values
  388. typedef v_int64x8 v_int64;
  389. //! @brief Maximum available vector register capacity 32-bit floating point values (single precision)
  390. typedef v_float32x16 v_float32;
  391. #if CV_SIMD512_64F
  392. //! @brief Maximum available vector register capacity 64-bit floating point values (double precision)
  393. typedef v_float64x8 v_float64;
  394. #endif
  395. //! @}
  396. #define VXPREFIX(func) v512##func
  397. } // namespace
  398. using namespace CV__SIMD_NAMESPACE;
  399. #elif CV_SIMD256 && (!defined(CV__SIMD_FORCE_WIDTH) || CV__SIMD_FORCE_WIDTH == 256)
  400. #define CV__SIMD_NAMESPACE simd256
  401. namespace CV__SIMD_NAMESPACE {
  402. #define CV_SIMD 1
  403. #define CV_SIMD_64F CV_SIMD256_64F
  404. #define CV_SIMD_FP16 CV_SIMD256_FP16
  405. #define CV_SIMD_WIDTH 32
  406. //! @addtogroup core_hal_intrin
  407. //! @{
  408. //! @brief Maximum available vector register capacity 8-bit unsigned integer values
  409. typedef v_uint8x32 v_uint8;
  410. //! @brief Maximum available vector register capacity 8-bit signed integer values
  411. typedef v_int8x32 v_int8;
  412. //! @brief Maximum available vector register capacity 16-bit unsigned integer values
  413. typedef v_uint16x16 v_uint16;
  414. //! @brief Maximum available vector register capacity 16-bit signed integer values
  415. typedef v_int16x16 v_int16;
  416. //! @brief Maximum available vector register capacity 32-bit unsigned integer values
  417. typedef v_uint32x8 v_uint32;
  418. //! @brief Maximum available vector register capacity 32-bit signed integer values
  419. typedef v_int32x8 v_int32;
  420. //! @brief Maximum available vector register capacity 64-bit unsigned integer values
  421. typedef v_uint64x4 v_uint64;
  422. //! @brief Maximum available vector register capacity 64-bit signed integer values
  423. typedef v_int64x4 v_int64;
  424. //! @brief Maximum available vector register capacity 32-bit floating point values (single precision)
  425. typedef v_float32x8 v_float32;
  426. #if CV_SIMD256_64F
  427. //! @brief Maximum available vector register capacity 64-bit floating point values (double precision)
  428. typedef v_float64x4 v_float64;
  429. #endif
  430. //! @}
  431. #define VXPREFIX(func) v256##func
  432. } // namespace
  433. using namespace CV__SIMD_NAMESPACE;
  434. #elif (CV_SIMD128 || CV_SIMD128_CPP) && (!defined(CV__SIMD_FORCE_WIDTH) || CV__SIMD_FORCE_WIDTH == 128)
  435. #if defined CV_SIMD128_CPP
  436. #define CV__SIMD_NAMESPACE simd128_cpp
  437. #else
  438. #define CV__SIMD_NAMESPACE simd128
  439. #endif
  440. namespace CV__SIMD_NAMESPACE {
  441. #define CV_SIMD CV_SIMD128
  442. #define CV_SIMD_64F CV_SIMD128_64F
  443. #define CV_SIMD_WIDTH 16
  444. //! @addtogroup core_hal_intrin
  445. //! @{
  446. //! @brief Maximum available vector register capacity 8-bit unsigned integer values
  447. typedef v_uint8x16 v_uint8;
  448. //! @brief Maximum available vector register capacity 8-bit signed integer values
  449. typedef v_int8x16 v_int8;
  450. //! @brief Maximum available vector register capacity 16-bit unsigned integer values
  451. typedef v_uint16x8 v_uint16;
  452. //! @brief Maximum available vector register capacity 16-bit signed integer values
  453. typedef v_int16x8 v_int16;
  454. //! @brief Maximum available vector register capacity 32-bit unsigned integer values
  455. typedef v_uint32x4 v_uint32;
  456. //! @brief Maximum available vector register capacity 32-bit signed integer values
  457. typedef v_int32x4 v_int32;
  458. //! @brief Maximum available vector register capacity 64-bit unsigned integer values
  459. typedef v_uint64x2 v_uint64;
  460. //! @brief Maximum available vector register capacity 64-bit signed integer values
  461. typedef v_int64x2 v_int64;
  462. //! @brief Maximum available vector register capacity 32-bit floating point values (single precision)
  463. typedef v_float32x4 v_float32;
  464. #if CV_SIMD128_64F
  465. //! @brief Maximum available vector register capacity 64-bit floating point values (double precision)
  466. typedef v_float64x2 v_float64;
  467. #endif
  468. //! @}
  469. #define VXPREFIX(func) v##func
  470. } // namespace
  471. using namespace CV__SIMD_NAMESPACE;
  472. #elif CV_SIMD_SCALABLE
  473. #define CV__SIMD_NAMESPACE simd
  474. namespace CV__SIMD_NAMESPACE {
  475. #define CV_SIMD 0
  476. #define CV_SIMD_WIDTH 128 /* 1024/8 */
  477. #define VXPREFIX(func) v##func
  478. } // namespace
  479. using namespace CV__SIMD_NAMESPACE;
  480. #endif
  481. //! @cond IGNORED
  482. #ifndef CV_SIMD_64F
  483. #define CV_SIMD_64F 0
  484. #endif
  485. namespace CV__SIMD_NAMESPACE {
  486. //! @addtogroup core_hal_intrin
  487. //! @{
  488. //! @name Wide init with value
  489. //! @{
  490. //! @brief Create maximum available capacity vector with elements set to a specific value
  491. inline v_uint8 vx_setall_u8(uchar v) { return VXPREFIX(_setall_u8)(v); }
  492. inline v_int8 vx_setall_s8(schar v) { return VXPREFIX(_setall_s8)(v); }
  493. inline v_uint16 vx_setall_u16(ushort v) { return VXPREFIX(_setall_u16)(v); }
  494. inline v_int16 vx_setall_s16(short v) { return VXPREFIX(_setall_s16)(v); }
  495. inline v_int32 vx_setall_s32(int v) { return VXPREFIX(_setall_s32)(v); }
  496. inline v_uint32 vx_setall_u32(unsigned v) { return VXPREFIX(_setall_u32)(v); }
  497. inline v_float32 vx_setall_f32(float v) { return VXPREFIX(_setall_f32)(v); }
  498. inline v_int64 vx_setall_s64(int64 v) { return VXPREFIX(_setall_s64)(v); }
  499. inline v_uint64 vx_setall_u64(uint64 v) { return VXPREFIX(_setall_u64)(v); }
  500. #if CV_SIMD_64F || CV_SIMD_SCALABLE_64F
  501. inline v_float64 vx_setall_f64(double v) { return VXPREFIX(_setall_f64)(v); }
  502. #endif
  503. //! @}
  504. //! @name Wide init with zero
  505. //! @{
  506. //! @brief Create maximum available capacity vector with elements set to zero
  507. inline v_uint8 vx_setzero_u8() { return VXPREFIX(_setzero_u8)(); }
  508. inline v_int8 vx_setzero_s8() { return VXPREFIX(_setzero_s8)(); }
  509. inline v_uint16 vx_setzero_u16() { return VXPREFIX(_setzero_u16)(); }
  510. inline v_int16 vx_setzero_s16() { return VXPREFIX(_setzero_s16)(); }
  511. inline v_int32 vx_setzero_s32() { return VXPREFIX(_setzero_s32)(); }
  512. inline v_uint32 vx_setzero_u32() { return VXPREFIX(_setzero_u32)(); }
  513. inline v_float32 vx_setzero_f32() { return VXPREFIX(_setzero_f32)(); }
  514. inline v_int64 vx_setzero_s64() { return VXPREFIX(_setzero_s64)(); }
  515. inline v_uint64 vx_setzero_u64() { return VXPREFIX(_setzero_u64)(); }
  516. #if CV_SIMD_64F || CV_SIMD_SCALABLE_64F
  517. inline v_float64 vx_setzero_f64() { return VXPREFIX(_setzero_f64)(); }
  518. #endif
  519. //! @}
  520. //! @name Wide load from memory
  521. //! @{
  522. //! @brief Load maximum available capacity register contents from memory
  523. inline v_uint8 vx_load(const uchar * ptr) { return VXPREFIX(_load)(ptr); }
  524. inline v_int8 vx_load(const schar * ptr) { return VXPREFIX(_load)(ptr); }
  525. inline v_uint16 vx_load(const ushort * ptr) { return VXPREFIX(_load)(ptr); }
  526. inline v_int16 vx_load(const short * ptr) { return VXPREFIX(_load)(ptr); }
  527. inline v_int32 vx_load(const int * ptr) { return VXPREFIX(_load)(ptr); }
  528. inline v_uint32 vx_load(const unsigned * ptr) { return VXPREFIX(_load)(ptr); }
  529. inline v_float32 vx_load(const float * ptr) { return VXPREFIX(_load)(ptr); }
  530. inline v_int64 vx_load(const int64 * ptr) { return VXPREFIX(_load)(ptr); }
  531. inline v_uint64 vx_load(const uint64 * ptr) { return VXPREFIX(_load)(ptr); }
  532. #if CV_SIMD_64F || CV_SIMD_SCALABLE_64F
  533. inline v_float64 vx_load(const double * ptr) { return VXPREFIX(_load)(ptr); }
  534. #endif
  535. //! @}
  536. //! @name Wide load from memory(aligned)
  537. //! @{
  538. //! @brief Load maximum available capacity register contents from memory(aligned)
  539. inline v_uint8 vx_load_aligned(const uchar * ptr) { return VXPREFIX(_load_aligned)(ptr); }
  540. inline v_int8 vx_load_aligned(const schar * ptr) { return VXPREFIX(_load_aligned)(ptr); }
  541. inline v_uint16 vx_load_aligned(const ushort * ptr) { return VXPREFIX(_load_aligned)(ptr); }
  542. inline v_int16 vx_load_aligned(const short * ptr) { return VXPREFIX(_load_aligned)(ptr); }
  543. inline v_int32 vx_load_aligned(const int * ptr) { return VXPREFIX(_load_aligned)(ptr); }
  544. inline v_uint32 vx_load_aligned(const unsigned * ptr) { return VXPREFIX(_load_aligned)(ptr); }
  545. inline v_float32 vx_load_aligned(const float * ptr) { return VXPREFIX(_load_aligned)(ptr); }
  546. inline v_int64 vx_load_aligned(const int64 * ptr) { return VXPREFIX(_load_aligned)(ptr); }
  547. inline v_uint64 vx_load_aligned(const uint64 * ptr) { return VXPREFIX(_load_aligned)(ptr); }
  548. #if CV_SIMD_64F || CV_SIMD_SCALABLE_64F
  549. inline v_float64 vx_load_aligned(const double * ptr) { return VXPREFIX(_load_aligned)(ptr); }
  550. #endif
  551. //! @}
  552. //! @name Wide load lower half from memory
  553. //! @{
  554. //! @brief Load lower half of maximum available capacity register from memory
  555. inline v_uint8 vx_load_low(const uchar * ptr) { return VXPREFIX(_load_low)(ptr); }
  556. inline v_int8 vx_load_low(const schar * ptr) { return VXPREFIX(_load_low)(ptr); }
  557. inline v_uint16 vx_load_low(const ushort * ptr) { return VXPREFIX(_load_low)(ptr); }
  558. inline v_int16 vx_load_low(const short * ptr) { return VXPREFIX(_load_low)(ptr); }
  559. inline v_int32 vx_load_low(const int * ptr) { return VXPREFIX(_load_low)(ptr); }
  560. inline v_uint32 vx_load_low(const unsigned * ptr) { return VXPREFIX(_load_low)(ptr); }
  561. inline v_float32 vx_load_low(const float * ptr) { return VXPREFIX(_load_low)(ptr); }
  562. inline v_int64 vx_load_low(const int64 * ptr) { return VXPREFIX(_load_low)(ptr); }
  563. inline v_uint64 vx_load_low(const uint64 * ptr) { return VXPREFIX(_load_low)(ptr); }
  564. #if CV_SIMD_64F || CV_SIMD_SCALABLE_64F
  565. inline v_float64 vx_load_low(const double * ptr) { return VXPREFIX(_load_low)(ptr); }
  566. #endif
  567. //! @}
  568. //! @name Wide load halfs from memory
  569. //! @{
  570. //! @brief Load maximum available capacity register contents from two memory blocks
  571. inline v_uint8 vx_load_halves(const uchar * ptr0, const uchar * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
  572. inline v_int8 vx_load_halves(const schar * ptr0, const schar * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
  573. inline v_uint16 vx_load_halves(const ushort * ptr0, const ushort * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
  574. inline v_int16 vx_load_halves(const short * ptr0, const short * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
  575. inline v_int32 vx_load_halves(const int * ptr0, const int * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
  576. inline v_uint32 vx_load_halves(const unsigned * ptr0, const unsigned * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
  577. inline v_float32 vx_load_halves(const float * ptr0, const float * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
  578. inline v_int64 vx_load_halves(const int64 * ptr0, const int64 * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
  579. inline v_uint64 vx_load_halves(const uint64 * ptr0, const uint64 * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
  580. #if CV_SIMD_64F || CV_SIMD_SCALABLE_64F
  581. inline v_float64 vx_load_halves(const double * ptr0, const double * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
  582. #endif
  583. //! @}
  584. //! @name Wide LUT of elements
  585. //! @{
  586. //! @brief Load maximum available capacity register contents with array elements by provided indexes
  587. inline v_uint8 vx_lut(const uchar * ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
  588. inline v_int8 vx_lut(const schar * ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
  589. inline v_uint16 vx_lut(const ushort * ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
  590. inline v_int16 vx_lut(const short* ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
  591. inline v_int32 vx_lut(const int* ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
  592. inline v_uint32 vx_lut(const unsigned* ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
  593. inline v_float32 vx_lut(const float* ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
  594. inline v_int64 vx_lut(const int64 * ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
  595. inline v_uint64 vx_lut(const uint64 * ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
  596. #if CV_SIMD_64F || CV_SIMD_SCALABLE_64F
  597. inline v_float64 vx_lut(const double* ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
  598. #endif
  599. //! @}
  600. //! @name Wide LUT of element pairs
  601. //! @{
  602. //! @brief Load maximum available capacity register contents with array element pairs by provided indexes
  603. inline v_uint8 vx_lut_pairs(const uchar * ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
  604. inline v_int8 vx_lut_pairs(const schar * ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
  605. inline v_uint16 vx_lut_pairs(const ushort * ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
  606. inline v_int16 vx_lut_pairs(const short* ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
  607. inline v_int32 vx_lut_pairs(const int* ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
  608. inline v_uint32 vx_lut_pairs(const unsigned* ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
  609. inline v_float32 vx_lut_pairs(const float* ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
  610. inline v_int64 vx_lut_pairs(const int64 * ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
  611. inline v_uint64 vx_lut_pairs(const uint64 * ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
  612. #if CV_SIMD_64F || CV_SIMD_SCALABLE_64F
  613. inline v_float64 vx_lut_pairs(const double* ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
  614. #endif
  615. //! @}
  616. //! @name Wide LUT of element quads
  617. //! @{
  618. //! @brief Load maximum available capacity register contents with array element quads by provided indexes
  619. inline v_uint8 vx_lut_quads(const uchar* ptr, const int* idx) { return VXPREFIX(_lut_quads)(ptr, idx); }
  620. inline v_int8 vx_lut_quads(const schar* ptr, const int* idx) { return VXPREFIX(_lut_quads)(ptr, idx); }
  621. inline v_uint16 vx_lut_quads(const ushort* ptr, const int* idx) { return VXPREFIX(_lut_quads)(ptr, idx); }
  622. inline v_int16 vx_lut_quads(const short* ptr, const int* idx) { return VXPREFIX(_lut_quads)(ptr, idx); }
  623. inline v_int32 vx_lut_quads(const int* ptr, const int* idx) { return VXPREFIX(_lut_quads)(ptr, idx); }
  624. inline v_uint32 vx_lut_quads(const unsigned* ptr, const int* idx) { return VXPREFIX(_lut_quads)(ptr, idx); }
  625. inline v_float32 vx_lut_quads(const float* ptr, const int* idx) { return VXPREFIX(_lut_quads)(ptr, idx); }
  626. //! @}
  627. //! @name Wide load with double expansion
  628. //! @{
  629. //! @brief Load maximum available capacity register contents from memory with double expand
  630. inline v_uint16 vx_load_expand(const uchar * ptr) { return VXPREFIX(_load_expand)(ptr); }
  631. inline v_int16 vx_load_expand(const schar * ptr) { return VXPREFIX(_load_expand)(ptr); }
  632. inline v_uint32 vx_load_expand(const ushort * ptr) { return VXPREFIX(_load_expand)(ptr); }
  633. inline v_int32 vx_load_expand(const short* ptr) { return VXPREFIX(_load_expand)(ptr); }
  634. inline v_int64 vx_load_expand(const int* ptr) { return VXPREFIX(_load_expand)(ptr); }
  635. inline v_uint64 vx_load_expand(const unsigned* ptr) { return VXPREFIX(_load_expand)(ptr); }
  636. inline v_float32 vx_load_expand(const hfloat * ptr) { return VXPREFIX(_load_expand)(ptr); }
  637. //! @}
  638. //! @name Wide load with quad expansion
  639. //! @{
  640. //! @brief Load maximum available capacity register contents from memory with quad expand
  641. inline v_uint32 vx_load_expand_q(const uchar * ptr) { return VXPREFIX(_load_expand_q)(ptr); }
  642. inline v_int32 vx_load_expand_q(const schar * ptr) { return VXPREFIX(_load_expand_q)(ptr); }
  643. //! @}
  644. /** @brief SIMD processing state cleanup call */
  645. inline void vx_cleanup() { VXPREFIX(_cleanup)(); }
  646. #if !CV_SIMD_SCALABLE
  647. // Compatibility layer
  648. #if !(CV_NEON && !defined(CV_FORCE_SIMD128_CPP))
  649. template<typename T> struct VTraits {
  650. static inline int vlanes() { return T::nlanes; }
  651. enum { nlanes = T::nlanes, max_nlanes = T::nlanes };
  652. using lane_type = typename T::lane_type;
  653. };
  654. //////////// get0 ////////////
  655. #define OPENCV_HAL_WRAP_GRT0(_Tpvec) \
  656. inline typename VTraits<_Tpvec>::lane_type v_get0(const _Tpvec& v) \
  657. { \
  658. return v.get0(); \
  659. }
  660. OPENCV_HAL_WRAP_GRT0(v_uint8)
  661. OPENCV_HAL_WRAP_GRT0(v_int8)
  662. OPENCV_HAL_WRAP_GRT0(v_uint16)
  663. OPENCV_HAL_WRAP_GRT0(v_int16)
  664. OPENCV_HAL_WRAP_GRT0(v_uint32)
  665. OPENCV_HAL_WRAP_GRT0(v_int32)
  666. OPENCV_HAL_WRAP_GRT0(v_uint64)
  667. OPENCV_HAL_WRAP_GRT0(v_int64)
  668. OPENCV_HAL_WRAP_GRT0(v_float32)
  669. #if CV_SIMD_64F
  670. OPENCV_HAL_WRAP_GRT0(v_float64)
  671. #endif
  672. #if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
  673. OPENCV_HAL_WRAP_GRT0(v_uint8x16)
  674. OPENCV_HAL_WRAP_GRT0(v_uint16x8)
  675. OPENCV_HAL_WRAP_GRT0(v_uint32x4)
  676. OPENCV_HAL_WRAP_GRT0(v_uint64x2)
  677. OPENCV_HAL_WRAP_GRT0(v_int8x16)
  678. OPENCV_HAL_WRAP_GRT0(v_int16x8)
  679. OPENCV_HAL_WRAP_GRT0(v_int32x4)
  680. OPENCV_HAL_WRAP_GRT0(v_int64x2)
  681. OPENCV_HAL_WRAP_GRT0(v_float32x4)
  682. #if CV_SIMD_64F
  683. OPENCV_HAL_WRAP_GRT0(v_float64x2)
  684. #endif
  685. #endif
  686. #if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
  687. OPENCV_HAL_WRAP_GRT0(v_uint8x32)
  688. OPENCV_HAL_WRAP_GRT0(v_uint16x16)
  689. OPENCV_HAL_WRAP_GRT0(v_uint32x8)
  690. OPENCV_HAL_WRAP_GRT0(v_uint64x4)
  691. OPENCV_HAL_WRAP_GRT0(v_int8x32)
  692. OPENCV_HAL_WRAP_GRT0(v_int16x16)
  693. OPENCV_HAL_WRAP_GRT0(v_int32x8)
  694. OPENCV_HAL_WRAP_GRT0(v_int64x4)
  695. OPENCV_HAL_WRAP_GRT0(v_float32x8)
  696. #if CV_SIMD_64F
  697. OPENCV_HAL_WRAP_GRT0(v_float64x4)
  698. #endif
  699. #endif
  700. #endif
  701. #define OPENCV_HAL_WRAP_BIN_OP_ADDSUB(_Tpvec) \
  702. template<typename... Args> \
  703. inline _Tpvec v_add(const _Tpvec& f1, const _Tpvec& f2, const _Tpvec& f3, const Args&... vf) { \
  704. return v_add(v_add(f1, f2), f3, vf...); \
  705. }
  706. OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint8)
  707. OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint16)
  708. OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint32)
  709. OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint64)
  710. OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int8)
  711. OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int16)
  712. OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int32)
  713. OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int64)
  714. OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float32)
  715. #if CV_SIMD_64F
  716. OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64)
  717. #endif
  718. #if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
  719. // when we use CV_SIMD128 with 256/512 bit SIMD (e.g. AVX2 or AVX512)
  720. OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint8x16)
  721. OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint16x8)
  722. OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint32x4)
  723. OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint64x2)
  724. OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int8x16)
  725. OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int16x8)
  726. OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int32x4)
  727. OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int64x2)
  728. OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float32x4)
  729. #if CV_SIMD_64F
  730. OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64x2)
  731. #endif
  732. #endif
  733. #if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
  734. // when we use CV_SIMD256 with 512 bit SIMD (e.g. AVX512)
  735. OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint8x32)
  736. OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint16x16)
  737. OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint32x8)
  738. OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_uint64x4)
  739. OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int8x32)
  740. OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int16x16)
  741. OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int32x8)
  742. OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_int64x4)
  743. OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float32x8)
  744. #if CV_SIMD_64F
  745. OPENCV_HAL_WRAP_BIN_OP_ADDSUB(v_float64x4)
  746. #endif
  747. #endif
  748. #define OPENCV_HAL_WRAP_BIN_OP_MUL(_Tpvec) \
  749. template<typename... Args> \
  750. inline _Tpvec v_mul(const _Tpvec& f1, const _Tpvec& f2, const _Tpvec& f3, const Args&... vf) { \
  751. return v_mul(v_mul(f1, f2), f3, vf...); \
  752. }
  753. OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint8)
  754. OPENCV_HAL_WRAP_BIN_OP_MUL(v_int8)
  755. OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint16)
  756. OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint32)
  757. OPENCV_HAL_WRAP_BIN_OP_MUL(v_int16)
  758. OPENCV_HAL_WRAP_BIN_OP_MUL(v_int32)
  759. OPENCV_HAL_WRAP_BIN_OP_MUL(v_float32)
  760. #if CV_SIMD_64F
  761. OPENCV_HAL_WRAP_BIN_OP_MUL(v_float64)
  762. #endif
  763. #if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
  764. OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint8x16)
  765. OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint16x8)
  766. OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint32x4)
  767. OPENCV_HAL_WRAP_BIN_OP_MUL(v_int8x16)
  768. OPENCV_HAL_WRAP_BIN_OP_MUL(v_int16x8)
  769. OPENCV_HAL_WRAP_BIN_OP_MUL(v_int32x4)
  770. OPENCV_HAL_WRAP_BIN_OP_MUL(v_float32x4)
  771. #if CV_SIMD_64F
  772. OPENCV_HAL_WRAP_BIN_OP_MUL(v_float64x2)
  773. #endif
  774. #endif
  775. #if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
  776. OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint8x32)
  777. OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint16x16)
  778. OPENCV_HAL_WRAP_BIN_OP_MUL(v_uint32x8)
  779. OPENCV_HAL_WRAP_BIN_OP_MUL(v_int8x32)
  780. OPENCV_HAL_WRAP_BIN_OP_MUL(v_int16x16)
  781. OPENCV_HAL_WRAP_BIN_OP_MUL(v_int32x8)
  782. OPENCV_HAL_WRAP_BIN_OP_MUL(v_float32x8)
  783. #if CV_SIMD_64F
  784. OPENCV_HAL_WRAP_BIN_OP_MUL(v_float64x4)
  785. #endif
  786. #endif
  787. #define OPENCV_HAL_WRAP_EXTRACT(_Tpvec) \
  788. inline typename VTraits<_Tpvec>::lane_type v_extract_highest(const _Tpvec& v) \
  789. { \
  790. return v_extract_n<VTraits<_Tpvec>::nlanes-1>(v); \
  791. }
  792. OPENCV_HAL_WRAP_EXTRACT(v_uint8)
  793. OPENCV_HAL_WRAP_EXTRACT(v_int8)
  794. OPENCV_HAL_WRAP_EXTRACT(v_uint16)
  795. OPENCV_HAL_WRAP_EXTRACT(v_int16)
  796. OPENCV_HAL_WRAP_EXTRACT(v_uint32)
  797. OPENCV_HAL_WRAP_EXTRACT(v_int32)
  798. OPENCV_HAL_WRAP_EXTRACT(v_uint64)
  799. OPENCV_HAL_WRAP_EXTRACT(v_int64)
  800. OPENCV_HAL_WRAP_EXTRACT(v_float32)
  801. #if CV_SIMD_64F
  802. OPENCV_HAL_WRAP_EXTRACT(v_float64)
  803. #endif
  804. #if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
  805. OPENCV_HAL_WRAP_EXTRACT(v_uint8x16)
  806. OPENCV_HAL_WRAP_EXTRACT(v_uint16x8)
  807. OPENCV_HAL_WRAP_EXTRACT(v_uint32x4)
  808. OPENCV_HAL_WRAP_EXTRACT(v_uint64x2)
  809. OPENCV_HAL_WRAP_EXTRACT(v_int8x16)
  810. OPENCV_HAL_WRAP_EXTRACT(v_int16x8)
  811. OPENCV_HAL_WRAP_EXTRACT(v_int32x4)
  812. OPENCV_HAL_WRAP_EXTRACT(v_int64x2)
  813. OPENCV_HAL_WRAP_EXTRACT(v_float32x4)
  814. #if CV_SIMD_64F
  815. OPENCV_HAL_WRAP_EXTRACT(v_float64x2)
  816. #endif
  817. #endif
  818. #if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
  819. OPENCV_HAL_WRAP_EXTRACT(v_uint8x32)
  820. OPENCV_HAL_WRAP_EXTRACT(v_uint16x16)
  821. OPENCV_HAL_WRAP_EXTRACT(v_uint32x8)
  822. OPENCV_HAL_WRAP_EXTRACT(v_uint64x4)
  823. OPENCV_HAL_WRAP_EXTRACT(v_int8x32)
  824. OPENCV_HAL_WRAP_EXTRACT(v_int16x16)
  825. OPENCV_HAL_WRAP_EXTRACT(v_int32x8)
  826. OPENCV_HAL_WRAP_EXTRACT(v_int64x4)
  827. OPENCV_HAL_WRAP_EXTRACT(v_float32x8)
  828. #if CV_SIMD_64F
  829. OPENCV_HAL_WRAP_EXTRACT(v_float64x4)
  830. #endif
  831. #endif
  832. #define OPENCV_HAL_WRAP_BROADCAST(_Tpvec) \
  833. inline _Tpvec v_broadcast_highest(const _Tpvec& v) \
  834. { \
  835. return v_broadcast_element<VTraits<_Tpvec>::nlanes-1>(v); \
  836. }
  837. OPENCV_HAL_WRAP_BROADCAST(v_uint32)
  838. OPENCV_HAL_WRAP_BROADCAST(v_int32)
  839. OPENCV_HAL_WRAP_BROADCAST(v_float32)
  840. #if CV_SIMD_WIDTH != 16/*128*/ && CV_SIMD128
  841. OPENCV_HAL_WRAP_BROADCAST(v_uint32x4)
  842. OPENCV_HAL_WRAP_BROADCAST(v_int32x4)
  843. OPENCV_HAL_WRAP_BROADCAST(v_float32x4)
  844. #endif
  845. #if CV_SIMD_WIDTH != 32/*256*/ && CV_SIMD256
  846. OPENCV_HAL_WRAP_BROADCAST(v_uint32x8)
  847. OPENCV_HAL_WRAP_BROADCAST(v_int32x8)
  848. OPENCV_HAL_WRAP_BROADCAST(v_float32x8)
  849. #endif
  850. #endif //!CV_SIMD_SCALABLE
  851. //! @cond IGNORED
  852. // backward compatibility
  853. template<typename _Tp, typename _Tvec> static inline
  854. void vx_store(_Tp* dst, const _Tvec& v) { return v_store(dst, v); }
  855. // backward compatibility
  856. template<typename _Tp, typename _Tvec> static inline
  857. void vx_store_aligned(_Tp* dst, const _Tvec& v) { return v_store_aligned(dst, v); }
  858. //! @endcond
  859. //! @}
  860. #undef VXPREFIX
  861. } // namespace
  862. #ifndef CV_SIMD_FP16
  863. #define CV_SIMD_FP16 0 //!< Defined to 1 on native support of operations with float16x8_t / float16x16_t (SIMD256) types
  864. #endif
  865. #ifndef CV_SIMD
  866. #define CV_SIMD 0
  867. #endif
  868. #include "simd_utils.impl.hpp"
  869. #ifndef CV_DOXYGEN
  870. CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
  871. #endif
  872. } // cv::
  873. //! @endcond
  874. #if defined(__GNUC__) && __GNUC__ == 12
  875. #pragma GCC diagnostic pop
  876. #endif
  877. #endif