调整更新部分功能

This commit is contained in:
masayume 2022-06-28 09:24:29 +08:00
parent 35a9c7b602
commit 898cb00afa
683 changed files with 33817 additions and 367556 deletions

34
3rdparty/nick/StopWatch.h vendored Normal file
View File

@ -0,0 +1,34 @@
#pragma once
#include <chrono>
class StopWatch
{
public:
StopWatch() {
_start = std::chrono::steady_clock::now();
}
void reset() {
_start = std::chrono::steady_clock::now();
}
double elapsed_s() {
return std::chrono::duration<double>(std::chrono::steady_clock::now() - _start).count();
}
double elapsed_ms() {
return std::chrono::duration<double, std::milli>(std::chrono::steady_clock::now() - _start).count();
}
double elapsed_us() {
return std::chrono::duration<double, std::micro>(std::chrono::steady_clock::now() - _start).count();
}
double elapsed_ns() {
return std::chrono::duration<double, std::nano>(std::chrono::steady_clock::now() - _start).count();
}
private:
std::chrono::steady_clock::time_point _start;
};

12
3rdparty/nick/callbackdefines.h vendored Normal file
View File

@ -0,0 +1,12 @@
#ifndef CALLBACKDEFINESH
#define CALLBACKDEFINESH
#include <type_traits>
typedef void(*usbreport_callback)(int conditioncode,void* usrdata);
typedef void(*usbcallback)(int conditioncode,void* usrdata);
//typedef void(*onimagecallback)(void* mat, int bpp, int statuscode);
typedef std::decay<void(void*,int,int)>::type onimagecallback;
typedef std::decay<void(int,void*)>::type usbcallback;
#endif

280
3rdparty/nick/common.h vendored Normal file
View File

@ -0,0 +1,280 @@
#ifndef COMMON_H
#define COMMON_H
#ifdef __cplusplus
extern "C"{
#endif
typedef struct hg_tag_SIZE
{
long cx;
long cy;
}CSSIZE, *PCSSIZE, *LPCSSIZE;
typedef CSSIZE CSSIZEL;
typedef CSSIZE *PCSSIZEL, *LPCSSIZEL;
enum hg_tagUsbSupported
{
/*停止扫描**/
SCAN_STOP = -1,
/*异常******/
HAVE_ERROR = -2,
/*正常状态****/
NORMAL = 0,
/*开盖***/
OPEN_COVER = 1,
/*无纸****/
NO_FEED = 2,
/*搓纸失败****/
FEED_IN_ERROR = 4,
/**卡纸*****/
PAPER_JAM = 8,
/**检测到双张**/
DETECT_DOUBLE_FEED = 16,
/**检测到订书钉**/
DETECT_STAPLE = 32,
/*纸张倾斜******/
PAPER_SKEW = 64,
/**自动模式****/
AUTO_SCAN_MODE = 65,
/**手动模式****/
MANAUL_SCAN_MODE = 66,
/**计数模式****/
COUNT_MODE = 67,
/*硬件错误*****/
HARDWARE_ERROR = 68,
/*FPGA崩溃***/
FPGA_ERROR = 68,
/*开始******/
START_SCAN = 69,
/**停止*****/
STOP_SCAN = 70,
/**有图*****/
HAVE_IMAGE = 71,
/*更新扫描参数***/
UPDATE_SCAN_PARAMETER = 72,
/*PC繁忙或出错***/
PC_SCAN_BUSY_or_ERROR = 73,
/*USB链接断开***/
DEVICE_OFF_LINE = 74
};
typedef enum hg_tagUsbSupported tagUsbSupported;
enum hg_twSS
{
None = 0,
A4Letter = 1,
A4 = 1,
B5Letter = 2,
JISB5 = 2,
B5 = 2,
USLetter = 3,
USLegal = 4,
A5 = 5,
B4 = 6,
ISOB4 = 6,
B6 = 7,
ISOB6 = 7,
USLedger = 9,
USExecutive = 10,
A3 = 11,
B3 = 12,
ISOB3 = 12,
A6 = 13,
C4 = 14,
C5 = 15,
C6 = 16,
_4A0 = 17,
_2A0 = 18,
A0 = 19,
A1 = 20,
A2 = 21,
A7 = 22,
A8 = 23,
A9 = 24,
A10 = 25,
ISOB0 = 26,
ISOB1 = 27,
ISOB2 = 28,
ISOB5 = 29,
ISOB7 = 30,
ISOB8 = 31,
ISOB9 = 32,
ISOB10 = 33,
JISB0 = 34,
JISB1 = 35,
JISB2 = 36,
JISB3 = 37,
JISB4 = 38,
JISB6 = 39,
JISB7 = 40,
JISB8 = 41,
JISB9 = 42,
JISB10 = 43,
C0 = 44,
C1 = 45,
C2 = 46,
C3 = 47,
C7 = 48,
C8 = 49,
C9 = 50,
C10 = 51,
USStatement = 52,
BusinessCard = 53,
MaxSize = 54,
};
typedef enum hg_twSS TwSS;
enum hg_tagFrontBack
{
FRONT_PAGE = 0,
BACK_PAGE
};
typedef enum hg_tagFrontBack FRONTBACK;
enum hg_tagFilter
{
FILTER_RED,
FILTER_GREEN,
FILTER_BLUE,
FILTER_ALL,
FILTER_NONE,
ENHANCE_RED,
ENHANCE_GREEN,
ENHANCE_BLUE
};
typedef enum hg_tagFilter Filter;
enum hg_tagOrentations
{
ROTATE_NONE = 0,
ROTATE_90,
ROTATE_180,
ROTATE_270,
AUTOTEXT_DETECT
};
typedef enum hg_tagOrentations Orentations;
struct hg_tagOutHoleParam
{
int OutHole;
int OutHoleValue;/*1~50;*/
};
typedef struct hg_tagOutHoleParam OutHoleParams;
struct hg_tagCropRect
{
int enable;
int x; /*****自定义裁切区域左上角x坐标*/
int y; /*****自定义裁切区域左上角y坐标*/
int width; /*****自定义裁切区域宽度*******/
int height; /*****自定义裁切区域高度*******/
};
typedef struct hg_tagCropRect CropRect;
struct hg_tagCustomGamma
{
int isDefined;
unsigned char* table;
int tableLength;
};
typedef struct hg_tagCustomGamma CustomGamma;
enum hg_PaperAlign {
Rot0 = 0,
Rot270 = 3,
AutoTextOrientation = 5
};
typedef enum hg_PaperAlign PaperAlign;
enum hg_Multi_output {
Unused = -1,
All,
ColorGray,
ColorBw,
GrayBw
};
typedef enum hg_Multi_output Multi_output;
struct hg_tagImageProcessParams
{
int PixType; /*same as color*/
int DestResulution; /*same sa resulution*/
int NativeResulution; /*fixed 200 for now*/
int AutoDiscardBlank; /****跳过空白页通用****************/
int AutoDiscardBlankVince; /****跳过空白页(发票)***********************/
int IsDuplex; /*false:single*/
int IsFold; /*对折*/
int AutoDescrew;
int AutoCrop;
int FillBlackRect;
int Filter; /*decolor ,0:red 1:green 2:blue 3:none encolor 5:red 6:green 7:blue*/
OutHoleParams OutHoleParam;
int Orentation; /*0:none 1:90 2:180 3:270 4:auto*/
int BackRotate180;
int Brightness; /*1~255*/
int Contrast; /*1~7*/
float Gamma; /*0.1f~5.0f*/
int MultiOutRed;
int MultiOutputType;/*-1:none 0:all 1:COLORGRAY 2:COLORBW 3:GRAYBW*/
CropRect cropRect;
CustomGamma customGamma;/*****色调曲线**************/
int RefuseInflow;/*防止渗透**************************/
int ColorCorrection;/*色彩校正**/
int RemoveMorr; /**去除摩尔纹********/
int ErrorExtention; /**错误扩散*****************/
int TextureRemove;/****除网纹*******************/
int imageSharpen;/*0:none 1:sharpen 2:sharpen_more 3:blur 4:blur_more*/
int SplitImage;
int AnswerSheetFilter;
int NosieDetach;
int AutoDetctOrentation;
};
typedef struct hg_tagImageProcessParams ImageProcessParams;
enum hg_color_mode {
BW,
Gray,
Color
};
typedef enum hg_color_mode ColorMode;
struct hg_tagScanParams
{
int colorMode;/*2:color 1:gray 0:bw*/
int papertype;
PaperAlign paperAlign;
int Resolution;/*fixed 200 for now*/
int UltrasonicDetect;/*double check*/
int BindingDetect;/*staple check*/
int ScrewDetect;
int ScrewTopLevel;/*1-5,1 easiest*/
int ScanCount;/*1-500*/
ImageProcessParams ImageProcessParam;
};
typedef struct hg_tagScanParams ScanParam;
struct hg_tagImageInfo
{
int Width;
int Height;
int bpp;
};
typedef struct hg_tagImageInfo ImageInfo;
enum hg_sharpenType
{
SharpenNone,
Sharpen, /*********锐化**********************/
SharpenMore, /*********进一步锐化*****************/
Blur, /*********模糊*********************/
BlurMore /*********进一步模糊***************/
};
typedef enum hg_sharpenType ImageSharpen;
#ifdef __cplusplus
}
#endif
#endif

155
3rdparty/nick/predefine.h vendored Normal file
View File

@ -0,0 +1,155 @@
#ifndef PRE_DEFINE_H
#define PRE_DEFINE_H
#define G100SCANNER
#define EN_LOG
#include <vector>
#ifndef _WIN32
typedef bool BOOL;
typedef unsigned int UINT32;
#endif
typedef unsigned int u32;
typedef struct
{
u32 gainF[6];
u32 gainB[6];
u32 offsetsF[6];
u32 offsetsB[6];
u32 expF[3];
u32 expB[3];
u32 sp;
}HGCISConfig;
typedef struct
{
HGCISConfig colorCorrect;
HGCISConfig color;
HGCISConfig grayCorrect;
HGCISConfig gray;
}HGCorrectConfigs;
struct SPSET
{
unsigned int FSP;
unsigned int BSP;
};
typedef struct CorrectParam {
unsigned int Exposures[6];
unsigned int Gain[12];
unsigned int Offset[12];
};
typedef struct CaptureParams
{
int correctColorExposure[6];
int correctColorGain[12];
int correctColorOffset[12];
int correctGrayExposure[6];
int correctGrayGain[12];
int correctGrayOffset[12];
int colorExposure[6];
int colorGain[12];
int colorOffset[12];
int grayExposure[6];
int grayGain[12];
int grayOffset[12];
int uvCorrectColorExposure[2];
int uvCorrectGrayExposure[2];
int uvColorExposure[2];
int uvGrayExposure[2];
} CaptureParams;
typedef struct hgsize{
hgsize(){}
template<typename T1, typename T2>
hgsize(T1 x,T2 y)
{
cy = y;
cx = x;
}
bool operator == (hgsize s)
{
if(s.cx == this->cx && s.cy == this->cy)
return true;
return false;
}
bool isempty()
{
return (this->cy*this->cx)?0:1;
}
int cy;
int cx;
}HgSize,HGSIZE;
enum ScannerSerial: unsigned char
{
G100Serial,
G200Serial,
G300Serial,
G400Serial,
G10039Serial,
G20039Serial,
};
struct Vid_pid
{
Vid_pid(unsigned short set_vid, unsigned short set_pid) :
vid(set_vid),
pid(set_pid) {}
bool operator == (Vid_pid sre)
{
if (sre.pid == this->pid && sre.vid == this->vid)
return true;
return false;
}
unsigned short vid;
unsigned short pid;
};
///#define LANXUMVERSION
#define HGVERSION
#ifdef EN_LOG
#define LOG printf
#else
#define LOG
#endif
#ifdef HGVERSION
#ifdef G100SCANNER
static std::vector<Vid_pid> DEVICE_ID={
{0x3072,0x100},
{0x3072,0x139}
};
#elif defined(G200SCANNER)
static std::vector<Vid_pid> DEVICE_ID={
{0x3072,0x200},
{0x3072,0x239}
};
#elif defined(G300SCANNER)
static std::vector<Vid_pid> DEVICE_ID={
{0x3072,0x300},
};
#else
static std::vector<Vid_pid> DEVICE_ID={
{0x3072,0x400},
};
#endif
#elif defined(LANXUMVERSION)
static std::vector<Vid_pid> DEVICE_ID={
{0x31c9,0x8730},
};
#endif
#endif

467
3rdparty/nick/sane_common.h vendored Normal file
View File

@ -0,0 +1,467 @@
#ifndef COMMON_H
#define COMMON_H
#ifdef __cplusplus
#include <stdint.h>
#ifdef __linux__
typedef unsigned char byte;
#endif // _WIN32
extern "C"{
#endif
typedef struct hg_tag_SIZE
{
long cx;
long cy;
}CSSIZE, *PCSSIZE, *LPCSSIZE;
typedef CSSIZE CSSIZEL;
typedef CSSIZE *PCSSIZEL, *LPCSSIZEL;
enum hg_tagUsbSupported
{
/*停止扫描**/
SCAN_STOP = -1,
/*异常******/
HAVE_ERROR = -2,
/*正常状态****/
NORMAL = 0,
/*开盖***/
OPEN_COVER = 1,
/*无纸****/
NO_FEED = 2,
/*搓纸失败****/
FEED_IN_ERROR = 4,
/**卡纸*****/
PAPER_JAM = 8,
/**检测到双张**/
DETECT_DOUBLE_FEED = 16,
/**检测到订书钉**/
DETECT_STAPLE = 32,
/*纸张倾斜******/
PAPER_SKEW = 64,
/**自动模式****/
AUTO_SCAN_MODE = 65,
/**手动模式****/
MANAUL_SCAN_MODE = 66,
/**计数模式****/
COUNT_MODE = 67,
/*硬件错误*****/
HARDWARE_ERROR = 68,
/*FPGA崩溃***/
FPGA_ERROR = 68,
/*开始******/
START_SCAN = 69,
/**停止*****/
STOP_SCAN = 70,
/**有图*****/
HAVE_IMAGE = 71,
/*更新扫描参数***/
UPDATE_SCAN_PARAMETER = 72,
/*PC繁忙或出错***/
PC_SCAN_BUSY_or_ERROR = 73,
/*USB链接断开***/
DEVICE_OFF_LINE = 74,
/*尺寸错误*/
SIZE_ERROR = 75,
//取图超时
AQUIRE_IMAGE_TIMEOUT = 76,
//获取图片与扫描张数不匹配
LOSE_IMAGE = 77,
//usb读取数据错误
USB_BULK_ERROR = 78,
//v4l2取图失败
V4L2_AQULRE_ERROR = 79,
//扫描仪内部图片丢失
V4L2_IMAGE_EMPTY = 80,
//处于休眠中
SLEEPING = 81,
//检测到有折角
HAVE_DOGEAR = 82,
//自动校正中
AUTO_FLATTING = 198,
//USB 未连接
USB_DISCONNECTED = 200,
//用户点击停止
USER_STOP = 201,
//自动平场校正完成
AUTO_FLAT_FINISHED = 202
};
typedef enum tagtwSS
{
None = 0,
A4Letter = 1,
A4 = 1,
B5Letter = 2,
JISB5 = 2,
B5 = 2,
USLetter = 3,
USLegal = 4,
A5 = 5,
B4 = 6,
ISOB4 = 6,
B6 = 7,
ISOB6 = 7,
USLedger = 9,
USExecutive = 10,
A3 = 11,
B3 = 12,
ISOB3 = 12,
A6 = 13,
C4 = 14,
C5 = 15,
C6 = 16,
_4A0 = 17,
_2A0 = 18,
A0 = 19,
A1 = 20,
A2 = 21,
A7 = 22,
A8 = 23,
A9 = 24,
A10 = 25,
ISOB0 = 26,
ISOB1 = 27,
ISOB2 = 28,
ISOB5 = 29,
ISOB7 = 30,
ISOB8 = 31,
ISOB9 = 32,
ISOB10 = 33,
JISB0 = 34,
JISB1 = 35,
JISB2 = 36,
JISB3 = 37,
JISB4 = 38,
JISB6 = 39,
JISB7 = 40,
JISB8 = 41,
JISB9 = 42,
JISB10 = 43,
C0 = 44,
C1 = 45,
C2 = 46,
C3 = 47,
C7 = 48,
C8 = 49,
C9 = 50,
C10 = 51,
USStatement = 52,
BusinessCard = 53,
MaxSize = 54
}TwSS;
typedef enum hg_tagUsbSupported tagUsbSupported;
#pragma pack(push)
#pragma pack(4)
typedef struct tagImageInfo
{
int Width;
int Height;
int bpp;
} ImageInfo;
typedef struct Scan_Rect {
int width;
int height;
int x;
int y;
}ScanRect;
/*********************************************************************************/
//基础参数
typedef enum tagColorMode {
BlackWhite,
Gray,
RGB
}ColorMode;
typedef enum tagMulti_Output {
Unused = -1,
All,
ColorGray,
ColorBw,
GrayBw
}MultiOutput;
typedef enum tagPaper_Align :unsigned char {
Rot0 = 0,
Rot270 = 3
}PaperAlign;
typedef struct tagCrop_Rect
{
int enable;
int x; /*****自定义裁切区域左上角x坐标*/
int y; /*****自定义裁切区域左上角y坐标*/
int width; /*****自定义裁切区域宽度*******/
int height; /*****自定义裁切区域高度*******/
}CropRect;
typedef struct tagScan_Side {
int duplex; /*0: both ; 1: front*/
int discardBlank; /*跳过空白页通用*/
int discardBlankVince; /*跳过空白页(发票)*/
int fold; /*对折*/
int switchFrontBack; /*互换正反面*/
}ScanSide;
typedef struct tagSkew_Detection {
int enable;
int level;
}SkewDetection;
typedef struct tagHhardware_Params
{
int capturepixtype;
int sizeDetection;
int doubleFeedDetection;
int bindingDetection;
SkewDetection skewDetection;
}HardwareCaps;
//图像处理参数
typedef struct tagCcustom_Gamma
{
int enable;
unsigned char table[768];
int tableLength;
}CustomGamma;
typedef struct tagFill_Hole
{
uint8_t enable;
int ratio;/*1~50;*/
}FillHole;
typedef enum tagColor_Filter
{
FILTER_RED,
FILTER_GREEN,
FILTER_BLUE,
FILTER_NONE,
FILTER_ALL,
ENHANCE_RED,
ENHANCE_GREEN,
ENHANCE_BLUE
}ColorFilter;
typedef enum tagSharpen_Type
{
STNone,
Sharpen,
SharpenMore,
Blur,
BlurMore
}SharpenType;
typedef enum tagOrentation
{
ROTATE_NONE = 0,
ROTATE_90,
ROTATE_180,
ROTATE_270,
AUTOTEXT_DETECT
}Orentation;
typedef struct tagjpegCompress {
int enable;
int ratio;
}JpegCompress;
typedef struct tagImage_Process
{
int autoCrop; /*自动裁剪尺寸*/
//亮度对比度伽马值
int brightness; /*1~255*/
int contrast; /*1~7*/
float gamma; /*0.1f~5.0f*/
CustomGamma customGamma;
//图像处理
int fillBlackRect;
int autoDescrew;
int refuseInflow;/*防止渗透*/
FillHole fillHole;
ColorFilter filter;
int colorCorrection;/*色彩校正*/
int removeMorr; /*去除摩尔纹*/
int errorExtention; /*错误扩散*/
int nosieDetach;/*噪点优化*/
int NosieDetachEnable;
int textureRemove;/*除网纹*/
int indent;/*边缘缩进像素*/
int noise;/*降噪像素点*/
int AutoCrop_threshold;/*自动裁剪二值化阀值*/
bool is_convex;/*填充黑框方式*/
SharpenType sharpenType;
int multiOutFilterRed;/*多流输出除红*/
int answerSheetFilterRed;/*答题卡除红*/
//送纸
Orentation orentation;
int backRotate180;
//其他
JpegCompress jpegCompress;
int splitImage;
int discardblank_percent;
}ImageProcess;
/*********************************************************************************/
typedef struct
{
ColorMode pixelType;
MultiOutput multiOutput;
TwSS paperSize;
PaperAlign paperAlign;
CropRect cropRect;
int resolution;
int resolution_native;
ScanSide scanSide;
ImageProcess imageProcess;
int scanCount; /* -1: 连续扫描 */
HardwareCaps hardwareParam;
int previewScan;
int threshold;
bool is_correct;
/*保存信息*/
/*std::string Caption;
std::string SavePath;*/
}GScanCap;
/******************
****
*******************/
typedef struct tagCONFIGPARAMS
{
/*基本选项卡参数*/
int Pixtype;
int PaperSize;
int EnAutoCrop;
int Resolution;
int EnDuplex;
int EnDiscardBlank;
int EnDiscardBlankVince;
int DBlank_AreaNum;
int DBlank_DevnMax;
int EnFold;
int EnExchangeFrontBack;
/*亮度对比度选项卡参数*/
float Brightness;
int EnAutoContrast;
float Contrast;
float Gamma;
/*图像处理选项卡参数*/
int Filter;
int Sharpen;
int EnFillBlack;
int EnAutoDescrew;
int EnOutHole;
int OutHoleRatio;
int EnMultiOutPutR;
int EnAnswerSheetR;
/*送纸部分选项卡参数*/
int EnUltrasonicDetect;
int EnBindingDetect;
int ScanCount;
int Orentation;
int EnBackRotate180;
int EnScrewDetect;
int ScrewDetectLevel;
/*保存信息*/
/*std::string Caption;
std::string SavePath;*/
}CONFIGPARAMS, * PCONFIGPARAMS;
typedef struct tagDetachNoise
{
int8_t is_detachnoise;
int detachnoise;
}DetachNoise;
typedef struct tagHARDWAREPARAMS_39
{
int8_t capturepixtype;
int8_t en_doublefeed;
int8_t en_stapledetect;
int8_t en_skrewdetect;
int8_t skrewdetectlevel;
int lowpowermode;
#ifdef UV
byte en_uv;
#endif
}HardwareCaps_39;
struct GScanCap_3399
{
uint8_t papertype; /**< the current paper source ADF or Flatbed*/
PaperAlign paperAlign;
uint8_t en_sizecheck; /**< 尺寸检测*/
float imageRotateDegree;
uint8_t is_duplex; /**< True to use duplex false for simplex, ignored if flatbed*/
uint8_t en_fold; /**<对折*/
int pixtype; /**< type of pixels to transfer image as */
int automaticcolor; /**<顔色自動識別*/
int automaticcolortype; /**<顔色自動識別后非彩色上傳類型*/
//ScanRect scanrect;
float resolution_dst; /**< horizontal resolution */
float resolution_native;
float gamma; /**< Gamma */
float contrast; /**< Contrast */
float brightness; /**< Brightness */
float threshold; /**< Threshold */
uint8_t is_autocontrast; /**< 自动对比度*/
uint8_t is_autocrop; /**< 自动裁切*/
uint8_t is_autodiscradblank_normal; /**< 自动丢弃空白页通用*/
int discardblank_percent; /**<跳过空白页阀值*/
uint8_t is_autodiscradblank_vince;/**自动丢弃空白页发票*/
uint8_t is_switchfrontback; /**交换正反面*/
uint8_t autodescrew; /**< 自动纠偏*/
uint8_t multi_output_red; /*多流输出*/
uint8_t hsvcorrect; /**<答题卡除红*/
uint8_t filter; /**< 除色*/
uint8_t sharpen;
uint8_t enhance_color; /**< 颜色增强*/
uint8_t fillbackground; /**< 填黑框*/
bool is_convex; /**< 填黑框模式true为凸多边形填充false为凹多边形填充默认true*/
int noise; /**< 除噪像素能够消除noise宽度的背景竖条纹干扰默认40*/
int indent; /**< 轮廓缩进裁剪、纠偏或者黑底填充时对探索到的纸张轮廓进行缩进indent像素默认5*/
int AutoCrop_threshold; /**< 自动裁剪二值化阈值,取值范围(0, 255)默认40*/
unsigned short scannum; /**< 扫描张数*/
uint8_t is_backrotate180; /**< 背面旋转180*/
uint8_t is_dogeardetection; /**<折角检测*/
HardwareCaps_39 hardwarecaps; /**< 硬件扫描参数*/
FillHole fillhole;
DetachNoise detachnoise; /**< 黑白降噪*/
uint8_t is_autotext; /**< 自动文本方向识别*/
bool isfillcolor; /**< 自动裁切颜色填充>*/
int refuseInflow; /**< 防止渗透>*/
int colorCorrection; /**< 色彩校正>*/
int removeMorr; /**< 去除摩尔纹>*/
int errorExtention; /** < 错误扩散>*/
int textureRemove; /** < 除网纹>*/
int splitImage; /** < 图像拆分>*/
CropRect cropRect; /**< 自定义裁切>*/
MultiOutput multiOutput; /**< 多流输出>*/
bool normalCrop; /**< 自动裁切深色样张>*/
uint32_t reserve[1024]; /**< 预留4096字节做协议扩展*/
};
#pragma pack(pop)
/*typedef struct tagCONFIGINFO
{
std::string Caption;
std::string SavePath;
}CONFIGINFO, * PCONFIGINFO;*/
#ifdef __cplusplus
}
#endif
#endif

View File

@ -50,7 +50,6 @@
#endif
#include "opencv2/core/cvdef.h"
#include "opencv2/core/version.hpp"
#include "opencv2/core/base.hpp"
#include "opencv2/core/cvstd.hpp"
#include "opencv2/core/traits.hpp"
@ -68,12 +67,15 @@
@defgroup core_c_glue Connections with C++
@}
@defgroup core_array Operations on arrays
@defgroup core_async Asynchronous API
@defgroup core_xml XML/YAML Persistence
@defgroup core_cluster Clustering
@defgroup core_utils Utility and system functions and macros
@{
@defgroup core_logging Logging facilities
@defgroup core_utils_sse SSE utilities
@defgroup core_utils_neon NEON utilities
@defgroup core_utils_vsx VSX utilities
@defgroup core_utils_softfloat Softfloat support
@defgroup core_utils_samples Utility functions for OpenCV samples
@}
@ -199,6 +201,9 @@ enum CovarFlags {
COVAR_COLS = 16
};
//! @addtogroup core_cluster
//! @{
//! k-Means flags
enum KmeansFlags {
/** Select random initial centers in each attempt.*/
@ -212,6 +217,8 @@ enum KmeansFlags {
KMEANS_USE_INITIAL_LABELS = 1
};
//! @} core_cluster
//! type of line
enum LineTypes {
FILLED = -1,
@ -233,12 +240,16 @@ enum HersheyFonts {
FONT_ITALIC = 16 //!< flag for italic font
};
//! @addtogroup core_array
//! @{
enum ReduceTypes { REDUCE_SUM = 0, //!< the output is the sum of all rows/columns of the matrix.
REDUCE_AVG = 1, //!< the output is the mean vector of all rows/columns of the matrix.
REDUCE_MAX = 2, //!< the output is the maximum (column/row-wise) of all rows/columns of the matrix.
REDUCE_MIN = 3 //!< the output is the minimum (column/row-wise) of all rows/columns of the matrix.
};
//! @} core_array
/** @brief Swaps two matrices
*/
@ -311,9 +322,9 @@ if src was not a ROI, use borderType | #BORDER_ISOLATED.
@param src Source image.
@param dst Destination image of the same type as src and the size Size(src.cols+left+right,
src.rows+top+bottom) .
@param top
@param bottom
@param left
@param top the top pixels
@param bottom the bottom pixels
@param left the left pixels
@param right Parameter specifying how many pixels in each direction from the source image rectangle
to extrapolate. For example, top=1, bottom=1, left=1, right=1 mean that 1 pixel-wide border needs
to be built.
@ -1612,7 +1623,9 @@ elements.
CV_EXPORTS_W bool checkRange(InputArray a, bool quiet = true, CV_OUT Point* pos = 0,
double minVal = -DBL_MAX, double maxVal = DBL_MAX);
/** @brief converts NaN's to the given number
/** @brief converts NaNs to the given number
@param a input/output matrix (CV_32F type).
@param val value to convert the NaNs
*/
CV_EXPORTS_W void patchNaNs(InputOutputArray a, double val = 0);

View File

@ -0,0 +1,105 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#ifndef OPENCV_CORE_ASYNC_HPP
#define OPENCV_CORE_ASYNC_HPP
#include <opencv2/core/mat.hpp>
#ifdef CV_CXX11
//#include <future>
#include <chrono>
#endif
namespace cv {
/** @addtogroup core_async
@{
*/
/** @brief Returns result of asynchronous operations
Object has attached asynchronous state.
Assignment operator doesn't clone asynchronous state (it is shared between all instances).
Result can be fetched via get() method only once.
*/
class CV_EXPORTS_W AsyncArray
{
public:
~AsyncArray() CV_NOEXCEPT;
CV_WRAP AsyncArray() CV_NOEXCEPT;
AsyncArray(const AsyncArray& o) CV_NOEXCEPT;
AsyncArray& operator=(const AsyncArray& o) CV_NOEXCEPT;
CV_WRAP void release() CV_NOEXCEPT;
/** Fetch the result.
@param[out] dst destination array
Waits for result until container has valid result.
Throws exception if exception was stored as a result.
Throws exception on invalid container state.
@note Result or stored exception can be fetched only once.
*/
CV_WRAP void get(OutputArray dst) const;
/** Retrieving the result with timeout
@param[out] dst destination array
@param[in] timeoutNs timeout in nanoseconds, -1 for infinite wait
@returns true if result is ready, false if the timeout has expired
@note Result or stored exception can be fetched only once.
*/
bool get(OutputArray dst, int64 timeoutNs) const;
CV_WRAP inline
bool get(OutputArray dst, double timeoutNs) const { return get(dst, (int64)timeoutNs); }
bool wait_for(int64 timeoutNs) const;
CV_WRAP inline
bool wait_for(double timeoutNs) const { return wait_for((int64)timeoutNs); }
CV_WRAP bool valid() const CV_NOEXCEPT;
#ifdef CV_CXX11
inline AsyncArray(AsyncArray&& o) { p = o.p; o.p = NULL; }
inline AsyncArray& operator=(AsyncArray&& o) CV_NOEXCEPT { std::swap(p, o.p); return *this; }
template<typename _Rep, typename _Period>
inline bool get(OutputArray dst, const std::chrono::duration<_Rep, _Period>& timeout)
{
return get(dst, (int64)(std::chrono::nanoseconds(timeout).count()));
}
template<typename _Rep, typename _Period>
inline bool wait_for(const std::chrono::duration<_Rep, _Period>& timeout)
{
return wait_for((int64)(std::chrono::nanoseconds(timeout).count()));
}
#if 0
std::future<Mat> getFutureMat() const;
std::future<UMat> getFutureUMat() const;
#endif
#endif
// PImpl
struct Impl; friend struct Impl;
inline void* _getImpl() const CV_NOEXCEPT { return p; }
protected:
Impl* p;
};
//! @}
} // namespace
#endif // OPENCV_CORE_ASYNC_HPP

View File

@ -188,7 +188,7 @@ enum NormTypes {
norm = \forkthree
{ \| \texttt{src1} \| _{L_2} ^{2} = \sum_I \texttt{src1}(I)^2} {if \(\texttt{normType} = \texttt{NORM_L2SQR}\)}
{ \| \texttt{src1} - \texttt{src2} \| _{L_2} ^{2} = \sum_I (\texttt{src1}(I) - \texttt{src2}(I))^2 }{if \(\texttt{normType} = \texttt{NORM_L2SQR}\) }
{ \left(\frac{\|\texttt{src1}-\texttt{src2}\|_{L_2} }{\|\texttt{src2}\|_{L_2}}\right)^2 }{if \(\texttt{normType} = \texttt{NORM_RELATIVE | NORM_L2}\) }
{ \left(\frac{\|\texttt{src1}-\texttt{src2}\|_{L_2} }{\|\texttt{src2}\|_{L_2}}\right)^2 }{if \(\texttt{normType} = \texttt{NORM_RELATIVE | NORM_L2SQR}\) }
\f]
*/
NORM_L2SQR = 5,
@ -326,8 +326,8 @@ CV_INLINE CV_NORETURN void errorNoReturn(int _code, const String& _err, const ch
// In practice, some macro are not processed correctly (noreturn is not detected).
// We need to use simplified definition for them.
#define CV_Error(...) do { abort(); } while (0)
#define CV_Error_( code, args ) do { cv::format args; abort(); } while (0)
#define CV_Error(code, msg) do { (void)(code); (void)(msg); abort(); } while (0)
#define CV_Error_(code, args) do { (void)(code); (void)(cv::format args); abort(); } while (0)
#define CV_Assert( expr ) do { if (!(expr)) abort(); } while (0)
#define CV_ErrorNoReturn CV_Error
#define CV_ErrorNoReturn_ CV_Error_
@ -587,6 +587,21 @@ _AccTp normInf(const _Tp* a, const _Tp* b, int n)
*/
CV_EXPORTS_W float cubeRoot(float val);
/** @overload
cubeRoot with argument of `double` type calls `std::cbrt(double)` (C++11) or falls back on `pow()` for C++98 compilation mode.
*/
static inline
double cubeRoot(double val)
{
#ifdef CV_CXX11
return std::cbrt(val);
#else
double v = pow(abs(val), 1/3.); // pow doesn't support negative inputs with fractional exponents
return val >= 0 ? v : -v;
#endif
}
/** @brief Calculates the angle of a 2D vector in degrees.
The function fastAtan2 calculates the full-range angle of an input 2D vector. The angle is measured

View File

@ -0,0 +1,170 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#ifndef OPENCV_CORE_BINDINGS_UTILS_HPP
#define OPENCV_CORE_BINDINGS_UTILS_HPP
#include <opencv2/core/async.hpp>
#include <opencv2/core/detail/async_promise.hpp>
#include <opencv2/core/utils/logger.hpp>
#include <stdexcept>
namespace cv { namespace utils {
//! @addtogroup core_utils
//! @{
CV_EXPORTS_W String dumpInputArray(InputArray argument);
CV_EXPORTS_W String dumpInputArrayOfArrays(InputArrayOfArrays argument);
CV_EXPORTS_W String dumpInputOutputArray(InputOutputArray argument);
CV_EXPORTS_W String dumpInputOutputArrayOfArrays(InputOutputArrayOfArrays argument);
CV_WRAP static inline
String dumpBool(bool argument)
{
return (argument) ? String("Bool: True") : String("Bool: False");
}
CV_WRAP static inline
String dumpInt(int argument)
{
return cv::format("Int: %d", argument);
}
CV_WRAP static inline
String dumpSizeT(size_t argument)
{
std::ostringstream oss("size_t: ", std::ios::ate);
oss << argument;
return oss.str();
}
CV_WRAP static inline
String dumpFloat(float argument)
{
return cv::format("Float: %.2f", argument);
}
CV_WRAP static inline
String dumpDouble(double argument)
{
return cv::format("Double: %.2f", argument);
}
CV_WRAP static inline
String dumpCString(const char* argument)
{
return cv::format("String: %s", argument);
}
CV_WRAP static inline
String dumpString(const String& argument)
{
return cv::format("String: %s", argument.c_str());
}
CV_WRAP static inline
String testOverloadResolution(int value, const Point& point = Point(42, 24))
{
return format("overload (int=%d, point=(x=%d, y=%d))", value, point.x,
point.y);
}
CV_WRAP static inline
String testOverloadResolution(const Rect& rect)
{
return format("overload (rect=(x=%d, y=%d, w=%d, h=%d))", rect.x, rect.y,
rect.width, rect.height);
}
CV_WRAP static inline
String dumpRect(const Rect& argument)
{
return format("rect: (x=%d, y=%d, w=%d, h=%d)", argument.x, argument.y,
argument.width, argument.height);
}
CV_WRAP static inline
String dumpTermCriteria(const TermCriteria& argument)
{
return format("term_criteria: (type=%d, max_count=%d, epsilon=%lf",
argument.type, argument.maxCount, argument.epsilon);
}
CV_WRAP static inline
String dumpRotatedRect(const RotatedRect& argument)
{
return format("rotated_rect: (c_x=%f, c_y=%f, w=%f, h=%f, a=%f)",
argument.center.x, argument.center.y, argument.size.width,
argument.size.height, argument.angle);
}
CV_WRAP static inline
String dumpRange(const Range& argument)
{
if (argument == Range::all())
{
return "range: all";
}
else
{
return format("range: (s=%d, e=%d)", argument.start, argument.end);
}
}
CV_WRAP static inline
void testRaiseGeneralException()
{
throw std::runtime_error("exception text");
}
CV_WRAP static inline
AsyncArray testAsyncArray(InputArray argument)
{
AsyncPromise p;
p.setValue(argument);
return p.getArrayResult();
}
CV_WRAP static inline
AsyncArray testAsyncException()
{
AsyncPromise p;
try
{
CV_Error(Error::StsOk, "Test: Generated async error");
}
catch (const cv::Exception& e)
{
p.setException(e);
}
return p.getArrayResult();
}
//! @} // core_utils
} // namespace cv::utils
//! @cond IGNORED
CV_WRAP static inline
int setLogLevel(int level)
{
// NB: Binding generators doesn't work with enums properly yet, so we define separate overload here
return cv::utils::logging::setLogLevel((cv::utils::logging::LogLevel)level);
}
CV_WRAP static inline
int getLogLevel()
{
return cv::utils::logging::getLogLevel();
}
//! @endcond IGNORED
} // namespaces cv / utils
#endif // OPENCV_CORE_BINDINGS_UTILS_HPP

View File

@ -63,12 +63,13 @@ struct CheckContext {
#define CV__CHECK_LOCATION_VARNAME(id) CVAUX_CONCAT(CVAUX_CONCAT(__cv_check_, id), __LINE__)
#define CV__DEFINE_CHECK_CONTEXT(id, message, testOp, p1_str, p2_str) \
static const cv::detail::CheckContext CV__CHECK_LOCATION_VARNAME(id) = \
{ CV__CHECK_FUNCTION, CV__CHECK_FILENAME, __LINE__, testOp, message, p1_str, p2_str }
{ CV__CHECK_FUNCTION, CV__CHECK_FILENAME, __LINE__, testOp, "" message, "" p1_str, "" p2_str }
CV_EXPORTS void CV_NORETURN check_failed_auto(const int v1, const int v2, const CheckContext& ctx);
CV_EXPORTS void CV_NORETURN check_failed_auto(const size_t v1, const size_t v2, const CheckContext& ctx);
CV_EXPORTS void CV_NORETURN check_failed_auto(const float v1, const float v2, const CheckContext& ctx);
CV_EXPORTS void CV_NORETURN check_failed_auto(const double v1, const double v2, const CheckContext& ctx);
CV_EXPORTS void CV_NORETURN check_failed_auto(const Size_<int> v1, const Size_<int> v2, const CheckContext& ctx);
CV_EXPORTS void CV_NORETURN check_failed_MatDepth(const int v1, const int v2, const CheckContext& ctx);
CV_EXPORTS void CV_NORETURN check_failed_MatType(const int v1, const int v2, const CheckContext& ctx);
CV_EXPORTS void CV_NORETURN check_failed_MatChannels(const int v1, const int v2, const CheckContext& ctx);
@ -77,6 +78,8 @@ CV_EXPORTS void CV_NORETURN check_failed_auto(const int v, const CheckContext& c
CV_EXPORTS void CV_NORETURN check_failed_auto(const size_t v, const CheckContext& ctx);
CV_EXPORTS void CV_NORETURN check_failed_auto(const float v, const CheckContext& ctx);
CV_EXPORTS void CV_NORETURN check_failed_auto(const double v, const CheckContext& ctx);
CV_EXPORTS void CV_NORETURN check_failed_auto(const Size_<int> v, const CheckContext& ctx);
CV_EXPORTS void CV_NORETURN check_failed_auto(const std::string& v1, const CheckContext& ctx);
CV_EXPORTS void CV_NORETURN check_failed_MatDepth(const int v, const CheckContext& ctx);
CV_EXPORTS void CV_NORETURN check_failed_MatType(const int v, const CheckContext& ctx);
CV_EXPORTS void CV_NORETURN check_failed_MatChannels(const int v, const CheckContext& ctx);

View File

@ -53,7 +53,7 @@
which is incompatible with C
It is OK to disable it because we only extend few plain structures with
C++ construrtors for simpler interoperability with C++ API of the library
C++ constructors for simpler interoperability with C++ API of the library
*/
# pragma warning(disable:4190)
# elif defined __clang__ && __clang_major__ >= 3
@ -579,7 +579,7 @@ CvNArrayIterator;
#define CV_NO_CN_CHECK 2
#define CV_NO_SIZE_CHECK 4
/** initializes iterator that traverses through several arrays simulteneously
/** initializes iterator that traverses through several arrays simultaneously
(the function together with cvNextArraySlice is used for
N-ari element-wise operations) */
CVAPI(int) cvInitNArrayIterator( int count, CvArr** arrs,
@ -1309,7 +1309,7 @@ CVAPI(void) cvMulTransposed( const CvArr* src, CvArr* dst, int order,
const CvArr* delta CV_DEFAULT(NULL),
double scale CV_DEFAULT(1.) );
/** Tranposes matrix. Square matrices can be transposed in-place */
/** Transposes matrix. Square matrices can be transposed in-place */
CVAPI(void) cvTranspose( const CvArr* src, CvArr* dst );
#define cvT cvTranspose

View File

@ -126,7 +126,7 @@ public:
GpuMat(int rows, int cols, int type, Allocator* allocator = defaultAllocator());
GpuMat(Size size, int type, Allocator* allocator = defaultAllocator());
//! constucts GpuMat and fills it with the specified value _s
//! constructs GpuMat and fills it with the specified value _s
GpuMat(int rows, int cols, int type, Scalar s, Allocator* allocator = defaultAllocator());
GpuMat(Size size, int type, Scalar s, Allocator* allocator = defaultAllocator());

View File

@ -101,6 +101,20 @@ namespace cv { namespace cuda
cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
cudaSafeCall( cudaBindTexture2D(0, tex, img.ptr(), &desc, img.cols, img.rows, img.step) );
}
template<class T> inline void createTextureObjectPitch2D(cudaTextureObject_t* tex, PtrStepSz<T>& img, const cudaTextureDesc& texDesc)
{
cudaResourceDesc resDesc;
memset(&resDesc, 0, sizeof(resDesc));
resDesc.resType = cudaResourceTypePitch2D;
resDesc.res.pitch2D.devPtr = static_cast<void*>(img.ptr());
resDesc.res.pitch2D.height = img.rows;
resDesc.res.pitch2D.width = img.cols;
resDesc.res.pitch2D.pitchInBytes = img.step;
resDesc.res.pitch2D.desc = cudaCreateChannelDesc<T>();
cudaSafeCall( cudaCreateTextureObject(tex, &resDesc, &texDesc, NULL) );
}
}
}}

View File

@ -106,8 +106,8 @@ namespace cv
size_t step;
__CV_CUDA_HOST_DEVICE__ T* ptr(int y = 0) { return ( T*)( ( char*)DevPtr<T>::data + y * step); }
__CV_CUDA_HOST_DEVICE__ const T* ptr(int y = 0) const { return (const T*)( (const char*)DevPtr<T>::data + y * step); }
__CV_CUDA_HOST_DEVICE__ T* ptr(int y = 0) { return ( T*)( ( char*)(((DevPtr<T>*)this)->data) + y * step); }
__CV_CUDA_HOST_DEVICE__ const T* ptr(int y = 0) const { return (const T*)( (const char*)(((DevPtr<T>*)this)->data) + y * step); }
__CV_CUDA_HOST_DEVICE__ T& operator ()(int y, int x) { return ptr(y)[x]; }
__CV_CUDA_HOST_DEVICE__ const T& operator ()(int y, int x) const { return ptr(y)[x]; }

View File

@ -72,7 +72,7 @@
# define CV_AVX 1
#endif
#ifdef CV_CPU_COMPILE_FP16
# if defined(__arm__) || defined(__aarch64__) || defined(_M_ARM)
# if defined(__arm__) || defined(__aarch64__) || defined(_M_ARM) || defined(_M_ARM64)
# include <arm_neon.h>
# else
# include <immintrin.h>
@ -87,15 +87,53 @@
# include <immintrin.h>
# define CV_AVX_512F 1
#endif
#ifdef CV_CPU_COMPILE_AVX512_COMMON
# define CV_AVX512_COMMON 1
# define CV_AVX_512CD 1
#endif
#ifdef CV_CPU_COMPILE_AVX512_KNL
# define CV_AVX512_KNL 1
# define CV_AVX_512ER 1
# define CV_AVX_512PF 1
#endif
#ifdef CV_CPU_COMPILE_AVX512_KNM
# define CV_AVX512_KNM 1
# define CV_AVX_5124FMAPS 1
# define CV_AVX_5124VNNIW 1
# define CV_AVX_512VPOPCNTDQ 1
#endif
#ifdef CV_CPU_COMPILE_AVX512_SKX
# include <immintrin.h>
# define CV_AVX512_SKX 1
# define CV_AVX_512VL 1
# define CV_AVX_512BW 1
# define CV_AVX_512DQ 1
#endif
#ifdef CV_CPU_COMPILE_AVX512_CNL
# define CV_AVX512_CNL 1
# define CV_AVX_512IFMA 1
# define CV_AVX_512VBMI 1
#endif
#ifdef CV_CPU_COMPILE_AVX512_CLX
# define CV_AVX512_CLX 1
# define CV_AVX_512VNNI 1
#endif
#ifdef CV_CPU_COMPILE_AVX512_ICL
# define CV_AVX512_ICL 1
# undef CV_AVX_512IFMA
# define CV_AVX_512IFMA 1
# undef CV_AVX_512VBMI
# define CV_AVX_512VBMI 1
# undef CV_AVX_512VNNI
# define CV_AVX_512VNNI 1
# define CV_AVX_512VBMI2 1
# define CV_AVX_512BITALG 1
# define CV_AVX_512VPOPCNTDQ 1
#endif
#ifdef CV_CPU_COMPILE_FMA3
# define CV_FMA3 1
#endif
#if defined _WIN32 && defined(_M_ARM)
#if defined _WIN32 && (defined(_M_ARM) || defined(_M_ARM64)) && (defined(CV_CPU_COMPILE_NEON) || !defined(_MSC_VER))
# include <Intrin.h>
# include <arm_neon.h>
# define CV_NEON 1
@ -120,6 +158,16 @@
# define CV_VSX3 1
#endif
#ifdef CV_CPU_COMPILE_MSA
# include "hal/msa_macros.h"
# define CV_MSA 1
#endif
#ifdef __EMSCRIPTEN__
# define CV_WASM_SIMD 1
# include <wasm_simd128.h>
#endif
#endif // CV_ENABLE_INTRINSICS && !CV_DISABLE_OPTIMIZATION && !__CUDACC__
#if defined CV_CPU_COMPILE_AVX && !defined CV_CPU_BASELINE_COMPILE_AVX
@ -153,7 +201,7 @@ struct VZeroUpperGuard {
# define CV_MMX 1
# define CV_SSE 1
# define CV_SSE2 1
#elif defined _WIN32 && defined(_M_ARM)
#elif defined _WIN32 && (defined(_M_ARM) || defined(_M_ARM64)) && (defined(CV_CPU_COMPILE_NEON) || !defined(_MSC_VER))
# include <Intrin.h>
# include <arm_neon.h>
# define CV_NEON 1
@ -168,6 +216,11 @@ struct VZeroUpperGuard {
# define CV_VSX 1
#endif
#ifdef __F16C__
# include <immintrin.h>
# define CV_FP16 1
#endif
#endif // !__OPENCV_BUILD && !__CUDACC (Compatibility code)
@ -223,9 +276,10 @@ struct VZeroUpperGuard {
#ifndef CV_AVX_512ER
# define CV_AVX_512ER 0
#endif
#ifndef CV_AVX_512IFMA512
# define CV_AVX_512IFMA512 0
#ifndef CV_AVX_512IFMA
# define CV_AVX_512IFMA 0
#endif
#define CV_AVX_512IFMA512 CV_AVX_512IFMA // deprecated
#ifndef CV_AVX_512PF
# define CV_AVX_512PF 0
#endif
@ -235,9 +289,45 @@ struct VZeroUpperGuard {
#ifndef CV_AVX_512VL
# define CV_AVX_512VL 0
#endif
#ifndef CV_AVX_5124FMAPS
# define CV_AVX_5124FMAPS 0
#endif
#ifndef CV_AVX_5124VNNIW
# define CV_AVX_5124VNNIW 0
#endif
#ifndef CV_AVX_512VPOPCNTDQ
# define CV_AVX_512VPOPCNTDQ 0
#endif
#ifndef CV_AVX_512VNNI
# define CV_AVX_512VNNI 0
#endif
#ifndef CV_AVX_512VBMI2
# define CV_AVX_512VBMI2 0
#endif
#ifndef CV_AVX_512BITALG
# define CV_AVX_512BITALG 0
#endif
#ifndef CV_AVX512_COMMON
# define CV_AVX512_COMMON 0
#endif
#ifndef CV_AVX512_KNL
# define CV_AVX512_KNL 0
#endif
#ifndef CV_AVX512_KNM
# define CV_AVX512_KNM 0
#endif
#ifndef CV_AVX512_SKX
# define CV_AVX512_SKX 0
#endif
#ifndef CV_AVX512_CNL
# define CV_AVX512_CNL 0
#endif
#ifndef CV_AVX512_CLX
# define CV_AVX512_CLX 0
#endif
#ifndef CV_AVX512_ICL
# define CV_AVX512_ICL 0
#endif
#ifndef CV_NEON
# define CV_NEON 0
@ -250,3 +340,11 @@ struct VZeroUpperGuard {
#ifndef CV_VSX3
# define CV_VSX3 0
#endif
#ifndef CV_MSA
# define CV_MSA 0
#endif
#ifndef CV_WASM_SIMD
# define CV_WASM_SIMD 0
#endif

View File

@ -252,6 +252,69 @@
#endif
#define __CV_CPU_DISPATCH_CHAIN_AVX_512F(fn, args, mode, ...) CV_CPU_CALL_AVX_512F(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_AVX512_COMMON
# define CV_TRY_AVX512_COMMON 1
# define CV_CPU_FORCE_AVX512_COMMON 1
# define CV_CPU_HAS_SUPPORT_AVX512_COMMON 1
# define CV_CPU_CALL_AVX512_COMMON(fn, args) return (cpu_baseline::fn args)
# define CV_CPU_CALL_AVX512_COMMON_(fn, args) return (opt_AVX512_COMMON::fn args)
#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_AVX512_COMMON
# define CV_TRY_AVX512_COMMON 1
# define CV_CPU_FORCE_AVX512_COMMON 0
# define CV_CPU_HAS_SUPPORT_AVX512_COMMON (cv::checkHardwareSupport(CV_CPU_AVX512_COMMON))
# define CV_CPU_CALL_AVX512_COMMON(fn, args) if (CV_CPU_HAS_SUPPORT_AVX512_COMMON) return (opt_AVX512_COMMON::fn args)
# define CV_CPU_CALL_AVX512_COMMON_(fn, args) if (CV_CPU_HAS_SUPPORT_AVX512_COMMON) return (opt_AVX512_COMMON::fn args)
#else
# define CV_TRY_AVX512_COMMON 0
# define CV_CPU_FORCE_AVX512_COMMON 0
# define CV_CPU_HAS_SUPPORT_AVX512_COMMON 0
# define CV_CPU_CALL_AVX512_COMMON(fn, args)
# define CV_CPU_CALL_AVX512_COMMON_(fn, args)
#endif
#define __CV_CPU_DISPATCH_CHAIN_AVX512_COMMON(fn, args, mode, ...) CV_CPU_CALL_AVX512_COMMON(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_AVX512_KNL
# define CV_TRY_AVX512_KNL 1
# define CV_CPU_FORCE_AVX512_KNL 1
# define CV_CPU_HAS_SUPPORT_AVX512_KNL 1
# define CV_CPU_CALL_AVX512_KNL(fn, args) return (cpu_baseline::fn args)
# define CV_CPU_CALL_AVX512_KNL_(fn, args) return (opt_AVX512_KNL::fn args)
#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_AVX512_KNL
# define CV_TRY_AVX512_KNL 1
# define CV_CPU_FORCE_AVX512_KNL 0
# define CV_CPU_HAS_SUPPORT_AVX512_KNL (cv::checkHardwareSupport(CV_CPU_AVX512_KNL))
# define CV_CPU_CALL_AVX512_KNL(fn, args) if (CV_CPU_HAS_SUPPORT_AVX512_KNL) return (opt_AVX512_KNL::fn args)
# define CV_CPU_CALL_AVX512_KNL_(fn, args) if (CV_CPU_HAS_SUPPORT_AVX512_KNL) return (opt_AVX512_KNL::fn args)
#else
# define CV_TRY_AVX512_KNL 0
# define CV_CPU_FORCE_AVX512_KNL 0
# define CV_CPU_HAS_SUPPORT_AVX512_KNL 0
# define CV_CPU_CALL_AVX512_KNL(fn, args)
# define CV_CPU_CALL_AVX512_KNL_(fn, args)
#endif
#define __CV_CPU_DISPATCH_CHAIN_AVX512_KNL(fn, args, mode, ...) CV_CPU_CALL_AVX512_KNL(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_AVX512_KNM
# define CV_TRY_AVX512_KNM 1
# define CV_CPU_FORCE_AVX512_KNM 1
# define CV_CPU_HAS_SUPPORT_AVX512_KNM 1
# define CV_CPU_CALL_AVX512_KNM(fn, args) return (cpu_baseline::fn args)
# define CV_CPU_CALL_AVX512_KNM_(fn, args) return (opt_AVX512_KNM::fn args)
#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_AVX512_KNM
# define CV_TRY_AVX512_KNM 1
# define CV_CPU_FORCE_AVX512_KNM 0
# define CV_CPU_HAS_SUPPORT_AVX512_KNM (cv::checkHardwareSupport(CV_CPU_AVX512_KNM))
# define CV_CPU_CALL_AVX512_KNM(fn, args) if (CV_CPU_HAS_SUPPORT_AVX512_KNM) return (opt_AVX512_KNM::fn args)
# define CV_CPU_CALL_AVX512_KNM_(fn, args) if (CV_CPU_HAS_SUPPORT_AVX512_KNM) return (opt_AVX512_KNM::fn args)
#else
# define CV_TRY_AVX512_KNM 0
# define CV_CPU_FORCE_AVX512_KNM 0
# define CV_CPU_HAS_SUPPORT_AVX512_KNM 0
# define CV_CPU_CALL_AVX512_KNM(fn, args)
# define CV_CPU_CALL_AVX512_KNM_(fn, args)
#endif
#define __CV_CPU_DISPATCH_CHAIN_AVX512_KNM(fn, args, mode, ...) CV_CPU_CALL_AVX512_KNM(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_AVX512_SKX
# define CV_TRY_AVX512_SKX 1
# define CV_CPU_FORCE_AVX512_SKX 1
@ -273,6 +336,69 @@
#endif
#define __CV_CPU_DISPATCH_CHAIN_AVX512_SKX(fn, args, mode, ...) CV_CPU_CALL_AVX512_SKX(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_AVX512_CNL
# define CV_TRY_AVX512_CNL 1
# define CV_CPU_FORCE_AVX512_CNL 1
# define CV_CPU_HAS_SUPPORT_AVX512_CNL 1
# define CV_CPU_CALL_AVX512_CNL(fn, args) return (cpu_baseline::fn args)
# define CV_CPU_CALL_AVX512_CNL_(fn, args) return (opt_AVX512_CNL::fn args)
#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_AVX512_CNL
# define CV_TRY_AVX512_CNL 1
# define CV_CPU_FORCE_AVX512_CNL 0
# define CV_CPU_HAS_SUPPORT_AVX512_CNL (cv::checkHardwareSupport(CV_CPU_AVX512_CNL))
# define CV_CPU_CALL_AVX512_CNL(fn, args) if (CV_CPU_HAS_SUPPORT_AVX512_CNL) return (opt_AVX512_CNL::fn args)
# define CV_CPU_CALL_AVX512_CNL_(fn, args) if (CV_CPU_HAS_SUPPORT_AVX512_CNL) return (opt_AVX512_CNL::fn args)
#else
# define CV_TRY_AVX512_CNL 0
# define CV_CPU_FORCE_AVX512_CNL 0
# define CV_CPU_HAS_SUPPORT_AVX512_CNL 0
# define CV_CPU_CALL_AVX512_CNL(fn, args)
# define CV_CPU_CALL_AVX512_CNL_(fn, args)
#endif
#define __CV_CPU_DISPATCH_CHAIN_AVX512_CNL(fn, args, mode, ...) CV_CPU_CALL_AVX512_CNL(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_AVX512_CLX
# define CV_TRY_AVX512_CLX 1
# define CV_CPU_FORCE_AVX512_CLX 1
# define CV_CPU_HAS_SUPPORT_AVX512_CLX 1
# define CV_CPU_CALL_AVX512_CLX(fn, args) return (cpu_baseline::fn args)
# define CV_CPU_CALL_AVX512_CLX_(fn, args) return (opt_AVX512_CLX::fn args)
#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_AVX512_CLX
# define CV_TRY_AVX512_CLX 1
# define CV_CPU_FORCE_AVX512_CLX 0
# define CV_CPU_HAS_SUPPORT_AVX512_CLX (cv::checkHardwareSupport(CV_CPU_AVX512_CLX))
# define CV_CPU_CALL_AVX512_CLX(fn, args) if (CV_CPU_HAS_SUPPORT_AVX512_CLX) return (opt_AVX512_CLX::fn args)
# define CV_CPU_CALL_AVX512_CLX_(fn, args) if (CV_CPU_HAS_SUPPORT_AVX512_CLX) return (opt_AVX512_CLX::fn args)
#else
# define CV_TRY_AVX512_CLX 0
# define CV_CPU_FORCE_AVX512_CLX 0
# define CV_CPU_HAS_SUPPORT_AVX512_CLX 0
# define CV_CPU_CALL_AVX512_CLX(fn, args)
# define CV_CPU_CALL_AVX512_CLX_(fn, args)
#endif
#define __CV_CPU_DISPATCH_CHAIN_AVX512_CLX(fn, args, mode, ...) CV_CPU_CALL_AVX512_CLX(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_AVX512_ICL
# define CV_TRY_AVX512_ICL 1
# define CV_CPU_FORCE_AVX512_ICL 1
# define CV_CPU_HAS_SUPPORT_AVX512_ICL 1
# define CV_CPU_CALL_AVX512_ICL(fn, args) return (cpu_baseline::fn args)
# define CV_CPU_CALL_AVX512_ICL_(fn, args) return (opt_AVX512_ICL::fn args)
#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_AVX512_ICL
# define CV_TRY_AVX512_ICL 1
# define CV_CPU_FORCE_AVX512_ICL 0
# define CV_CPU_HAS_SUPPORT_AVX512_ICL (cv::checkHardwareSupport(CV_CPU_AVX512_ICL))
# define CV_CPU_CALL_AVX512_ICL(fn, args) if (CV_CPU_HAS_SUPPORT_AVX512_ICL) return (opt_AVX512_ICL::fn args)
# define CV_CPU_CALL_AVX512_ICL_(fn, args) if (CV_CPU_HAS_SUPPORT_AVX512_ICL) return (opt_AVX512_ICL::fn args)
#else
# define CV_TRY_AVX512_ICL 0
# define CV_CPU_FORCE_AVX512_ICL 0
# define CV_CPU_HAS_SUPPORT_AVX512_ICL 0
# define CV_CPU_CALL_AVX512_ICL(fn, args)
# define CV_CPU_CALL_AVX512_ICL_(fn, args)
#endif
#define __CV_CPU_DISPATCH_CHAIN_AVX512_ICL(fn, args, mode, ...) CV_CPU_CALL_AVX512_ICL(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_NEON
# define CV_TRY_NEON 1
# define CV_CPU_FORCE_NEON 1
@ -294,6 +420,27 @@
#endif
#define __CV_CPU_DISPATCH_CHAIN_NEON(fn, args, mode, ...) CV_CPU_CALL_NEON(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_MSA
# define CV_TRY_MSA 1
# define CV_CPU_FORCE_MSA 1
# define CV_CPU_HAS_SUPPORT_MSA 1
# define CV_CPU_CALL_MSA(fn, args) return (cpu_baseline::fn args)
# define CV_CPU_CALL_MSA_(fn, args) return (opt_MSA::fn args)
#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_MSA
# define CV_TRY_MSA 1
# define CV_CPU_FORCE_MSA 0
# define CV_CPU_HAS_SUPPORT_MSA (cv::checkHardwareSupport(CV_CPU_MSA))
# define CV_CPU_CALL_MSA(fn, args) if (CV_CPU_HAS_SUPPORT_MSA) return (opt_MSA::fn args)
# define CV_CPU_CALL_MSA_(fn, args) if (CV_CPU_HAS_SUPPORT_MSA) return (opt_MSA::fn args)
#else
# define CV_TRY_MSA 0
# define CV_CPU_FORCE_MSA 0
# define CV_CPU_HAS_SUPPORT_MSA 0
# define CV_CPU_CALL_MSA(fn, args)
# define CV_CPU_CALL_MSA_(fn, args)
#endif
#define __CV_CPU_DISPATCH_CHAIN_MSA(fn, args, mode, ...) CV_CPU_CALL_MSA(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_VSX
# define CV_TRY_VSX 1
# define CV_CPU_FORCE_VSX 1

View File

@ -45,9 +45,15 @@
#ifndef OPENCV_CORE_CVDEF_H
#define OPENCV_CORE_CVDEF_H
#include "opencv2/core/version.hpp"
//! @addtogroup core_utils
//! @{
#ifdef OPENCV_INCLUDE_PORT_FILE // User-provided header file with custom platform configuration
#include OPENCV_INCLUDE_PORT_FILE
#endif
#if !defined CV_DOXYGEN && !defined CV_IGNORE_DEBUG_BUILD_GUARD
#if (defined(_MSC_VER) && (defined(DEBUG) || defined(_DEBUG))) || \
(defined(_GLIBCXX_DEBUG) || defined(_GLIBCXX_DEBUG_PEDANTIC))
@ -82,12 +88,24 @@ namespace cv { namespace debug_build_guard { } using namespace debug_build_guard
#define __CV_VA_NUM_ARGS_HELPER(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, N, ...) N
#define __CV_VA_NUM_ARGS(...) __CV_VA_NUM_ARGS_HELPER(__VA_ARGS__, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)
#if defined __GNUC__
#ifdef CV_Func
// keep current value (through OpenCV port file)
#elif defined __GNUC__ || (defined (__cpluscplus) && (__cpluscplus >= 201103))
#define CV_Func __func__
#elif defined __clang__ && (__clang_minor__ * 100 + __clang_major__ >= 305)
#define CV_Func __func__
#elif defined(__STDC_VERSION__) && (__STDC_VERSION >= 199901)
#define CV_Func __func__
#elif defined _MSC_VER
#define CV_Func __FUNCTION__
#elif defined(__INTEL_COMPILER) && (_INTEL_COMPILER >= 600)
#define CV_Func __FUNCTION__
#elif defined __IBMCPP__ && __IBMCPP__ >=500
#define CV_Func __FUNCTION__
#elif defined __BORLAND__ && (__BORLANDC__ >= 0x550)
#define CV_Func __FUNC__
#else
#define CV_Func ""
#define CV_Func "<unknown>"
#endif
//! @cond IGNORED
@ -118,9 +136,11 @@ namespace cv { namespace debug_build_guard { } using namespace debug_build_guard
# if !defined(__clang__) && defined(__GNUC__) && (__GNUC__*100 + __GNUC_MINOR__ > 302)
# define CV_StaticAssert(condition, reason) ({ extern int __attribute__((error("CV_StaticAssert: " reason " " #condition))) CV_StaticAssert(); ((condition) ? 0 : CV_StaticAssert()); })
# else
namespace cv {
template <bool x> struct CV_StaticAssert_failed;
template <> struct CV_StaticAssert_failed<true> { enum { val = 1 }; };
template<int x> struct CV_StaticAssert_test {};
}
# define CV_StaticAssert(condition, reason)\
typedef cv::CV_StaticAssert_test< sizeof(cv::CV_StaticAssert_failed< static_cast<bool>(condition) >) > CVAUX_CONCAT(CV_StaticAssert_failed_at_, __LINE__)
# endif
@ -175,7 +195,12 @@ namespace cv { namespace debug_build_guard { } using namespace debug_build_guard
#undef abs
#undef Complex
#if defined __cplusplus
#include <limits>
#else
#include <limits.h>
#endif
#include "opencv2/core/hal/interface.h"
#if defined __ICL
@ -249,14 +274,28 @@ namespace cv { namespace debug_build_guard { } using namespace debug_build_guard
#define CV_CPU_AVX_512PF 19
#define CV_CPU_AVX_512VBMI 20
#define CV_CPU_AVX_512VL 21
#define CV_CPU_AVX_512VBMI2 22
#define CV_CPU_AVX_512VNNI 23
#define CV_CPU_AVX_512BITALG 24
#define CV_CPU_AVX_512VPOPCNTDQ 25
#define CV_CPU_AVX_5124VNNIW 26
#define CV_CPU_AVX_5124FMAPS 27
#define CV_CPU_NEON 100
#define CV_CPU_MSA 150
#define CV_CPU_VSX 200
#define CV_CPU_VSX3 201
// CPU features groups
#define CV_CPU_AVX512_SKX 256
#define CV_CPU_AVX512_COMMON 257
#define CV_CPU_AVX512_KNL 258
#define CV_CPU_AVX512_KNM 259
#define CV_CPU_AVX512_CNL 260
#define CV_CPU_AVX512_CLX 261
#define CV_CPU_AVX512_ICL 262
// when adding to this list remember to update the following enum
#define CV_HARDWARE_MAX_FEATURE 512
@ -287,13 +326,27 @@ enum CpuFeatures {
CPU_AVX_512PF = 19,
CPU_AVX_512VBMI = 20,
CPU_AVX_512VL = 21,
CPU_AVX_512VBMI2 = 22,
CPU_AVX_512VNNI = 23,
CPU_AVX_512BITALG = 24,
CPU_AVX_512VPOPCNTDQ= 25,
CPU_AVX_5124VNNIW = 26,
CPU_AVX_5124FMAPS = 27,
CPU_NEON = 100,
CPU_MSA = 150,
CPU_VSX = 200,
CPU_VSX3 = 201,
CPU_AVX512_SKX = 256, //!< Skylake-X with AVX-512F/CD/BW/DQ/VL
CPU_AVX512_COMMON = 257, //!< Common instructions AVX-512F/CD for all CPUs that support AVX-512
CPU_AVX512_KNL = 258, //!< Knights Landing with AVX-512F/CD/ER/PF
CPU_AVX512_KNM = 259, //!< Knights Mill with AVX-512F/CD/ER/PF/4FMAPS/4VNNIW/VPOPCNTDQ
CPU_AVX512_CNL = 260, //!< Cannon Lake with AVX-512F/CD/BW/DQ/VL/IFMA/VBMI
CPU_AVX512_CLX = 261, //!< Cascade Lake with AVX-512F/CD/BW/DQ/VL/VNNI
CPU_AVX512_ICL = 262, //!< Ice Lake with AVX-512F/CD/BW/DQ/VL/IFMA/VBMI/VNNI/VBMI2/BITALG/VPOPCNTDQ
CPU_MAX_FEATURE = 512 // see CV_HARDWARE_MAX_FEATURE
};
@ -301,6 +354,13 @@ enum CpuFeatures {
#include "cv_cpu_dispatch.h"
#if !defined(CV_STRONG_ALIGNMENT) && defined(__arm__) && !(defined(__aarch64__) || defined(_M_ARM64))
// int*, int64* should be propertly aligned pointers on ARMv7
#define CV_STRONG_ALIGNMENT 1
#endif
#if !defined(CV_STRONG_ALIGNMENT)
#define CV_STRONG_ALIGNMENT 0
#endif
/* fundamental constants */
#define CV_PI 3.1415926535897932384626433832795
@ -340,17 +400,19 @@ typedef union Cv64suf
}
Cv64suf;
#ifndef OPENCV_ABI_COMPATIBILITY
#define OPENCV_ABI_COMPATIBILITY 300
#endif
#ifdef __OPENCV_BUILD
# define DISABLE_OPENCV_24_COMPATIBILITY
# define OPENCV_DISABLE_DEPRECATED_COMPATIBILITY
#endif
#ifdef CVAPI_EXPORTS
# if (defined _WIN32 || defined WINCE || defined __CYGWIN__)
#ifndef CV_EXPORTS
# if (defined _WIN32 || defined WINCE || defined __CYGWIN__) && defined(CVAPI_EXPORTS)
# define CV_EXPORTS __declspec(dllexport)
# elif defined __GNUC__ && __GNUC__ >= 4
# elif defined __GNUC__ && __GNUC__ >= 4 && (defined(CVAPI_EXPORTS) || defined(__APPLE__))
# define CV_EXPORTS __attribute__ ((visibility ("default")))
# endif
#endif
@ -491,7 +553,11 @@ Cv64suf;
# include <intrin.h>
# define CV_XADD(addr, delta) (int)_InterlockedExchangeAdd((long volatile*)addr, delta)
#else
#ifdef OPENCV_FORCE_UNSAFE_XADD
CV_INLINE CV_XADD(int* addr, int delta) { int tmp = *addr; *addr += delta; return tmp; }
#else
#error "OpenCV: can't define safe CV_XADD macro for current platform (unsupported). Define CV_XADD macro through custom port header (see OPENCV_INCLUDE_PORT_FILE)"
#endif
#endif
@ -560,6 +626,13 @@ Cv64suf;
# endif
#endif
#ifdef CV_CXX_MOVE_SEMANTICS
#define CV_CXX_MOVE(x) std::move(x)
#else
#define CV_CXX_MOVE(x) (x)
#endif
/****************************************************************************************\
* C++11 std::array *
\****************************************************************************************/
@ -598,6 +671,19 @@ Cv64suf;
# define CV_FINAL
#endif
/****************************************************************************************\
* C++11 noexcept *
\****************************************************************************************/
#ifndef CV_NOEXCEPT
# if __cplusplus >= 201103L || (defined(_MSC_VER) && _MSC_VER >= 1900/*MSVS 2015*/)
# define CV_NOEXCEPT noexcept
# endif
#endif
#ifndef CV_NOEXCEPT
# define CV_NOEXCEPT
#endif
// Integer types portatibility
@ -683,7 +769,7 @@ protected:
float16_t() {}
explicit float16_t(float x)
{
#if CV_AVX2
#if CV_FP16
__m128 v = _mm_load_ss(&x);
w = (ushort)_mm_cvtsi128_si32(_mm_cvtps_ph(v, 0));
#else
@ -714,7 +800,7 @@ protected:
operator float() const
{
#if CV_AVX2
#if CV_FP16
float f;
_mm_store_ss(&f, _mm_cvtph_ps(_mm_cvtsi32_si128(w)));
return f;

View File

@ -1026,6 +1026,40 @@ static inline bool operator>= (const String& lhs, const String& rhs) { return lh
static inline bool operator>= (const char* lhs, const String& rhs) { return rhs.compare(lhs) <= 0; }
static inline bool operator>= (const String& lhs, const char* rhs) { return lhs.compare(rhs) >= 0; }
#ifndef OPENCV_DISABLE_STRING_LOWER_UPPER_CONVERSIONS
//! @cond IGNORED
namespace details {
// std::tolower is int->int
static inline char char_tolower(char ch)
{
return (char)std::tolower((int)ch);
}
// std::toupper is int->int
static inline char char_toupper(char ch)
{
return (char)std::toupper((int)ch);
}
} // namespace details
//! @endcond
static inline std::string toLowerCase(const std::string& str)
{
std::string result(str);
std::transform(result.begin(), result.end(), result.begin(), details::char_tolower);
return result;
}
static inline std::string toUpperCase(const std::string& str)
{
std::string result(str);
std::transform(result.begin(), result.end(), result.begin(), details::char_toupper);
return result;
}
#endif // OPENCV_DISABLE_STRING_LOWER_UPPER_CONVERSIONS
//! @} relates cv::String
} // cv

View File

@ -46,6 +46,7 @@
#include <complex>
#include <ostream>
#include <sstream>
//! @cond IGNORED

View File

@ -0,0 +1,71 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#ifndef OPENCV_CORE_ASYNC_PROMISE_HPP
#define OPENCV_CORE_ASYNC_PROMISE_HPP
#include "../async.hpp"
#include "exception_ptr.hpp"
namespace cv {
/** @addtogroup core_async
@{
*/
/** @brief Provides result of asynchronous operations
*/
class CV_EXPORTS AsyncPromise
{
public:
~AsyncPromise() CV_NOEXCEPT;
AsyncPromise() CV_NOEXCEPT;
explicit AsyncPromise(const AsyncPromise& o) CV_NOEXCEPT;
AsyncPromise& operator=(const AsyncPromise& o) CV_NOEXCEPT;
void release() CV_NOEXCEPT;
/** Returns associated AsyncArray
@note Can be called once
*/
AsyncArray getArrayResult();
/** Stores asynchronous result.
@param[in] value result
*/
void setValue(InputArray value);
// TODO "move" setters
#if CV__EXCEPTION_PTR
/** Stores exception.
@param[in] exception exception to be raised in AsyncArray
*/
void setException(std::exception_ptr exception);
#endif
/** Stores exception.
@param[in] exception exception to be raised in AsyncArray
*/
void setException(const cv::Exception& exception);
#ifdef CV_CXX11
explicit AsyncPromise(AsyncPromise&& o) { p = o.p; o.p = NULL; }
AsyncPromise& operator=(AsyncPromise&& o) CV_NOEXCEPT { std::swap(p, o.p); return *this; }
#endif
// PImpl
typedef struct AsyncArray::Impl Impl; friend struct AsyncArray::Impl;
inline void* _getImpl() const CV_NOEXCEPT { return p; }
protected:
Impl* p;
};
//! @}
} // namespace
#endif // OPENCV_CORE_ASYNC_PROMISE_HPP

View File

@ -0,0 +1,27 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#ifndef OPENCV_CORE_DETAILS_EXCEPTION_PTR_H
#define OPENCV_CORE_DETAILS_EXCEPTION_PTR_H
#ifndef CV__EXCEPTION_PTR
# if defined(__ANDROID__) && defined(ATOMIC_INT_LOCK_FREE) && ATOMIC_INT_LOCK_FREE < 2
# define CV__EXCEPTION_PTR 0 // Not supported, details: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58938
# elif defined(CV_CXX11)
# define CV__EXCEPTION_PTR 1
# elif defined(_MSC_VER)
# define CV__EXCEPTION_PTR (_MSC_VER >= 1600)
# elif defined(__clang__)
# define CV__EXCEPTION_PTR 0 // C++11 only (see above)
# elif defined(__GNUC__) && defined(__GXX_EXPERIMENTAL_CXX0X__)
# define CV__EXCEPTION_PTR (__GXX_EXPERIMENTAL_CXX0X__ > 0)
# endif
#endif
#ifndef CV__EXCEPTION_PTR
# define CV__EXCEPTION_PTR 0
#elif CV__EXCEPTION_PTR
# include <exception> // std::exception_ptr
#endif
#endif // OPENCV_CORE_DETAILS_EXCEPTION_PTR_H

View File

@ -45,20 +45,142 @@
#ifndef OPENCV_CORE_EIGEN_HPP
#define OPENCV_CORE_EIGEN_HPP
#ifndef EIGEN_WORLD_VERSION
#error "Wrong usage of OpenCV's Eigen utility header. Include Eigen's headers first. See https://github.com/opencv/opencv/issues/17366"
#endif
#include "opencv2/core.hpp"
#if defined _MSC_VER && _MSC_VER >= 1200
#define NOMINMAX // fix https://github.com/opencv/opencv/issues/17548
#pragma warning( disable: 4714 ) //__forceinline is not inlined
#pragma warning( disable: 4127 ) //conditional expression is constant
#pragma warning( disable: 4244 ) //conversion from '__int64' to 'int', possible loss of data
#endif
#if !defined(OPENCV_DISABLE_EIGEN_TENSOR_SUPPORT)
#if EIGEN_WORLD_VERSION == 3 && EIGEN_MAJOR_VERSION >= 3 \
&& defined(CV_CXX11) && defined(CV_CXX_STD_ARRAY)
#include <unsupported/Eigen/CXX11/Tensor>
#define OPENCV_EIGEN_TENSOR_SUPPORT 1
#endif // EIGEN_WORLD_VERSION == 3 && EIGEN_MAJOR_VERSION >= 3
#endif // !defined(OPENCV_DISABLE_EIGEN_TENSOR_SUPPORT)
namespace cv
{
//! @addtogroup core_eigen
/** @addtogroup core_eigen
These functions are provided for OpenCV-Eigen interoperability. They convert `Mat`
objects to corresponding `Eigen::Matrix` objects and vice-versa. Consult the [Eigen
documentation](https://eigen.tuxfamily.org/dox/group__TutorialMatrixClass.html) for
information about the `Matrix` template type.
@note Using these functions requires the `Eigen/Dense` or similar header to be
included before this header.
*/
//! @{
#if defined(OPENCV_EIGEN_TENSOR_SUPPORT) || defined(CV_DOXYGEN)
/** @brief Converts an Eigen::Tensor to a cv::Mat.
The method converts an Eigen::Tensor with shape (H x W x C) to a cv::Mat where:
H = number of rows
W = number of columns
C = number of channels
Usage:
\code
Eigen::Tensor<float, 3, Eigen::RowMajor> a_tensor(...);
// populate tensor with values
Mat a_mat;
eigen2cv(a_tensor, a_mat);
\endcode
*/
template <typename _Tp, int _layout> static inline
void eigen2cv( const Eigen::Tensor<_Tp, 3, _layout> &src, OutputArray dst )
{
if( !(_layout & Eigen::RowMajorBit) )
{
const std::array<int, 3> shuffle{2, 1, 0};
Eigen::Tensor<_Tp, 3, !_layout> row_major_tensor = src.swap_layout().shuffle(shuffle);
Mat _src(src.dimension(0), src.dimension(1), CV_MAKETYPE(DataType<_Tp>::type, src.dimension(2)), row_major_tensor.data());
_src.copyTo(dst);
}
else
{
Mat _src(src.dimension(0), src.dimension(1), CV_MAKETYPE(DataType<_Tp>::type, src.dimension(2)), (void *)src.data());
_src.copyTo(dst);
}
}
/** @brief Converts a cv::Mat to an Eigen::Tensor.
The method converts a cv::Mat to an Eigen Tensor with shape (H x W x C) where:
H = number of rows
W = number of columns
C = number of channels
Usage:
\code
Mat a_mat(...);
// populate Mat with values
Eigen::Tensor<float, 3, Eigen::RowMajor> a_tensor(...);
cv2eigen(a_mat, a_tensor);
\endcode
*/
template <typename _Tp, int _layout> static inline
void cv2eigen( const Mat &src, Eigen::Tensor<_Tp, 3, _layout> &dst )
{
if( !(_layout & Eigen::RowMajorBit) )
{
Eigen::Tensor<_Tp, 3, !_layout> row_major_tensor(src.rows, src.cols, src.channels());
Mat _dst(src.rows, src.cols, CV_MAKETYPE(DataType<_Tp>::type, src.channels()), row_major_tensor.data());
if (src.type() == _dst.type())
src.copyTo(_dst);
else
src.convertTo(_dst, _dst.type());
const std::array<int, 3> shuffle{2, 1, 0};
dst = row_major_tensor.swap_layout().shuffle(shuffle);
}
else
{
dst.resize(src.rows, src.cols, src.channels());
Mat _dst(src.rows, src.cols, CV_MAKETYPE(DataType<_Tp>::type, src.channels()), dst.data());
if (src.type() == _dst.type())
src.copyTo(_dst);
else
src.convertTo(_dst, _dst.type());
}
}
/** @brief Maps cv::Mat data to an Eigen::TensorMap.
The method wraps an existing Mat data array with an Eigen TensorMap of shape (H x W x C) where:
H = number of rows
W = number of columns
C = number of channels
Explicit instantiation of the return type is required.
@note Caller should be aware of the lifetime of the cv::Mat instance and take appropriate safety measures.
The cv::Mat instance will retain ownership of the data and the Eigen::TensorMap will lose access when the cv::Mat data is deallocated.
The example below initializes a cv::Mat and produces an Eigen::TensorMap:
\code
float arr[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
Mat a_mat(2, 2, CV_32FC3, arr);
Eigen::TensorMap<Eigen::Tensor<float, 3, Eigen::RowMajor>> a_tensormap = cv2eigen_tensormap<float>(a_mat);
\endcode
*/
template <typename _Tp> static inline
Eigen::TensorMap<Eigen::Tensor<_Tp, 3, Eigen::RowMajor>> cv2eigen_tensormap(InputArray src)
{
Mat mat = src.getMat();
CV_CheckTypeEQ(mat.type(), CV_MAKETYPE(traits::Type<_Tp>::value, mat.channels()), "");
return Eigen::TensorMap<Eigen::Tensor<_Tp, 3, Eigen::RowMajor>>((_Tp *)mat.data, mat.rows, mat.cols, mat.channels());
}
#endif // OPENCV_EIGEN_TENSOR_SUPPORT
template<typename _Tp, int _rows, int _cols, int _options, int _maxRows, int _maxCols> static inline
void eigen2cv( const Eigen::Matrix<_Tp, _rows, _cols, _options, _maxRows, _maxCols>& src, OutputArray dst )
{

View File

@ -47,12 +47,6 @@
#include "opencv2/core/cvdef.h"
#if ((defined _MSC_VER && defined _M_X64) || (defined __GNUC__ && defined __x86_64__ \
&& defined __SSE2__ && !defined __APPLE__)) && !defined(__CUDACC__)
#include <emmintrin.h>
#endif
//! @addtogroup core_utils
//! @{
@ -70,11 +64,27 @@
# endif
#endif
#ifdef HAVE_TEGRA_OPTIMIZATION
# include "tegra_round.hpp"
#endif
#if defined(__CUDACC__)
// nothing, intrinsics/asm code is not supported
#else
#if ((defined _MSC_VER && defined _M_X64) \
|| (defined __GNUC__ && defined __x86_64__ && defined __SSE2__)) \
&& !defined(OPENCV_SKIP_INCLUDE_EMMINTRIN_H)
#include <emmintrin.h>
#endif
#if defined __GNUC__ && defined __arm__ && (defined __ARM_PCS_VFP || defined __ARM_VFPV3__ || defined __ARM_NEON__) && !defined __SOFTFP__ && !defined(__CUDACC__)
#if defined __PPC64__ && defined __GNUC__ && defined _ARCH_PWR8 \
&& !defined(OPENCV_SKIP_INCLUDE_ALTIVEC_H)
#include <altivec.h>
#undef vector
#undef bool
#undef pixel
#endif
#if defined(CV_INLINE_ROUND_FLT)
// user-specified version
// CV_INLINE_ROUND_DBL should be defined too
#elif defined __GNUC__ && defined __arm__ && (defined __ARM_PCS_VFP || defined __ARM_VFPV3__ || defined __ARM_NEON__) && !defined __SOFTFP__
// 1. general scheme
#define ARM_ROUND(_value, _asm_string) \
int res; \
@ -84,13 +94,102 @@
return res
// 2. version for double
#ifdef __clang__
#define ARM_ROUND_DBL(value) ARM_ROUND(value, "vcvtr.s32.f64 %[temp], %[value] \n vmov %[res], %[temp]")
#define CV_INLINE_ROUND_DBL(value) ARM_ROUND(value, "vcvtr.s32.f64 %[temp], %[value] \n vmov %[res], %[temp]")
#else
#define ARM_ROUND_DBL(value) ARM_ROUND(value, "vcvtr.s32.f64 %[temp], %P[value] \n vmov %[res], %[temp]")
#define CV_INLINE_ROUND_DBL(value) ARM_ROUND(value, "vcvtr.s32.f64 %[temp], %P[value] \n vmov %[res], %[temp]")
#endif
// 3. version for float
#define ARM_ROUND_FLT(value) ARM_ROUND(value, "vcvtr.s32.f32 %[temp], %[value]\n vmov %[res], %[temp]")
#endif
#define CV_INLINE_ROUND_FLT(value) ARM_ROUND(value, "vcvtr.s32.f32 %[temp], %[value]\n vmov %[res], %[temp]")
#elif defined __PPC64__ && defined __GNUC__ && defined _ARCH_PWR8
// P8 and newer machines can convert fp32/64 to int quickly.
#define CV_INLINE_ROUND_DBL(value) \
int out; \
double temp; \
__asm__( "fctiw %[temp],%[in]\n\tmfvsrwz %[out],%[temp]\n\t" : [out] "=r" (out), [temp] "=d" (temp) : [in] "d" ((double)(value)) : ); \
return out;
// FP32 also works with FP64 routine above
#define CV_INLINE_ROUND_FLT(value) CV_INLINE_ROUND_DBL(value)
#endif
#ifdef CV_INLINE_ISINF_FLT
// user-specified version
// CV_INLINE_ISINF_DBL should be defined too
#elif defined __PPC64__ && defined _ARCH_PWR9 && defined(scalar_test_data_class)
#define CV_INLINE_ISINF_DBL(value) return scalar_test_data_class(value, 0x30);
#define CV_INLINE_ISINF_FLT(value) CV_INLINE_ISINF_DBL(value)
#endif
#ifdef CV_INLINE_ISNAN_FLT
// user-specified version
// CV_INLINE_ISNAN_DBL should be defined too
#elif defined __PPC64__ && defined _ARCH_PWR9 && defined(scalar_test_data_class)
#define CV_INLINE_ISNAN_DBL(value) return scalar_test_data_class(value, 0x40);
#define CV_INLINE_ISNAN_FLT(value) CV_INLINE_ISNAN_DBL(value)
#endif
#if !defined(OPENCV_USE_FASTMATH_BUILTINS) \
&& ( \
defined(__x86_64__) || defined(__i686__) \
|| defined(__arm__) \
|| defined(__PPC64__) \
)
/* Let builtin C math functions when available. Dedicated hardware is available to
round and convert FP values. */
#define OPENCV_USE_FASTMATH_BUILTINS 1
#endif
/* Enable builtin math functions if possible, desired, and available.
Note, not all math functions inline equally. E.g lrint will not inline
without the -fno-math-errno option. */
#if defined(CV_ICC)
// nothing
#elif defined(OPENCV_USE_FASTMATH_BUILTINS) && OPENCV_USE_FASTMATH_BUILTINS
#if defined(__clang__)
#define CV__FASTMATH_ENABLE_CLANG_MATH_BUILTINS
#if !defined(CV_INLINE_ISNAN_DBL) && __has_builtin(__builtin_isnan)
#define CV_INLINE_ISNAN_DBL(value) return __builtin_isnan(value);
#endif
#if !defined(CV_INLINE_ISNAN_FLT) && __has_builtin(__builtin_isnan)
#define CV_INLINE_ISNAN_FLT(value) return __builtin_isnan(value);
#endif
#if !defined(CV_INLINE_ISINF_DBL) && __has_builtin(__builtin_isinf)
#define CV_INLINE_ISINF_DBL(value) return __builtin_isinf(value);
#endif
#if !defined(CV_INLINE_ISINF_FLT) && __has_builtin(__builtin_isinf)
#define CV_INLINE_ISINF_FLT(value) return __builtin_isinf(value);
#endif
#elif defined(__GNUC__)
#define CV__FASTMATH_ENABLE_GCC_MATH_BUILTINS
#if !defined(CV_INLINE_ISNAN_DBL)
#define CV_INLINE_ISNAN_DBL(value) return __builtin_isnan(value);
#endif
#if !defined(CV_INLINE_ISNAN_FLT)
#define CV_INLINE_ISNAN_FLT(value) return __builtin_isnanf(value);
#endif
#if !defined(CV_INLINE_ISINF_DBL)
#define CV_INLINE_ISINF_DBL(value) return __builtin_isinf(value);
#endif
#if !defined(CV_INLINE_ISINF_FLT)
#define CV_INLINE_ISINF_FLT(value) return __builtin_isinff(value);
#endif
#elif defined(_MSC_VER)
#if !defined(CV_INLINE_ISNAN_DBL)
#define CV_INLINE_ISNAN_DBL(value) return isnan(value);
#endif
#if !defined(CV_INLINE_ISNAN_FLT)
#define CV_INLINE_ISNAN_FLT(value) return isnan(value);
#endif
#if !defined(CV_INLINE_ISINF_DBL)
#define CV_INLINE_ISINF_DBL(value) return isinf(value);
#endif
#if !defined(CV_INLINE_ISINF_FLT)
#define CV_INLINE_ISINF_FLT(value) return isinf(value);
#endif
#endif
#endif
#endif // defined(__CUDACC__)
/** @brief Rounds floating-point number to the nearest integer
@ -100,8 +199,11 @@
CV_INLINE int
cvRound( double value )
{
#if ((defined _MSC_VER && defined _M_X64) || (defined __GNUC__ && defined __x86_64__ \
&& defined __SSE2__ && !defined __APPLE__) || CV_SSE2) && !defined(__CUDACC__)
#if defined CV_INLINE_ROUND_DBL
CV_INLINE_ROUND_DBL(value);
#elif ((defined _MSC_VER && defined _M_X64) || (defined __GNUC__ && defined __x86_64__ \
&& defined __SSE2__ && !defined __APPLE__) || CV_SSE2) \
&& !defined(__CUDACC__)
__m128d t = _mm_set_sd( value );
return _mm_cvtsd_si32(t);
#elif defined _MSC_VER && defined _M_IX86
@ -112,15 +214,8 @@ cvRound( double value )
fistp t;
}
return t;
#elif ((defined _MSC_VER && defined _M_ARM) || defined CV_ICC || \
defined __GNUC__) && defined HAVE_TEGRA_OPTIMIZATION
TEGRA_ROUND_DBL(value);
#elif defined CV_ICC || defined __GNUC__
# if defined ARM_ROUND_DBL
ARM_ROUND_DBL(value);
# else
return (int)lrint(value);
# endif
return (int)(lrint(value));
#else
/* it's ok if round does not comply with IEEE754 standard;
the tests should allow +/-1 difference when the tested functions use round */
@ -138,8 +233,15 @@ cvRound( double value )
*/
CV_INLINE int cvFloor( double value )
{
#if (defined CV__FASTMATH_ENABLE_GCC_MATH_BUILTINS || defined CV__FASTMATH_ENABLE_CLANG_MATH_BUILTINS) \
&& ( \
defined(__PPC64__) \
)
return __builtin_floor(value);
#else
int i = (int)value;
return i - (i > value);
#endif
}
/** @brief Rounds floating-point number to the nearest integer not smaller than the original.
@ -151,8 +253,15 @@ CV_INLINE int cvFloor( double value )
*/
CV_INLINE int cvCeil( double value )
{
#if (defined CV__FASTMATH_ENABLE_GCC_MATH_BUILTINS || defined CV__FASTMATH_ENABLE_CLANG_MATH_BUILTINS) \
&& ( \
defined(__PPC64__) \
)
return __builtin_ceil(value);
#else
int i = (int)value;
return i + (i < value);
#endif
}
/** @brief Determines if the argument is Not A Number.
@ -163,10 +272,14 @@ CV_INLINE int cvCeil( double value )
otherwise. */
CV_INLINE int cvIsNaN( double value )
{
#if defined CV_INLINE_ISNAN_DBL
CV_INLINE_ISNAN_DBL(value);
#else
Cv64suf ieee754;
ieee754.f = value;
return ((unsigned)(ieee754.u >> 32) & 0x7fffffff) +
((unsigned)ieee754.u != 0) > 0x7ff00000;
#endif
}
/** @brief Determines if the argument is Infinity.
@ -177,10 +290,19 @@ CV_INLINE int cvIsNaN( double value )
and 0 otherwise. */
CV_INLINE int cvIsInf( double value )
{
#if defined CV_INLINE_ISINF_DBL
CV_INLINE_ISINF_DBL(value);
#elif defined(__x86_64__) || defined(_M_X64) || defined(__aarch64__) || defined(_M_ARM64) || defined(__PPC64__)
Cv64suf ieee754;
ieee754.f = value;
return (ieee754.u & 0x7fffffff00000000) ==
0x7ff0000000000000;
#else
Cv64suf ieee754;
ieee754.f = value;
return ((unsigned)(ieee754.u >> 32) & 0x7fffffff) == 0x7ff00000 &&
(unsigned)ieee754.u == 0;
#endif
}
#ifdef __cplusplus
@ -188,8 +310,11 @@ CV_INLINE int cvIsInf( double value )
/** @overload */
CV_INLINE int cvRound(float value)
{
#if ((defined _MSC_VER && defined _M_X64) || (defined __GNUC__ && defined __x86_64__ \
&& defined __SSE2__ && !defined __APPLE__) || CV_SSE2) && !defined(__CUDACC__)
#if defined CV_INLINE_ROUND_FLT
CV_INLINE_ROUND_FLT(value);
#elif ((defined _MSC_VER && defined _M_X64) || (defined __GNUC__ && defined __x86_64__ \
&& defined __SSE2__ && !defined __APPLE__) || CV_SSE2) \
&& !defined(__CUDACC__)
__m128 t = _mm_set_ss( value );
return _mm_cvtss_si32(t);
#elif defined _MSC_VER && defined _M_IX86
@ -200,15 +325,8 @@ CV_INLINE int cvRound(float value)
fistp t;
}
return t;
#elif ((defined _MSC_VER && defined _M_ARM) || defined CV_ICC || \
defined __GNUC__) && defined HAVE_TEGRA_OPTIMIZATION
TEGRA_ROUND_FLT(value);
#elif defined CV_ICC || defined __GNUC__
# if defined ARM_ROUND_FLT
ARM_ROUND_FLT(value);
# else
return (int)lrintf(value);
# endif
return (int)(lrintf(value));
#else
/* it's ok if round does not comply with IEEE754 standard;
the tests should allow +/-1 difference when the tested functions use round */
@ -225,8 +343,15 @@ CV_INLINE int cvRound( int value )
/** @overload */
CV_INLINE int cvFloor( float value )
{
#if (defined CV__FASTMATH_ENABLE_GCC_MATH_BUILTINS || defined CV__FASTMATH_ENABLE_CLANG_MATH_BUILTINS) \
&& ( \
defined(__PPC64__) \
)
return __builtin_floorf(value);
#else
int i = (int)value;
return i - (i > value);
#endif
}
/** @overload */
@ -238,8 +363,15 @@ CV_INLINE int cvFloor( int value )
/** @overload */
CV_INLINE int cvCeil( float value )
{
#if (defined CV__FASTMATH_ENABLE_GCC_MATH_BUILTINS || defined CV__FASTMATH_ENABLE_CLANG_MATH_BUILTINS) \
&& ( \
defined(__PPC64__) \
)
return __builtin_ceilf(value);
#else
int i = (int)value;
return i + (i < value);
#endif
}
/** @overload */
@ -251,17 +383,25 @@ CV_INLINE int cvCeil( int value )
/** @overload */
CV_INLINE int cvIsNaN( float value )
{
#if defined CV_INLINE_ISNAN_FLT
CV_INLINE_ISNAN_FLT(value);
#else
Cv32suf ieee754;
ieee754.f = value;
return (ieee754.u & 0x7fffffff) > 0x7f800000;
#endif
}
/** @overload */
CV_INLINE int cvIsInf( float value )
{
#if defined CV_INLINE_ISINF_FLT
CV_INLINE_ISINF_FLT(value);
#else
Cv32suf ieee754;
ieee754.f = value;
return (ieee754.u & 0x7fffffff) == 0x7f800000;
#endif
}
#endif // __cplusplus

View File

@ -0,0 +1,698 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
// Copyright (C) 2015, Itseez Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#ifndef OPENCV_HAL_INTRIN_HPP
#define OPENCV_HAL_INTRIN_HPP
#include <cmath>
#include <float.h>
#include <stdlib.h>
#include "opencv2/core/cvdef.h"
#define OPENCV_HAL_ADD(a, b) ((a) + (b))
#define OPENCV_HAL_AND(a, b) ((a) & (b))
#define OPENCV_HAL_NOP(a) (a)
#define OPENCV_HAL_1ST(a, b) (a)
namespace {
inline unsigned int trailingZeros32(unsigned int value) {
#if defined(_MSC_VER)
#if (_MSC_VER < 1700) || defined(_M_ARM) || defined(_M_ARM64)
unsigned long index = 0;
_BitScanForward(&index, value);
return (unsigned int)index;
#elif defined(__clang__)
// clang-cl doesn't export _tzcnt_u32 for non BMI systems
return value ? __builtin_ctz(value) : 32;
#else
return _tzcnt_u32(value);
#endif
#elif defined(__GNUC__) || defined(__GNUG__)
return __builtin_ctz(value);
#elif defined(__ICC) || defined(__INTEL_COMPILER)
return _bit_scan_forward(value);
#elif defined(__clang__)
return llvm.cttz.i32(value, true);
#else
static const int MultiplyDeBruijnBitPosition[32] = {
0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9 };
return MultiplyDeBruijnBitPosition[((uint32_t)((value & -value) * 0x077CB531U)) >> 27];
#endif
}
}
// unlike HAL API, which is in cv::hal,
// we put intrinsics into cv namespace to make its
// access from within opencv code more accessible
namespace cv {
namespace hal {
enum StoreMode
{
STORE_UNALIGNED = 0,
STORE_ALIGNED = 1,
STORE_ALIGNED_NOCACHE = 2
};
}
// TODO FIXIT: Don't use "God" traits. Split on separate cases.
template<typename _Tp> struct V_TypeTraits
{
};
#define CV_INTRIN_DEF_TYPE_TRAITS(type, int_type_, uint_type_, abs_type_, w_type_, q_type_, sum_type_) \
template<> struct V_TypeTraits<type> \
{ \
typedef type value_type; \
typedef int_type_ int_type; \
typedef abs_type_ abs_type; \
typedef uint_type_ uint_type; \
typedef w_type_ w_type; \
typedef q_type_ q_type; \
typedef sum_type_ sum_type; \
\
static inline int_type reinterpret_int(type x) \
{ \
union { type l; int_type i; } v; \
v.l = x; \
return v.i; \
} \
\
static inline type reinterpret_from_int(int_type x) \
{ \
union { type l; int_type i; } v; \
v.i = x; \
return v.l; \
} \
}
#define CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(type, int_type_, uint_type_, abs_type_, w_type_, sum_type_) \
template<> struct V_TypeTraits<type> \
{ \
typedef type value_type; \
typedef int_type_ int_type; \
typedef abs_type_ abs_type; \
typedef uint_type_ uint_type; \
typedef w_type_ w_type; \
typedef sum_type_ sum_type; \
\
static inline int_type reinterpret_int(type x) \
{ \
union { type l; int_type i; } v; \
v.l = x; \
return v.i; \
} \
\
static inline type reinterpret_from_int(int_type x) \
{ \
union { type l; int_type i; } v; \
v.i = x; \
return v.l; \
} \
}
CV_INTRIN_DEF_TYPE_TRAITS(uchar, schar, uchar, uchar, ushort, unsigned, unsigned);
CV_INTRIN_DEF_TYPE_TRAITS(schar, schar, uchar, uchar, short, int, int);
CV_INTRIN_DEF_TYPE_TRAITS(ushort, short, ushort, ushort, unsigned, uint64, unsigned);
CV_INTRIN_DEF_TYPE_TRAITS(short, short, ushort, ushort, int, int64, int);
CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(unsigned, int, unsigned, unsigned, uint64, unsigned);
CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(int, int, unsigned, unsigned, int64, int);
CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(float, int, unsigned, float, double, float);
CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(uint64, int64, uint64, uint64, void, uint64);
CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(int64, int64, uint64, uint64, void, int64);
CV_INTRIN_DEF_TYPE_TRAITS_NO_Q_TYPE(double, int64, uint64, double, void, double);
#ifndef CV_DOXYGEN
#ifndef CV_CPU_OPTIMIZATION_HAL_NAMESPACE
#ifdef CV_FORCE_SIMD128_CPP
#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE hal_EMULATOR_CPP
#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN namespace hal_EMULATOR_CPP {
#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END }
#elif defined(CV_CPU_DISPATCH_MODE)
#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE __CV_CAT(hal_, CV_CPU_DISPATCH_MODE)
#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN namespace __CV_CAT(hal_, CV_CPU_DISPATCH_MODE) {
#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END }
#else
#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE hal_baseline
#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN namespace hal_baseline {
#define CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END }
#endif
#endif // CV_CPU_OPTIMIZATION_HAL_NAMESPACE
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE;
#endif
}
#ifdef CV_DOXYGEN
# undef CV_AVX2
# undef CV_SSE2
# undef CV_NEON
# undef CV_VSX
# undef CV_FP16
# undef CV_MSA
#endif
#if (CV_SSE2 || CV_NEON || CV_VSX || CV_MSA || CV_WASM_SIMD) && !defined(CV_FORCE_SIMD128_CPP)
#define CV__SIMD_FORWARD 128
#include "opencv2/core/hal/intrin_forward.hpp"
#endif
#if CV_SSE2 && !defined(CV_FORCE_SIMD128_CPP)
#include "opencv2/core/hal/intrin_sse_em.hpp"
#include "opencv2/core/hal/intrin_sse.hpp"
#elif CV_NEON && !defined(CV_FORCE_SIMD128_CPP)
#include "opencv2/core/hal/intrin_neon.hpp"
#elif CV_VSX && !defined(CV_FORCE_SIMD128_CPP)
#include "opencv2/core/hal/intrin_vsx.hpp"
#elif CV_MSA && !defined(CV_FORCE_SIMD128_CPP)
#include "opencv2/core/hal/intrin_msa.hpp"
#elif CV_WASM_SIMD && !defined(CV_FORCE_SIMD128_CPP)
#include "opencv2/core/hal/intrin_wasm.hpp"
#else
#include "opencv2/core/hal/intrin_cpp.hpp"
#endif
// AVX2 can be used together with SSE2, so
// we define those two sets of intrinsics at once.
// Most of the intrinsics do not conflict (the proper overloaded variant is
// resolved by the argument types, e.g. v_float32x4 ~ SSE2, v_float32x8 ~ AVX2),
// but some of AVX2 intrinsics get v256_ prefix instead of v_, e.g. v256_load() vs v_load().
// Correspondingly, the wide intrinsics (which are mapped to the "widest"
// available instruction set) will get vx_ prefix
// (and will be mapped to v256_ counterparts) (e.g. vx_load() => v256_load())
#if CV_AVX2
#define CV__SIMD_FORWARD 256
#include "opencv2/core/hal/intrin_forward.hpp"
#include "opencv2/core/hal/intrin_avx.hpp"
#endif
// AVX512 can be used together with SSE2 and AVX2, so
// we define those sets of intrinsics at once.
// For some of AVX512 intrinsics get v512_ prefix instead of v_, e.g. v512_load() vs v_load().
// Wide intrinsics will be mapped to v512_ counterparts in this case(e.g. vx_load() => v512_load())
#if CV_AVX512_SKX
#define CV__SIMD_FORWARD 512
#include "opencv2/core/hal/intrin_forward.hpp"
#include "opencv2/core/hal/intrin_avx512.hpp"
#endif
//! @cond IGNORED
namespace cv {
#ifndef CV_DOXYGEN
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
#endif
#ifndef CV_SIMD128
#define CV_SIMD128 0
#endif
#ifndef CV_SIMD128_CPP
#define CV_SIMD128_CPP 0
#endif
#ifndef CV_SIMD128_64F
#define CV_SIMD128_64F 0
#endif
#ifndef CV_SIMD256
#define CV_SIMD256 0
#endif
#ifndef CV_SIMD256_64F
#define CV_SIMD256_64F 0
#endif
#ifndef CV_SIMD512
#define CV_SIMD512 0
#endif
#ifndef CV_SIMD512_64F
#define CV_SIMD512_64F 0
#endif
#ifndef CV_SIMD128_FP16
#define CV_SIMD128_FP16 0
#endif
#ifndef CV_SIMD256_FP16
#define CV_SIMD256_FP16 0
#endif
#ifndef CV_SIMD512_FP16
#define CV_SIMD512_FP16 0
#endif
//==================================================================================================
template<typename _Tp> struct V_RegTraits
{
};
#define CV_DEF_REG_TRAITS(prefix, _reg, lane_type, suffix, _u_reg, _w_reg, _q_reg, _int_reg, _round_reg) \
template<> struct V_RegTraits<_reg> \
{ \
typedef _reg reg; \
typedef _u_reg u_reg; \
typedef _w_reg w_reg; \
typedef _q_reg q_reg; \
typedef _int_reg int_reg; \
typedef _round_reg round_reg; \
}
#if CV_SIMD128 || CV_SIMD128_CPP
CV_DEF_REG_TRAITS(v, v_uint8x16, uchar, u8, v_uint8x16, v_uint16x8, v_uint32x4, v_int8x16, void);
CV_DEF_REG_TRAITS(v, v_int8x16, schar, s8, v_uint8x16, v_int16x8, v_int32x4, v_int8x16, void);
CV_DEF_REG_TRAITS(v, v_uint16x8, ushort, u16, v_uint16x8, v_uint32x4, v_uint64x2, v_int16x8, void);
CV_DEF_REG_TRAITS(v, v_int16x8, short, s16, v_uint16x8, v_int32x4, v_int64x2, v_int16x8, void);
CV_DEF_REG_TRAITS(v, v_uint32x4, unsigned, u32, v_uint32x4, v_uint64x2, void, v_int32x4, void);
CV_DEF_REG_TRAITS(v, v_int32x4, int, s32, v_uint32x4, v_int64x2, void, v_int32x4, void);
#if CV_SIMD128_64F || CV_SIMD128_CPP
CV_DEF_REG_TRAITS(v, v_float32x4, float, f32, v_float32x4, v_float64x2, void, v_int32x4, v_int32x4);
#else
CV_DEF_REG_TRAITS(v, v_float32x4, float, f32, v_float32x4, void, void, v_int32x4, v_int32x4);
#endif
CV_DEF_REG_TRAITS(v, v_uint64x2, uint64, u64, v_uint64x2, void, void, v_int64x2, void);
CV_DEF_REG_TRAITS(v, v_int64x2, int64, s64, v_uint64x2, void, void, v_int64x2, void);
#if CV_SIMD128_64F
CV_DEF_REG_TRAITS(v, v_float64x2, double, f64, v_float64x2, void, void, v_int64x2, v_int32x4);
#endif
#endif
#if CV_SIMD256
CV_DEF_REG_TRAITS(v256, v_uint8x32, uchar, u8, v_uint8x32, v_uint16x16, v_uint32x8, v_int8x32, void);
CV_DEF_REG_TRAITS(v256, v_int8x32, schar, s8, v_uint8x32, v_int16x16, v_int32x8, v_int8x32, void);
CV_DEF_REG_TRAITS(v256, v_uint16x16, ushort, u16, v_uint16x16, v_uint32x8, v_uint64x4, v_int16x16, void);
CV_DEF_REG_TRAITS(v256, v_int16x16, short, s16, v_uint16x16, v_int32x8, v_int64x4, v_int16x16, void);
CV_DEF_REG_TRAITS(v256, v_uint32x8, unsigned, u32, v_uint32x8, v_uint64x4, void, v_int32x8, void);
CV_DEF_REG_TRAITS(v256, v_int32x8, int, s32, v_uint32x8, v_int64x4, void, v_int32x8, void);
CV_DEF_REG_TRAITS(v256, v_float32x8, float, f32, v_float32x8, v_float64x4, void, v_int32x8, v_int32x8);
CV_DEF_REG_TRAITS(v256, v_uint64x4, uint64, u64, v_uint64x4, void, void, v_int64x4, void);
CV_DEF_REG_TRAITS(v256, v_int64x4, int64, s64, v_uint64x4, void, void, v_int64x4, void);
CV_DEF_REG_TRAITS(v256, v_float64x4, double, f64, v_float64x4, void, void, v_int64x4, v_int32x8);
#endif
#if CV_SIMD512
CV_DEF_REG_TRAITS(v512, v_uint8x64, uchar, u8, v_uint8x64, v_uint16x32, v_uint32x16, v_int8x64, void);
CV_DEF_REG_TRAITS(v512, v_int8x64, schar, s8, v_uint8x64, v_int16x32, v_int32x16, v_int8x64, void);
CV_DEF_REG_TRAITS(v512, v_uint16x32, ushort, u16, v_uint16x32, v_uint32x16, v_uint64x8, v_int16x32, void);
CV_DEF_REG_TRAITS(v512, v_int16x32, short, s16, v_uint16x32, v_int32x16, v_int64x8, v_int16x32, void);
CV_DEF_REG_TRAITS(v512, v_uint32x16, unsigned, u32, v_uint32x16, v_uint64x8, void, v_int32x16, void);
CV_DEF_REG_TRAITS(v512, v_int32x16, int, s32, v_uint32x16, v_int64x8, void, v_int32x16, void);
CV_DEF_REG_TRAITS(v512, v_float32x16, float, f32, v_float32x16, v_float64x8, void, v_int32x16, v_int32x16);
CV_DEF_REG_TRAITS(v512, v_uint64x8, uint64, u64, v_uint64x8, void, void, v_int64x8, void);
CV_DEF_REG_TRAITS(v512, v_int64x8, int64, s64, v_uint64x8, void, void, v_int64x8, void);
CV_DEF_REG_TRAITS(v512, v_float64x8, double, f64, v_float64x8, void, void, v_int64x8, v_int32x16);
#endif
//! @endcond
#if CV_SIMD512 && (!defined(CV__SIMD_FORCE_WIDTH) || CV__SIMD_FORCE_WIDTH == 512)
#define CV__SIMD_NAMESPACE simd512
namespace CV__SIMD_NAMESPACE {
#define CV_SIMD 1
#define CV_SIMD_64F CV_SIMD512_64F
#define CV_SIMD_FP16 CV_SIMD512_FP16
#define CV_SIMD_WIDTH 64
//! @addtogroup core_hal_intrin
//! @{
//! @brief Maximum available vector register capacity 8-bit unsigned integer values
typedef v_uint8x64 v_uint8;
//! @brief Maximum available vector register capacity 8-bit signed integer values
typedef v_int8x64 v_int8;
//! @brief Maximum available vector register capacity 16-bit unsigned integer values
typedef v_uint16x32 v_uint16;
//! @brief Maximum available vector register capacity 16-bit signed integer values
typedef v_int16x32 v_int16;
//! @brief Maximum available vector register capacity 32-bit unsigned integer values
typedef v_uint32x16 v_uint32;
//! @brief Maximum available vector register capacity 32-bit signed integer values
typedef v_int32x16 v_int32;
//! @brief Maximum available vector register capacity 64-bit unsigned integer values
typedef v_uint64x8 v_uint64;
//! @brief Maximum available vector register capacity 64-bit signed integer values
typedef v_int64x8 v_int64;
//! @brief Maximum available vector register capacity 32-bit floating point values (single precision)
typedef v_float32x16 v_float32;
#if CV_SIMD512_64F
//! @brief Maximum available vector register capacity 64-bit floating point values (double precision)
typedef v_float64x8 v_float64;
#endif
//! @}
#define VXPREFIX(func) v512##func
} // namespace
using namespace CV__SIMD_NAMESPACE;
#elif CV_SIMD256 && (!defined(CV__SIMD_FORCE_WIDTH) || CV__SIMD_FORCE_WIDTH == 256)
#define CV__SIMD_NAMESPACE simd256
namespace CV__SIMD_NAMESPACE {
#define CV_SIMD 1
#define CV_SIMD_64F CV_SIMD256_64F
#define CV_SIMD_FP16 CV_SIMD256_FP16
#define CV_SIMD_WIDTH 32
//! @addtogroup core_hal_intrin
//! @{
//! @brief Maximum available vector register capacity 8-bit unsigned integer values
typedef v_uint8x32 v_uint8;
//! @brief Maximum available vector register capacity 8-bit signed integer values
typedef v_int8x32 v_int8;
//! @brief Maximum available vector register capacity 16-bit unsigned integer values
typedef v_uint16x16 v_uint16;
//! @brief Maximum available vector register capacity 16-bit signed integer values
typedef v_int16x16 v_int16;
//! @brief Maximum available vector register capacity 32-bit unsigned integer values
typedef v_uint32x8 v_uint32;
//! @brief Maximum available vector register capacity 32-bit signed integer values
typedef v_int32x8 v_int32;
//! @brief Maximum available vector register capacity 64-bit unsigned integer values
typedef v_uint64x4 v_uint64;
//! @brief Maximum available vector register capacity 64-bit signed integer values
typedef v_int64x4 v_int64;
//! @brief Maximum available vector register capacity 32-bit floating point values (single precision)
typedef v_float32x8 v_float32;
#if CV_SIMD256_64F
//! @brief Maximum available vector register capacity 64-bit floating point values (double precision)
typedef v_float64x4 v_float64;
#endif
//! @}
#define VXPREFIX(func) v256##func
} // namespace
using namespace CV__SIMD_NAMESPACE;
#elif (CV_SIMD128 || CV_SIMD128_CPP) && (!defined(CV__SIMD_FORCE_WIDTH) || CV__SIMD_FORCE_WIDTH == 128)
#if defined CV_SIMD128_CPP
#define CV__SIMD_NAMESPACE simd128_cpp
#else
#define CV__SIMD_NAMESPACE simd128
#endif
namespace CV__SIMD_NAMESPACE {
#define CV_SIMD CV_SIMD128
#define CV_SIMD_64F CV_SIMD128_64F
#define CV_SIMD_WIDTH 16
//! @addtogroup core_hal_intrin
//! @{
//! @brief Maximum available vector register capacity 8-bit unsigned integer values
typedef v_uint8x16 v_uint8;
//! @brief Maximum available vector register capacity 8-bit signed integer values
typedef v_int8x16 v_int8;
//! @brief Maximum available vector register capacity 16-bit unsigned integer values
typedef v_uint16x8 v_uint16;
//! @brief Maximum available vector register capacity 16-bit signed integer values
typedef v_int16x8 v_int16;
//! @brief Maximum available vector register capacity 32-bit unsigned integer values
typedef v_uint32x4 v_uint32;
//! @brief Maximum available vector register capacity 32-bit signed integer values
typedef v_int32x4 v_int32;
//! @brief Maximum available vector register capacity 64-bit unsigned integer values
typedef v_uint64x2 v_uint64;
//! @brief Maximum available vector register capacity 64-bit signed integer values
typedef v_int64x2 v_int64;
//! @brief Maximum available vector register capacity 32-bit floating point values (single precision)
typedef v_float32x4 v_float32;
#if CV_SIMD128_64F
//! @brief Maximum available vector register capacity 64-bit floating point values (double precision)
typedef v_float64x2 v_float64;
#endif
//! @}
#define VXPREFIX(func) v##func
} // namespace
using namespace CV__SIMD_NAMESPACE;
#endif
namespace CV__SIMD_NAMESPACE {
//! @addtogroup core_hal_intrin
//! @{
//! @name Wide init with value
//! @{
//! @brief Create maximum available capacity vector with elements set to a specific value
inline v_uint8 vx_setall_u8(uchar v) { return VXPREFIX(_setall_u8)(v); }
inline v_int8 vx_setall_s8(schar v) { return VXPREFIX(_setall_s8)(v); }
inline v_uint16 vx_setall_u16(ushort v) { return VXPREFIX(_setall_u16)(v); }
inline v_int16 vx_setall_s16(short v) { return VXPREFIX(_setall_s16)(v); }
inline v_int32 vx_setall_s32(int v) { return VXPREFIX(_setall_s32)(v); }
inline v_uint32 vx_setall_u32(unsigned v) { return VXPREFIX(_setall_u32)(v); }
inline v_float32 vx_setall_f32(float v) { return VXPREFIX(_setall_f32)(v); }
inline v_int64 vx_setall_s64(int64 v) { return VXPREFIX(_setall_s64)(v); }
inline v_uint64 vx_setall_u64(uint64 v) { return VXPREFIX(_setall_u64)(v); }
#if CV_SIMD_64F
inline v_float64 vx_setall_f64(double v) { return VXPREFIX(_setall_f64)(v); }
#endif
//! @}
//! @name Wide init with zero
//! @{
//! @brief Create maximum available capacity vector with elements set to zero
inline v_uint8 vx_setzero_u8() { return VXPREFIX(_setzero_u8)(); }
inline v_int8 vx_setzero_s8() { return VXPREFIX(_setzero_s8)(); }
inline v_uint16 vx_setzero_u16() { return VXPREFIX(_setzero_u16)(); }
inline v_int16 vx_setzero_s16() { return VXPREFIX(_setzero_s16)(); }
inline v_int32 vx_setzero_s32() { return VXPREFIX(_setzero_s32)(); }
inline v_uint32 vx_setzero_u32() { return VXPREFIX(_setzero_u32)(); }
inline v_float32 vx_setzero_f32() { return VXPREFIX(_setzero_f32)(); }
inline v_int64 vx_setzero_s64() { return VXPREFIX(_setzero_s64)(); }
inline v_uint64 vx_setzero_u64() { return VXPREFIX(_setzero_u64)(); }
#if CV_SIMD_64F
inline v_float64 vx_setzero_f64() { return VXPREFIX(_setzero_f64)(); }
#endif
//! @}
//! @name Wide load from memory
//! @{
//! @brief Load maximum available capacity register contents from memory
inline v_uint8 vx_load(const uchar * ptr) { return VXPREFIX(_load)(ptr); }
inline v_int8 vx_load(const schar * ptr) { return VXPREFIX(_load)(ptr); }
inline v_uint16 vx_load(const ushort * ptr) { return VXPREFIX(_load)(ptr); }
inline v_int16 vx_load(const short * ptr) { return VXPREFIX(_load)(ptr); }
inline v_int32 vx_load(const int * ptr) { return VXPREFIX(_load)(ptr); }
inline v_uint32 vx_load(const unsigned * ptr) { return VXPREFIX(_load)(ptr); }
inline v_float32 vx_load(const float * ptr) { return VXPREFIX(_load)(ptr); }
inline v_int64 vx_load(const int64 * ptr) { return VXPREFIX(_load)(ptr); }
inline v_uint64 vx_load(const uint64 * ptr) { return VXPREFIX(_load)(ptr); }
#if CV_SIMD_64F
inline v_float64 vx_load(const double * ptr) { return VXPREFIX(_load)(ptr); }
#endif
//! @}
//! @name Wide load from memory(aligned)
//! @{
//! @brief Load maximum available capacity register contents from memory(aligned)
inline v_uint8 vx_load_aligned(const uchar * ptr) { return VXPREFIX(_load_aligned)(ptr); }
inline v_int8 vx_load_aligned(const schar * ptr) { return VXPREFIX(_load_aligned)(ptr); }
inline v_uint16 vx_load_aligned(const ushort * ptr) { return VXPREFIX(_load_aligned)(ptr); }
inline v_int16 vx_load_aligned(const short * ptr) { return VXPREFIX(_load_aligned)(ptr); }
inline v_int32 vx_load_aligned(const int * ptr) { return VXPREFIX(_load_aligned)(ptr); }
inline v_uint32 vx_load_aligned(const unsigned * ptr) { return VXPREFIX(_load_aligned)(ptr); }
inline v_float32 vx_load_aligned(const float * ptr) { return VXPREFIX(_load_aligned)(ptr); }
inline v_int64 vx_load_aligned(const int64 * ptr) { return VXPREFIX(_load_aligned)(ptr); }
inline v_uint64 vx_load_aligned(const uint64 * ptr) { return VXPREFIX(_load_aligned)(ptr); }
#if CV_SIMD_64F
inline v_float64 vx_load_aligned(const double * ptr) { return VXPREFIX(_load_aligned)(ptr); }
#endif
//! @}
//! @name Wide load lower half from memory
//! @{
//! @brief Load lower half of maximum available capacity register from memory
inline v_uint8 vx_load_low(const uchar * ptr) { return VXPREFIX(_load_low)(ptr); }
inline v_int8 vx_load_low(const schar * ptr) { return VXPREFIX(_load_low)(ptr); }
inline v_uint16 vx_load_low(const ushort * ptr) { return VXPREFIX(_load_low)(ptr); }
inline v_int16 vx_load_low(const short * ptr) { return VXPREFIX(_load_low)(ptr); }
inline v_int32 vx_load_low(const int * ptr) { return VXPREFIX(_load_low)(ptr); }
inline v_uint32 vx_load_low(const unsigned * ptr) { return VXPREFIX(_load_low)(ptr); }
inline v_float32 vx_load_low(const float * ptr) { return VXPREFIX(_load_low)(ptr); }
inline v_int64 vx_load_low(const int64 * ptr) { return VXPREFIX(_load_low)(ptr); }
inline v_uint64 vx_load_low(const uint64 * ptr) { return VXPREFIX(_load_low)(ptr); }
#if CV_SIMD_64F
inline v_float64 vx_load_low(const double * ptr) { return VXPREFIX(_load_low)(ptr); }
#endif
//! @}
//! @name Wide load halfs from memory
//! @{
//! @brief Load maximum available capacity register contents from two memory blocks
inline v_uint8 vx_load_halves(const uchar * ptr0, const uchar * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
inline v_int8 vx_load_halves(const schar * ptr0, const schar * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
inline v_uint16 vx_load_halves(const ushort * ptr0, const ushort * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
inline v_int16 vx_load_halves(const short * ptr0, const short * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
inline v_int32 vx_load_halves(const int * ptr0, const int * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
inline v_uint32 vx_load_halves(const unsigned * ptr0, const unsigned * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
inline v_float32 vx_load_halves(const float * ptr0, const float * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
inline v_int64 vx_load_halves(const int64 * ptr0, const int64 * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
inline v_uint64 vx_load_halves(const uint64 * ptr0, const uint64 * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
#if CV_SIMD_64F
inline v_float64 vx_load_halves(const double * ptr0, const double * ptr1) { return VXPREFIX(_load_halves)(ptr0, ptr1); }
#endif
//! @}
//! @name Wide LUT of elements
//! @{
//! @brief Load maximum available capacity register contents with array elements by provided indexes
inline v_uint8 vx_lut(const uchar * ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
inline v_int8 vx_lut(const schar * ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
inline v_uint16 vx_lut(const ushort * ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
inline v_int16 vx_lut(const short* ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
inline v_int32 vx_lut(const int* ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
inline v_uint32 vx_lut(const unsigned* ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
inline v_float32 vx_lut(const float* ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
inline v_int64 vx_lut(const int64 * ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
inline v_uint64 vx_lut(const uint64 * ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
#if CV_SIMD_64F
inline v_float64 vx_lut(const double* ptr, const int* idx) { return VXPREFIX(_lut)(ptr, idx); }
#endif
//! @}
//! @name Wide LUT of element pairs
//! @{
//! @brief Load maximum available capacity register contents with array element pairs by provided indexes
inline v_uint8 vx_lut_pairs(const uchar * ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
inline v_int8 vx_lut_pairs(const schar * ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
inline v_uint16 vx_lut_pairs(const ushort * ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
inline v_int16 vx_lut_pairs(const short* ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
inline v_int32 vx_lut_pairs(const int* ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
inline v_uint32 vx_lut_pairs(const unsigned* ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
inline v_float32 vx_lut_pairs(const float* ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
inline v_int64 vx_lut_pairs(const int64 * ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
inline v_uint64 vx_lut_pairs(const uint64 * ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
#if CV_SIMD_64F
inline v_float64 vx_lut_pairs(const double* ptr, const int* idx) { return VXPREFIX(_lut_pairs)(ptr, idx); }
#endif
//! @}
//! @name Wide LUT of element quads
//! @{
//! @brief Load maximum available capacity register contents with array element quads by provided indexes
inline v_uint8 vx_lut_quads(const uchar* ptr, const int* idx) { return VXPREFIX(_lut_quads)(ptr, idx); }
inline v_int8 vx_lut_quads(const schar* ptr, const int* idx) { return VXPREFIX(_lut_quads)(ptr, idx); }
inline v_uint16 vx_lut_quads(const ushort* ptr, const int* idx) { return VXPREFIX(_lut_quads)(ptr, idx); }
inline v_int16 vx_lut_quads(const short* ptr, const int* idx) { return VXPREFIX(_lut_quads)(ptr, idx); }
inline v_int32 vx_lut_quads(const int* ptr, const int* idx) { return VXPREFIX(_lut_quads)(ptr, idx); }
inline v_uint32 vx_lut_quads(const unsigned* ptr, const int* idx) { return VXPREFIX(_lut_quads)(ptr, idx); }
inline v_float32 vx_lut_quads(const float* ptr, const int* idx) { return VXPREFIX(_lut_quads)(ptr, idx); }
//! @}
//! @name Wide load with double expansion
//! @{
//! @brief Load maximum available capacity register contents from memory with double expand
inline v_uint16 vx_load_expand(const uchar * ptr) { return VXPREFIX(_load_expand)(ptr); }
inline v_int16 vx_load_expand(const schar * ptr) { return VXPREFIX(_load_expand)(ptr); }
inline v_uint32 vx_load_expand(const ushort * ptr) { return VXPREFIX(_load_expand)(ptr); }
inline v_int32 vx_load_expand(const short* ptr) { return VXPREFIX(_load_expand)(ptr); }
inline v_int64 vx_load_expand(const int* ptr) { return VXPREFIX(_load_expand)(ptr); }
inline v_uint64 vx_load_expand(const unsigned* ptr) { return VXPREFIX(_load_expand)(ptr); }
inline v_float32 vx_load_expand(const float16_t * ptr) { return VXPREFIX(_load_expand)(ptr); }
//! @}
//! @name Wide load with quad expansion
//! @{
//! @brief Load maximum available capacity register contents from memory with quad expand
inline v_uint32 vx_load_expand_q(const uchar * ptr) { return VXPREFIX(_load_expand_q)(ptr); }
inline v_int32 vx_load_expand_q(const schar * ptr) { return VXPREFIX(_load_expand_q)(ptr); }
//! @}
/** @brief SIMD processing state cleanup call */
inline void vx_cleanup() { VXPREFIX(_cleanup)(); }
//! @cond IGNORED
// backward compatibility
template<typename _Tp, typename _Tvec> static inline
void vx_store(_Tp* dst, const _Tvec& v) { return v_store(dst, v); }
// backward compatibility
template<typename _Tp, typename _Tvec> static inline
void vx_store_aligned(_Tp* dst, const _Tvec& v) { return v_store_aligned(dst, v); }
//! @endcond
//! @}
#undef VXPREFIX
} // namespace
//! @cond IGNORED
#ifndef CV_SIMD_64F
#define CV_SIMD_64F 0
#endif
#ifndef CV_SIMD_FP16
#define CV_SIMD_FP16 0 //!< Defined to 1 on native support of operations with float16x8_t / float16x16_t (SIMD256) types
#endif
#ifndef CV_SIMD
#define CV_SIMD 0
#endif
#include "simd_utils.impl.hpp"
#ifndef CV_DOXYGEN
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
#endif
} // cv::
//! @endcond
#endif

View File

@ -90,6 +90,50 @@ inline __m256i _v256_packs_epu32(const __m256i& a, const __m256i& b)
return _mm256_packus_epi32(am, bm);
}
template<int i>
inline int _v256_extract_epi8(const __m256i& a)
{
#if defined(CV__SIMD_HAVE_mm256_extract_epi8) || (CV_AVX2 && (!defined(_MSC_VER) || _MSC_VER >= 1910/*MSVS 2017*/))
return _mm256_extract_epi8(a, i);
#else
__m128i b = _mm256_extractf128_si256(a, ((i) >> 4));
return _mm_extract_epi8(b, i & 15); // SSE4.1
#endif
}
template<int i>
inline int _v256_extract_epi16(const __m256i& a)
{
#if defined(CV__SIMD_HAVE_mm256_extract_epi8) || (CV_AVX2 && (!defined(_MSC_VER) || _MSC_VER >= 1910/*MSVS 2017*/))
return _mm256_extract_epi16(a, i);
#else
__m128i b = _mm256_extractf128_si256(a, ((i) >> 3));
return _mm_extract_epi16(b, i & 7); // SSE2
#endif
}
template<int i>
inline int _v256_extract_epi32(const __m256i& a)
{
#if defined(CV__SIMD_HAVE_mm256_extract_epi8) || (CV_AVX2 && (!defined(_MSC_VER) || _MSC_VER >= 1910/*MSVS 2017*/))
return _mm256_extract_epi32(a, i);
#else
__m128i b = _mm256_extractf128_si256(a, ((i) >> 2));
return _mm_extract_epi32(b, i & 3); // SSE4.1
#endif
}
template<int i>
inline int64 _v256_extract_epi64(const __m256i& a)
{
#if defined(CV__SIMD_HAVE_mm256_extract_epi8) || (CV_AVX2 && (!defined(_MSC_VER) || _MSC_VER >= 1910/*MSVS 2017*/))
return _mm256_extract_epi64(a, i);
#else
__m128i b = _mm256_extractf128_si256(a, ((i) >> 1));
return _mm_extract_epi64(b, i & 1); // SSE4.1
#endif
}
///////// Types ////////////
struct v_uint8x32
@ -115,7 +159,9 @@ struct v_uint8x32
(char)v22, (char)v23, (char)v24, (char)v25, (char)v26, (char)v27,
(char)v28, (char)v29, (char)v30, (char)v31);
}
v_uint8x32() : val(_mm256_setzero_si256()) {}
/* coverity[uninit_ctor]: suppress warning */
v_uint8x32() {}
uchar get0() const { return (uchar)_v_cvtsi256_si32(val); }
};
@ -139,7 +185,9 @@ struct v_int8x32
v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20,
v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31);
}
v_int8x32() : val(_mm256_setzero_si256()) {}
/* coverity[uninit_ctor]: suppress warning */
v_int8x32() {}
schar get0() const { return (schar)_v_cvtsi256_si32(val); }
};
@ -159,7 +207,9 @@ struct v_uint16x16
(short)v4, (short)v5, (short)v6, (short)v7, (short)v8, (short)v9,
(short)v10, (short)v11, (short)v12, (short)v13, (short)v14, (short)v15);
}
v_uint16x16() : val(_mm256_setzero_si256()) {}
/* coverity[uninit_ctor]: suppress warning */
v_uint16x16() {}
ushort get0() const { return (ushort)_v_cvtsi256_si32(val); }
};
@ -178,7 +228,9 @@ struct v_int16x16
val = _mm256_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7,
v8, v9, v10, v11, v12, v13, v14, v15);
}
v_int16x16() : val(_mm256_setzero_si256()) {}
/* coverity[uninit_ctor]: suppress warning */
v_int16x16() {}
short get0() const { return (short)_v_cvtsi256_si32(val); }
};
@ -195,7 +247,9 @@ struct v_uint32x8
val = _mm256_setr_epi32((unsigned)v0, (unsigned)v1, (unsigned)v2,
(unsigned)v3, (unsigned)v4, (unsigned)v5, (unsigned)v6, (unsigned)v7);
}
v_uint32x8() : val(_mm256_setzero_si256()) {}
/* coverity[uninit_ctor]: suppress warning */
v_uint32x8() {}
unsigned get0() const { return (unsigned)_v_cvtsi256_si32(val); }
};
@ -211,7 +265,9 @@ struct v_int32x8
{
val = _mm256_setr_epi32(v0, v1, v2, v3, v4, v5, v6, v7);
}
v_int32x8() : val(_mm256_setzero_si256()) {}
/* coverity[uninit_ctor]: suppress warning */
v_int32x8() {}
int get0() const { return _v_cvtsi256_si32(val); }
};
@ -227,7 +283,9 @@ struct v_float32x8
{
val = _mm256_setr_ps(v0, v1, v2, v3, v4, v5, v6, v7);
}
v_float32x8() : val(_mm256_setzero_ps()) {}
/* coverity[uninit_ctor]: suppress warning */
v_float32x8() {}
float get0() const { return _mm_cvtss_f32(_mm256_castps256_ps128(val)); }
};
@ -240,7 +298,9 @@ struct v_uint64x4
explicit v_uint64x4(__m256i v) : val(v) {}
v_uint64x4(uint64 v0, uint64 v1, uint64 v2, uint64 v3)
{ val = _mm256_setr_epi64x((int64)v0, (int64)v1, (int64)v2, (int64)v3); }
v_uint64x4() : val(_mm256_setzero_si256()) {}
/* coverity[uninit_ctor]: suppress warning */
v_uint64x4() {}
uint64 get0() const
{
#if defined __x86_64__ || defined _M_X64
@ -262,7 +322,8 @@ struct v_int64x4
explicit v_int64x4(__m256i v) : val(v) {}
v_int64x4(int64 v0, int64 v1, int64 v2, int64 v3)
{ val = _mm256_setr_epi64x(v0, v1, v2, v3); }
v_int64x4() : val(_mm256_setzero_si256()) {}
/* coverity[uninit_ctor]: suppress warning */
v_int64x4() {}
int64 get0() const
{
@ -285,7 +346,9 @@ struct v_float64x4
explicit v_float64x4(__m256d v) : val(v) {}
v_float64x4(double v0, double v1, double v2, double v3)
{ val = _mm256_setr_pd(v0, v1, v2, v3); }
v_float64x4() : val(_mm256_setzero_pd()) {}
/* coverity[uninit_ctor]: suppress warning */
v_float64x4() {}
double get0() const { return _mm_cvtsd_f64(_mm256_castpd256_pd128(val)); }
};
@ -431,19 +494,6 @@ inline v_float64x4 v_reinterpret_as_f64(const v_float64x4& a)
inline v_float64x4 v_reinterpret_as_f64(const v_float32x8& a)
{ return v_float64x4(_mm256_castps_pd(a.val)); }
#if CV_FP16
inline v_float32x8 v256_load_fp16_f32(const short* ptr)
{
return v_float32x8(_mm256_cvtph_ps(_mm_loadu_si128((const __m128i*)ptr)));
}
inline void v_store_fp16(short* ptr, const v_float32x8& a)
{
__m128i fp16_value = _mm256_cvtps_ph(a.val, 0);
_mm_store_si128((__m128i*)ptr, fp16_value);
}
#endif
/* Recombine */
/*#define OPENCV_HAL_IMPL_AVX_COMBINE(_Tpvec, perm) \
inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \
@ -538,7 +588,7 @@ inline v_int64x4 v256_blend(const v_int64x4& a, const v_int64x4& b)
{ return v_int64x4(v256_blend<m>(v_uint64x4(a.val), v_uint64x4(b.val)).val); }
// shuffle
// todo: emluate 64bit
// todo: emulate 64bit
#define OPENCV_HAL_IMPL_AVX_SHUFFLE(_Tpvec, intrin) \
template<int m> \
inline _Tpvec v256_shuffle(const _Tpvec& a) \
@ -1025,9 +1075,85 @@ OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_right, v_float32x8, _mm256_castsi256_ps
OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_left, v_float64x4, _mm256_castsi256_pd)
OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_right, v_float64x4, _mm256_castsi256_pd)
/** Reverse **/
inline v_uint8x32 v_reverse(const v_uint8x32 &a)
{
static const __m256i perm = _mm256_setr_epi8(
15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
__m256i vec = _mm256_shuffle_epi8(a.val, perm);
return v_uint8x32(_mm256_permute2x128_si256(vec, vec, 1));
}
inline v_int8x32 v_reverse(const v_int8x32 &a)
{ return v_reinterpret_as_s8(v_reverse(v_reinterpret_as_u8(a))); }
inline v_uint16x16 v_reverse(const v_uint16x16 &a)
{
static const __m256i perm = _mm256_setr_epi8(
14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1,
14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
__m256i vec = _mm256_shuffle_epi8(a.val, perm);
return v_uint16x16(_mm256_permute2x128_si256(vec, vec, 1));
}
inline v_int16x16 v_reverse(const v_int16x16 &a)
{ return v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); }
inline v_uint32x8 v_reverse(const v_uint32x8 &a)
{
static const __m256i perm = _mm256_setr_epi32(7, 6, 5, 4, 3, 2, 1, 0);
return v_uint32x8(_mm256_permutevar8x32_epi32(a.val, perm));
}
inline v_int32x8 v_reverse(const v_int32x8 &a)
{ return v_reinterpret_as_s32(v_reverse(v_reinterpret_as_u32(a))); }
inline v_float32x8 v_reverse(const v_float32x8 &a)
{ return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); }
inline v_uint64x4 v_reverse(const v_uint64x4 &a)
{
return v_uint64x4(_mm256_permute4x64_epi64(a.val, _MM_SHUFFLE(0, 1, 2, 3)));
}
inline v_int64x4 v_reverse(const v_int64x4 &a)
{ return v_reinterpret_as_s64(v_reverse(v_reinterpret_as_u64(a))); }
inline v_float64x4 v_reverse(const v_float64x4 &a)
{ return v_reinterpret_as_f64(v_reverse(v_reinterpret_as_u64(a))); }
////////// Reduce and mask /////////
/** Reduce **/
inline unsigned v_reduce_sum(const v_uint8x32& a)
{
__m256i half = _mm256_sad_epu8(a.val, _mm256_setzero_si256());
__m128i quarter = _mm_add_epi32(_v256_extract_low(half), _v256_extract_high(half));
return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(quarter, _mm_unpackhi_epi64(quarter, quarter)));
}
inline int v_reduce_sum(const v_int8x32& a)
{
__m256i half = _mm256_sad_epu8(_mm256_xor_si256(a.val, _mm256_set1_epi8((schar)-128)), _mm256_setzero_si256());
__m128i quarter = _mm_add_epi32(_v256_extract_low(half), _v256_extract_high(half));
return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(quarter, _mm_unpackhi_epi64(quarter, quarter))) - 4096;
}
#define OPENCV_HAL_IMPL_AVX_REDUCE_32(_Tpvec, sctype, func, intrin) \
inline sctype v_reduce_##func(const _Tpvec& a) \
{ \
__m128i val = intrin(_v256_extract_low(a.val), _v256_extract_high(a.val)); \
val = intrin(val, _mm_srli_si128(val,8)); \
val = intrin(val, _mm_srli_si128(val,4)); \
val = intrin(val, _mm_srli_si128(val,2)); \
val = intrin(val, _mm_srli_si128(val,1)); \
return (sctype)_mm_cvtsi128_si32(val); \
}
OPENCV_HAL_IMPL_AVX_REDUCE_32(v_uint8x32, uchar, min, _mm_min_epu8)
OPENCV_HAL_IMPL_AVX_REDUCE_32(v_int8x32, schar, min, _mm_min_epi8)
OPENCV_HAL_IMPL_AVX_REDUCE_32(v_uint8x32, uchar, max, _mm_max_epu8)
OPENCV_HAL_IMPL_AVX_REDUCE_32(v_int8x32, schar, max, _mm_max_epi8)
#define OPENCV_HAL_IMPL_AVX_REDUCE_16(_Tpvec, sctype, func, intrin) \
inline sctype v_reduce_##func(const _Tpvec& a) \
{ \
@ -1068,38 +1194,13 @@ OPENCV_HAL_IMPL_AVX_REDUCE_8(v_int32x8, int, max, _mm_max_epi32)
__m128 v1 = _v256_extract_high(a.val); \
v0 = intrin(v0, v1); \
v0 = intrin(v0, _mm_permute_ps(v0, _MM_SHUFFLE(0, 0, 3, 2))); \
v0 = intrin(v0, _mm_permute_ps(v0, _MM_SHUFFLE(0, 0, 0, 3))); \
v0 = intrin(v0, _mm_permute_ps(v0, _MM_SHUFFLE(0, 0, 0, 1))); \
return _mm_cvtss_f32(v0); \
}
OPENCV_HAL_IMPL_AVX_REDUCE_FLT(min, _mm_min_ps)
OPENCV_HAL_IMPL_AVX_REDUCE_FLT(max, _mm_max_ps)
inline ushort v_reduce_sum(const v_uint16x16& a)
{
__m128i a0 = _v256_extract_low(a.val);
__m128i a1 = _v256_extract_high(a.val);
__m128i s0 = _mm_adds_epu16(a0, a1);
s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 8));
s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 4));
s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 2));
return (ushort)_mm_cvtsi128_si32(s0);
}
inline short v_reduce_sum(const v_int16x16& a)
{
__m256i s0 = _mm256_hadds_epi16(a.val, a.val);
s0 = _mm256_hadds_epi16(s0, s0);
s0 = _mm256_hadds_epi16(s0, s0);
__m128i s1 = _v256_extract_high(s0);
s1 = _mm_adds_epi16(_v256_extract_low(s0), s1);
return (short)_mm_cvtsi128_si32(s1);
}
inline int v_reduce_sum(const v_int32x8& a)
{
__m256i s0 = _mm256_hadd_epi32(a.val, a.val);
@ -1114,6 +1215,11 @@ inline int v_reduce_sum(const v_int32x8& a)
inline unsigned v_reduce_sum(const v_uint32x8& a)
{ return v_reduce_sum(v_reinterpret_as_s32(a)); }
inline int v_reduce_sum(const v_int16x16& a)
{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
inline unsigned v_reduce_sum(const v_uint16x16& a)
{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
inline float v_reduce_sum(const v_float32x8& a)
{
__m256 s0 = _mm256_hadd_ps(a.val, a.val);
@ -1125,6 +1231,18 @@ inline float v_reduce_sum(const v_float32x8& a)
return _mm_cvtss_f32(s1);
}
inline uint64 v_reduce_sum(const v_uint64x4& a)
{
uint64 CV_DECL_ALIGNED(32) idx[2];
_mm_store_si128((__m128i*)idx, _mm_add_epi64(_v256_extract_low(a.val), _v256_extract_high(a.val)));
return idx[0] + idx[1];
}
inline int64 v_reduce_sum(const v_int64x4& a)
{
int64 CV_DECL_ALIGNED(32) idx[2];
_mm_store_si128((__m128i*)idx, _mm_add_epi64(_v256_extract_low(a.val), _v256_extract_high(a.val)));
return idx[0] + idx[1];
}
inline double v_reduce_sum(const v_float64x4& a)
{
__m256d s0 = _mm256_hadd_pd(a.val, a.val);
@ -1141,12 +1259,16 @@ inline v_float32x8 v_reduce_sum4(const v_float32x8& a, const v_float32x8& b,
inline unsigned v_reduce_sad(const v_uint8x32& a, const v_uint8x32& b)
{
return (unsigned)_v_cvtsi256_si32(_mm256_sad_epu8(a.val, b.val));
__m256i half = _mm256_sad_epu8(a.val, b.val);
__m128i quarter = _mm_add_epi32(_v256_extract_low(half), _v256_extract_high(half));
return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(quarter, _mm_unpackhi_epi64(quarter, quarter)));
}
inline unsigned v_reduce_sad(const v_int8x32& a, const v_int8x32& b)
{
__m256i half = _mm256_set1_epi8(0x7f);
return (unsigned)_v_cvtsi256_si32(_mm256_sad_epu8(_mm256_add_epi8(a.val, half), _mm256_add_epi8(b.val, half)));
half = _mm256_sad_epu8(_mm256_add_epi8(a.val, half), _mm256_add_epi8(b.val, half));
__m128i quarter = _mm_add_epi32(_v256_extract_low(half), _v256_extract_high(half));
return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(quarter, _mm_unpackhi_epi64(quarter, quarter)));
}
inline unsigned v_reduce_sad(const v_uint16x16& a, const v_uint16x16& b)
{
@ -1175,26 +1297,39 @@ inline float v_reduce_sad(const v_float32x8& a, const v_float32x8& b)
}
/** Popcount **/
#define OPENCV_HAL_IMPL_AVX_POPCOUNT(_Tpvec) \
inline v_uint32x8 v_popcount(const _Tpvec& a) \
{ \
const v_uint32x8 m1 = v256_setall_u32(0x55555555); \
const v_uint32x8 m2 = v256_setall_u32(0x33333333); \
const v_uint32x8 m4 = v256_setall_u32(0x0f0f0f0f); \
v_uint32x8 p = v_reinterpret_as_u32(a); \
p = ((p >> 1) & m1) + (p & m1); \
p = ((p >> 2) & m2) + (p & m2); \
p = ((p >> 4) & m4) + (p & m4); \
p.val = _mm256_sad_epu8(p.val, _mm256_setzero_si256()); \
return p; \
}
OPENCV_HAL_IMPL_AVX_POPCOUNT(v_uint8x32)
OPENCV_HAL_IMPL_AVX_POPCOUNT(v_int8x32)
OPENCV_HAL_IMPL_AVX_POPCOUNT(v_uint16x16)
OPENCV_HAL_IMPL_AVX_POPCOUNT(v_int16x16)
OPENCV_HAL_IMPL_AVX_POPCOUNT(v_uint32x8)
OPENCV_HAL_IMPL_AVX_POPCOUNT(v_int32x8)
inline v_uint8x32 v_popcount(const v_uint8x32& a)
{
__m256i _popcnt_table = _mm256_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);
__m256i _popcnt_mask = _mm256_set1_epi8(0x0F);
return v_uint8x32(_mm256_add_epi8(_mm256_shuffle_epi8(_popcnt_table, _mm256_and_si256( a.val , _popcnt_mask)),
_mm256_shuffle_epi8(_popcnt_table, _mm256_and_si256(_mm256_srli_epi16(a.val, 4), _popcnt_mask))));
}
inline v_uint16x16 v_popcount(const v_uint16x16& a)
{
v_uint8x32 p = v_popcount(v_reinterpret_as_u8(a));
p += v_rotate_right<1>(p);
return v_reinterpret_as_u16(p) & v256_setall_u16(0x00ff);
}
inline v_uint32x8 v_popcount(const v_uint32x8& a)
{
v_uint8x32 p = v_popcount(v_reinterpret_as_u8(a));
p += v_rotate_right<1>(p);
p += v_rotate_right<2>(p);
return v_reinterpret_as_u32(p) & v256_setall_u32(0x000000ff);
}
inline v_uint64x4 v_popcount(const v_uint64x4& a)
{
return v_uint64x4(_mm256_sad_epu8(v_popcount(v_reinterpret_as_u8(a)).val, _mm256_setzero_si256()));
}
inline v_uint8x32 v_popcount(const v_int8x32& a)
{ return v_popcount(v_reinterpret_as_u8(a)); }
inline v_uint16x16 v_popcount(const v_int16x16& a)
{ return v_popcount(v_reinterpret_as_u16(a)); }
inline v_uint32x8 v_popcount(const v_int32x8& a)
{ return v_popcount(v_reinterpret_as_u32(a)); }
inline v_uint64x4 v_popcount(const v_int64x4& a)
{ return v_popcount(v_reinterpret_as_u64(a)); }
/** Mask **/
inline int v_signmask(const v_int8x32& a)
@ -1203,62 +1338,54 @@ inline int v_signmask(const v_uint8x32& a)
{ return v_signmask(v_reinterpret_as_s8(a)); }
inline int v_signmask(const v_int16x16& a)
{
v_int8x32 v = v_int8x32(_mm256_packs_epi16(a.val, a.val));
return v_signmask(v) & 255;
}
{ return v_signmask(v_pack(a, a)) & 0xFFFF; }
inline int v_signmask(const v_uint16x16& a)
{ return v_signmask(v_reinterpret_as_s16(a)); }
inline int v_signmask(const v_int32x8& a)
{
__m256i a16 = _mm256_packs_epi32(a.val, a.val);
v_int8x32 v = v_int8x32(_mm256_packs_epi16(a16, a16));
return v_signmask(v) & 15;
}
inline int v_signmask(const v_uint32x8& a)
{ return v_signmask(v_reinterpret_as_s32(a)); }
inline int v_signmask(const v_float32x8& a)
{ return _mm256_movemask_ps(a.val); }
inline int v_signmask(const v_float64x4& a)
{ return _mm256_movemask_pd(a.val); }
inline int v_signmask(const v_int32x8& a)
{ return v_signmask(v_reinterpret_as_f32(a)); }
inline int v_signmask(const v_uint32x8& a)
{ return v_signmask(v_reinterpret_as_f32(a)); }
inline int v_signmask(const v_int64x4& a)
{ return v_signmask(v_reinterpret_as_f64(a)); }
inline int v_signmask(const v_uint64x4& a)
{ return v_signmask(v_reinterpret_as_f64(a)); }
inline int v_scan_forward(const v_int8x32& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); }
inline int v_scan_forward(const v_uint8x32& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); }
inline int v_scan_forward(const v_int16x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 2; }
inline int v_scan_forward(const v_uint16x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 2; }
inline int v_scan_forward(const v_int32x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
inline int v_scan_forward(const v_uint32x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
inline int v_scan_forward(const v_float32x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
inline int v_scan_forward(const v_int64x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
inline int v_scan_forward(const v_uint64x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
inline int v_scan_forward(const v_float64x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
/** Checks **/
#define OPENCV_HAL_IMPL_AVX_CHECK(_Tpvec, and_op, allmask) \
inline bool v_check_all(const _Tpvec& a) \
{ \
int mask = v_signmask(v_reinterpret_as_s8(a)); \
return and_op(mask, allmask) == allmask; \
} \
inline bool v_check_any(const _Tpvec& a) \
{ \
int mask = v_signmask(v_reinterpret_as_s8(a)); \
return and_op(mask, allmask) != 0; \
}
OPENCV_HAL_IMPL_AVX_CHECK(v_uint8x32, OPENCV_HAL_1ST, -1)
OPENCV_HAL_IMPL_AVX_CHECK(v_int8x32, OPENCV_HAL_1ST, -1)
OPENCV_HAL_IMPL_AVX_CHECK(v_uint16x16, OPENCV_HAL_AND, (int)0xaaaa)
OPENCV_HAL_IMPL_AVX_CHECK(v_int16x16, OPENCV_HAL_AND, (int)0xaaaa)
OPENCV_HAL_IMPL_AVX_CHECK(v_uint32x8, OPENCV_HAL_AND, (int)0x8888)
OPENCV_HAL_IMPL_AVX_CHECK(v_int32x8, OPENCV_HAL_AND, (int)0x8888)
#define OPENCV_HAL_IMPL_AVX_CHECK_FLT(_Tpvec, allmask) \
inline bool v_check_all(const _Tpvec& a) \
{ \
int mask = v_signmask(a); \
return mask == allmask; \
} \
inline bool v_check_any(const _Tpvec& a) \
{ \
int mask = v_signmask(a); \
return mask != 0; \
}
OPENCV_HAL_IMPL_AVX_CHECK_FLT(v_float32x8, 255)
OPENCV_HAL_IMPL_AVX_CHECK_FLT(v_float64x4, 15)
#define OPENCV_HAL_IMPL_AVX_CHECK(_Tpvec, allmask) \
inline bool v_check_all(const _Tpvec& a) { return v_signmask(a) == allmask; } \
inline bool v_check_any(const _Tpvec& a) { return v_signmask(a) != 0; }
OPENCV_HAL_IMPL_AVX_CHECK(v_uint8x32, -1)
OPENCV_HAL_IMPL_AVX_CHECK(v_int8x32, -1)
OPENCV_HAL_IMPL_AVX_CHECK(v_uint32x8, 255)
OPENCV_HAL_IMPL_AVX_CHECK(v_int32x8, 255)
OPENCV_HAL_IMPL_AVX_CHECK(v_uint64x4, 15)
OPENCV_HAL_IMPL_AVX_CHECK(v_int64x4, 15)
OPENCV_HAL_IMPL_AVX_CHECK(v_float32x8, 255)
OPENCV_HAL_IMPL_AVX_CHECK(v_float64x4, 15)
#define OPENCV_HAL_IMPL_AVX_CHECK_SHORT(_Tpvec) \
inline bool v_check_all(const _Tpvec& a) { return (v_signmask(v_reinterpret_as_s8(a)) & 0xaaaaaaaa) == 0xaaaaaaaa; } \
inline bool v_check_any(const _Tpvec& a) { return (v_signmask(v_reinterpret_as_s8(a)) & 0xaaaaaaaa) != 0; }
OPENCV_HAL_IMPL_AVX_CHECK_SHORT(v_uint16x16)
OPENCV_HAL_IMPL_AVX_CHECK_SHORT(v_int16x16)
////////// Other math /////////
@ -1400,7 +1527,7 @@ inline v_float32x8 v_cvt_f32(const v_float64x4& a)
inline v_float32x8 v_cvt_f32(const v_float64x4& a, const v_float64x4& b)
{
__m128 af = _mm256_cvtpd_ps(a.val), bf = _mm256_cvtpd_ps(b.val);
return v_float32x8(_mm256_insertf128_ps(_mm256_castps128_ps256(af), bf, 1));
return v_float32x8(_v256_combine(af, bf));
}
inline v_float64x4 v_cvt_f64(const v_int32x8& a)
@ -1415,6 +1542,28 @@ inline v_float64x4 v_cvt_f64(const v_float32x8& a)
inline v_float64x4 v_cvt_f64_high(const v_float32x8& a)
{ return v_float64x4(_mm256_cvtps_pd(_v256_extract_high(a.val))); }
// from (Mysticial and wim) https://stackoverflow.com/q/41144668
inline v_float64x4 v_cvt_f64(const v_int64x4& v)
{
// constants encoded as floating-point
__m256i magic_i_lo = _mm256_set1_epi64x(0x4330000000000000); // 2^52
__m256i magic_i_hi32 = _mm256_set1_epi64x(0x4530000080000000); // 2^84 + 2^63
__m256i magic_i_all = _mm256_set1_epi64x(0x4530000080100000); // 2^84 + 2^63 + 2^52
__m256d magic_d_all = _mm256_castsi256_pd(magic_i_all);
// Blend the 32 lowest significant bits of v with magic_int_lo
__m256i v_lo = _mm256_blend_epi32(magic_i_lo, v.val, 0x55);
// Extract the 32 most significant bits of v
__m256i v_hi = _mm256_srli_epi64(v.val, 32);
// Flip the msb of v_hi and blend with 0x45300000
v_hi = _mm256_xor_si256(v_hi, magic_i_hi32);
// Compute in double precision
__m256d v_hi_dbl = _mm256_sub_pd(_mm256_castsi256_pd(v_hi), magic_d_all);
// (v_hi - magic_d_all) + v_lo Do not assume associativity of floating point addition
__m256d result = _mm256_add_pd(v_hi_dbl, _mm256_castsi256_pd(v_lo));
return v_float64x4(result);
}
////////////// Lookup table access ////////////////////
inline v_int8x32 v256_lut(const schar* tab, const int* idx)
@ -1474,7 +1623,7 @@ inline v_int32x8 v256_lut_pairs(const int* tab, const int* idx)
}
inline v_int32x8 v256_lut_quads(const int* tab, const int* idx)
{
return v_int32x8(_mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((const __m128i*)(tab + idx[0]))), _mm_loadu_si128((const __m128i*)(tab + idx[1])), 0x1));
return v_int32x8(_v256_combine(_mm_loadu_si128((const __m128i*)(tab + idx[0])), _mm_loadu_si128((const __m128i*)(tab + idx[1]))));
}
inline v_uint32x8 v256_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v256_lut((const int *)tab, idx)); }
inline v_uint32x8 v256_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v256_lut_pairs((const int *)tab, idx)); }
@ -1490,7 +1639,7 @@ inline v_int64x4 v256_lut(const int64* tab, const int* idx)
}
inline v_int64x4 v256_lut_pairs(const int64* tab, const int* idx)
{
return v_int64x4(_mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((const __m128i*)(tab + idx[0]))), _mm_loadu_si128((const __m128i*)(tab + idx[1])), 0x1));
return v_int64x4(_v256_combine(_mm_loadu_si128((const __m128i*)(tab + idx[0])), _mm_loadu_si128((const __m128i*)(tab + idx[1]))));
}
inline v_uint64x4 v256_lut(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v256_lut((const int64 *)tab, idx)); }
inline v_uint64x4 v256_lut_pairs(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v256_lut_pairs((const int64 *)tab, idx)); }
@ -1506,7 +1655,7 @@ inline v_float64x4 v256_lut(const double* tab, const int* idx)
{
return v_float64x4(_mm256_i32gather_pd(tab, _mm_loadu_si128((const __m128i*)idx), 8));
}
inline v_float64x4 v256_lut_pairs(const double* tab, const int* idx) { return v_float64x4(_mm256_insertf128_pd(_mm256_castpd128_pd256(_mm_loadu_pd(tab + idx[0])), _mm_loadu_pd(tab + idx[1]), 0x1)); }
inline v_float64x4 v256_lut_pairs(const double* tab, const int* idx) { return v_float64x4(_v256_combine(_mm_loadu_pd(tab + idx[0]), _mm_loadu_pd(tab + idx[1]))); }
inline v_int32x8 v_lut(const int* tab, const v_int32x8& idxvec)
{
@ -1622,12 +1771,165 @@ inline v_float32x8 v_pack_triplets(const v_float32x8& vec)
////////// Matrix operations /////////
//////// Dot Product ////////
// 16 >> 32
inline v_int32x8 v_dotprod(const v_int16x16& a, const v_int16x16& b)
{ return v_int32x8(_mm256_madd_epi16(a.val, b.val)); }
inline v_int32x8 v_dotprod(const v_int16x16& a, const v_int16x16& b, const v_int32x8& c)
{ return v_dotprod(a, b) + c; }
// 32 >> 64
inline v_int64x4 v_dotprod(const v_int32x8& a, const v_int32x8& b)
{
__m256i even = _mm256_mul_epi32(a.val, b.val);
__m256i odd = _mm256_mul_epi32(_mm256_srli_epi64(a.val, 32), _mm256_srli_epi64(b.val, 32));
return v_int64x4(_mm256_add_epi64(even, odd));
}
inline v_int64x4 v_dotprod(const v_int32x8& a, const v_int32x8& b, const v_int64x4& c)
{ return v_dotprod(a, b) + c; }
// 8 >> 32
inline v_uint32x8 v_dotprod_expand(const v_uint8x32& a, const v_uint8x32& b)
{
__m256i even_m = _mm256_set1_epi32(0xFF00FF00);
__m256i even_a = _mm256_blendv_epi8(a.val, _mm256_setzero_si256(), even_m);
__m256i odd_a = _mm256_srli_epi16(a.val, 8);
__m256i even_b = _mm256_blendv_epi8(b.val, _mm256_setzero_si256(), even_m);
__m256i odd_b = _mm256_srli_epi16(b.val, 8);
__m256i prod0 = _mm256_madd_epi16(even_a, even_b);
__m256i prod1 = _mm256_madd_epi16(odd_a, odd_b);
return v_uint32x8(_mm256_add_epi32(prod0, prod1));
}
inline v_uint32x8 v_dotprod_expand(const v_uint8x32& a, const v_uint8x32& b, const v_uint32x8& c)
{ return v_dotprod_expand(a, b) + c; }
inline v_int32x8 v_dotprod_expand(const v_int8x32& a, const v_int8x32& b)
{
__m256i even_a = _mm256_srai_epi16(_mm256_bslli_epi128(a.val, 1), 8);
__m256i odd_a = _mm256_srai_epi16(a.val, 8);
__m256i even_b = _mm256_srai_epi16(_mm256_bslli_epi128(b.val, 1), 8);
__m256i odd_b = _mm256_srai_epi16(b.val, 8);
__m256i prod0 = _mm256_madd_epi16(even_a, even_b);
__m256i prod1 = _mm256_madd_epi16(odd_a, odd_b);
return v_int32x8(_mm256_add_epi32(prod0, prod1));
}
inline v_int32x8 v_dotprod_expand(const v_int8x32& a, const v_int8x32& b, const v_int32x8& c)
{ return v_dotprod_expand(a, b) + c; }
// 16 >> 64
inline v_uint64x4 v_dotprod_expand(const v_uint16x16& a, const v_uint16x16& b)
{
__m256i mullo = _mm256_mullo_epi16(a.val, b.val);
__m256i mulhi = _mm256_mulhi_epu16(a.val, b.val);
__m256i mul0 = _mm256_unpacklo_epi16(mullo, mulhi);
__m256i mul1 = _mm256_unpackhi_epi16(mullo, mulhi);
__m256i p02 = _mm256_blend_epi32(mul0, _mm256_setzero_si256(), 0xAA);
__m256i p13 = _mm256_srli_epi64(mul0, 32);
__m256i p46 = _mm256_blend_epi32(mul1, _mm256_setzero_si256(), 0xAA);
__m256i p57 = _mm256_srli_epi64(mul1, 32);
__m256i p15_ = _mm256_add_epi64(p02, p13);
__m256i p9d_ = _mm256_add_epi64(p46, p57);
return v_uint64x4(_mm256_add_epi64(
_mm256_unpacklo_epi64(p15_, p9d_),
_mm256_unpackhi_epi64(p15_, p9d_)
));
}
inline v_uint64x4 v_dotprod_expand(const v_uint16x16& a, const v_uint16x16& b, const v_uint64x4& c)
{ return v_dotprod_expand(a, b) + c; }
inline v_int64x4 v_dotprod_expand(const v_int16x16& a, const v_int16x16& b)
{
__m256i prod = _mm256_madd_epi16(a.val, b.val);
__m256i sign = _mm256_srai_epi32(prod, 31);
__m256i lo = _mm256_unpacklo_epi32(prod, sign);
__m256i hi = _mm256_unpackhi_epi32(prod, sign);
return v_int64x4(_mm256_add_epi64(
_mm256_unpacklo_epi64(lo, hi),
_mm256_unpackhi_epi64(lo, hi)
));
}
inline v_int64x4 v_dotprod_expand(const v_int16x16& a, const v_int16x16& b, const v_int64x4& c)
{ return v_dotprod_expand(a, b) + c; }
// 32 >> 64f
inline v_float64x4 v_dotprod_expand(const v_int32x8& a, const v_int32x8& b)
{ return v_cvt_f64(v_dotprod(a, b)); }
inline v_float64x4 v_dotprod_expand(const v_int32x8& a, const v_int32x8& b, const v_float64x4& c)
{ return v_dotprod_expand(a, b) + c; }
//////// Fast Dot Product ////////
// 16 >> 32
inline v_int32x8 v_dotprod_fast(const v_int16x16& a, const v_int16x16& b)
{ return v_dotprod(a, b); }
inline v_int32x8 v_dotprod_fast(const v_int16x16& a, const v_int16x16& b, const v_int32x8& c)
{ return v_dotprod(a, b, c); }
// 32 >> 64
inline v_int64x4 v_dotprod_fast(const v_int32x8& a, const v_int32x8& b)
{ return v_dotprod(a, b); }
inline v_int64x4 v_dotprod_fast(const v_int32x8& a, const v_int32x8& b, const v_int64x4& c)
{ return v_dotprod(a, b, c); }
// 8 >> 32
inline v_uint32x8 v_dotprod_expand_fast(const v_uint8x32& a, const v_uint8x32& b)
{ return v_dotprod_expand(a, b); }
inline v_uint32x8 v_dotprod_expand_fast(const v_uint8x32& a, const v_uint8x32& b, const v_uint32x8& c)
{ return v_dotprod_expand(a, b, c); }
inline v_int32x8 v_dotprod_expand_fast(const v_int8x32& a, const v_int8x32& b)
{ return v_dotprod_expand(a, b); }
inline v_int32x8 v_dotprod_expand_fast(const v_int8x32& a, const v_int8x32& b, const v_int32x8& c)
{ return v_dotprod_expand(a, b, c); }
// 16 >> 64
inline v_uint64x4 v_dotprod_expand_fast(const v_uint16x16& a, const v_uint16x16& b)
{
__m256i mullo = _mm256_mullo_epi16(a.val, b.val);
__m256i mulhi = _mm256_mulhi_epu16(a.val, b.val);
__m256i mul0 = _mm256_unpacklo_epi16(mullo, mulhi);
__m256i mul1 = _mm256_unpackhi_epi16(mullo, mulhi);
__m256i p02 = _mm256_blend_epi32(mul0, _mm256_setzero_si256(), 0xAA);
__m256i p13 = _mm256_srli_epi64(mul0, 32);
__m256i p46 = _mm256_blend_epi32(mul1, _mm256_setzero_si256(), 0xAA);
__m256i p57 = _mm256_srli_epi64(mul1, 32);
__m256i p15_ = _mm256_add_epi64(p02, p13);
__m256i p9d_ = _mm256_add_epi64(p46, p57);
return v_uint64x4(_mm256_add_epi64(p15_, p9d_));
}
inline v_uint64x4 v_dotprod_expand_fast(const v_uint16x16& a, const v_uint16x16& b, const v_uint64x4& c)
{ return v_dotprod_expand_fast(a, b) + c; }
inline v_int64x4 v_dotprod_expand_fast(const v_int16x16& a, const v_int16x16& b)
{
__m256i prod = _mm256_madd_epi16(a.val, b.val);
__m256i sign = _mm256_srai_epi32(prod, 31);
__m256i lo = _mm256_unpacklo_epi32(prod, sign);
__m256i hi = _mm256_unpackhi_epi32(prod, sign);
return v_int64x4(_mm256_add_epi64(lo, hi));
}
inline v_int64x4 v_dotprod_expand_fast(const v_int16x16& a, const v_int16x16& b, const v_int64x4& c)
{ return v_dotprod_expand_fast(a, b) + c; }
// 32 >> 64f
inline v_float64x4 v_dotprod_expand_fast(const v_int32x8& a, const v_int32x8& b)
{ return v_dotprod_expand(a, b); }
inline v_float64x4 v_dotprod_expand_fast(const v_int32x8& a, const v_int32x8& b, const v_float64x4& c)
{ return v_dotprod_expand(a, b, c); }
#define OPENCV_HAL_AVX_SPLAT2_PS(a, im) \
v_float32x8(_mm256_permute_ps(a.val, _MM_SHUFFLE(im, im, im, im)))
@ -1956,6 +2258,85 @@ OPENCV_HAL_IMPL_AVX_EXTRACT(v_int64x4)
OPENCV_HAL_IMPL_AVX_EXTRACT(v_float32x8)
OPENCV_HAL_IMPL_AVX_EXTRACT(v_float64x4)
template<int i>
inline uchar v_extract_n(v_uint8x32 a)
{
return (uchar)_v256_extract_epi8<i>(a.val);
}
template<int i>
inline schar v_extract_n(v_int8x32 a)
{
return (schar)v_extract_n<i>(v_reinterpret_as_u8(a));
}
template<int i>
inline ushort v_extract_n(v_uint16x16 a)
{
return (ushort)_v256_extract_epi16<i>(a.val);
}
template<int i>
inline short v_extract_n(v_int16x16 a)
{
return (short)v_extract_n<i>(v_reinterpret_as_u16(a));
}
template<int i>
inline uint v_extract_n(v_uint32x8 a)
{
return (uint)_v256_extract_epi32<i>(a.val);
}
template<int i>
inline int v_extract_n(v_int32x8 a)
{
return (int)v_extract_n<i>(v_reinterpret_as_u32(a));
}
template<int i>
inline uint64 v_extract_n(v_uint64x4 a)
{
return (uint64)_v256_extract_epi64<i>(a.val);
}
template<int i>
inline int64 v_extract_n(v_int64x4 v)
{
return (int64)v_extract_n<i>(v_reinterpret_as_u64(v));
}
template<int i>
inline float v_extract_n(v_float32x8 v)
{
union { uint iv; float fv; } d;
d.iv = v_extract_n<i>(v_reinterpret_as_u32(v));
return d.fv;
}
template<int i>
inline double v_extract_n(v_float64x4 v)
{
union { uint64 iv; double dv; } d;
d.iv = v_extract_n<i>(v_reinterpret_as_u64(v));
return d.dv;
}
template<int i>
inline v_uint32x8 v_broadcast_element(v_uint32x8 a)
{
static const __m256i perm = _mm256_set1_epi32((char)i);
return v_uint32x8(_mm256_permutevar8x32_epi32(a.val, perm));
}
template<int i>
inline v_int32x8 v_broadcast_element(const v_int32x8 &a)
{ return v_reinterpret_as_s32(v_broadcast_element<i>(v_reinterpret_as_u32(a))); }
template<int i>
inline v_float32x8 v_broadcast_element(const v_float32x8 &a)
{ return v_reinterpret_as_f32(v_broadcast_element<i>(v_reinterpret_as_u32(a))); }
///////////////////// load deinterleave /////////////////////////////
@ -2740,29 +3121,41 @@ OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_float32x8, float, f32, v_uint32x8, un
OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_int64x4, int64, s64, v_uint64x4, uint64, u64)
OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_float64x4, double, f64, v_uint64x4, uint64, u64)
//
// FP16
//
inline v_float32x8 v256_load_expand(const float16_t* ptr)
{
#if CV_FP16
return v_float32x8(_mm256_cvtph_ps(_mm_loadu_si128((const __m128i*)ptr)));
#else
float CV_DECL_ALIGNED(32) buf[8];
for (int i = 0; i < 8; i++)
buf[i] = (float)ptr[i];
return v256_load_aligned(buf);
#endif
}
inline void v_pack_store(float16_t* ptr, const v_float32x8& a)
{
#if CV_FP16
__m128i ah = _mm256_cvtps_ph(a.val, 0);
_mm_storeu_si128((__m128i*)ptr, ah);
#else
float CV_DECL_ALIGNED(32) buf[8];
v_store_aligned(buf, a);
for (int i = 0; i < 8; i++)
ptr[i] = float16_t(buf[i]);
#endif
}
//
// end of FP16
//
inline void v256_cleanup() { _mm256_zeroall(); }
//! @name Check SIMD256 support
//! @{
//! @brief Check CPU capability of SIMD operation
static inline bool hasSIMD256()
{
return (CV_CPU_HAS_SUPPORT_AVX2) ? true : false;
}
//! @}
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
//! @endcond

File diff suppressed because it is too large Load Diff

View File

@ -14,9 +14,32 @@ namespace cv
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
/** Types **/
#if CV__SIMD_FORWARD == 512
// [todo] 512
#error "AVX512 Not implemented yet"
#if CV__SIMD_FORWARD == 1024
// [todo] 1024
#error "1024-long ops not implemented yet"
#elif CV__SIMD_FORWARD == 512
// 512
#define __CV_VX(fun) v512_##fun
#define __CV_V_UINT8 v_uint8x64
#define __CV_V_INT8 v_int8x64
#define __CV_V_UINT16 v_uint16x32
#define __CV_V_INT16 v_int16x32
#define __CV_V_UINT32 v_uint32x16
#define __CV_V_INT32 v_int32x16
#define __CV_V_UINT64 v_uint64x8
#define __CV_V_INT64 v_int64x8
#define __CV_V_FLOAT32 v_float32x16
#define __CV_V_FLOAT64 v_float64x8
struct v_uint8x64;
struct v_int8x64;
struct v_uint16x32;
struct v_int16x32;
struct v_uint32x16;
struct v_int32x16;
struct v_uint64x8;
struct v_int64x8;
struct v_float32x16;
struct v_float64x8;
#elif CV__SIMD_FORWARD == 256
// 256
#define __CV_VX(fun) v256_##fun
@ -137,6 +160,16 @@ void v_mul_expand(const __CV_V_UINT32&, const __CV_V_UINT32&, __CV_V_UINT64&, __
void v_mul_expand(const __CV_V_INT32&, const __CV_V_INT32&, __CV_V_INT64&, __CV_V_INT64&);
#endif
// Conversions
__CV_V_FLOAT32 v_cvt_f32(const __CV_V_INT32& a);
__CV_V_FLOAT32 v_cvt_f32(const __CV_V_FLOAT64& a);
__CV_V_FLOAT32 v_cvt_f32(const __CV_V_FLOAT64& a, const __CV_V_FLOAT64& b);
__CV_V_FLOAT64 v_cvt_f64(const __CV_V_INT32& a);
__CV_V_FLOAT64 v_cvt_f64_high(const __CV_V_INT32& a);
__CV_V_FLOAT64 v_cvt_f64(const __CV_V_FLOAT32& a);
__CV_V_FLOAT64 v_cvt_f64_high(const __CV_V_FLOAT32& a);
__CV_V_FLOAT64 v_cvt_f64(const __CV_V_INT64& a);
/** Cleanup **/
#undef CV__SIMD_FORWARD
#undef __CV_VX

File diff suppressed because it is too large Load Diff

View File

@ -56,29 +56,85 @@ namespace cv
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
#define CV_SIMD128 1
#if defined(__aarch64__)
#if defined(__aarch64__) || defined(_M_ARM64)
#define CV_SIMD128_64F 1
#else
#define CV_SIMD128_64F 0
#endif
// The following macro checks if the code is being compiled for the
// AArch64 execution state of Armv8, to enable the 128-bit
// intrinsics. The macro `__ARM_64BIT_STATE` is the one recommended by
// the Arm C Language Extension (ACLE) specifications [1] to check the
// availability of 128-bit intrinsics, and it is supporrted by clang
// and gcc. The macro `_M_ARM64` is the equivalent one for Microsoft
// Visual Studio [2] .
//
// [1] https://developer.arm.com/documentation/101028/0012/13--Advanced-SIMD--Neon--intrinsics
// [2] https://docs.microsoft.com/en-us/cpp/preprocessor/predefined-macros
#if defined(__ARM_64BIT_STATE) || defined(_M_ARM64)
#define CV_NEON_AARCH64 1
#else
#define CV_NEON_AARCH64 0
#endif
// TODO
#define CV_NEON_DOT 0
//////////// Utils ////////////
#if CV_SIMD128_64F
#define OPENCV_HAL_IMPL_NEON_UNZIP(_Tpv, _Tpvx2, suffix) \
inline void _v128_unzip(const _Tpv& a, const _Tpv& b, _Tpv& c, _Tpv& d) \
{ c = vuzp1q_##suffix(a, b); d = vuzp2q_##suffix(a, b); }
#define OPENCV_HAL_IMPL_NEON_UNZIP_L(_Tpv, _Tpvx2, suffix) \
inline void _v128_unzip(const _Tpv&a, const _Tpv&b, _Tpv& c, _Tpv& d) \
{ c = vuzp1_##suffix(a, b); d = vuzp2_##suffix(a, b); }
#else
#define OPENCV_HAL_IMPL_NEON_UNZIP(_Tpv, _Tpvx2, suffix) \
inline void _v128_unzip(const _Tpv& a, const _Tpv& b, _Tpv& c, _Tpv& d) \
{ _Tpvx2 ab = vuzpq_##suffix(a, b); c = ab.val[0]; d = ab.val[1]; }
#define OPENCV_HAL_IMPL_NEON_UNZIP_L(_Tpv, _Tpvx2, suffix) \
inline void _v128_unzip(const _Tpv& a, const _Tpv& b, _Tpv& c, _Tpv& d) \
{ _Tpvx2 ab = vuzp_##suffix(a, b); c = ab.val[0]; d = ab.val[1]; }
#endif
#if CV_SIMD128_64F
#define OPENCV_HAL_IMPL_NEON_REINTERPRET(_Tpv, suffix) \
template <typename T> static inline \
_Tpv vreinterpretq_##suffix##_f64(T a) { return (_Tpv) a; } \
template <typename T> static inline \
float64x2_t vreinterpretq_f64_##suffix(T a) { return (float64x2_t) a; }
OPENCV_HAL_IMPL_NEON_REINTERPRET(uint8x16_t, u8)
OPENCV_HAL_IMPL_NEON_REINTERPRET(int8x16_t, s8)
OPENCV_HAL_IMPL_NEON_REINTERPRET(uint16x8_t, u16)
OPENCV_HAL_IMPL_NEON_REINTERPRET(int16x8_t, s16)
OPENCV_HAL_IMPL_NEON_REINTERPRET(uint32x4_t, u32)
OPENCV_HAL_IMPL_NEON_REINTERPRET(int32x4_t, s32)
OPENCV_HAL_IMPL_NEON_REINTERPRET(uint64x2_t, u64)
OPENCV_HAL_IMPL_NEON_REINTERPRET(int64x2_t, s64)
OPENCV_HAL_IMPL_NEON_REINTERPRET(float32x4_t, f32)
template <typename T> static inline \
_Tpv vreinterpretq_##suffix##_f64(T a) { return (_Tpv) a; } \
template <typename T> static inline \
float64x2_t vreinterpretq_f64_##suffix(T a) { return (float64x2_t) a; }
#else
#define OPENCV_HAL_IMPL_NEON_REINTERPRET(_Tpv, suffix)
#endif
#define OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(_Tpv, _Tpvl, suffix) \
OPENCV_HAL_IMPL_NEON_UNZIP(_Tpv##_t, _Tpv##x2_t, suffix) \
OPENCV_HAL_IMPL_NEON_UNZIP_L(_Tpvl##_t, _Tpvl##x2_t, suffix) \
OPENCV_HAL_IMPL_NEON_REINTERPRET(_Tpv##_t, suffix)
#define OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_I64(_Tpv, _Tpvl, suffix) \
OPENCV_HAL_IMPL_NEON_REINTERPRET(_Tpv##_t, suffix)
#define OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_F64(_Tpv, _Tpvl, suffix) \
OPENCV_HAL_IMPL_NEON_UNZIP(_Tpv##_t, _Tpv##x2_t, suffix)
OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(uint8x16, uint8x8, u8)
OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(int8x16, int8x8, s8)
OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(uint16x8, uint16x4, u16)
OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(int16x8, int16x4, s16)
OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(uint32x4, uint32x2, u32)
OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(int32x4, int32x2, s32)
OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX(float32x4, float32x2, f32)
OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_I64(uint64x2, uint64x1, u64)
OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_I64(int64x2, int64x1, s64)
#if CV_SIMD128_64F
OPENCV_HAL_IMPL_NEON_UTILS_SUFFIX_F64(float64x2, float64x1,f64)
#endif
//////////// Types ////////////
struct v_uint8x16
{
typedef uchar lane_type;
@ -278,48 +334,6 @@ struct v_float64x2
};
#endif
#if CV_FP16
// Workaround for old compilers
static inline int16x4_t vreinterpret_s16_f16(float16x4_t a) { return (int16x4_t)a; }
static inline float16x4_t vreinterpret_f16_s16(int16x4_t a) { return (float16x4_t)a; }
static inline float16x4_t cv_vld1_f16(const void* ptr)
{
#ifndef vld1_f16 // APPLE compiler defines vld1_f16 as macro
return vreinterpret_f16_s16(vld1_s16((const short*)ptr));
#else
return vld1_f16((const __fp16*)ptr);
#endif
}
static inline void cv_vst1_f16(void* ptr, float16x4_t a)
{
#ifndef vst1_f16 // APPLE compiler defines vst1_f16 as macro
vst1_s16((short*)ptr, vreinterpret_s16_f16(a));
#else
vst1_f16((__fp16*)ptr, a);
#endif
}
#ifndef vdup_n_f16
#define vdup_n_f16(v) (float16x4_t){v, v, v, v}
#endif
#endif // CV_FP16
#if CV_FP16
inline v_float32x4 v128_load_fp16_f32(const short* ptr)
{
float16x4_t a = cv_vld1_f16((const __fp16*)ptr);
return v_float32x4(vcvt_f32_f16(a));
}
inline void v_store_fp16(short* ptr, const v_float32x4& a)
{
float16x4_t fp16 = vcvt_f16_f32(a.val);
cv_vst1_f16((short*)ptr, fp16);
}
#endif
#define OPENCV_HAL_IMPL_NEON_INIT(_Tpv, _Tp, suffix) \
inline v_##_Tpv v_setzero_##suffix() { return v_##_Tpv(vdupq_n_##suffix((_Tp)0)); } \
inline v_##_Tpv v_setall_##suffix(_Tp v) { return v_##_Tpv(vdupq_n_##suffix(v)); } \
@ -570,20 +584,292 @@ inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b)
));
}
//////// Dot Product ////////
// 16 >> 32
inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
{
int32x4_t c = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val));
int32x4_t d = vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val));
int32x4x2_t cd = vuzpq_s32(c, d);
return v_int32x4(vaddq_s32(cd.val[0], cd.val[1]));
int16x8_t uzp1, uzp2;
_v128_unzip(a.val, b.val, uzp1, uzp2);
int16x4_t a0 = vget_low_s16(uzp1);
int16x4_t b0 = vget_high_s16(uzp1);
int16x4_t a1 = vget_low_s16(uzp2);
int16x4_t b1 = vget_high_s16(uzp2);
int32x4_t p = vmull_s16(a0, b0);
return v_int32x4(vmlal_s16(p, a1, b1));
}
inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
{
v_int32x4 s = v_dotprod(a, b);
return v_int32x4(vaddq_s32(s.val , c.val));
int16x8_t uzp1, uzp2;
_v128_unzip(a.val, b.val, uzp1, uzp2);
int16x4_t a0 = vget_low_s16(uzp1);
int16x4_t b0 = vget_high_s16(uzp1);
int16x4_t a1 = vget_low_s16(uzp2);
int16x4_t b1 = vget_high_s16(uzp2);
int32x4_t p = vmlal_s16(c.val, a0, b0);
return v_int32x4(vmlal_s16(p, a1, b1));
}
// 32 >> 64
inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
{
int32x4_t uzp1, uzp2;
_v128_unzip(a.val, b.val, uzp1, uzp2);
int32x2_t a0 = vget_low_s32(uzp1);
int32x2_t b0 = vget_high_s32(uzp1);
int32x2_t a1 = vget_low_s32(uzp2);
int32x2_t b1 = vget_high_s32(uzp2);
int64x2_t p = vmull_s32(a0, b0);
return v_int64x2(vmlal_s32(p, a1, b1));
}
inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
{
int32x4_t uzp1, uzp2;
_v128_unzip(a.val, b.val, uzp1, uzp2);
int32x2_t a0 = vget_low_s32(uzp1);
int32x2_t b0 = vget_high_s32(uzp1);
int32x2_t a1 = vget_low_s32(uzp2);
int32x2_t b1 = vget_high_s32(uzp2);
int64x2_t p = vmlal_s32(c.val, a0, b0);
return v_int64x2(vmlal_s32(p, a1, b1));
}
// 8 >> 32
inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
{
#if CV_NEON_DOT
return v_uint32x4(vdotq_u32(vdupq_n_u32(0), a.val, b.val));
#else
const uint8x16_t zero = vreinterpretq_u8_u32(vdupq_n_u32(0));
const uint8x16_t mask = vreinterpretq_u8_u32(vdupq_n_u32(0x00FF00FF));
const uint16x8_t zero32 = vreinterpretq_u16_u32(vdupq_n_u32(0));
const uint16x8_t mask32 = vreinterpretq_u16_u32(vdupq_n_u32(0x0000FFFF));
uint16x8_t even = vmulq_u16(vreinterpretq_u16_u8(vbslq_u8(mask, a.val, zero)),
vreinterpretq_u16_u8(vbslq_u8(mask, b.val, zero)));
uint16x8_t odd = vmulq_u16(vshrq_n_u16(vreinterpretq_u16_u8(a.val), 8),
vshrq_n_u16(vreinterpretq_u16_u8(b.val), 8));
uint32x4_t s0 = vaddq_u32(vreinterpretq_u32_u16(vbslq_u16(mask32, even, zero32)),
vreinterpretq_u32_u16(vbslq_u16(mask32, odd, zero32)));
uint32x4_t s1 = vaddq_u32(vshrq_n_u32(vreinterpretq_u32_u16(even), 16),
vshrq_n_u32(vreinterpretq_u32_u16(odd), 16));
return v_uint32x4(vaddq_u32(s0, s1));
#endif
}
inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b,
const v_uint32x4& c)
{
#if CV_NEON_DOT
return v_uint32x4(vdotq_u32(c.val, a.val, b.val));
#else
return v_dotprod_expand(a, b) + c;
#endif
}
inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
{
#if CV_NEON_DOT
return v_int32x4(vdotq_s32(vdupq_n_s32(0), a.val, b.val));
#else
int16x8_t p0 = vmull_s8(vget_low_s8(a.val), vget_low_s8(b.val));
int16x8_t p1 = vmull_s8(vget_high_s8(a.val), vget_high_s8(b.val));
int16x8_t uzp1, uzp2;
_v128_unzip(p0, p1, uzp1, uzp2);
int16x8_t sum = vaddq_s16(uzp1, uzp2);
int16x4_t uzpl1, uzpl2;
_v128_unzip(vget_low_s16(sum), vget_high_s16(sum), uzpl1, uzpl2);
return v_int32x4(vaddl_s16(uzpl1, uzpl2));
#endif
}
inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b,
const v_int32x4& c)
{
#if CV_NEON_DOT
return v_int32x4(vdotq_s32(c.val, a.val, b.val));
#else
return v_dotprod_expand(a, b) + c;
#endif
}
// 16 >> 64
inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
{
const uint16x8_t zero = vreinterpretq_u16_u32(vdupq_n_u32(0));
const uint16x8_t mask = vreinterpretq_u16_u32(vdupq_n_u32(0x0000FFFF));
uint32x4_t even = vmulq_u32(vreinterpretq_u32_u16(vbslq_u16(mask, a.val, zero)),
vreinterpretq_u32_u16(vbslq_u16(mask, b.val, zero)));
uint32x4_t odd = vmulq_u32(vshrq_n_u32(vreinterpretq_u32_u16(a.val), 16),
vshrq_n_u32(vreinterpretq_u32_u16(b.val), 16));
uint32x4_t uzp1, uzp2;
_v128_unzip(even, odd, uzp1, uzp2);
uint64x2_t s0 = vaddl_u32(vget_low_u32(uzp1), vget_high_u32(uzp1));
uint64x2_t s1 = vaddl_u32(vget_low_u32(uzp2), vget_high_u32(uzp2));
return v_uint64x2(vaddq_u64(s0, s1));
}
inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
{ return v_dotprod_expand(a, b) + c; }
inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
{
int32x4_t p0 = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val));
int32x4_t p1 = vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val));
int32x4_t uzp1, uzp2;
_v128_unzip(p0, p1, uzp1, uzp2);
int32x4_t sum = vaddq_s32(uzp1, uzp2);
int32x2_t uzpl1, uzpl2;
_v128_unzip(vget_low_s32(sum), vget_high_s32(sum), uzpl1, uzpl2);
return v_int64x2(vaddl_s32(uzpl1, uzpl2));
}
inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b,
const v_int64x2& c)
{ return v_dotprod_expand(a, b) + c; }
// 32 >> 64f
#if CV_SIMD128_64F
inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
{ return v_cvt_f64(v_dotprod(a, b)); }
inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b,
const v_float64x2& c)
{ return v_dotprod_expand(a, b) + c; }
#endif
//////// Fast Dot Product ////////
// 16 >> 32
inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
{
#if CV_NEON_AARCH64
int32x4_t p = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val));
return v_int32x4(vmlal_high_s16(p, a.val, b.val));
#else
int16x4_t a0 = vget_low_s16(a.val);
int16x4_t a1 = vget_high_s16(a.val);
int16x4_t b0 = vget_low_s16(b.val);
int16x4_t b1 = vget_high_s16(b.val);
int32x4_t p = vmull_s16(a0, b0);
return v_int32x4(vmlal_s16(p, a1, b1));
#endif
}
inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
{
#if CV_NEON_AARCH64
int32x4_t p = vmlal_s16(c.val, vget_low_s16(a.val), vget_low_s16(b.val));
return v_int32x4(vmlal_high_s16(p, a.val, b.val));
#else
int16x4_t a0 = vget_low_s16(a.val);
int16x4_t a1 = vget_high_s16(a.val);
int16x4_t b0 = vget_low_s16(b.val);
int16x4_t b1 = vget_high_s16(b.val);
int32x4_t p = vmlal_s16(c.val, a0, b0);
return v_int32x4(vmlal_s16(p, a1, b1));
#endif
}
// 32 >> 64
inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
{
#if CV_NEON_AARCH64
int64x2_t p = vmull_s32(vget_low_s32(a.val), vget_low_s32(b.val));
return v_int64x2(vmlal_high_s32(p, a.val, b.val));
#else
int32x2_t a0 = vget_low_s32(a.val);
int32x2_t a1 = vget_high_s32(a.val);
int32x2_t b0 = vget_low_s32(b.val);
int32x2_t b1 = vget_high_s32(b.val);
int64x2_t p = vmull_s32(a0, b0);
return v_int64x2(vmlal_s32(p, a1, b1));
#endif
}
inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
{
#if CV_NEON_AARCH64
int64x2_t p = vmlal_s32(c.val, vget_low_s32(a.val), vget_low_s32(b.val));
return v_int64x2(vmlal_high_s32(p, a.val, b.val));
#else
int32x2_t a0 = vget_low_s32(a.val);
int32x2_t a1 = vget_high_s32(a.val);
int32x2_t b0 = vget_low_s32(b.val);
int32x2_t b1 = vget_high_s32(b.val);
int64x2_t p = vmlal_s32(c.val, a0, b0);
return v_int64x2(vmlal_s32(p, a1, b1));
#endif
}
// 8 >> 32
inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b)
{
#if CV_NEON_DOT
return v_uint32x4(vdotq_u32(vdupq_n_u32(0), a.val, b.val));
#else
uint16x8_t p0 = vmull_u8(vget_low_u8(a.val), vget_low_u8(b.val));
uint16x8_t p1 = vmull_u8(vget_high_u8(a.val), vget_high_u8(b.val));
uint32x4_t s0 = vaddl_u16(vget_low_u16(p0), vget_low_u16(p1));
uint32x4_t s1 = vaddl_u16(vget_high_u16(p0), vget_high_u16(p1));
return v_uint32x4(vaddq_u32(s0, s1));
#endif
}
inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
{
#if CV_NEON_DOT
return v_uint32x4(vdotq_u32(c.val, a.val, b.val));
#else
return v_dotprod_expand_fast(a, b) + c;
#endif
}
inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
{
#if CV_NEON_DOT
return v_int32x4(vdotq_s32(vdupq_n_s32(0), a.val, b.val));
#else
int16x8_t prod = vmull_s8(vget_low_s8(a.val), vget_low_s8(b.val));
prod = vmlal_s8(prod, vget_high_s8(a.val), vget_high_s8(b.val));
return v_int32x4(vaddl_s16(vget_low_s16(prod), vget_high_s16(prod)));
#endif
}
inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
{
#if CV_NEON_DOT
return v_int32x4(vdotq_s32(c.val, a.val, b.val));
#else
return v_dotprod_expand_fast(a, b) + c;
#endif
}
// 16 >> 64
inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)
{
uint32x4_t p0 = vmull_u16(vget_low_u16(a.val), vget_low_u16(b.val));
uint32x4_t p1 = vmull_u16(vget_high_u16(a.val), vget_high_u16(b.val));
uint64x2_t s0 = vaddl_u32(vget_low_u32(p0), vget_high_u32(p0));
uint64x2_t s1 = vaddl_u32(vget_low_u32(p1), vget_high_u32(p1));
return v_uint64x2(vaddq_u64(s0, s1));
}
inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
{ return v_dotprod_expand_fast(a, b) + c; }
inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
{
int32x4_t prod = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val));
prod = vmlal_s16(prod, vget_high_s16(a.val), vget_high_s16(b.val));
return v_int64x2(vaddl_s32(vget_low_s32(prod), vget_high_s32(prod)));
}
inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
{ return v_dotprod_expand_fast(a, b) + c; }
// 32 >> 64f
#if CV_SIMD128_64F
inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
{ return v_cvt_f64(v_dotprod_fast(a, b)); }
inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
{ return v_dotprod_expand_fast(a, b) + c; }
#endif
#define OPENCV_HAL_IMPL_NEON_LOGIC_OP(_Tpvec, suffix) \
OPENCV_HAL_IMPL_NEON_BIN_OP(&, _Tpvec, vandq_##suffix) \
OPENCV_HAL_IMPL_NEON_BIN_OP(|, _Tpvec, vorrq_##suffix) \
@ -917,13 +1203,27 @@ OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_int64x2, s64)
OPENCV_HAL_IMPL_NEON_ROTATE_OP(v_float64x2, f64)
#endif
#if defined(__clang__) && defined(__aarch64__)
// avoid LD2 instruction. details: https://github.com/opencv/opencv/issues/14863
#define OPENCV_HAL_IMPL_NEON_LOAD_LOW_OP(_Tpvec, _Tp, suffix) \
inline _Tpvec v_load_low(const _Tp* ptr) \
{ \
typedef uint64 CV_DECL_ALIGNED(1) unaligned_uint64; \
uint64 v = *(unaligned_uint64*)ptr; \
return _Tpvec(v_reinterpret_as_##suffix(v_uint64x2(v, (uint64)123456))); \
}
#else
#define OPENCV_HAL_IMPL_NEON_LOAD_LOW_OP(_Tpvec, _Tp, suffix) \
inline _Tpvec v_load_low(const _Tp* ptr) \
{ return _Tpvec(vcombine_##suffix(vld1_##suffix(ptr), vdup_n_##suffix((_Tp)0))); }
#endif
#define OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(_Tpvec, _Tp, suffix) \
inline _Tpvec v_load(const _Tp* ptr) \
{ return _Tpvec(vld1q_##suffix(ptr)); } \
inline _Tpvec v_load_aligned(const _Tp* ptr) \
{ return _Tpvec(vld1q_##suffix(ptr)); } \
inline _Tpvec v_load_low(const _Tp* ptr) \
{ return _Tpvec(vcombine_##suffix(vld1_##suffix(ptr), vdup_n_##suffix((_Tp)0))); } \
OPENCV_HAL_IMPL_NEON_LOAD_LOW_OP(_Tpvec, _Tp, suffix) \
inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
{ return _Tpvec(vcombine_##suffix(vld1_##suffix(ptr0), vld1_##suffix(ptr1))); } \
inline void v_store(_Tp* ptr, const _Tpvec& a) \
@ -952,6 +1252,45 @@ OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float32x4, float, f32)
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float64x2, double, f64)
#endif
inline unsigned v_reduce_sum(const v_uint8x16& a)
{
uint32x4_t t0 = vpaddlq_u16(vpaddlq_u8(a.val));
uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
return vget_lane_u32(vpadd_u32(t1, t1), 0);
}
inline int v_reduce_sum(const v_int8x16& a)
{
int32x4_t t0 = vpaddlq_s16(vpaddlq_s8(a.val));
int32x2_t t1 = vpadd_s32(vget_low_s32(t0), vget_high_s32(t0));
return vget_lane_s32(vpadd_s32(t1, t1), 0);
}
inline unsigned v_reduce_sum(const v_uint16x8& a)
{
uint32x4_t t0 = vpaddlq_u16(a.val);
uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
return vget_lane_u32(vpadd_u32(t1, t1), 0);
}
inline int v_reduce_sum(const v_int16x8& a)
{
int32x4_t t0 = vpaddlq_s16(a.val);
int32x2_t t1 = vpadd_s32(vget_low_s32(t0), vget_high_s32(t0));
return vget_lane_s32(vpadd_s32(t1, t1), 0);
}
#define OPENCV_HAL_IMPL_NEON_REDUCE_OP_16(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
inline scalartype v_reduce_##func(const _Tpvec& a) \
{ \
_Tpnvec##_t a0 = vp##vectorfunc##_##suffix(vget_low_##suffix(a.val), vget_high_##suffix(a.val)); \
a0 = vp##vectorfunc##_##suffix(a0, a0); \
a0 = vp##vectorfunc##_##suffix(a0, a0); \
return (scalartype)vget_lane_##suffix(vp##vectorfunc##_##suffix(a0, a0),0); \
}
OPENCV_HAL_IMPL_NEON_REDUCE_OP_16(v_uint8x16, uint8x8, uchar, max, max, u8)
OPENCV_HAL_IMPL_NEON_REDUCE_OP_16(v_uint8x16, uint8x8, uchar, min, min, u8)
OPENCV_HAL_IMPL_NEON_REDUCE_OP_16(v_int8x16, int8x8, schar, max, max, s8)
OPENCV_HAL_IMPL_NEON_REDUCE_OP_16(v_int8x16, int8x8, schar, min, min, s8)
#define OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
inline scalartype v_reduce_##func(const _Tpvec& a) \
{ \
@ -960,10 +1299,8 @@ inline scalartype v_reduce_##func(const _Tpvec& a) \
return (scalartype)vget_lane_##suffix(vp##vectorfunc##_##suffix(a0, a0),0); \
}
OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_uint16x8, uint16x4, unsigned short, sum, add, u16)
OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_uint16x8, uint16x4, unsigned short, max, max, u16)
OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_uint16x8, uint16x4, unsigned short, min, min, u16)
OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_int16x8, int16x4, short, sum, add, s16)
OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_uint16x8, uint16x4, ushort, max, max, u16)
OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_uint16x8, uint16x4, ushort, min, min, u16)
OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_int16x8, int16x4, short, max, max, s16)
OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_int16x8, int16x4, short, min, min, s16)
@ -984,10 +1321,14 @@ OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, sum, add, f32)
OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, max, max, f32)
OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, min, min, f32)
inline uint64 v_reduce_sum(const v_uint64x2& a)
{ return vget_lane_u64(vadd_u64(vget_low_u64(a.val), vget_high_u64(a.val)),0); }
inline int64 v_reduce_sum(const v_int64x2& a)
{ return vget_lane_s64(vadd_s64(vget_low_s64(a.val), vget_high_s64(a.val)),0); }
#if CV_SIMD128_64F
inline double v_reduce_sum(const v_float64x2& a)
{
return vgetq_lane_f64(a.val, 0) + vgetq_lane_f64(a.val, 1);
return vaddvq_f64(a.val);
}
#endif
@ -1049,21 +1390,22 @@ inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
return vget_lane_f32(vpadd_f32(t1, t1), 0);
}
#define OPENCV_HAL_IMPL_NEON_POPCOUNT(_Tpvec, cast) \
inline v_uint32x4 v_popcount(const _Tpvec& a) \
{ \
uint8x16_t t = vcntq_u8(cast(a.val)); \
uint16x8_t t0 = vpaddlq_u8(t); /* 16 -> 8 */ \
uint32x4_t t1 = vpaddlq_u16(t0); /* 8 -> 4 */ \
return v_uint32x4(t1); \
}
OPENCV_HAL_IMPL_NEON_POPCOUNT(v_uint8x16, OPENCV_HAL_NOP)
OPENCV_HAL_IMPL_NEON_POPCOUNT(v_uint16x8, vreinterpretq_u8_u16)
OPENCV_HAL_IMPL_NEON_POPCOUNT(v_uint32x4, vreinterpretq_u8_u32)
OPENCV_HAL_IMPL_NEON_POPCOUNT(v_int8x16, vreinterpretq_u8_s8)
OPENCV_HAL_IMPL_NEON_POPCOUNT(v_int16x8, vreinterpretq_u8_s16)
OPENCV_HAL_IMPL_NEON_POPCOUNT(v_int32x4, vreinterpretq_u8_s32)
inline v_uint8x16 v_popcount(const v_uint8x16& a)
{ return v_uint8x16(vcntq_u8(a.val)); }
inline v_uint8x16 v_popcount(const v_int8x16& a)
{ return v_uint8x16(vcntq_u8(vreinterpretq_u8_s8(a.val))); }
inline v_uint16x8 v_popcount(const v_uint16x8& a)
{ return v_uint16x8(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u16(a.val)))); }
inline v_uint16x8 v_popcount(const v_int16x8& a)
{ return v_uint16x8(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_s16(a.val)))); }
inline v_uint32x4 v_popcount(const v_uint32x4& a)
{ return v_uint32x4(vpaddlq_u16(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u32(a.val))))); }
inline v_uint32x4 v_popcount(const v_int32x4& a)
{ return v_uint32x4(vpaddlq_u16(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_s32(a.val))))); }
inline v_uint64x2 v_popcount(const v_uint64x2& a)
{ return v_uint64x2(vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u64(a.val)))))); }
inline v_uint64x2 v_popcount(const v_int64x2& a)
{ return v_uint64x2(vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_s64(a.val)))))); }
inline int v_signmask(const v_uint8x16& a)
{
@ -1096,17 +1438,32 @@ inline int v_signmask(const v_int32x4& a)
{ return v_signmask(v_reinterpret_as_u32(a)); }
inline int v_signmask(const v_float32x4& a)
{ return v_signmask(v_reinterpret_as_u32(a)); }
#if CV_SIMD128_64F
inline int v_signmask(const v_uint64x2& a)
{
int64x1_t m0 = vdup_n_s64(0);
uint64x2_t v0 = vshlq_u64(vshrq_n_u64(a.val, 63), vcombine_s64(m0, m0));
return (int)vgetq_lane_u64(v0, 0) + ((int)vgetq_lane_u64(v0, 1) << 1);
}
inline int v_signmask(const v_int64x2& a)
{ return v_signmask(v_reinterpret_as_u64(a)); }
#if CV_SIMD128_64F
inline int v_signmask(const v_float64x2& a)
{ return v_signmask(v_reinterpret_as_u64(a)); }
#endif
inline int v_scan_forward(const v_int8x16& a) { return trailingZeros32(v_signmask(a)); }
inline int v_scan_forward(const v_uint8x16& a) { return trailingZeros32(v_signmask(a)); }
inline int v_scan_forward(const v_int16x8& a) { return trailingZeros32(v_signmask(a)); }
inline int v_scan_forward(const v_uint16x8& a) { return trailingZeros32(v_signmask(a)); }
inline int v_scan_forward(const v_int32x4& a) { return trailingZeros32(v_signmask(a)); }
inline int v_scan_forward(const v_uint32x4& a) { return trailingZeros32(v_signmask(a)); }
inline int v_scan_forward(const v_float32x4& a) { return trailingZeros32(v_signmask(a)); }
inline int v_scan_forward(const v_int64x2& a) { return trailingZeros32(v_signmask(a)); }
inline int v_scan_forward(const v_uint64x2& a) { return trailingZeros32(v_signmask(a)); }
#if CV_SIMD128_64F
inline int v_scan_forward(const v_float64x2& a) { return trailingZeros32(v_signmask(a)); }
#endif
#define OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(_Tpvec, suffix, shift) \
inline bool v_check_all(const v_##_Tpvec& a) \
{ \
@ -1124,9 +1481,17 @@ inline bool v_check_any(const v_##_Tpvec& a) \
OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint8x16, u8, 7)
OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint16x8, u16, 15)
OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint32x4, u32, 31)
#if CV_SIMD128_64F
OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(uint64x2, u64, 63)
#endif
inline bool v_check_all(const v_uint64x2& a)
{
uint64x2_t v0 = vshrq_n_u64(a.val, 63);
return (vgetq_lane_u64(v0, 0) & vgetq_lane_u64(v0, 1)) == 1;
}
inline bool v_check_any(const v_uint64x2& a)
{
uint64x2_t v0 = vshrq_n_u64(a.val, 63);
return (vgetq_lane_u64(v0, 0) | vgetq_lane_u64(v0, 1)) != 0;
}
inline bool v_check_all(const v_int8x16& a)
{ return v_check_all(v_reinterpret_as_u8(a)); }
@ -1146,13 +1511,13 @@ inline bool v_check_any(const v_int32x4& a)
inline bool v_check_any(const v_float32x4& a)
{ return v_check_any(v_reinterpret_as_u32(a)); }
#if CV_SIMD128_64F
inline bool v_check_all(const v_int64x2& a)
{ return v_check_all(v_reinterpret_as_u64(a)); }
inline bool v_check_all(const v_float64x2& a)
{ return v_check_all(v_reinterpret_as_u64(a)); }
inline bool v_check_any(const v_int64x2& a)
{ return v_check_any(v_reinterpret_as_u64(a)); }
#if CV_SIMD128_64F
inline bool v_check_all(const v_float64x2& a)
{ return v_check_all(v_reinterpret_as_u64(a)); }
inline bool v_check_any(const v_float64x2& a)
{ return v_check_any(v_reinterpret_as_u64(a)); }
#endif
@ -1174,6 +1539,26 @@ OPENCV_HAL_IMPL_NEON_SELECT(v_float32x4, f32, u32)
OPENCV_HAL_IMPL_NEON_SELECT(v_float64x2, f64, u64)
#endif
#if CV_NEON_AARCH64
#define OPENCV_HAL_IMPL_NEON_EXPAND(_Tpvec, _Tpwvec, _Tp, suffix) \
inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
{ \
b0.val = vmovl_##suffix(vget_low_##suffix(a.val)); \
b1.val = vmovl_high_##suffix(a.val); \
} \
inline _Tpwvec v_expand_low(const _Tpvec& a) \
{ \
return _Tpwvec(vmovl_##suffix(vget_low_##suffix(a.val))); \
} \
inline _Tpwvec v_expand_high(const _Tpvec& a) \
{ \
return _Tpwvec(vmovl_high_##suffix(a.val)); \
} \
inline _Tpwvec v_load_expand(const _Tp* ptr) \
{ \
return _Tpwvec(vmovl_##suffix(vld1_##suffix(ptr))); \
}
#else
#define OPENCV_HAL_IMPL_NEON_EXPAND(_Tpvec, _Tpwvec, _Tp, suffix) \
inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
{ \
@ -1192,6 +1577,7 @@ inline _Tpwvec v_load_expand(const _Tp* ptr) \
{ \
return _Tpwvec(vmovl_##suffix(vld1_##suffix(ptr))); \
}
#endif
OPENCV_HAL_IMPL_NEON_EXPAND(v_uint8x16, v_uint16x8, uchar, u8)
OPENCV_HAL_IMPL_NEON_EXPAND(v_int8x16, v_int16x8, schar, s8)
@ -1216,7 +1602,7 @@ inline v_int32x4 v_load_expand_q(const schar* ptr)
return v_int32x4(vmovl_s16(v1));
}
#if defined(__aarch64__)
#if defined(__aarch64__) || defined(_M_ARM64)
#define OPENCV_HAL_IMPL_NEON_UNPACKS(_Tpvec, suffix) \
inline void v_zip(const v_##_Tpvec& a0, const v_##_Tpvec& a1, v_##_Tpvec& b0, v_##_Tpvec& b1) \
{ \
@ -1270,6 +1656,52 @@ OPENCV_HAL_IMPL_NEON_UNPACKS(float32x4, f32)
OPENCV_HAL_IMPL_NEON_UNPACKS(float64x2, f64)
#endif
inline v_uint8x16 v_reverse(const v_uint8x16 &a)
{
uint8x16_t vec = vrev64q_u8(a.val);
return v_uint8x16(vextq_u8(vec, vec, 8));
}
inline v_int8x16 v_reverse(const v_int8x16 &a)
{ return v_reinterpret_as_s8(v_reverse(v_reinterpret_as_u8(a))); }
inline v_uint16x8 v_reverse(const v_uint16x8 &a)
{
uint16x8_t vec = vrev64q_u16(a.val);
return v_uint16x8(vextq_u16(vec, vec, 4));
}
inline v_int16x8 v_reverse(const v_int16x8 &a)
{ return v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); }
inline v_uint32x4 v_reverse(const v_uint32x4 &a)
{
uint32x4_t vec = vrev64q_u32(a.val);
return v_uint32x4(vextq_u32(vec, vec, 2));
}
inline v_int32x4 v_reverse(const v_int32x4 &a)
{ return v_reinterpret_as_s32(v_reverse(v_reinterpret_as_u32(a))); }
inline v_float32x4 v_reverse(const v_float32x4 &a)
{ return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); }
inline v_uint64x2 v_reverse(const v_uint64x2 &a)
{
uint64x2_t vec = a.val;
uint64x1_t vec_lo = vget_low_u64(vec);
uint64x1_t vec_hi = vget_high_u64(vec);
return v_uint64x2(vcombine_u64(vec_hi, vec_lo));
}
inline v_int64x2 v_reverse(const v_int64x2 &a)
{ return v_reinterpret_as_s64(v_reverse(v_reinterpret_as_u64(a))); }
#if CV_SIMD128_64F
inline v_float64x2 v_reverse(const v_float64x2 &a)
{ return v_reinterpret_as_f64(v_reverse(v_reinterpret_as_u64(a))); }
#endif
#define OPENCV_HAL_IMPL_NEON_EXTRACT(_Tpvec, suffix) \
template <int s> \
inline v_##_Tpvec v_extract(const v_##_Tpvec& a, const v_##_Tpvec& b) \
@ -1290,6 +1722,38 @@ OPENCV_HAL_IMPL_NEON_EXTRACT(float32x4, f32)
OPENCV_HAL_IMPL_NEON_EXTRACT(float64x2, f64)
#endif
#define OPENCV_HAL_IMPL_NEON_EXTRACT_N(_Tpvec, _Tp, suffix) \
template<int i> inline _Tp v_extract_n(_Tpvec v) { return vgetq_lane_##suffix(v.val, i); }
OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_uint8x16, uchar, u8)
OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_int8x16, schar, s8)
OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_uint16x8, ushort, u16)
OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_int16x8, short, s16)
OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_uint32x4, uint, u32)
OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_int32x4, int, s32)
OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_uint64x2, uint64, u64)
OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_int64x2, int64, s64)
OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_float32x4, float, f32)
#if CV_SIMD128_64F
OPENCV_HAL_IMPL_NEON_EXTRACT_N(v_float64x2, double, f64)
#endif
#define OPENCV_HAL_IMPL_NEON_BROADCAST(_Tpvec, _Tp, suffix) \
template<int i> inline _Tpvec v_broadcast_element(_Tpvec v) { _Tp t = v_extract_n<i>(v); return v_setall_##suffix(t); }
OPENCV_HAL_IMPL_NEON_BROADCAST(v_uint8x16, uchar, u8)
OPENCV_HAL_IMPL_NEON_BROADCAST(v_int8x16, schar, s8)
OPENCV_HAL_IMPL_NEON_BROADCAST(v_uint16x8, ushort, u16)
OPENCV_HAL_IMPL_NEON_BROADCAST(v_int16x8, short, s16)
OPENCV_HAL_IMPL_NEON_BROADCAST(v_uint32x4, uint, u32)
OPENCV_HAL_IMPL_NEON_BROADCAST(v_int32x4, int, s32)
OPENCV_HAL_IMPL_NEON_BROADCAST(v_uint64x2, uint64, u64)
OPENCV_HAL_IMPL_NEON_BROADCAST(v_int64x2, int64, s64)
OPENCV_HAL_IMPL_NEON_BROADCAST(v_float32x4, float, f32)
#if CV_SIMD128_64F
OPENCV_HAL_IMPL_NEON_BROADCAST(v_float64x2, double, f64)
#endif
#if CV_SIMD128_64F
inline v_int32x4 v_round(const v_float32x4& a)
{
@ -1570,6 +2034,10 @@ inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
{
return v_float64x2(vcvt_f64_f32(vget_high_f32(a.val)));
}
inline v_float64x2 v_cvt_f64(const v_int64x2& a)
{ return v_float64x2(vcvtq_f64_s64(a.val)); }
#endif
////////////// Lookup table access ////////////////////
@ -1732,10 +2200,12 @@ inline v_float32x4 v_lut(const float* tab, const int* idx)
}
inline v_float32x4 v_lut_pairs(const float* tab, const int* idx)
{
typedef uint64 CV_DECL_ALIGNED(1) unaligned_uint64;
uint64 CV_DECL_ALIGNED(32) elems[2] =
{
*(uint64*)(tab + idx[0]),
*(uint64*)(tab + idx[1])
*(unaligned_uint64*)(tab + idx[0]),
*(unaligned_uint64*)(tab + idx[1])
};
return v_float32x4(vreinterpretq_f32_u64(vld1q_u64(elems)));
}
@ -1924,16 +2394,6 @@ inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
inline void v_cleanup() {}
//! @name Check SIMD support
//! @{
//! @brief Check CPU capability of SIMD operation
static inline bool hasSIMD128()
{
return (CV_CPU_HAS_SUPPORT_NEON) ? true : false;
}
//! @}
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
//! @endcond

View File

@ -57,6 +57,14 @@ namespace cv
//! @cond IGNORED
//
// Compilation troubleshooting:
// - MSVC: error C2719: 'a': formal parameter with requested alignment of 16 won't be aligned
// Replace parameter declaration to const reference:
// -v_int32x4 a
// +const v_int32x4& a
//
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
///////// Types ////////////
@ -67,7 +75,8 @@ struct v_uint8x16
typedef __m128i vector_type;
enum { nlanes = 16 };
v_uint8x16() : val(_mm_setzero_si128()) {}
/* coverity[uninit_ctor]: suppress warning */
v_uint8x16() {}
explicit v_uint8x16(__m128i v) : val(v) {}
v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
@ -77,6 +86,7 @@ struct v_uint8x16
(char)v8, (char)v9, (char)v10, (char)v11,
(char)v12, (char)v13, (char)v14, (char)v15);
}
uchar get0() const
{
return (uchar)_mm_cvtsi128_si32(val);
@ -91,7 +101,8 @@ struct v_int8x16
typedef __m128i vector_type;
enum { nlanes = 16 };
v_int8x16() : val(_mm_setzero_si128()) {}
/* coverity[uninit_ctor]: suppress warning */
v_int8x16() {}
explicit v_int8x16(__m128i v) : val(v) {}
v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
@ -101,6 +112,7 @@ struct v_int8x16
(char)v8, (char)v9, (char)v10, (char)v11,
(char)v12, (char)v13, (char)v14, (char)v15);
}
schar get0() const
{
return (schar)_mm_cvtsi128_si32(val);
@ -115,13 +127,15 @@ struct v_uint16x8
typedef __m128i vector_type;
enum { nlanes = 8 };
v_uint16x8() : val(_mm_setzero_si128()) {}
/* coverity[uninit_ctor]: suppress warning */
v_uint16x8() {}
explicit v_uint16x8(__m128i v) : val(v) {}
v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
{
val = _mm_setr_epi16((short)v0, (short)v1, (short)v2, (short)v3,
(short)v4, (short)v5, (short)v6, (short)v7);
}
ushort get0() const
{
return (ushort)_mm_cvtsi128_si32(val);
@ -136,13 +150,15 @@ struct v_int16x8
typedef __m128i vector_type;
enum { nlanes = 8 };
v_int16x8() : val(_mm_setzero_si128()) {}
/* coverity[uninit_ctor]: suppress warning */
v_int16x8() {}
explicit v_int16x8(__m128i v) : val(v) {}
v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
{
val = _mm_setr_epi16((short)v0, (short)v1, (short)v2, (short)v3,
(short)v4, (short)v5, (short)v6, (short)v7);
}
short get0() const
{
return (short)_mm_cvtsi128_si32(val);
@ -157,12 +173,14 @@ struct v_uint32x4
typedef __m128i vector_type;
enum { nlanes = 4 };
v_uint32x4() : val(_mm_setzero_si128()) {}
/* coverity[uninit_ctor]: suppress warning */
v_uint32x4() {}
explicit v_uint32x4(__m128i v) : val(v) {}
v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
{
val = _mm_setr_epi32((int)v0, (int)v1, (int)v2, (int)v3);
}
unsigned get0() const
{
return (unsigned)_mm_cvtsi128_si32(val);
@ -177,12 +195,14 @@ struct v_int32x4
typedef __m128i vector_type;
enum { nlanes = 4 };
v_int32x4() : val(_mm_setzero_si128()) {}
/* coverity[uninit_ctor]: suppress warning */
v_int32x4() {}
explicit v_int32x4(__m128i v) : val(v) {}
v_int32x4(int v0, int v1, int v2, int v3)
{
val = _mm_setr_epi32(v0, v1, v2, v3);
}
int get0() const
{
return _mm_cvtsi128_si32(val);
@ -197,12 +217,14 @@ struct v_float32x4
typedef __m128 vector_type;
enum { nlanes = 4 };
v_float32x4() : val(_mm_setzero_ps()) {}
/* coverity[uninit_ctor]: suppress warning */
v_float32x4() {}
explicit v_float32x4(__m128 v) : val(v) {}
v_float32x4(float v0, float v1, float v2, float v3)
{
val = _mm_setr_ps(v0, v1, v2, v3);
}
float get0() const
{
return _mm_cvtss_f32(val);
@ -217,17 +239,23 @@ struct v_uint64x2
typedef __m128i vector_type;
enum { nlanes = 2 };
v_uint64x2() : val(_mm_setzero_si128()) {}
/* coverity[uninit_ctor]: suppress warning */
v_uint64x2() {}
explicit v_uint64x2(__m128i v) : val(v) {}
v_uint64x2(uint64 v0, uint64 v1)
{
val = _mm_setr_epi32((int)v0, (int)(v0 >> 32), (int)v1, (int)(v1 >> 32));
}
uint64 get0() const
{
#if !defined(__x86_64__) && !defined(_M_X64)
int a = _mm_cvtsi128_si32(val);
int b = _mm_cvtsi128_si32(_mm_srli_epi64(val, 32));
return (unsigned)a | ((uint64)(unsigned)b << 32);
#else
return (uint64)_mm_cvtsi128_si64(val);
#endif
}
__m128i val;
@ -239,17 +267,23 @@ struct v_int64x2
typedef __m128i vector_type;
enum { nlanes = 2 };
v_int64x2() : val(_mm_setzero_si128()) {}
/* coverity[uninit_ctor]: suppress warning */
v_int64x2() {}
explicit v_int64x2(__m128i v) : val(v) {}
v_int64x2(int64 v0, int64 v1)
{
val = _mm_setr_epi32((int)v0, (int)(v0 >> 32), (int)v1, (int)(v1 >> 32));
}
int64 get0() const
{
#if !defined(__x86_64__) && !defined(_M_X64)
int a = _mm_cvtsi128_si32(val);
int b = _mm_cvtsi128_si32(_mm_srli_epi64(val, 32));
return (int64)((unsigned)a | ((uint64)(unsigned)b << 32));
#else
return _mm_cvtsi128_si64(val);
#endif
}
__m128i val;
@ -261,12 +295,14 @@ struct v_float64x2
typedef __m128d vector_type;
enum { nlanes = 2 };
v_float64x2() : val(_mm_setzero_pd()) {}
/* coverity[uninit_ctor]: suppress warning */
v_float64x2() {}
explicit v_float64x2(__m128d v) : val(v) {}
v_float64x2(double v0, double v1)
{
val = _mm_setr_pd(v0, v1);
}
double get0() const
{
return _mm_cvtsd_f64(val);
@ -302,8 +338,8 @@ inline _Tpvec v_setall_##suffix(_Tp v) { return _Tpvec(_mm_set1_##ssuffix((_Tps)
template<typename _Tpvec0> inline _Tpvec v_reinterpret_as_##suffix(const _Tpvec0& a) \
{ return _Tpvec(cast(a.val)); }
OPENCV_HAL_IMPL_SSE_INITVEC(v_uint8x16, uchar, u8, si128, epi8, char, OPENCV_HAL_NOP)
OPENCV_HAL_IMPL_SSE_INITVEC(v_int8x16, schar, s8, si128, epi8, char, OPENCV_HAL_NOP)
OPENCV_HAL_IMPL_SSE_INITVEC(v_uint8x16, uchar, u8, si128, epi8, schar, OPENCV_HAL_NOP)
OPENCV_HAL_IMPL_SSE_INITVEC(v_int8x16, schar, s8, si128, epi8, schar, OPENCV_HAL_NOP)
OPENCV_HAL_IMPL_SSE_INITVEC(v_uint16x8, ushort, u16, si128, epi16, short, OPENCV_HAL_NOP)
OPENCV_HAL_IMPL_SSE_INITVEC(v_int16x8, short, s16, si128, epi16, short, OPENCV_HAL_NOP)
OPENCV_HAL_IMPL_SSE_INITVEC(v_uint32x4, unsigned, u32, si128, epi32, int, OPENCV_HAL_NOP)
@ -791,15 +827,195 @@ inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b) { return v_int16x8(_mm_mulhi_epi16(a.val, b.val)); }
inline v_uint16x8 v_mul_hi(const v_uint16x8& a, const v_uint16x8& b) { return v_uint16x8(_mm_mulhi_epu16(a.val, b.val)); }
inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
{
return v_int32x4(_mm_madd_epi16(a.val, b.val));
}
//////// Dot Product ////////
// 16 >> 32
inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
{ return v_int32x4(_mm_madd_epi16(a.val, b.val)); }
inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
{ return v_dotprod(a, b) + c; }
// 32 >> 64
inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
{
return v_int32x4(_mm_add_epi32(_mm_madd_epi16(a.val, b.val), c.val));
#if CV_SSE4_1
__m128i even = _mm_mul_epi32(a.val, b.val);
__m128i odd = _mm_mul_epi32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32));
return v_int64x2(_mm_add_epi64(even, odd));
#else
__m128i even_u = _mm_mul_epu32(a.val, b.val);
__m128i odd_u = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32));
// convert unsigned to signed high multiplication (from: Agner Fog(veclib) and H S Warren: Hacker's delight, 2003, p. 132)
__m128i a_sign = _mm_srai_epi32(a.val, 31);
__m128i b_sign = _mm_srai_epi32(b.val, 31);
// |x * sign of x
__m128i axb = _mm_and_si128(a.val, b_sign);
__m128i bxa = _mm_and_si128(b.val, a_sign);
// sum of sign corrections
__m128i ssum = _mm_add_epi32(bxa, axb);
__m128i even_ssum = _mm_slli_epi64(ssum, 32);
__m128i odd_ssum = _mm_and_si128(ssum, _mm_set_epi32(-1, 0, -1, 0));
// convert to signed and prod
return v_int64x2(_mm_add_epi64(_mm_sub_epi64(even_u, even_ssum), _mm_sub_epi64(odd_u, odd_ssum)));
#endif
}
inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
{ return v_dotprod(a, b) + c; }
// 8 >> 32
inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
{
__m128i a0 = _mm_srli_epi16(_mm_slli_si128(a.val, 1), 8); // even
__m128i a1 = _mm_srli_epi16(a.val, 8); // odd
__m128i b0 = _mm_srli_epi16(_mm_slli_si128(b.val, 1), 8);
__m128i b1 = _mm_srli_epi16(b.val, 8);
__m128i p0 = _mm_madd_epi16(a0, b0);
__m128i p1 = _mm_madd_epi16(a1, b1);
return v_uint32x4(_mm_add_epi32(p0, p1));
}
inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
{ return v_dotprod_expand(a, b) + c; }
inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
{
__m128i a0 = _mm_srai_epi16(_mm_slli_si128(a.val, 1), 8); // even
__m128i a1 = _mm_srai_epi16(a.val, 8); // odd
__m128i b0 = _mm_srai_epi16(_mm_slli_si128(b.val, 1), 8);
__m128i b1 = _mm_srai_epi16(b.val, 8);
__m128i p0 = _mm_madd_epi16(a0, b0);
__m128i p1 = _mm_madd_epi16(a1, b1);
return v_int32x4(_mm_add_epi32(p0, p1));
}
inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
{ return v_dotprod_expand(a, b) + c; }
// 16 >> 64
inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
{
v_uint32x4 c, d;
v_mul_expand(a, b, c, d);
v_uint64x2 c0, c1, d0, d1;
v_expand(c, c0, c1);
v_expand(d, d0, d1);
c0 += c1; d0 += d1;
return v_uint64x2(_mm_add_epi64(
_mm_unpacklo_epi64(c0.val, d0.val),
_mm_unpackhi_epi64(c0.val, d0.val)
));
}
inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
{ return v_dotprod_expand(a, b) + c; }
inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
{
v_int32x4 prod = v_dotprod(a, b);
v_int64x2 c, d;
v_expand(prod, c, d);
return v_int64x2(_mm_add_epi64(
_mm_unpacklo_epi64(c.val, d.val),
_mm_unpackhi_epi64(c.val, d.val)
));
}
inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
{ return v_dotprod_expand(a, b) + c; }
// 32 >> 64f
inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
{
#if CV_SSE4_1
return v_cvt_f64(v_dotprod(a, b));
#else
v_float64x2 c = v_cvt_f64(a) * v_cvt_f64(b);
v_float64x2 d = v_cvt_f64_high(a) * v_cvt_f64_high(b);
return v_float64x2(_mm_add_pd(
_mm_unpacklo_pd(c.val, d.val),
_mm_unpackhi_pd(c.val, d.val)
));
#endif
}
inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
{ return v_dotprod_expand(a, b) + c; }
//////// Fast Dot Product ////////
// 16 >> 32
inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
{ return v_dotprod(a, b); }
inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
{ return v_dotprod(a, b) + c; }
// 32 >> 64
inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
{ return v_dotprod(a, b); }
inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
{ return v_dotprod_fast(a, b) + c; }
// 8 >> 32
inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b)
{
__m128i a0 = v_expand_low(a).val;
__m128i a1 = v_expand_high(a).val;
__m128i b0 = v_expand_low(b).val;
__m128i b1 = v_expand_high(b).val;
__m128i p0 = _mm_madd_epi16(a0, b0);
__m128i p1 = _mm_madd_epi16(a1, b1);
return v_uint32x4(_mm_add_epi32(p0, p1));
}
inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
{ return v_dotprod_expand_fast(a, b) + c; }
inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
{
#if CV_SSE4_1
__m128i a0 = _mm_cvtepi8_epi16(a.val);
__m128i a1 = v_expand_high(a).val;
__m128i b0 = _mm_cvtepi8_epi16(b.val);
__m128i b1 = v_expand_high(b).val;
__m128i p0 = _mm_madd_epi16(a0, b0);
__m128i p1 = _mm_madd_epi16(a1, b1);
return v_int32x4(_mm_add_epi32(p0, p1));
#else
return v_dotprod_expand(a, b);
#endif
}
inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
{ return v_dotprod_expand_fast(a, b) + c; }
// 16 >> 64
inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)
{
v_uint32x4 c, d;
v_mul_expand(a, b, c, d);
v_uint64x2 c0, c1, d0, d1;
v_expand(c, c0, c1);
v_expand(d, d0, d1);
c0 += c1; d0 += d1;
return c0 + d0;
}
inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
{ return v_dotprod_expand_fast(a, b) + c; }
inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
{
v_int32x4 prod = v_dotprod(a, b);
v_int64x2 c, d;
v_expand(prod, c, d);
return c + d;
}
inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
{ return v_dotprod_expand_fast(a, b) + c; }
// 32 >> 64f
v_float64x2 v_fma(const v_float64x2& a, const v_float64x2& b, const v_float64x2& c);
inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
{ return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_cvt_f64_high(a) * v_cvt_f64_high(b)); }
inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
{ return v_fma(v_cvt_f64(a), v_cvt_f64(b), v_fma(v_cvt_f64_high(a), v_cvt_f64_high(b), c)); }
#define OPENCV_HAL_IMPL_SSE_LOGIC_OP(_Tpvec, suffix, not_const) \
OPENCV_HAL_IMPL_SSE_BIN_OP(&, _Tpvec, _mm_and_##suffix) \
@ -1032,14 +1248,23 @@ inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(v_float32x4, ps)
OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(v_float64x2, pd)
#define OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(_Tpvec, cast) \
#if CV_SSE4_1
#define OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(_Tpvec) \
inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
{ return cast(v_reinterpret_as_f64(a) == v_reinterpret_as_f64(b)); } \
{ return _Tpvec(_mm_cmpeq_epi64(a.val, b.val)); } \
inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
{ return cast(v_reinterpret_as_f64(a) != v_reinterpret_as_f64(b)); }
{ return ~(a == b); }
#else
#define OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(_Tpvec) \
inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
{ __m128i cmp = _mm_cmpeq_epi32(a.val, b.val); \
return _Tpvec(_mm_and_si128(cmp, _mm_shuffle_epi32(cmp, _MM_SHUFFLE(2, 3, 0, 1)))); } \
inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
{ return ~(a == b); }
#endif
OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(v_uint64x2, v_reinterpret_as_u64)
OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(v_int64x2, v_reinterpret_as_s64)
OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(v_uint64x2)
OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(v_int64x2)
inline v_float32x4 v_not_nan(const v_float32x4& a)
{ return v_float32x4(_mm_cmpord_ps(a.val, a.val)); }
@ -1393,6 +1618,41 @@ inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float32x4, float, ps)
OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float64x2, double, pd)
inline unsigned v_reduce_sum(const v_uint8x16& a)
{
__m128i half = _mm_sad_epu8(a.val, _mm_setzero_si128());
return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(half, _mm_unpackhi_epi64(half, half)));
}
inline int v_reduce_sum(const v_int8x16& a)
{
__m128i half = _mm_set1_epi8((schar)-128);
half = _mm_sad_epu8(_mm_xor_si128(a.val, half), _mm_setzero_si128());
return _mm_cvtsi128_si32(_mm_add_epi32(half, _mm_unpackhi_epi64(half, half))) - 2048;
}
#define OPENCV_HAL_IMPL_SSE_REDUCE_OP_16(func) \
inline schar v_reduce_##func(const v_int8x16& a) \
{ \
__m128i val = a.val; \
__m128i smask = _mm_set1_epi8((schar)-128); \
val = _mm_xor_si128(val, smask); \
val = _mm_##func##_epu8(val, _mm_srli_si128(val,8)); \
val = _mm_##func##_epu8(val, _mm_srli_si128(val,4)); \
val = _mm_##func##_epu8(val, _mm_srli_si128(val,2)); \
val = _mm_##func##_epu8(val, _mm_srli_si128(val,1)); \
return (schar)_mm_cvtsi128_si32(val) ^ (schar)-128; \
} \
inline uchar v_reduce_##func(const v_uint8x16& a) \
{ \
__m128i val = a.val; \
val = _mm_##func##_epu8(val, _mm_srli_si128(val,8)); \
val = _mm_##func##_epu8(val, _mm_srli_si128(val,4)); \
val = _mm_##func##_epu8(val, _mm_srli_si128(val,2)); \
val = _mm_##func##_epu8(val, _mm_srli_si128(val,1)); \
return (uchar)_mm_cvtsi128_si32(val); \
}
OPENCV_HAL_IMPL_SSE_REDUCE_OP_16(max)
OPENCV_HAL_IMPL_SSE_REDUCE_OP_16(min)
#define OPENCV_HAL_IMPL_SSE_REDUCE_OP_8(_Tpvec, scalartype, func, suffix, sbit) \
inline scalartype v_reduce_##func(const v_##_Tpvec& a) \
{ \
@ -1412,26 +1672,8 @@ inline unsigned scalartype v_reduce_##func(const v_u##_Tpvec& a) \
val = _mm_##func##_##suffix(val, _mm_srli_si128(val,2)); \
return (unsigned scalartype)(_mm_cvtsi128_si32(val) ^ sbit); \
}
#define OPENCV_HAL_IMPL_SSE_REDUCE_OP_8_SUM(_Tpvec, scalartype, suffix) \
inline scalartype v_reduce_sum(const v_##_Tpvec& a) \
{ \
__m128i val = a.val; \
val = _mm_adds_epi##suffix(val, _mm_srli_si128(val, 8)); \
val = _mm_adds_epi##suffix(val, _mm_srli_si128(val, 4)); \
val = _mm_adds_epi##suffix(val, _mm_srli_si128(val, 2)); \
return (scalartype)_mm_cvtsi128_si32(val); \
} \
inline unsigned scalartype v_reduce_sum(const v_u##_Tpvec& a) \
{ \
__m128i val = a.val; \
val = _mm_adds_epu##suffix(val, _mm_srli_si128(val, 8)); \
val = _mm_adds_epu##suffix(val, _mm_srli_si128(val, 4)); \
val = _mm_adds_epu##suffix(val, _mm_srli_si128(val, 2)); \
return (unsigned scalartype)_mm_cvtsi128_si32(val); \
}
OPENCV_HAL_IMPL_SSE_REDUCE_OP_8(int16x8, short, max, epi16, (short)-32768)
OPENCV_HAL_IMPL_SSE_REDUCE_OP_8(int16x8, short, min, epi16, (short)-32768)
OPENCV_HAL_IMPL_SSE_REDUCE_OP_8_SUM(int16x8, short, 16)
#define OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(_Tpvec, scalartype, regtype, suffix, cast_from, cast_to, extract) \
inline scalartype v_reduce_sum(const _Tpvec& a) \
@ -1456,6 +1698,23 @@ OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_uint32x4, unsigned, __m128i, epi32, OPENCV
OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_int32x4, int, __m128i, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP, si128_si32)
OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_float32x4, float, __m128, ps, _mm_castps_si128, _mm_castsi128_ps, ss_f32)
inline int v_reduce_sum(const v_int16x8& a)
{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
inline unsigned v_reduce_sum(const v_uint16x8& a)
{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
inline uint64 v_reduce_sum(const v_uint64x2& a)
{
uint64 CV_DECL_ALIGNED(32) idx[2];
v_store_aligned(idx, a);
return idx[0] + idx[1];
}
inline int64 v_reduce_sum(const v_int64x2& a)
{
int64 CV_DECL_ALIGNED(32) idx[2];
v_store_aligned(idx, a);
return idx[0] + idx[1];
}
inline double v_reduce_sum(const v_float64x2& a)
{
double CV_DECL_ALIGNED(32) idx[2];
@ -1486,13 +1745,14 @@ OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, min, std::min)
inline unsigned v_reduce_sad(const v_uint8x16& a, const v_uint8x16& b)
{
return (unsigned)_mm_cvtsi128_si32(_mm_sad_epu8(a.val, b.val));
__m128i half = _mm_sad_epu8(a.val, b.val);
return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(half, _mm_unpackhi_epi64(half, half)));
}
inline unsigned v_reduce_sad(const v_int8x16& a, const v_int8x16& b)
{
__m128i half = _mm_set1_epi8(0x7f);
return (unsigned)_mm_cvtsi128_si32(_mm_sad_epu8(_mm_add_epi8(a.val, half),
_mm_add_epi8(b.val, half)));
half = _mm_sad_epu8(_mm_add_epi8(a.val, half), _mm_add_epi8(b.val, half));
return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(half, _mm_unpackhi_epi64(half, half)));
}
inline unsigned v_reduce_sad(const v_uint16x8& a, const v_uint16x8& b)
{
@ -1519,53 +1779,73 @@ inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
return v_reduce_sum(v_absdiff(a, b));
}
#define OPENCV_HAL_IMPL_SSE_POPCOUNT(_Tpvec) \
inline v_uint32x4 v_popcount(const _Tpvec& a) \
{ \
__m128i m1 = _mm_set1_epi32(0x55555555); \
__m128i m2 = _mm_set1_epi32(0x33333333); \
__m128i m4 = _mm_set1_epi32(0x0f0f0f0f); \
__m128i p = a.val; \
p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 1), m1), _mm_and_si128(p, m1)); \
p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 2), m2), _mm_and_si128(p, m2)); \
p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 4), m4), _mm_and_si128(p, m4)); \
p = _mm_adds_epi8(p, _mm_srli_si128(p, 1)); \
p = _mm_adds_epi8(p, _mm_srli_si128(p, 2)); \
return v_uint32x4(_mm_and_si128(p, _mm_set1_epi32(0x000000ff))); \
}
OPENCV_HAL_IMPL_SSE_POPCOUNT(v_uint8x16)
OPENCV_HAL_IMPL_SSE_POPCOUNT(v_uint16x8)
OPENCV_HAL_IMPL_SSE_POPCOUNT(v_uint32x4)
OPENCV_HAL_IMPL_SSE_POPCOUNT(v_int8x16)
OPENCV_HAL_IMPL_SSE_POPCOUNT(v_int16x8)
OPENCV_HAL_IMPL_SSE_POPCOUNT(v_int32x4)
#define OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(_Tpvec, suffix, pack_op, and_op, signmask, allmask) \
inline int v_signmask(const _Tpvec& a) \
{ \
return and_op(_mm_movemask_##suffix(pack_op(a.val)), signmask); \
} \
inline bool v_check_all(const _Tpvec& a) \
{ return and_op(_mm_movemask_##suffix(a.val), allmask) == allmask; } \
inline bool v_check_any(const _Tpvec& a) \
{ return and_op(_mm_movemask_##suffix(a.val), allmask) != 0; }
#define OPENCV_HAL_PACKS(a) _mm_packs_epi16(a, a)
inline __m128i v_packq_epi32(__m128i a)
inline v_uint8x16 v_popcount(const v_uint8x16& a)
{
__m128i b = _mm_packs_epi32(a, a);
return _mm_packs_epi16(b, b);
__m128i m1 = _mm_set1_epi32(0x55555555);
__m128i m2 = _mm_set1_epi32(0x33333333);
__m128i m4 = _mm_set1_epi32(0x0f0f0f0f);
__m128i p = a.val;
p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 1), m1), _mm_and_si128(p, m1));
p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 2), m2), _mm_and_si128(p, m2));
p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 4), m4), _mm_and_si128(p, m4));
return v_uint8x16(p);
}
inline v_uint16x8 v_popcount(const v_uint16x8& a)
{
v_uint8x16 p = v_popcount(v_reinterpret_as_u8(a));
p += v_rotate_right<1>(p);
return v_reinterpret_as_u16(p) & v_setall_u16(0x00ff);
}
inline v_uint32x4 v_popcount(const v_uint32x4& a)
{
v_uint8x16 p = v_popcount(v_reinterpret_as_u8(a));
p += v_rotate_right<1>(p);
p += v_rotate_right<2>(p);
return v_reinterpret_as_u32(p) & v_setall_u32(0x000000ff);
}
inline v_uint64x2 v_popcount(const v_uint64x2& a)
{
return v_uint64x2(_mm_sad_epu8(v_popcount(v_reinterpret_as_u8(a)).val, _mm_setzero_si128()));
}
inline v_uint8x16 v_popcount(const v_int8x16& a)
{ return v_popcount(v_reinterpret_as_u8(a)); }
inline v_uint16x8 v_popcount(const v_int16x8& a)
{ return v_popcount(v_reinterpret_as_u16(a)); }
inline v_uint32x4 v_popcount(const v_int32x4& a)
{ return v_popcount(v_reinterpret_as_u32(a)); }
inline v_uint64x2 v_popcount(const v_int64x2& a)
{ return v_popcount(v_reinterpret_as_u64(a)); }
OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 65535, 65535)
OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 65535, 65535)
OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint16x8, epi8, OPENCV_HAL_PACKS, OPENCV_HAL_AND, 255, (int)0xaaaa)
OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int16x8, epi8, OPENCV_HAL_PACKS, OPENCV_HAL_AND, 255, (int)0xaaaa)
OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint32x4, epi8, v_packq_epi32, OPENCV_HAL_AND, 15, (int)0x8888)
OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int32x4, epi8, v_packq_epi32, OPENCV_HAL_AND, 15, (int)0x8888)
OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_float32x4, ps, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 15, 15)
OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_float64x2, pd, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 3, 3)
#define OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(_Tpvec, suffix, cast_op, allmask) \
inline int v_signmask(const _Tpvec& a) { return _mm_movemask_##suffix(cast_op(a.val)); } \
inline bool v_check_all(const _Tpvec& a) { return _mm_movemask_##suffix(cast_op(a.val)) == allmask; } \
inline bool v_check_any(const _Tpvec& a) { return _mm_movemask_##suffix(cast_op(a.val)) != 0; }
OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint8x16, epi8, OPENCV_HAL_NOP, 65535)
OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int8x16, epi8, OPENCV_HAL_NOP, 65535)
OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint32x4, ps, _mm_castsi128_ps, 15)
OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int32x4, ps, _mm_castsi128_ps, 15)
OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint64x2, pd, _mm_castsi128_pd, 3)
OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int64x2, pd, _mm_castsi128_pd, 3)
OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_float32x4, ps, OPENCV_HAL_NOP, 15)
OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_float64x2, pd, OPENCV_HAL_NOP, 3)
#define OPENCV_HAL_IMPL_SSE_CHECK_SIGNS_SHORT(_Tpvec) \
inline int v_signmask(const _Tpvec& a) { return _mm_movemask_epi8(_mm_packs_epi16(a.val, a.val)) & 255; } \
inline bool v_check_all(const _Tpvec& a) { return (_mm_movemask_epi8(a.val) & 0xaaaa) == 0xaaaa; } \
inline bool v_check_any(const _Tpvec& a) { return (_mm_movemask_epi8(a.val) & 0xaaaa) != 0; }
OPENCV_HAL_IMPL_SSE_CHECK_SIGNS_SHORT(v_uint16x8)
OPENCV_HAL_IMPL_SSE_CHECK_SIGNS_SHORT(v_int16x8)
inline int v_scan_forward(const v_int8x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); }
inline int v_scan_forward(const v_uint8x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); }
inline int v_scan_forward(const v_int16x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 2; }
inline int v_scan_forward(const v_uint16x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 2; }
inline int v_scan_forward(const v_int32x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
inline int v_scan_forward(const v_uint32x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
inline int v_scan_forward(const v_float32x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; }
inline int v_scan_forward(const v_int64x2& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
inline int v_scan_forward(const v_uint64x2& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
inline int v_scan_forward(const v_float64x2& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; }
#if CV_SSE4_1
#define OPENCV_HAL_IMPL_SSE_SELECT(_Tpvec, cast_ret, cast, suffix) \
@ -1671,6 +1951,59 @@ OPENCV_HAL_IMPL_SSE_UNPACKS(v_int32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
OPENCV_HAL_IMPL_SSE_UNPACKS(v_float32x4, ps, _mm_castps_si128, _mm_castsi128_ps)
OPENCV_HAL_IMPL_SSE_UNPACKS(v_float64x2, pd, _mm_castpd_si128, _mm_castsi128_pd)
inline v_uint8x16 v_reverse(const v_uint8x16 &a)
{
#if CV_SSSE3
static const __m128i perm = _mm_setr_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
return v_uint8x16(_mm_shuffle_epi8(a.val, perm));
#else
uchar CV_DECL_ALIGNED(32) d[16];
v_store_aligned(d, a);
return v_uint8x16(d[15], d[14], d[13], d[12], d[11], d[10], d[9], d[8], d[7], d[6], d[5], d[4], d[3], d[2], d[1], d[0]);
#endif
}
inline v_int8x16 v_reverse(const v_int8x16 &a)
{ return v_reinterpret_as_s8(v_reverse(v_reinterpret_as_u8(a))); }
inline v_uint16x8 v_reverse(const v_uint16x8 &a)
{
#if CV_SSSE3
static const __m128i perm = _mm_setr_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
return v_uint16x8(_mm_shuffle_epi8(a.val, perm));
#else
__m128i r = _mm_shuffle_epi32(a.val, _MM_SHUFFLE(0, 1, 2, 3));
r = _mm_shufflelo_epi16(r, _MM_SHUFFLE(2, 3, 0, 1));
r = _mm_shufflehi_epi16(r, _MM_SHUFFLE(2, 3, 0, 1));
return v_uint16x8(r);
#endif
}
inline v_int16x8 v_reverse(const v_int16x8 &a)
{ return v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); }
inline v_uint32x4 v_reverse(const v_uint32x4 &a)
{
return v_uint32x4(_mm_shuffle_epi32(a.val, _MM_SHUFFLE(0, 1, 2, 3)));
}
inline v_int32x4 v_reverse(const v_int32x4 &a)
{ return v_reinterpret_as_s32(v_reverse(v_reinterpret_as_u32(a))); }
inline v_float32x4 v_reverse(const v_float32x4 &a)
{ return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); }
inline v_uint64x2 v_reverse(const v_uint64x2 &a)
{
return v_uint64x2(_mm_shuffle_epi32(a.val, _MM_SHUFFLE(1, 0, 3, 2)));
}
inline v_int64x2 v_reverse(const v_int64x2 &a)
{ return v_reinterpret_as_s64(v_reverse(v_reinterpret_as_u64(a))); }
inline v_float64x2 v_reverse(const v_float64x2 &a)
{ return v_reinterpret_as_f64(v_reverse(v_reinterpret_as_u64(a))); }
template<int s, typename _Tpvec>
inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b)
{
@ -2684,18 +3017,31 @@ inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
return v_float64x2(_mm_cvtps_pd(_mm_movehl_ps(a.val, a.val)));
}
#if CV_FP16
inline v_float32x4 v128_load_fp16_f32(const short* ptr)
// from (Mysticial and wim) https://stackoverflow.com/q/41144668
inline v_float64x2 v_cvt_f64(const v_int64x2& v)
{
return v_float32x4(_mm_cvtph_ps(_mm_loadu_si128((const __m128i*)ptr)));
}
inline void v_store_fp16(short* ptr, const v_float32x4& a)
{
__m128i fp16_value = _mm_cvtps_ph(a.val, 0);
_mm_storel_epi64((__m128i*)ptr, fp16_value);
}
// constants encoded as floating-point
__m128i magic_i_hi32 = _mm_set1_epi64x(0x4530000080000000); // 2^84 + 2^63
__m128i magic_i_all = _mm_set1_epi64x(0x4530000080100000); // 2^84 + 2^63 + 2^52
__m128d magic_d_all = _mm_castsi128_pd(magic_i_all);
// Blend the 32 lowest significant bits of v with magic_int_lo
#if CV_SSE4_1
__m128i magic_i_lo = _mm_set1_epi64x(0x4330000000000000); // 2^52
__m128i v_lo = _mm_blend_epi16(v.val, magic_i_lo, 0xcc);
#else
__m128i magic_i_lo = _mm_set1_epi32(0x43300000); // 2^52
__m128i v_lo = _mm_unpacklo_epi32(_mm_shuffle_epi32(v.val, _MM_SHUFFLE(0, 0, 2, 0)), magic_i_lo);
#endif
// Extract the 32 most significant bits of v
__m128i v_hi = _mm_srli_epi64(v.val, 32);
// Flip the msb of v_hi and blend with 0x45300000
v_hi = _mm_xor_si128(v_hi, magic_i_hi32);
// Compute in double precision
__m128d v_hi_dbl = _mm_sub_pd(_mm_castsi128_pd(v_hi), magic_d_all);
// (v_hi - magic_d_all) + v_lo Do not assume associativity of floating point addition
__m128d result = _mm_add_pd(v_hi_dbl, _mm_castsi128_pd(v_lo));
return v_float64x2(result);
}
////////////// Lookup table access ////////////////////
@ -2952,10 +3298,107 @@ inline v_int32x4 v_pack_triplets(const v_int32x4& vec) { return vec; }
inline v_uint32x4 v_pack_triplets(const v_uint32x4& vec) { return vec; }
inline v_float32x4 v_pack_triplets(const v_float32x4& vec) { return vec; }
template<int i>
inline uchar v_extract_n(const v_uint8x16& v)
{
#if CV_SSE4_1
return (uchar)_mm_extract_epi8(v.val, i);
#else
return v_rotate_right<i>(v).get0();
#endif
}
template<int i>
inline schar v_extract_n(const v_int8x16& v)
{
return (schar)v_extract_n<i>(v_reinterpret_as_u8(v));
}
template<int i>
inline ushort v_extract_n(const v_uint16x8& v)
{
return (ushort)_mm_extract_epi16(v.val, i);
}
template<int i>
inline short v_extract_n(const v_int16x8& v)
{
return (short)v_extract_n<i>(v_reinterpret_as_u16(v));
}
template<int i>
inline uint v_extract_n(const v_uint32x4& v)
{
#if CV_SSE4_1
return (uint)_mm_extract_epi32(v.val, i);
#else
return v_rotate_right<i>(v).get0();
#endif
}
template<int i>
inline int v_extract_n(const v_int32x4& v)
{
return (int)v_extract_n<i>(v_reinterpret_as_u32(v));
}
template<int i>
inline uint64 v_extract_n(const v_uint64x2& v)
{
#ifdef CV__SIMD_NATIVE_mm_extract_epi64
return (uint64)_v128_extract_epi64<i>(v.val);
#else
return v_rotate_right<i>(v).get0();
#endif
}
template<int i>
inline int64 v_extract_n(const v_int64x2& v)
{
return (int64)v_extract_n<i>(v_reinterpret_as_u64(v));
}
template<int i>
inline float v_extract_n(const v_float32x4& v)
{
union { uint iv; float fv; } d;
d.iv = v_extract_n<i>(v_reinterpret_as_u32(v));
return d.fv;
}
template<int i>
inline double v_extract_n(const v_float64x2& v)
{
union { uint64 iv; double dv; } d;
d.iv = v_extract_n<i>(v_reinterpret_as_u64(v));
return d.dv;
}
template<int i>
inline v_int32x4 v_broadcast_element(const v_int32x4& v)
{
return v_int32x4(_mm_shuffle_epi32(v.val, _MM_SHUFFLE(i,i,i,i)));
}
template<int i>
inline v_uint32x4 v_broadcast_element(const v_uint32x4& v)
{
return v_uint32x4(_mm_shuffle_epi32(v.val, _MM_SHUFFLE(i,i,i,i)));
}
template<int i>
inline v_float32x4 v_broadcast_element(const v_float32x4& v)
{
return v_float32x4(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE((char)i,(char)i,(char)i,(char)i)));
}
////////////// FP16 support ///////////////////////////
inline v_float32x4 v_load_expand(const float16_t* ptr)
{
#if CV_FP16
return v_float32x4(_mm_cvtph_ps(_mm_loadu_si128((const __m128i*)ptr)));
#else
const __m128i z = _mm_setzero_si128(), delta = _mm_set1_epi32(0x38000000);
const __m128i signmask = _mm_set1_epi32(0x80000000), maxexp = _mm_set1_epi32(0x7c000000);
const __m128 deltaf = _mm_castsi128_ps(_mm_set1_epi32(0x38800000));
@ -2968,10 +3411,15 @@ inline v_float32x4 v_load_expand(const float16_t* ptr)
__m128i zmask = _mm_cmpeq_epi32(e, z);
__m128i ft = v_select_si128(zmask, zt, t);
return v_float32x4(_mm_castsi128_ps(_mm_or_si128(ft, sign)));
#endif
}
inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
{
#if CV_FP16
__m128i fp16_value = _mm_cvtps_ph(v.val, 0);
_mm_storel_epi64((__m128i*)ptr, fp16_value);
#else
const __m128i signmask = _mm_set1_epi32(0x80000000);
const __m128i rval = _mm_set1_epi32(0x3f000000);
@ -2993,20 +3441,11 @@ inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
t = _mm_or_si128(t, sign);
t = _mm_packs_epi32(t, t);
_mm_storel_epi64((__m128i*)ptr, t);
#endif
}
inline void v_cleanup() {}
//! @name Check SIMD support
//! @{
//! @brief Check CPU capability of SIMD operation
static inline bool hasSIMD128()
{
return (CV_CPU_HAS_SUPPORT_SSE2) ? true : false;
}
//! @}
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
//! @endcond

View File

@ -158,6 +158,19 @@ inline __m128i _v128_packs_epu32(const __m128i& a, const __m128i& b)
#endif
}
template<int i>
inline int64 _v128_extract_epi64(const __m128i& a)
{
#if defined(CV__SIMD_HAVE_mm_extract_epi64) || (CV_SSE4_1 && (defined(__x86_64__)/*GCC*/ || defined(_M_X64)/*MSVC*/))
#define CV__SIMD_NATIVE_mm_extract_epi64 1
return _mm_extract_epi64(a, i);
#else
CV_DECL_ALIGNED(16) int64 tmp[2];
_mm_store_si128((__m128i*)tmp, a);
return tmp[i];
#endif
}
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
//! @endcond

View File

@ -28,7 +28,7 @@ struct v_uint8x16
explicit v_uint8x16(const vec_uchar16& v) : val(v)
{}
v_uint8x16() : val(vec_uchar16_z)
v_uint8x16()
{}
v_uint8x16(vec_bchar16 v) : val(vec_uchar16_c(v))
{}
@ -36,6 +36,9 @@ struct v_uint8x16
uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
: val(vec_uchar16_set(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15))
{}
static inline v_uint8x16 zero() { return v_uint8x16(vec_uchar16_z); }
uchar get0() const
{ return vec_extract(val, 0); }
};
@ -48,7 +51,7 @@ struct v_int8x16
explicit v_int8x16(const vec_char16& v) : val(v)
{}
v_int8x16() : val(vec_char16_z)
v_int8x16()
{}
v_int8x16(vec_bchar16 v) : val(vec_char16_c(v))
{}
@ -56,6 +59,9 @@ struct v_int8x16
schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
: val(vec_char16_set(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15))
{}
static inline v_int8x16 zero() { return v_int8x16(vec_char16_z); }
schar get0() const
{ return vec_extract(val, 0); }
};
@ -68,13 +74,16 @@ struct v_uint16x8
explicit v_uint16x8(const vec_ushort8& v) : val(v)
{}
v_uint16x8() : val(vec_ushort8_z)
v_uint16x8()
{}
v_uint16x8(vec_bshort8 v) : val(vec_ushort8_c(v))
{}
v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
: val(vec_ushort8_set(v0, v1, v2, v3, v4, v5, v6, v7))
{}
static inline v_uint16x8 zero() { return v_uint16x8(vec_ushort8_z); }
ushort get0() const
{ return vec_extract(val, 0); }
};
@ -87,13 +96,16 @@ struct v_int16x8
explicit v_int16x8(const vec_short8& v) : val(v)
{}
v_int16x8() : val(vec_short8_z)
v_int16x8()
{}
v_int16x8(vec_bshort8 v) : val(vec_short8_c(v))
{}
v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
: val(vec_short8_set(v0, v1, v2, v3, v4, v5, v6, v7))
{}
static inline v_int16x8 zero() { return v_int16x8(vec_short8_z); }
short get0() const
{ return vec_extract(val, 0); }
};
@ -106,12 +118,15 @@ struct v_uint32x4
explicit v_uint32x4(const vec_uint4& v) : val(v)
{}
v_uint32x4() : val(vec_uint4_z)
v_uint32x4()
{}
v_uint32x4(vec_bint4 v) : val(vec_uint4_c(v))
{}
v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3) : val(vec_uint4_set(v0, v1, v2, v3))
{}
static inline v_uint32x4 zero() { return v_uint32x4(vec_uint4_z); }
uint get0() const
{ return vec_extract(val, 0); }
};
@ -124,12 +139,15 @@ struct v_int32x4
explicit v_int32x4(const vec_int4& v) : val(v)
{}
v_int32x4() : val(vec_int4_z)
v_int32x4()
{}
v_int32x4(vec_bint4 v) : val(vec_int4_c(v))
{}
v_int32x4(int v0, int v1, int v2, int v3) : val(vec_int4_set(v0, v1, v2, v3))
{}
static inline v_int32x4 zero() { return v_int32x4(vec_int4_z); }
int get0() const
{ return vec_extract(val, 0); }
};
@ -142,12 +160,15 @@ struct v_float32x4
explicit v_float32x4(const vec_float4& v) : val(v)
{}
v_float32x4() : val(vec_float4_z)
v_float32x4()
{}
v_float32x4(vec_bint4 v) : val(vec_float4_c(v))
{}
v_float32x4(float v0, float v1, float v2, float v3) : val(vec_float4_set(v0, v1, v2, v3))
{}
static inline v_float32x4 zero() { return v_float32x4(vec_float4_z); }
float get0() const
{ return vec_extract(val, 0); }
};
@ -160,12 +181,15 @@ struct v_uint64x2
explicit v_uint64x2(const vec_udword2& v) : val(v)
{}
v_uint64x2() : val(vec_udword2_z)
v_uint64x2()
{}
v_uint64x2(vec_bdword2 v) : val(vec_udword2_c(v))
{}
v_uint64x2(uint64 v0, uint64 v1) : val(vec_udword2_set(v0, v1))
{}
static inline v_uint64x2 zero() { return v_uint64x2(vec_udword2_z); }
uint64 get0() const
{ return vec_extract(val, 0); }
};
@ -178,12 +202,15 @@ struct v_int64x2
explicit v_int64x2(const vec_dword2& v) : val(v)
{}
v_int64x2() : val(vec_dword2_z)
v_int64x2()
{}
v_int64x2(vec_bdword2 v) : val(vec_dword2_c(v))
{}
v_int64x2(int64 v0, int64 v1) : val(vec_dword2_set(v0, v1))
{}
static inline v_int64x2 zero() { return v_int64x2(vec_dword2_z); }
int64 get0() const
{ return vec_extract(val, 0); }
};
@ -196,16 +223,33 @@ struct v_float64x2
explicit v_float64x2(const vec_double2& v) : val(v)
{}
v_float64x2() : val(vec_double2_z)
v_float64x2()
{}
v_float64x2(vec_bdword2 v) : val(vec_double2_c(v))
{}
v_float64x2(double v0, double v1) : val(vec_double2_set(v0, v1))
{}
static inline v_float64x2 zero() { return v_float64x2(vec_double2_z); }
double get0() const
{ return vec_extract(val, 0); }
};
#define OPENCV_HAL_IMPL_VSX_EXTRACT_N(_Tpvec, _Tp) \
template<int i> inline _Tp v_extract_n(VSX_UNUSED(_Tpvec v)) { return vec_extract(v.val, i); }
OPENCV_HAL_IMPL_VSX_EXTRACT_N(v_uint8x16, uchar)
OPENCV_HAL_IMPL_VSX_EXTRACT_N(v_int8x16, schar)
OPENCV_HAL_IMPL_VSX_EXTRACT_N(v_uint16x8, ushort)
OPENCV_HAL_IMPL_VSX_EXTRACT_N(v_int16x8, short)
OPENCV_HAL_IMPL_VSX_EXTRACT_N(v_uint32x4, uint)
OPENCV_HAL_IMPL_VSX_EXTRACT_N(v_int32x4, int)
OPENCV_HAL_IMPL_VSX_EXTRACT_N(v_uint64x2, uint64)
OPENCV_HAL_IMPL_VSX_EXTRACT_N(v_int64x2, int64)
OPENCV_HAL_IMPL_VSX_EXTRACT_N(v_float32x4, float)
OPENCV_HAL_IMPL_VSX_EXTRACT_N(v_float64x2, double)
//////////////// Load and store operations ///////////////
/*
@ -215,7 +259,7 @@ struct v_float64x2
* if vec_xxx_c defined as C++ cast, clang-5 will pass it
*/
#define OPENCV_HAL_IMPL_VSX_INITVEC(_Tpvec, _Tp, suffix, cast) \
inline _Tpvec v_setzero_##suffix() { return _Tpvec(); } \
inline _Tpvec v_setzero_##suffix() { return _Tpvec(vec_splats((_Tp)0)); } \
inline _Tpvec v_setall_##suffix(_Tp v) { return _Tpvec(vec_splats((_Tp)v));} \
template<typename _Tpvec0> inline _Tpvec v_reinterpret_as_##suffix(const _Tpvec0 &a) \
{ return _Tpvec((cast)a.val); }
@ -332,11 +376,37 @@ OPENCV_HAL_IMPL_VSX_EXPAND(v_int16x8, v_int32x4, short, vec_unpackl, vec_unpackh
OPENCV_HAL_IMPL_VSX_EXPAND(v_uint32x4, v_uint64x2, uint, vec_unpacklu, vec_unpackhu)
OPENCV_HAL_IMPL_VSX_EXPAND(v_int32x4, v_int64x2, int, vec_unpackl, vec_unpackh)
/* Load and zero expand a 4 byte value into the second dword, first is don't care. */
#if !defined(CV_COMPILER_VSX_BROKEN_ASM)
#define _LXSIWZX(out, ptr, T) __asm__ ("lxsiwzx %x0, 0, %1\r\n" : "=wa"(out) : "r" (ptr) : "memory");
#else
/* This is compiler-agnostic, but will introduce an unneeded splat on the critical path. */
#define _LXSIWZX(out, ptr, T) out = (T)vec_udword2_sp(*(uint32_t*)(ptr));
#endif
inline v_uint32x4 v_load_expand_q(const uchar* ptr)
{ return v_uint32x4(vec_uint4_set(ptr[0], ptr[1], ptr[2], ptr[3])); }
{
// Zero-extend the extra 24B instead of unpacking. Usually faster in small kernel
// Likewise note, value is zero extended and upper 4 bytes are zero'ed.
vec_uchar16 pmu = {8, 12, 12, 12, 9, 12, 12, 12, 10, 12, 12, 12, 11, 12, 12, 12};
vec_uchar16 out;
_LXSIWZX(out, ptr, vec_uchar16);
out = vec_perm(out, out, pmu);
return v_uint32x4((vec_uint4)out);
}
inline v_int32x4 v_load_expand_q(const schar* ptr)
{ return v_int32x4(vec_int4_set(ptr[0], ptr[1], ptr[2], ptr[3])); }
{
vec_char16 out;
vec_short8 outs;
vec_int4 outw;
_LXSIWZX(out, ptr, vec_char16);
outs = vec_unpackl(out);
outw = vec_unpackh(outs);
return v_int32x4(outw);
}
/* pack */
#define OPENCV_HAL_IMPL_VSX_PACK(_Tpvec, _Tp, _Tpwvec, _Tpvn, _Tpdel, sfnc, pkfnc, addfnc, pack) \
@ -499,12 +569,6 @@ inline void v_mul_expand(const Tvec& a, const Tvec& b, Twvec& c, Twvec& d)
v_zip(p0, p1, c, d);
}
inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b, v_uint64x2& c, v_uint64x2& d)
{
c.val = vec_mul(vec_unpackhu(a.val), vec_unpackhu(b.val));
d.val = vec_mul(vec_unpacklu(a.val), vec_unpacklu(b.val));
}
inline v_int16x8 v_mul_hi(const v_int16x8& a, const v_int16x8& b)
{
vec_int4 p0 = vec_mule(a.val, b.val);
@ -626,7 +690,7 @@ inline _Tpvec v_rotate_##suffix(const _Tpvec& a)
{ \
const int wd = imm * sizeof(typename _Tpvec::lane_type); \
if (wd > 15) \
return _Tpvec(); \
return _Tpvec::zero(); \
return _Tpvec((cast)shf(vec_uchar16_c(a.val), vec_uchar16_sp(wd << 3))); \
}
@ -684,6 +748,53 @@ OPENCV_IMPL_VSX_ROTATE_64_2RG_LR(v_float64x2)
OPENCV_IMPL_VSX_ROTATE_64_2RG_LR(v_uint64x2)
OPENCV_IMPL_VSX_ROTATE_64_2RG_LR(v_int64x2)
/* Reverse */
inline v_uint8x16 v_reverse(const v_uint8x16 &a)
{
static const vec_uchar16 perm = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
vec_uchar16 vec = (vec_uchar16)a.val;
return v_uint8x16(vec_perm(vec, vec, perm));
}
inline v_int8x16 v_reverse(const v_int8x16 &a)
{ return v_reinterpret_as_s8(v_reverse(v_reinterpret_as_u8(a))); }
inline v_uint16x8 v_reverse(const v_uint16x8 &a)
{
static const vec_uchar16 perm = {14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1};
vec_uchar16 vec = (vec_uchar16)a.val;
return v_reinterpret_as_u16(v_uint8x16(vec_perm(vec, vec, perm)));
}
inline v_int16x8 v_reverse(const v_int16x8 &a)
{ return v_reinterpret_as_s16(v_reverse(v_reinterpret_as_u16(a))); }
inline v_uint32x4 v_reverse(const v_uint32x4 &a)
{
static const vec_uchar16 perm = {12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3};
vec_uchar16 vec = (vec_uchar16)a.val;
return v_reinterpret_as_u32(v_uint8x16(vec_perm(vec, vec, perm)));
}
inline v_int32x4 v_reverse(const v_int32x4 &a)
{ return v_reinterpret_as_s32(v_reverse(v_reinterpret_as_u32(a))); }
inline v_float32x4 v_reverse(const v_float32x4 &a)
{ return v_reinterpret_as_f32(v_reverse(v_reinterpret_as_u32(a))); }
inline v_uint64x2 v_reverse(const v_uint64x2 &a)
{
static const vec_uchar16 perm = {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7};
vec_uchar16 vec = (vec_uchar16)a.val;
return v_reinterpret_as_u64(v_uint8x16(vec_perm(vec, vec, perm)));
}
inline v_int64x2 v_reverse(const v_int64x2 &a)
{ return v_reinterpret_as_s64(v_reverse(v_reinterpret_as_u64(a))); }
inline v_float64x2 v_reverse(const v_float64x2 &a)
{ return v_reinterpret_as_f64(v_reverse(v_reinterpret_as_u64(a))); }
/* Extract */
template<int s, typename _Tpvec>
inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b)
@ -692,15 +803,27 @@ inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b)
////////// Reduce and mask /////////
/** Reduce **/
inline short v_reduce_sum(const v_int16x8& a)
inline uint v_reduce_sum(const v_uint8x16& a)
{
const vec_uint4 zero4 = vec_uint4_z;
vec_uint4 sum4 = vec_sum4s(a.val, zero4);
return (uint)vec_extract(vec_sums(vec_int4_c(sum4), vec_int4_c(zero4)), 3);
}
inline int v_reduce_sum(const v_int8x16& a)
{
const vec_int4 zero4 = vec_int4_z;
vec_int4 sum4 = vec_sum4s(a.val, zero4);
return (int)vec_extract(vec_sums(sum4, zero4), 3);
}
inline int v_reduce_sum(const v_int16x8& a)
{
const vec_int4 zero = vec_int4_z;
return saturate_cast<short>(vec_extract(vec_sums(vec_sum4s(a.val, zero), zero), 3));
return saturate_cast<int>(vec_extract(vec_sums(vec_sum4s(a.val, zero), zero), 3));
}
inline ushort v_reduce_sum(const v_uint16x8& a)
inline uint v_reduce_sum(const v_uint16x8& a)
{
const vec_int4 v4 = vec_int4_c(vec_unpackhu(vec_adds(a.val, vec_sld(a.val, a.val, 8))));
return saturate_cast<ushort>(vec_extract(vec_sums(v4, vec_int4_z), 3));
return saturate_cast<uint>(vec_extract(vec_sums(v4, vec_int4_z), 3));
}
#define OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(_Tpvec, _Tpvec2, scalartype, suffix, func) \
@ -719,6 +842,14 @@ OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_float32x4, vec_float4, float, sum, vec_add)
OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_float32x4, vec_float4, float, max, vec_max)
OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_float32x4, vec_float4, float, min, vec_min)
inline uint64 v_reduce_sum(const v_uint64x2& a)
{
return vec_extract(vec_add(a.val, vec_permi(a.val, a.val, 3)), 0);
}
inline int64 v_reduce_sum(const v_int64x2& a)
{
return vec_extract(vec_add(a.val, vec_permi(a.val, a.val, 3)), 0);
}
inline double v_reduce_sum(const v_float64x2& a)
{
return vec_extract(vec_add(a.val, vec_permi(a.val, a.val, 3)), 0);
@ -736,6 +867,19 @@ OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(v_uint16x8, vec_ushort8, ushort, min, vec_min)
OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(v_int16x8, vec_short8, short, max, vec_max)
OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(v_int16x8, vec_short8, short, min, vec_min)
#define OPENCV_HAL_IMPL_VSX_REDUCE_OP_16(_Tpvec, _Tpvec2, scalartype, suffix, func) \
inline scalartype v_reduce_##suffix(const _Tpvec& a) \
{ \
_Tpvec2 rs = func(a.val, vec_sld(a.val, a.val, 8)); \
rs = func(rs, vec_sld(rs, rs, 4)); \
rs = func(rs, vec_sld(rs, rs, 2)); \
return vec_extract(func(rs, vec_sld(rs, rs, 1)), 0); \
}
OPENCV_HAL_IMPL_VSX_REDUCE_OP_16(v_uint8x16, vec_uchar16, uchar, max, vec_max)
OPENCV_HAL_IMPL_VSX_REDUCE_OP_16(v_uint8x16, vec_uchar16, uchar, min, vec_min)
OPENCV_HAL_IMPL_VSX_REDUCE_OP_16(v_int8x16, vec_char16, schar, max, vec_max)
OPENCV_HAL_IMPL_VSX_REDUCE_OP_16(v_int8x16, vec_char16, schar, min, vec_min)
inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
const v_float32x4& c, const v_float32x4& d)
{
@ -763,7 +907,7 @@ inline unsigned v_reduce_sad(const v_int8x16& a, const v_int8x16& b)
inline unsigned v_reduce_sad(const v_uint16x8& a, const v_uint16x8& b)
{
vec_ushort8 ad = vec_absd(a.val, b.val);
VSX_UNUSED(vec_int4) sum = vec_sums(vec_int4_c(vec_unpackhu(ad)), vec_int4_c(vec_unpacklu(ad)));
VSX_UNUSED(vec_int4) sum = vec_sums(vec_int4_c(vec_unpackhu(ad)) + vec_int4_c(vec_unpacklu(ad)), vec_int4_z);
return (unsigned)vec_extract(sum, 3);
}
inline unsigned v_reduce_sad(const v_int16x8& a, const v_int16x8& b)
@ -792,43 +936,44 @@ inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
}
/** Popcount **/
template<typename _Tpvec>
inline v_uint32x4 v_popcount(const _Tpvec& a)
{ return v_uint32x4(vec_popcntu(vec_uint4_c(a.val))); }
inline v_uint8x16 v_popcount(const v_uint8x16& a)
{ return v_uint8x16(vec_popcntu(a.val)); }
inline v_uint8x16 v_popcount(const v_int8x16& a)
{ return v_uint8x16(vec_popcntu(a.val)); }
inline v_uint16x8 v_popcount(const v_uint16x8& a)
{ return v_uint16x8(vec_popcntu(a.val)); }
inline v_uint16x8 v_popcount(const v_int16x8& a)
{ return v_uint16x8(vec_popcntu(a.val)); }
inline v_uint32x4 v_popcount(const v_uint32x4& a)
{ return v_uint32x4(vec_popcntu(a.val)); }
inline v_uint32x4 v_popcount(const v_int32x4& a)
{ return v_uint32x4(vec_popcntu(a.val)); }
inline v_uint64x2 v_popcount(const v_uint64x2& a)
{ return v_uint64x2(vec_popcntu(a.val)); }
inline v_uint64x2 v_popcount(const v_int64x2& a)
{ return v_uint64x2(vec_popcntu(a.val)); }
/** Mask **/
inline int v_signmask(const v_uint8x16& a)
{
vec_uchar16 sv = vec_sr(a.val, vec_uchar16_sp(7));
static const vec_uchar16 slm = {0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7};
sv = vec_sl(sv, slm);
vec_uint4 sv4 = vec_sum4s(sv, vec_uint4_z);
static const vec_uint4 slm4 = {0, 0, 8, 8};
sv4 = vec_sl(sv4, slm4);
return vec_extract(vec_sums((vec_int4) sv4, vec_int4_z), 3);
static const vec_uchar16 qperm = {120, 112, 104, 96, 88, 80, 72, 64, 56, 48, 40, 32, 24, 16, 8, 0};
return vec_extract((vec_int4)vec_vbpermq(v_reinterpret_as_u8(a).val, qperm), 2);
}
inline int v_signmask(const v_int8x16& a)
{ return v_signmask(v_reinterpret_as_u8(a)); }
inline int v_signmask(const v_int16x8& a)
{
static const vec_ushort8 slm = {0, 1, 2, 3, 4, 5, 6, 7};
vec_short8 sv = vec_sr(a.val, vec_ushort8_sp(15));
sv = vec_sl(sv, slm);
vec_int4 svi = vec_int4_z;
svi = vec_sums(vec_sum4s(sv, svi), svi);
return vec_extract(svi, 3);
static const vec_uchar16 qperm = {112, 96, 80, 64, 48, 32, 16, 0, 128, 128, 128, 128, 128, 128, 128, 128};
return vec_extract((vec_int4)vec_vbpermq(v_reinterpret_as_u8(a).val, qperm), 2);
}
inline int v_signmask(const v_uint16x8& a)
{ return v_signmask(v_reinterpret_as_s16(a)); }
inline int v_signmask(const v_int32x4& a)
{
static const vec_uint4 slm = {0, 1, 2, 3};
vec_int4 sv = vec_sr(a.val, vec_uint4_sp(31));
sv = vec_sl(sv, slm);
sv = vec_sums(sv, vec_int4_z);
return vec_extract(sv, 3);
static const vec_uchar16 qperm = {96, 64, 32, 0, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128};
return vec_extract((vec_int4)vec_vbpermq(v_reinterpret_as_u8(a).val, qperm), 2);
}
inline int v_signmask(const v_uint32x4& a)
{ return v_signmask(v_reinterpret_as_s32(a)); }
@ -845,15 +990,28 @@ inline int v_signmask(const v_uint64x2& a)
inline int v_signmask(const v_float64x2& a)
{ return v_signmask(v_reinterpret_as_s64(a)); }
inline int v_scan_forward(const v_int8x16& a) { return trailingZeros32(v_signmask(a)); }
inline int v_scan_forward(const v_uint8x16& a) { return trailingZeros32(v_signmask(a)); }
inline int v_scan_forward(const v_int16x8& a) { return trailingZeros32(v_signmask(a)); }
inline int v_scan_forward(const v_uint16x8& a) { return trailingZeros32(v_signmask(a)); }
inline int v_scan_forward(const v_int32x4& a) { return trailingZeros32(v_signmask(a)); }
inline int v_scan_forward(const v_uint32x4& a) { return trailingZeros32(v_signmask(a)); }
inline int v_scan_forward(const v_float32x4& a) { return trailingZeros32(v_signmask(a)); }
inline int v_scan_forward(const v_int64x2& a) { return trailingZeros32(v_signmask(a)); }
inline int v_scan_forward(const v_uint64x2& a) { return trailingZeros32(v_signmask(a)); }
inline int v_scan_forward(const v_float64x2& a) { return trailingZeros32(v_signmask(a)); }
template<typename _Tpvec>
inline bool v_check_all(const _Tpvec& a)
{ return vec_all_lt(a.val, _Tpvec().val); }
{ return vec_all_lt(a.val, _Tpvec::zero().val); }
inline bool v_check_all(const v_uint8x16& a)
{ return v_check_all(v_reinterpret_as_s8(a)); }
inline bool v_check_all(const v_uint16x8& a)
{ return v_check_all(v_reinterpret_as_s16(a)); }
inline bool v_check_all(const v_uint32x4& a)
{ return v_check_all(v_reinterpret_as_s32(a)); }
inline bool v_check_all(const v_uint64x2& a)
{ return v_check_all(v_reinterpret_as_s64(a)); }
inline bool v_check_all(const v_float32x4& a)
{ return v_check_all(v_reinterpret_as_s32(a)); }
inline bool v_check_all(const v_float64x2& a)
@ -861,13 +1019,15 @@ inline bool v_check_all(const v_float64x2& a)
template<typename _Tpvec>
inline bool v_check_any(const _Tpvec& a)
{ return vec_any_lt(a.val, _Tpvec().val); }
{ return vec_any_lt(a.val, _Tpvec::zero().val); }
inline bool v_check_any(const v_uint8x16& a)
{ return v_check_any(v_reinterpret_as_s8(a)); }
inline bool v_check_any(const v_uint16x8& a)
{ return v_check_any(v_reinterpret_as_s16(a)); }
inline bool v_check_any(const v_uint32x4& a)
{ return v_check_any(v_reinterpret_as_s32(a)); }
inline bool v_check_any(const v_uint64x2& a)
{ return v_check_any(v_reinterpret_as_s64(a)); }
inline bool v_check_any(const v_float32x4& a)
{ return v_check_any(v_reinterpret_as_s32(a)); }
inline bool v_check_any(const v_float64x2& a)
@ -994,6 +1154,9 @@ inline v_float64x2 v_cvt_f64(const v_float32x4& a)
inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
{ return v_float64x2(vec_cvfo(vec_mergel(a.val, a.val))); }
inline v_float64x2 v_cvt_f64(const v_int64x2& a)
{ return v_float64x2(vec_ctd(a.val)); }
////////////// Lookup table access ////////////////////
inline v_int8x16 v_lut(const schar* tab, const int* idx)
@ -1205,7 +1368,7 @@ inline v_float32x4 v_load_expand(const float16_t* ptr)
return v_float32x4(vec_extract_fp_from_shorth(vf16));
#elif CV_VSX3 && !defined(CV_COMPILER_VSX_BROKEN_ASM)
vec_float4 vf32;
__asm__ __volatile__ ("xvcvhpsp %x0,%x1" : "=wf" (vf32) : "wa" (vec_mergeh(vf16, vf16)));
__asm__ __volatile__ ("xvcvhpsp %x0,%x1" : "=wa" (vf32) : "wa" (vec_mergeh(vf16, vf16)));
return v_float32x4(vf32);
#else
const vec_int4 z = vec_int4_z, delta = vec_int4_sp(0x38000000);
@ -1227,10 +1390,10 @@ inline v_float32x4 v_load_expand(const float16_t* ptr)
inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
{
// fixme: Is there any buitin op or intrinsic that cover "xvcvsphp"?
// fixme: Is there any builtin op or intrinsic that cover "xvcvsphp"?
#if CV_VSX3 && !defined(CV_COMPILER_VSX_BROKEN_ASM)
vec_ushort8 vf16;
__asm__ __volatile__ ("xvcvsphp %x0,%x1" : "=wa" (vf16) : "wf" (v.val));
__asm__ __volatile__ ("xvcvsphp %x0,%x1" : "=wa" (vf16) : "wa" (v.val));
vec_st_l8(vec_mergesqe(vf16, vf16), ptr);
#else
const vec_int4 signmask = vec_int4_sp(0x80000000);
@ -1264,12 +1427,134 @@ inline void v_cleanup() {}
////////// Matrix operations /////////
//////// Dot Product ////////
// 16 >> 32
inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
{ return v_int32x4(vec_msum(a.val, b.val, vec_int4_z)); }
inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
{ return v_int32x4(vec_msum(a.val, b.val, c.val)); }
// 32 >> 64
inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b)
{
vec_dword2 even = vec_mule(a.val, b.val);
vec_dword2 odd = vec_mulo(a.val, b.val);
return v_int64x2(vec_add(even, odd));
}
inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
{ return v_dotprod(a, b) + c; }
// 8 >> 32
inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
{ return v_uint32x4(vec_msum(a.val, b.val, c.val)); }
inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
{ return v_uint32x4(vec_msum(a.val, b.val, vec_uint4_z)); }
inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
{
const vec_ushort8 eight = vec_ushort8_sp(8);
vec_short8 a0 = vec_sra((vec_short8)vec_sld(a.val, a.val, 1), eight); // even
vec_short8 a1 = vec_sra((vec_short8)a.val, eight); // odd
vec_short8 b0 = vec_sra((vec_short8)vec_sld(b.val, b.val, 1), eight);
vec_short8 b1 = vec_sra((vec_short8)b.val, eight);
return v_int32x4(vec_msum(a0, b0, vec_msum(a1, b1, vec_int4_z)));
}
inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
{
const vec_ushort8 eight = vec_ushort8_sp(8);
vec_short8 a0 = vec_sra((vec_short8)vec_sld(a.val, a.val, 1), eight); // even
vec_short8 a1 = vec_sra((vec_short8)a.val, eight); // odd
vec_short8 b0 = vec_sra((vec_short8)vec_sld(b.val, b.val, 1), eight);
vec_short8 b1 = vec_sra((vec_short8)b.val, eight);
return v_int32x4(vec_msum(a0, b0, vec_msum(a1, b1, c.val)));
}
// 16 >> 64
inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b)
{
const vec_uint4 zero = vec_uint4_z;
vec_uint4 even = vec_mule(a.val, b.val);
vec_uint4 odd = vec_mulo(a.val, b.val);
vec_udword2 e0 = (vec_udword2)vec_mergee(even, zero);
vec_udword2 e1 = (vec_udword2)vec_mergeo(even, zero);
vec_udword2 o0 = (vec_udword2)vec_mergee(odd, zero);
vec_udword2 o1 = (vec_udword2)vec_mergeo(odd, zero);
vec_udword2 s0 = vec_add(e0, o0);
vec_udword2 s1 = vec_add(e1, o1);
return v_uint64x2(vec_add(s0, s1));
}
inline v_uint64x2 v_dotprod_expand(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
{ return v_dotprod_expand(a, b) + c; }
inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b)
{
v_int32x4 prod = v_dotprod(a, b);
v_int64x2 c, d;
v_expand(prod, c, d);
return v_int64x2(vec_add(vec_mergeh(c.val, d.val), vec_mergel(c.val, d.val)));
}
inline v_int64x2 v_dotprod_expand(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
{ return v_dotprod_expand(a, b) + c; }
// 32 >> 64f
inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b)
{ return v_cvt_f64(v_dotprod(a, b)); }
inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
{ return v_dotprod_expand(a, b) + c; }
//////// Fast Dot Product ////////
// 16 >> 32
inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
{ return v_dotprod(a, b); }
inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
{ return v_int32x4(vec_msum(a.val, b.val, vec_int4_z)) + c; }
// 32 >> 64
inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
{ return v_dotprod(a, b); }
inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
{ return v_dotprod(a, b, c); }
// 8 >> 32
inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b)
{ return v_dotprod_expand(a, b); }
inline v_uint32x4 v_dotprod_expand_fast(const v_uint8x16& a, const v_uint8x16& b, const v_uint32x4& c)
{ return v_uint32x4(vec_msum(a.val, b.val, vec_uint4_z)) + c; }
inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b)
{
vec_short8 a0 = vec_unpackh(a.val);
vec_short8 a1 = vec_unpackl(a.val);
vec_short8 b0 = vec_unpackh(b.val);
vec_short8 b1 = vec_unpackl(b.val);
return v_int32x4(vec_msum(a0, b0, vec_msum(a1, b1, vec_int4_z)));
}
inline v_int32x4 v_dotprod_expand_fast(const v_int8x16& a, const v_int8x16& b, const v_int32x4& c)
{ return v_dotprod_expand_fast(a, b) + c; }
// 16 >> 64
inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b)
{ return v_dotprod_expand(a, b); }
inline v_uint64x2 v_dotprod_expand_fast(const v_uint16x8& a, const v_uint16x8& b, const v_uint64x2& c)
{ return v_dotprod_expand(a, b, c); }
inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b)
{
v_int32x4 prod = v_dotprod(a, b);
v_int64x2 c, d;
v_expand(prod, c, d);
return c + d;
}
inline v_int64x2 v_dotprod_expand_fast(const v_int16x8& a, const v_int16x8& b, const v_int64x2& c)
{ return v_dotprod_expand_fast(a, b) + c; }
// 32 >> 64f
inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b)
{ return v_dotprod_expand(a, b); }
inline v_float64x2 v_dotprod_expand_fast(const v_int32x4& a, const v_int32x4& b, const v_float64x2& c)
{ return v_dotprod_expand(a, b, c); }
inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
const v_float32x4& m1, const v_float32x4& m2,
const v_float32x4& m3)
@ -1309,15 +1594,10 @@ OPENCV_HAL_IMPL_VSX_TRANSPOSE4x4(v_uint32x4, vec_uint4)
OPENCV_HAL_IMPL_VSX_TRANSPOSE4x4(v_int32x4, vec_int4)
OPENCV_HAL_IMPL_VSX_TRANSPOSE4x4(v_float32x4, vec_float4)
//! @name Check SIMD support
//! @{
//! @brief Check CPU capability of SIMD operation
static inline bool hasSIMD128()
{
return (CV_CPU_HAS_SUPPORT_VSX) ? true : false;
}
template<int i, typename Tvec>
inline Tvec v_broadcast_element(const Tvec& v)
{ return Tvec(vec_splat(v.val, i)); }
//! @}
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,146 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html
// This header is not standalone. Don't include directly, use "intrin.hpp" instead.
#ifdef OPENCV_HAL_INTRIN_HPP // defined in intrin.hpp
#if CV_SIMD128 || CV_SIMD128_CPP
template<typename _T> struct Type2Vec128_Traits;
#define CV_INTRIN_DEF_TYPE2VEC128_TRAITS(type_, vec_type_) \
template<> struct Type2Vec128_Traits<type_> \
{ \
typedef vec_type_ vec_type; \
}
CV_INTRIN_DEF_TYPE2VEC128_TRAITS(uchar, v_uint8x16);
CV_INTRIN_DEF_TYPE2VEC128_TRAITS(schar, v_int8x16);
CV_INTRIN_DEF_TYPE2VEC128_TRAITS(ushort, v_uint16x8);
CV_INTRIN_DEF_TYPE2VEC128_TRAITS(short, v_int16x8);
CV_INTRIN_DEF_TYPE2VEC128_TRAITS(unsigned, v_uint32x4);
CV_INTRIN_DEF_TYPE2VEC128_TRAITS(int, v_int32x4);
CV_INTRIN_DEF_TYPE2VEC128_TRAITS(float, v_float32x4);
CV_INTRIN_DEF_TYPE2VEC128_TRAITS(uint64, v_uint64x2);
CV_INTRIN_DEF_TYPE2VEC128_TRAITS(int64, v_int64x2);
#if CV_SIMD128_64F
CV_INTRIN_DEF_TYPE2VEC128_TRAITS(double, v_float64x2);
#endif
template<typename _T> static inline
typename Type2Vec128_Traits<_T>::vec_type v_setall(const _T& a);
template<> inline Type2Vec128_Traits< uchar>::vec_type v_setall< uchar>(const uchar& a) { return v_setall_u8(a); }
template<> inline Type2Vec128_Traits< schar>::vec_type v_setall< schar>(const schar& a) { return v_setall_s8(a); }
template<> inline Type2Vec128_Traits<ushort>::vec_type v_setall<ushort>(const ushort& a) { return v_setall_u16(a); }
template<> inline Type2Vec128_Traits< short>::vec_type v_setall< short>(const short& a) { return v_setall_s16(a); }
template<> inline Type2Vec128_Traits< uint>::vec_type v_setall< uint>(const uint& a) { return v_setall_u32(a); }
template<> inline Type2Vec128_Traits< int>::vec_type v_setall< int>(const int& a) { return v_setall_s32(a); }
template<> inline Type2Vec128_Traits<uint64>::vec_type v_setall<uint64>(const uint64& a) { return v_setall_u64(a); }
template<> inline Type2Vec128_Traits< int64>::vec_type v_setall< int64>(const int64& a) { return v_setall_s64(a); }
template<> inline Type2Vec128_Traits< float>::vec_type v_setall< float>(const float& a) { return v_setall_f32(a); }
#if CV_SIMD128_64F
template<> inline Type2Vec128_Traits<double>::vec_type v_setall<double>(const double& a) { return v_setall_f64(a); }
#endif
#endif // SIMD128
#if CV_SIMD256
template<typename _T> struct Type2Vec256_Traits;
#define CV_INTRIN_DEF_TYPE2VEC256_TRAITS(type_, vec_type_) \
template<> struct Type2Vec256_Traits<type_> \
{ \
typedef vec_type_ vec_type; \
}
CV_INTRIN_DEF_TYPE2VEC256_TRAITS(uchar, v_uint8x32);
CV_INTRIN_DEF_TYPE2VEC256_TRAITS(schar, v_int8x32);
CV_INTRIN_DEF_TYPE2VEC256_TRAITS(ushort, v_uint16x16);
CV_INTRIN_DEF_TYPE2VEC256_TRAITS(short, v_int16x16);
CV_INTRIN_DEF_TYPE2VEC256_TRAITS(unsigned, v_uint32x8);
CV_INTRIN_DEF_TYPE2VEC256_TRAITS(int, v_int32x8);
CV_INTRIN_DEF_TYPE2VEC256_TRAITS(float, v_float32x8);
CV_INTRIN_DEF_TYPE2VEC256_TRAITS(uint64, v_uint64x4);
CV_INTRIN_DEF_TYPE2VEC256_TRAITS(int64, v_int64x4);
#if CV_SIMD256_64F
CV_INTRIN_DEF_TYPE2VEC256_TRAITS(double, v_float64x4);
#endif
template<typename _T> static inline
typename Type2Vec256_Traits<_T>::vec_type v256_setall(const _T& a);
template<> inline Type2Vec256_Traits< uchar>::vec_type v256_setall< uchar>(const uchar& a) { return v256_setall_u8(a); }
template<> inline Type2Vec256_Traits< schar>::vec_type v256_setall< schar>(const schar& a) { return v256_setall_s8(a); }
template<> inline Type2Vec256_Traits<ushort>::vec_type v256_setall<ushort>(const ushort& a) { return v256_setall_u16(a); }
template<> inline Type2Vec256_Traits< short>::vec_type v256_setall< short>(const short& a) { return v256_setall_s16(a); }
template<> inline Type2Vec256_Traits< uint>::vec_type v256_setall< uint>(const uint& a) { return v256_setall_u32(a); }
template<> inline Type2Vec256_Traits< int>::vec_type v256_setall< int>(const int& a) { return v256_setall_s32(a); }
template<> inline Type2Vec256_Traits<uint64>::vec_type v256_setall<uint64>(const uint64& a) { return v256_setall_u64(a); }
template<> inline Type2Vec256_Traits< int64>::vec_type v256_setall< int64>(const int64& a) { return v256_setall_s64(a); }
template<> inline Type2Vec256_Traits< float>::vec_type v256_setall< float>(const float& a) { return v256_setall_f32(a); }
#if CV_SIMD256_64F
template<> inline Type2Vec256_Traits<double>::vec_type v256_setall<double>(const double& a) { return v256_setall_f64(a); }
#endif
#endif // SIMD256
#if CV_SIMD512
template<typename _T> struct Type2Vec512_Traits;
#define CV_INTRIN_DEF_TYPE2VEC512_TRAITS(type_, vec_type_) \
template<> struct Type2Vec512_Traits<type_> \
{ \
typedef vec_type_ vec_type; \
}
CV_INTRIN_DEF_TYPE2VEC512_TRAITS(uchar, v_uint8x64);
CV_INTRIN_DEF_TYPE2VEC512_TRAITS(schar, v_int8x64);
CV_INTRIN_DEF_TYPE2VEC512_TRAITS(ushort, v_uint16x32);
CV_INTRIN_DEF_TYPE2VEC512_TRAITS(short, v_int16x32);
CV_INTRIN_DEF_TYPE2VEC512_TRAITS(unsigned, v_uint32x16);
CV_INTRIN_DEF_TYPE2VEC512_TRAITS(int, v_int32x16);
CV_INTRIN_DEF_TYPE2VEC512_TRAITS(float, v_float32x16);
CV_INTRIN_DEF_TYPE2VEC512_TRAITS(uint64, v_uint64x8);
CV_INTRIN_DEF_TYPE2VEC512_TRAITS(int64, v_int64x8);
#if CV_SIMD512_64F
CV_INTRIN_DEF_TYPE2VEC512_TRAITS(double, v_float64x8);
#endif
template<typename _T> static inline
typename Type2Vec512_Traits<_T>::vec_type v512_setall(const _T& a);
template<> inline Type2Vec512_Traits< uchar>::vec_type v512_setall< uchar>(const uchar& a) { return v512_setall_u8(a); }
template<> inline Type2Vec512_Traits< schar>::vec_type v512_setall< schar>(const schar& a) { return v512_setall_s8(a); }
template<> inline Type2Vec512_Traits<ushort>::vec_type v512_setall<ushort>(const ushort& a) { return v512_setall_u16(a); }
template<> inline Type2Vec512_Traits< short>::vec_type v512_setall< short>(const short& a) { return v512_setall_s16(a); }
template<> inline Type2Vec512_Traits< uint>::vec_type v512_setall< uint>(const uint& a) { return v512_setall_u32(a); }
template<> inline Type2Vec512_Traits< int>::vec_type v512_setall< int>(const int& a) { return v512_setall_s32(a); }
template<> inline Type2Vec512_Traits<uint64>::vec_type v512_setall<uint64>(const uint64& a) { return v512_setall_u64(a); }
template<> inline Type2Vec512_Traits< int64>::vec_type v512_setall< int64>(const int64& a) { return v512_setall_s64(a); }
template<> inline Type2Vec512_Traits< float>::vec_type v512_setall< float>(const float& a) { return v512_setall_f32(a); }
#if CV_SIMD512_64F
template<> inline Type2Vec512_Traits<double>::vec_type v512_setall<double>(const double& a) { return v512_setall_f64(a); }
#endif
#endif // SIMD512
#if CV_SIMD_WIDTH == 16
template<typename _T> static inline
typename Type2Vec128_Traits<_T>::vec_type vx_setall(const _T& a) { return v_setall(a); }
#elif CV_SIMD_WIDTH == 32
template<typename _T> static inline
typename Type2Vec256_Traits<_T>::vec_type vx_setall(const _T& a) { return v256_setall(a); }
#elif CV_SIMD_WIDTH == 64
template<typename _T> static inline
typename Type2Vec512_Traits<_T>::vec_type vx_setall(const _T& a) { return v512_setall(a); }
#else
#error "Build configuration error, unsupported CV_SIMD_WIDTH"
#endif
#endif // OPENCV_HAL_INTRIN_HPP

View File

@ -151,7 +151,7 @@ number of components (vectors/matrices) of the outer vector.
In general, type support is limited to cv::Mat types. Other types are forbidden.
But in some cases we need to support passing of custom non-general Mat types, like arrays of cv::KeyPoint, cv::DMatch, etc.
This data is not intented to be interpreted as an image data, or processed somehow like regular cv::Mat.
This data is not intended to be interpreted as an image data, or processed somehow like regular cv::Mat.
To pass such custom type use rawIn() / rawOut() / rawInOut() wrappers.
Custom type is wrapped as Mat-compatible `CV_8UC<N>` values (N = sizeof(T), N <= CV_CN_MAX).
*/
@ -170,7 +170,9 @@ public:
STD_VECTOR = 3 << KIND_SHIFT,
STD_VECTOR_VECTOR = 4 << KIND_SHIFT,
STD_VECTOR_MAT = 5 << KIND_SHIFT,
EXPR = 6 << KIND_SHIFT,
#if OPENCV_ABI_COMPATIBILITY < 500
EXPR = 6 << KIND_SHIFT, //!< removed: https://github.com/opencv/opencv/pull/17046
#endif
OPENGL_BUFFER = 7 << KIND_SHIFT,
CUDA_HOST_MEM = 8 << KIND_SHIFT,
CUDA_GPU_MAT = 9 << KIND_SHIFT,
@ -178,7 +180,9 @@ public:
STD_VECTOR_UMAT =11 << KIND_SHIFT,
STD_BOOL_VECTOR =12 << KIND_SHIFT,
STD_VECTOR_CUDA_GPU_MAT = 13 << KIND_SHIFT,
STD_ARRAY =14 << KIND_SHIFT,
#if OPENCV_ABI_COMPATIBILITY < 500
STD_ARRAY =14 << KIND_SHIFT, //!< removed: https://github.com/opencv/opencv/issues/18897
#endif
STD_ARRAY_MAT =15 << KIND_SHIFT
};
@ -377,6 +381,9 @@ public:
void assign(const std::vector<UMat>& v) const;
void assign(const std::vector<Mat>& v) const;
void move(UMat& u) const;
void move(Mat& m) const;
};
@ -576,24 +583,24 @@ struct CV_EXPORTS UMatData
struct CV_EXPORTS MatSize
{
explicit MatSize(int* _p);
int dims() const;
explicit MatSize(int* _p) CV_NOEXCEPT;
int dims() const CV_NOEXCEPT;
Size operator()() const;
const int& operator[](int i) const;
int& operator[](int i);
operator const int*() const; // TODO OpenCV 4.0: drop this
bool operator == (const MatSize& sz) const;
bool operator != (const MatSize& sz) const;
operator const int*() const CV_NOEXCEPT; // TODO OpenCV 4.0: drop this
bool operator == (const MatSize& sz) const CV_NOEXCEPT;
bool operator != (const MatSize& sz) const CV_NOEXCEPT;
int* p;
};
struct CV_EXPORTS MatStep
{
MatStep();
explicit MatStep(size_t s);
const size_t& operator[](int i) const;
size_t& operator[](int i);
MatStep() CV_NOEXCEPT;
explicit MatStep(size_t s) CV_NOEXCEPT;
const size_t& operator[](int i) const CV_NOEXCEPT;
size_t& operator[](int i) CV_NOEXCEPT;
operator size_t() const;
MatStep& operator = (size_t s);
@ -699,11 +706,16 @@ sub-matrices.
-# Process "foreign" data using OpenCV (for example, when you implement a DirectShow\* filter or
a processing module for gstreamer, and so on). For example:
@code
void process_video_frame(const unsigned char* pixels,
Mat process_video_frame(const unsigned char* pixels,
int width, int height, int step)
{
Mat img(height, width, CV_8UC3, pixels, step);
GaussianBlur(img, img, Size(7,7), 1.5, 1.5);
// wrap input buffer
Mat img(height, width, CV_8UC3, (unsigned char*)pixels, step);
Mat result;
GaussianBlur(img, result, Size(7, 7), 1.5, 1.5);
return result;
}
@endcode
-# Quickly initialize small matrices and/or get a super-fast element access.
@ -807,7 +819,7 @@ public:
The constructed matrix can further be assigned to another matrix or matrix expression or can be
allocated with Mat::create . In the former case, the old content is de-referenced.
*/
Mat();
Mat() CV_NOEXCEPT;
/** @overload
@param rows Number of rows in a 2D array.
@ -2208,7 +2220,7 @@ public:
typedef MatConstIterator_<_Tp> const_iterator;
//! default constructor
Mat_();
Mat_() CV_NOEXCEPT;
//! equivalent to Mat(_rows, _cols, DataType<_Tp>::type)
Mat_(int _rows, int _cols);
//! constructor that sets each matrix element to specified value
@ -2408,12 +2420,12 @@ class CV_EXPORTS UMat
{
public:
//! default constructor
UMat(UMatUsageFlags usageFlags = USAGE_DEFAULT);
UMat(UMatUsageFlags usageFlags = USAGE_DEFAULT) CV_NOEXCEPT;
//! constructs 2D matrix of the specified size and type
// (_type is CV_8UC1, CV_64FC3, CV_32SC(12) etc.)
UMat(int rows, int cols, int type, UMatUsageFlags usageFlags = USAGE_DEFAULT);
UMat(Size size, int type, UMatUsageFlags usageFlags = USAGE_DEFAULT);
//! constucts 2D matrix and fills it with the specified value _s.
//! constructs 2D matrix and fills it with the specified value _s.
UMat(int rows, int cols, int type, const Scalar& s, UMatUsageFlags usageFlags = USAGE_DEFAULT);
UMat(Size size, int type, const Scalar& s, UMatUsageFlags usageFlags = USAGE_DEFAULT);
@ -2429,20 +2441,11 @@ public:
UMat(const UMat& m, const Rect& roi);
UMat(const UMat& m, const Range* ranges);
UMat(const UMat& m, const std::vector<Range>& ranges);
// FIXIT copyData=false is not implemented, drop this in favor of cv::Mat (OpenCV 5.0)
//! builds matrix from std::vector with or without copying the data
template<typename _Tp> explicit UMat(const std::vector<_Tp>& vec, bool copyData=false);
//! builds matrix from cv::Vec; the data is copied by default
template<typename _Tp, int n> explicit UMat(const Vec<_Tp, n>& vec, bool copyData=true);
//! builds matrix from cv::Matx; the data is copied by default
template<typename _Tp, int m, int n> explicit UMat(const Matx<_Tp, m, n>& mtx, bool copyData=true);
//! builds matrix from a 2D point
template<typename _Tp> explicit UMat(const Point_<_Tp>& pt, bool copyData=true);
//! builds matrix from a 3D point
template<typename _Tp> explicit UMat(const Point3_<_Tp>& pt, bool copyData=true);
//! builds matrix from comma initializer
template<typename _Tp> explicit UMat(const MatCommaInitializer_<_Tp>& commaInitializer);
//! destructor - calls release()
~UMat();
//! assignment operators
@ -2860,7 +2863,7 @@ public:
`ref<_Tp>(i0,...[,hashval])` is equivalent to `*(_Tp*)ptr(i0,...,true[,hashval])`.
The methods always return a valid reference.
If the element did not exist, it is created and initialiazed with 0.
If the element did not exist, it is created and initialized with 0.
*/
//! returns reference to the specified element (1D case)
template<typename _Tp> _Tp& ref(int i0, size_t* hashval=0);
@ -3577,6 +3580,8 @@ public:
Mat cross(const Mat& m) const;
double dot(const Mat& m) const;
void swap(MatExpr& b);
const MatOp* op;
int flags;

View File

@ -54,6 +54,21 @@
#pragma warning( disable: 4127 )
#endif
#if defined(CV_SKIP_DISABLE_CLANG_ENUM_WARNINGS)
// nothing
#elif defined(CV_FORCE_DISABLE_CLANG_ENUM_WARNINGS)
#define CV_DISABLE_CLANG_ENUM_WARNINGS
#elif defined(__clang__) && defined(__has_warning)
#if __has_warning("-Wdeprecated-enum-enum-conversion") && __has_warning("-Wdeprecated-anon-enum-enum-conversion")
#define CV_DISABLE_CLANG_ENUM_WARNINGS
#endif
#endif
#ifdef CV_DISABLE_CLANG_ENUM_WARNINGS
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wdeprecated-enum-enum-conversion"
#pragma clang diagnostic ignored "-Wdeprecated-anon-enum-enum-conversion"
#endif
namespace cv
{
CV__DEBUG_NS_BEGIN
@ -97,7 +112,7 @@ _InputArray::_InputArray(const std::vector<_Tp>& vec)
#ifdef CV_CXX_STD_ARRAY
template<typename _Tp, std::size_t _Nm> inline
_InputArray::_InputArray(const std::array<_Tp, _Nm>& arr)
{ init(FIXED_TYPE + FIXED_SIZE + STD_ARRAY + traits::Type<_Tp>::value + ACCESS_READ, arr.data(), Size(1, _Nm)); }
{ init(FIXED_TYPE + FIXED_SIZE + MATX + traits::Type<_Tp>::value + ACCESS_READ, arr.data(), Size(1, _Nm)); }
template<std::size_t _Nm> inline
_InputArray::_InputArray(const std::array<Mat, _Nm>& arr)
@ -135,9 +150,6 @@ _InputArray::_InputArray(const Mat_<_Tp>& m)
inline _InputArray::_InputArray(const double& val)
{ init(FIXED_TYPE + FIXED_SIZE + MATX + CV_64F + ACCESS_READ, &val, Size(1,1)); }
inline _InputArray::_InputArray(const MatExpr& expr)
{ init(FIXED_TYPE + FIXED_SIZE + EXPR + ACCESS_READ, &expr); }
inline _InputArray::_InputArray(const cuda::GpuMat& d_mat)
{ init(CUDA_GPU_MAT + ACCESS_READ, &d_mat); }
@ -164,7 +176,7 @@ template<typename _Tp, std::size_t _Nm> inline
_InputArray _InputArray::rawIn(const std::array<_Tp, _Nm>& arr)
{
_InputArray v;
v.flags = FIXED_TYPE + FIXED_SIZE + STD_ARRAY + traits::Type<_Tp>::value + ACCESS_READ;
v.flags = FIXED_TYPE + FIXED_SIZE + MATX + traits::Type<_Tp>::value + ACCESS_READ;
v.obj = (void*)arr.data();
v.sz = Size(1, _Nm);
return v;
@ -187,7 +199,7 @@ inline bool _InputArray::isUMatVector() const { return kind() == _InputArray::S
inline bool _InputArray::isMatx() const { return kind() == _InputArray::MATX; }
inline bool _InputArray::isVector() const { return kind() == _InputArray::STD_VECTOR ||
kind() == _InputArray::STD_BOOL_VECTOR ||
kind() == _InputArray::STD_ARRAY; }
(kind() == _InputArray::MATX && (sz.width <= 1 || sz.height <= 1)); }
inline bool _InputArray::isGpuMat() const { return kind() == _InputArray::CUDA_GPU_MAT; }
inline bool _InputArray::isGpuMatVector() const { return kind() == _InputArray::STD_VECTOR_CUDA_GPU_MAT; }
@ -207,7 +219,7 @@ _OutputArray::_OutputArray(std::vector<_Tp>& vec)
#ifdef CV_CXX_STD_ARRAY
template<typename _Tp, std::size_t _Nm> inline
_OutputArray::_OutputArray(std::array<_Tp, _Nm>& arr)
{ init(FIXED_TYPE + FIXED_SIZE + STD_ARRAY + traits::Type<_Tp>::value + ACCESS_WRITE, arr.data(), Size(1, _Nm)); }
{ init(FIXED_TYPE + FIXED_SIZE + MATX + traits::Type<_Tp>::value + ACCESS_WRITE, arr.data(), Size(1, _Nm)); }
template<std::size_t _Nm> inline
_OutputArray::_OutputArray(std::array<Mat, _Nm>& arr)
@ -249,7 +261,7 @@ _OutputArray::_OutputArray(const std::vector<_Tp>& vec)
#ifdef CV_CXX_STD_ARRAY
template<typename _Tp, std::size_t _Nm> inline
_OutputArray::_OutputArray(const std::array<_Tp, _Nm>& arr)
{ init(FIXED_TYPE + FIXED_SIZE + STD_ARRAY + traits::Type<_Tp>::value + ACCESS_WRITE, arr.data(), Size(1, _Nm)); }
{ init(FIXED_TYPE + FIXED_SIZE + MATX + traits::Type<_Tp>::value + ACCESS_WRITE, arr.data(), Size(1, _Nm)); }
template<std::size_t _Nm> inline
_OutputArray::_OutputArray(const std::array<Mat, _Nm>& arr)
@ -324,7 +336,7 @@ template<typename _Tp, std::size_t _Nm> inline
_OutputArray _OutputArray::rawOut(std::array<_Tp, _Nm>& arr)
{
_OutputArray v;
v.flags = FIXED_TYPE + FIXED_SIZE + STD_ARRAY + traits::Type<_Tp>::value + ACCESS_WRITE;
v.flags = FIXED_TYPE + FIXED_SIZE + MATX + traits::Type<_Tp>::value + ACCESS_WRITE;
v.obj = (void*)arr.data();
v.sz = Size(1, _Nm);
return v;
@ -347,7 +359,7 @@ _InputOutputArray::_InputOutputArray(std::vector<_Tp>& vec)
#ifdef CV_CXX_STD_ARRAY
template<typename _Tp, std::size_t _Nm> inline
_InputOutputArray::_InputOutputArray(std::array<_Tp, _Nm>& arr)
{ init(FIXED_TYPE + FIXED_SIZE + STD_ARRAY + traits::Type<_Tp>::value + ACCESS_RW, arr.data(), Size(1, _Nm)); }
{ init(FIXED_TYPE + FIXED_SIZE + MATX + traits::Type<_Tp>::value + ACCESS_RW, arr.data(), Size(1, _Nm)); }
template<std::size_t _Nm> inline
_InputOutputArray::_InputOutputArray(std::array<Mat, _Nm>& arr)
@ -384,7 +396,7 @@ _InputOutputArray::_InputOutputArray(const std::vector<_Tp>& vec)
#ifdef CV_CXX_STD_ARRAY
template<typename _Tp, std::size_t _Nm> inline
_InputOutputArray::_InputOutputArray(const std::array<_Tp, _Nm>& arr)
{ init(FIXED_TYPE + FIXED_SIZE + STD_ARRAY + traits::Type<_Tp>::value + ACCESS_RW, arr.data(), Size(1, _Nm)); }
{ init(FIXED_TYPE + FIXED_SIZE + MATX + traits::Type<_Tp>::value + ACCESS_RW, arr.data(), Size(1, _Nm)); }
template<std::size_t _Nm> inline
_InputOutputArray::_InputOutputArray(const std::array<Mat, _Nm>& arr)
@ -461,7 +473,7 @@ template<typename _Tp, std::size_t _Nm> inline
_InputOutputArray _InputOutputArray::rawInOut(std::array<_Tp, _Nm>& arr)
{
_InputOutputArray v;
v.flags = FIXED_TYPE + FIXED_SIZE + STD_ARRAY + traits::Type<_Tp>::value + ACCESS_RW;
v.flags = FIXED_TYPE + FIXED_SIZE + MATX + traits::Type<_Tp>::value + ACCESS_RW;
v.obj = (void*)arr.data();
v.sz = Size(1, _Nm);
return v;
@ -477,158 +489,6 @@ CV__DEBUG_NS_END
//////////////////////////////////////////// Mat //////////////////////////////////////////
inline
Mat::Mat()
: flags(MAGIC_VAL), dims(0), rows(0), cols(0), data(0), datastart(0), dataend(0),
datalimit(0), allocator(0), u(0), size(&rows), step(0)
{}
inline
Mat::Mat(int _rows, int _cols, int _type)
: flags(MAGIC_VAL), dims(0), rows(0), cols(0), data(0), datastart(0), dataend(0),
datalimit(0), allocator(0), u(0), size(&rows), step(0)
{
create(_rows, _cols, _type);
}
inline
Mat::Mat(int _rows, int _cols, int _type, const Scalar& _s)
: flags(MAGIC_VAL), dims(0), rows(0), cols(0), data(0), datastart(0), dataend(0),
datalimit(0), allocator(0), u(0), size(&rows), step(0)
{
create(_rows, _cols, _type);
*this = _s;
}
inline
Mat::Mat(Size _sz, int _type)
: flags(MAGIC_VAL), dims(0), rows(0), cols(0), data(0), datastart(0), dataend(0),
datalimit(0), allocator(0), u(0), size(&rows), step(0)
{
create( _sz.height, _sz.width, _type );
}
inline
Mat::Mat(Size _sz, int _type, const Scalar& _s)
: flags(MAGIC_VAL), dims(0), rows(0), cols(0), data(0), datastart(0), dataend(0),
datalimit(0), allocator(0), u(0), size(&rows), step(0)
{
create(_sz.height, _sz.width, _type);
*this = _s;
}
inline
Mat::Mat(int _dims, const int* _sz, int _type)
: flags(MAGIC_VAL), dims(0), rows(0), cols(0), data(0), datastart(0), dataend(0),
datalimit(0), allocator(0), u(0), size(&rows), step(0)
{
create(_dims, _sz, _type);
}
inline
Mat::Mat(int _dims, const int* _sz, int _type, const Scalar& _s)
: flags(MAGIC_VAL), dims(0), rows(0), cols(0), data(0), datastart(0), dataend(0),
datalimit(0), allocator(0), u(0), size(&rows), step(0)
{
create(_dims, _sz, _type);
*this = _s;
}
inline
Mat::Mat(const std::vector<int>& _sz, int _type)
: flags(MAGIC_VAL), dims(0), rows(0), cols(0), data(0), datastart(0), dataend(0),
datalimit(0), allocator(0), u(0), size(&rows), step(0)
{
create(_sz, _type);
}
inline
Mat::Mat(const std::vector<int>& _sz, int _type, const Scalar& _s)
: flags(MAGIC_VAL), dims(0), rows(0), cols(0), data(0), datastart(0), dataend(0),
datalimit(0), allocator(0), u(0), size(&rows), step(0)
{
create(_sz, _type);
*this = _s;
}
inline
Mat::Mat(const Mat& m)
: flags(m.flags), dims(m.dims), rows(m.rows), cols(m.cols), data(m.data),
datastart(m.datastart), dataend(m.dataend), datalimit(m.datalimit), allocator(m.allocator),
u(m.u), size(&rows), step(0)
{
if( u )
CV_XADD(&u->refcount, 1);
if( m.dims <= 2 )
{
step[0] = m.step[0]; step[1] = m.step[1];
}
else
{
dims = 0;
copySize(m);
}
}
inline
Mat::Mat(int _rows, int _cols, int _type, void* _data, size_t _step)
: flags(MAGIC_VAL + (_type & TYPE_MASK)), dims(2), rows(_rows), cols(_cols),
data((uchar*)_data), datastart((uchar*)_data), dataend(0), datalimit(0),
allocator(0), u(0), size(&rows)
{
CV_Assert(total() == 0 || data != NULL);
size_t esz = CV_ELEM_SIZE(_type), esz1 = CV_ELEM_SIZE1(_type);
size_t minstep = cols * esz;
if( _step == AUTO_STEP )
{
_step = minstep;
}
else
{
CV_DbgAssert( _step >= minstep );
if (_step % esz1 != 0)
{
CV_Error(Error::BadStep, "Step must be a multiple of esz1");
}
}
step[0] = _step;
step[1] = esz;
datalimit = datastart + _step * rows;
dataend = datalimit - _step + minstep;
updateContinuityFlag();
}
inline
Mat::Mat(Size _sz, int _type, void* _data, size_t _step)
: flags(MAGIC_VAL + (_type & TYPE_MASK)), dims(2), rows(_sz.height), cols(_sz.width),
data((uchar*)_data), datastart((uchar*)_data), dataend(0), datalimit(0),
allocator(0), u(0), size(&rows)
{
CV_Assert(total() == 0 || data != NULL);
size_t esz = CV_ELEM_SIZE(_type), esz1 = CV_ELEM_SIZE1(_type);
size_t minstep = cols*esz;
if( _step == AUTO_STEP )
{
_step = minstep;
}
else
{
CV_DbgAssert( _step >= minstep );
if (_step % esz1 != 0)
{
CV_Error(Error::BadStep, "Step must be a multiple of esz1");
}
}
step[0] = _step;
step[1] = esz;
datalimit = datastart + _step*rows;
dataend = datalimit - _step + minstep;
updateContinuityFlag();
}
template<typename _Tp> inline
Mat::Mat(const std::vector<_Tp>& vec, bool copyData)
: flags(MAGIC_VAL | traits::Type<_Tp>::value | CV_MAT_CONT_FLAG), dims(2), rows((int)vec.size()),
@ -766,43 +626,6 @@ Mat::Mat(const MatCommaInitializer_<_Tp>& commaInitializer)
*this = commaInitializer.operator Mat_<_Tp>();
}
inline
Mat::~Mat()
{
release();
if( step.p != step.buf )
fastFree(step.p);
}
inline
Mat& Mat::operator = (const Mat& m)
{
if( this != &m )
{
if( m.u )
CV_XADD(&m.u->refcount, 1);
release();
flags = m.flags;
if( dims <= 2 && m.dims <= 2 )
{
dims = m.dims;
rows = m.rows;
cols = m.cols;
step[0] = m.step[0];
step[1] = m.step[1];
}
else
copySize(m);
data = m.data;
datastart = m.datastart;
dataend = m.dataend;
datalimit = m.datalimit;
allocator = m.allocator;
u = m.u;
}
return *this;
}
inline
Mat Mat::row(int y) const
{
@ -839,67 +662,6 @@ Mat Mat::colRange(const Range& r) const
return Mat(*this, Range::all(), r);
}
inline
Mat Mat::clone() const
{
Mat m;
copyTo(m);
return m;
}
inline
void Mat::assignTo( Mat& m, int _type ) const
{
if( _type < 0 )
m = *this;
else
convertTo(m, _type);
}
inline
void Mat::create(int _rows, int _cols, int _type)
{
_type &= TYPE_MASK;
if( dims <= 2 && rows == _rows && cols == _cols && type() == _type && data )
return;
int sz[] = {_rows, _cols};
create(2, sz, _type);
}
inline
void Mat::create(Size _sz, int _type)
{
create(_sz.height, _sz.width, _type);
}
inline
void Mat::addref()
{
if( u )
CV_XADD(&u->refcount, 1);
}
inline
void Mat::release()
{
if( u && CV_XADD(&u->refcount, -1) == 1 )
deallocate();
u = NULL;
datastart = dataend = datalimit = data = 0;
for(int i = 0; i < dims; i++)
size.p[i] = 0;
#ifdef _DEBUG
flags = MAGIC_VAL;
dims = rows = cols = 0;
if(step.p != step.buf)
{
fastFree(step.p);
step.p = step.buf;
size.p = &rows;
}
#endif
}
inline
Mat Mat::operator()( Range _rowRange, Range _colRange ) const
{
@ -968,40 +730,6 @@ int Mat::channels() const
return CV_MAT_CN(flags);
}
inline
size_t Mat::step1(int i) const
{
return step.p[i] / elemSize1();
}
inline
bool Mat::empty() const
{
return data == 0 || total() == 0 || dims == 0;
}
inline
size_t Mat::total() const
{
if( dims <= 2 )
return (size_t)rows * cols;
size_t p = 1;
for( int i = 0; i < dims; i++ )
p *= size[i];
return p;
}
inline
size_t Mat::total(int startDim, int endDim) const
{
CV_Assert( 0 <= startDim && startDim <= endDim);
size_t p = 1;
int endDim_ = endDim <= dims ? endDim : dims;
for( int i = startDim; i < endDim_; i++ )
p *= size[i];
return p;
}
inline
uchar* Mat::ptr(int y)
{
@ -1289,6 +1017,8 @@ const _Tp& Mat::at(const Vec<int, n>& idx) const
template<typename _Tp> inline
MatConstIterator_<_Tp> Mat::begin() const
{
if (empty())
return MatConstIterator_<_Tp>();
CV_DbgAssert( elemSize() == sizeof(_Tp) );
return MatConstIterator_<_Tp>((const Mat_<_Tp>*)this);
}
@ -1296,6 +1026,8 @@ MatConstIterator_<_Tp> Mat::begin() const
template<typename _Tp> inline
MatConstIterator_<_Tp> Mat::end() const
{
if (empty())
return MatConstIterator_<_Tp>();
CV_DbgAssert( elemSize() == sizeof(_Tp) );
MatConstIterator_<_Tp> it((const Mat_<_Tp>*)this);
it += total();
@ -1305,6 +1037,8 @@ MatConstIterator_<_Tp> Mat::end() const
template<typename _Tp> inline
MatIterator_<_Tp> Mat::begin()
{
if (empty())
return MatIterator_<_Tp>();
CV_DbgAssert( elemSize() == sizeof(_Tp) );
return MatIterator_<_Tp>((Mat_<_Tp>*)this);
}
@ -1312,6 +1046,8 @@ MatIterator_<_Tp> Mat::begin()
template<typename _Tp> inline
MatIterator_<_Tp> Mat::end()
{
if (empty())
return MatIterator_<_Tp>();
CV_DbgAssert( elemSize() == sizeof(_Tp) );
MatIterator_<_Tp> it((Mat_<_Tp>*)this);
it += total();
@ -1482,11 +1218,11 @@ Mat& Mat::operator = (Mat&& m)
///////////////////////////// MatSize ////////////////////////////
inline
MatSize::MatSize(int* _p)
MatSize::MatSize(int* _p) CV_NOEXCEPT
: p(_p) {}
inline
int MatSize::dims() const
int MatSize::dims() const CV_NOEXCEPT
{
return (p - 1)[0];
}
@ -1519,29 +1255,13 @@ int& MatSize::operator[](int i)
}
inline
MatSize::operator const int*() const
MatSize::operator const int*() const CV_NOEXCEPT
{
return p;
}
inline
bool MatSize::operator == (const MatSize& sz) const
{
int d = dims();
int dsz = sz.dims();
if( d != dsz )
return false;
if( d == 2 )
return p[0] == sz.p[0] && p[1] == sz.p[1];
for( int i = 0; i < d; i++ )
if( p[i] != sz.p[i] )
return false;
return true;
}
inline
bool MatSize::operator != (const MatSize& sz) const
bool MatSize::operator != (const MatSize& sz) const CV_NOEXCEPT
{
return !(*this == sz);
}
@ -1551,25 +1271,25 @@ bool MatSize::operator != (const MatSize& sz) const
///////////////////////////// MatStep ////////////////////////////
inline
MatStep::MatStep()
MatStep::MatStep() CV_NOEXCEPT
{
p = buf; p[0] = p[1] = 0;
}
inline
MatStep::MatStep(size_t s)
MatStep::MatStep(size_t s) CV_NOEXCEPT
{
p = buf; p[0] = s; p[1] = 0;
}
inline
const size_t& MatStep::operator[](int i) const
const size_t& MatStep::operator[](int i) const CV_NOEXCEPT
{
return p[i];
}
inline
size_t& MatStep::operator[](int i)
size_t& MatStep::operator[](int i) CV_NOEXCEPT
{
return p[i];
}
@ -1592,7 +1312,7 @@ inline MatStep& MatStep::operator = (size_t s)
////////////////////////////// Mat_<_Tp> ////////////////////////////
template<typename _Tp> inline
Mat_<_Tp>::Mat_()
Mat_<_Tp>::Mat_() CV_NOEXCEPT
: Mat()
{
flags = (flags & ~CV_MAT_TYPE_MASK) | traits::Type<_Tp>::value;
@ -1744,6 +1464,11 @@ Mat_<_Tp>::Mat_(const std::array<_Tp, _Nm>& arr, bool copyData)
template<typename _Tp> inline
Mat_<_Tp>& Mat_<_Tp>::operator = (const Mat& m)
{
if (m.empty())
{
release();
return *this;
}
if( traits::Type<_Tp>::value == m.type() )
{
Mat::operator = (m);
@ -1795,9 +1520,7 @@ template<typename _Tp> inline
void Mat_<_Tp>::release()
{
Mat::release();
#ifdef _DEBUG
flags = (flags & ~CV_MAT_TYPE_MASK) | traits::Type<_Tp>::value;
#endif
}
template<typename _Tp> inline
@ -1809,7 +1532,7 @@ Mat_<_Tp> Mat_<_Tp>::cross(const Mat_& m) const
template<typename _Tp> template<typename T2> inline
Mat_<_Tp>::operator Mat_<T2>() const
{
return Mat_<T2>(*this);
return Mat_<T2>(static_cast<const Mat&>(*this));
}
template<typename _Tp> inline
@ -2103,7 +1826,7 @@ void Mat_<_Tp>::forEach(const Functor& operation) const {
template<typename _Tp> inline
Mat_<_Tp>::Mat_(Mat_&& m)
: Mat(m)
: Mat(std::move(m))
{
}
@ -2119,12 +1842,17 @@ Mat_<_Tp>::Mat_(Mat&& m)
: Mat()
{
flags = (flags & ~CV_MAT_TYPE_MASK) | traits::Type<_Tp>::value;
*this = m;
*this = std::move(m);
}
template<typename _Tp> inline
Mat_<_Tp>& Mat_<_Tp>::operator = (Mat&& m)
{
if (m.empty())
{
release();
return *this;
}
if( traits::Type<_Tp>::value == m.type() )
{
Mat::operator = ((Mat&&)m);
@ -2152,51 +1880,6 @@ Mat_<_Tp>::Mat_(MatExpr&& e)
///////////////////////////// SparseMat /////////////////////////////
inline
SparseMat::SparseMat()
: flags(MAGIC_VAL), hdr(0)
{}
inline
SparseMat::SparseMat(int _dims, const int* _sizes, int _type)
: flags(MAGIC_VAL), hdr(0)
{
create(_dims, _sizes, _type);
}
inline
SparseMat::SparseMat(const SparseMat& m)
: flags(m.flags), hdr(m.hdr)
{
addref();
}
inline
SparseMat::~SparseMat()
{
release();
}
inline
SparseMat& SparseMat::operator = (const SparseMat& m)
{
if( this != &m )
{
if( m.hdr )
CV_XADD(&m.hdr->refcount, 1);
release();
flags = m.flags;
hdr = m.hdr;
}
return *this;
}
inline
SparseMat& SparseMat::operator = (const Mat& m)
{
return (*this = SparseMat(m));
}
inline
SparseMat SparseMat::clone() const
{
@ -2205,30 +1888,6 @@ SparseMat SparseMat::clone() const
return temp;
}
inline
void SparseMat::assignTo( SparseMat& m, int _type ) const
{
if( _type < 0 )
m = *this;
else
convertTo(m, _type);
}
inline
void SparseMat::addref()
{
if( hdr )
CV_XADD(&hdr->refcount, 1);
}
inline
void SparseMat::release()
{
if( hdr && CV_XADD(&hdr->refcount, -1) == 1 )
delete hdr;
hdr = 0;
}
inline
size_t SparseMat::elemSize() const
{
@ -2288,36 +1947,6 @@ size_t SparseMat::nzcount() const
return hdr ? hdr->nodeCount : 0;
}
inline
size_t SparseMat::hash(int i0) const
{
return (size_t)i0;
}
inline
size_t SparseMat::hash(int i0, int i1) const
{
return (size_t)(unsigned)i0 * HASH_SCALE + (unsigned)i1;
}
inline
size_t SparseMat::hash(int i0, int i1, int i2) const
{
return ((size_t)(unsigned)i0 * HASH_SCALE + (unsigned)i1) * HASH_SCALE + (unsigned)i2;
}
inline
size_t SparseMat::hash(const int* idx) const
{
size_t h = (unsigned)idx[0];
if( !hdr )
return 0;
int d = hdr->dims;
for(int i = 1; i < d; i++ )
h = h * HASH_SCALE + (unsigned)idx[i];
return h;
}
template<typename _Tp> inline
_Tp& SparseMat::ref(int i0, size_t* hashval)
{
@ -2665,6 +2294,7 @@ MatConstIterator::MatConstIterator(const Mat* _m)
{
if( m && m->isContinuous() )
{
CV_Assert(!m->empty());
sliceStart = m->ptr();
sliceEnd = sliceStart + m->total()*elemSize;
}
@ -2678,6 +2308,7 @@ MatConstIterator::MatConstIterator(const Mat* _m, int _row, int _col)
CV_Assert(m && m->dims <= 2);
if( m->isContinuous() )
{
CV_Assert(!m->empty());
sliceStart = m->ptr();
sliceEnd = sliceStart + m->total()*elemSize;
}
@ -2692,6 +2323,7 @@ MatConstIterator::MatConstIterator(const Mat* _m, Point _pt)
CV_Assert(m && m->dims <= 2);
if( m->isContinuous() )
{
CV_Assert(!m->empty());
sliceStart = m->ptr();
sliceEnd = sliceStart + m->total()*elemSize;
}
@ -3634,74 +3266,6 @@ const Mat_<_Tp>& operator /= (const Mat_<_Tp>& a, const MatExpr& b)
//////////////////////////////// UMat ////////////////////////////////
inline
UMat::UMat(UMatUsageFlags _usageFlags)
: flags(MAGIC_VAL), dims(0), rows(0), cols(0), allocator(0), usageFlags(_usageFlags), u(0), offset(0), size(&rows)
{}
inline
UMat::UMat(int _rows, int _cols, int _type, UMatUsageFlags _usageFlags)
: flags(MAGIC_VAL), dims(0), rows(0), cols(0), allocator(0), usageFlags(_usageFlags), u(0), offset(0), size(&rows)
{
create(_rows, _cols, _type);
}
inline
UMat::UMat(int _rows, int _cols, int _type, const Scalar& _s, UMatUsageFlags _usageFlags)
: flags(MAGIC_VAL), dims(0), rows(0), cols(0), allocator(0), usageFlags(_usageFlags), u(0), offset(0), size(&rows)
{
create(_rows, _cols, _type);
*this = _s;
}
inline
UMat::UMat(Size _sz, int _type, UMatUsageFlags _usageFlags)
: flags(MAGIC_VAL), dims(0), rows(0), cols(0), allocator(0), usageFlags(_usageFlags), u(0), offset(0), size(&rows)
{
create( _sz.height, _sz.width, _type );
}
inline
UMat::UMat(Size _sz, int _type, const Scalar& _s, UMatUsageFlags _usageFlags)
: flags(MAGIC_VAL), dims(0), rows(0), cols(0), allocator(0), usageFlags(_usageFlags), u(0), offset(0), size(&rows)
{
create(_sz.height, _sz.width, _type);
*this = _s;
}
inline
UMat::UMat(int _dims, const int* _sz, int _type, UMatUsageFlags _usageFlags)
: flags(MAGIC_VAL), dims(0), rows(0), cols(0), allocator(0), usageFlags(_usageFlags), u(0), offset(0), size(&rows)
{
create(_dims, _sz, _type);
}
inline
UMat::UMat(int _dims, const int* _sz, int _type, const Scalar& _s, UMatUsageFlags _usageFlags)
: flags(MAGIC_VAL), dims(0), rows(0), cols(0), allocator(0), usageFlags(_usageFlags), u(0), offset(0), size(&rows)
{
create(_dims, _sz, _type);
*this = _s;
}
inline
UMat::UMat(const UMat& m)
: flags(m.flags), dims(m.dims), rows(m.rows), cols(m.cols), allocator(m.allocator),
usageFlags(m.usageFlags), u(m.u), offset(m.offset), size(&rows)
{
addref();
if( m.dims <= 2 )
{
step[0] = m.step[0]; step[1] = m.step[1];
}
else
{
dims = 0;
copySize(m);
}
}
template<typename _Tp> inline
UMat::UMat(const std::vector<_Tp>& vec, bool copyData)
: flags(MAGIC_VAL | traits::Type<_Tp>::value | CV_MAT_CONT_FLAG), dims(2), rows((int)vec.size()),
@ -3718,33 +3282,6 @@ cols(1), allocator(0), usageFlags(USAGE_DEFAULT), u(0), offset(0), size(&rows)
Mat((int)vec.size(), 1, traits::Type<_Tp>::value, (uchar*)&vec[0]).copyTo(*this);
}
inline
UMat& UMat::operator = (const UMat& m)
{
if( this != &m )
{
const_cast<UMat&>(m).addref();
release();
flags = m.flags;
if( dims <= 2 && m.dims <= 2 )
{
dims = m.dims;
rows = m.rows;
cols = m.cols;
step[0] = m.step[0];
step[1] = m.step[1];
}
else
copySize(m);
allocator = m.allocator;
if (usageFlags == USAGE_DEFAULT)
usageFlags = m.usageFlags;
u = m.u;
offset = m.offset;
}
return *this;
}
inline
UMat UMat::row(int y) const
{
@ -3781,55 +3318,6 @@ UMat UMat::colRange(const Range& r) const
return UMat(*this, Range::all(), r);
}
inline
UMat UMat::clone() const
{
UMat m;
copyTo(m);
return m;
}
inline
void UMat::assignTo( UMat& m, int _type ) const
{
if( _type < 0 )
m = *this;
else
convertTo(m, _type);
}
inline
void UMat::create(int _rows, int _cols, int _type, UMatUsageFlags _usageFlags)
{
_type &= TYPE_MASK;
if( dims <= 2 && rows == _rows && cols == _cols && type() == _type && u )
return;
int sz[] = {_rows, _cols};
create(2, sz, _type, _usageFlags);
}
inline
void UMat::create(Size _sz, int _type, UMatUsageFlags _usageFlags)
{
create(_sz.height, _sz.width, _type, _usageFlags);
}
inline
void UMat::addref()
{
if( u )
CV_XADD(&(u->urefcount), 1);
}
inline void UMat::release()
{
if( u && CV_XADD(&(u->urefcount), -1) == 1 )
deallocate();
for(int i = 0; i < dims; i++)
size.p[i] = 0;
u = 0;
}
inline
UMat UMat::operator()( Range _rowRange, Range _colRange ) const
{
@ -3904,23 +3392,6 @@ size_t UMat::step1(int i) const
return step.p[i] / elemSize1();
}
inline
bool UMat::empty() const
{
return u == 0 || total() == 0 || dims == 0;
}
inline
size_t UMat::total() const
{
if( dims <= 2 )
return (size_t)rows * cols;
size_t p = 1;
for( int i = 0; i < dims; i++ )
p *= size[i];
return p;
}
#ifdef CV_CXX_MOVE_SEMANTICS
inline
@ -4018,10 +3489,18 @@ inline void UMatData::markDeviceCopyObsolete(bool flag)
//! @endcond
static inline
void swap(MatExpr& a, MatExpr& b) { a.swap(b); }
} //cv
#ifdef _MSC_VER
#pragma warning( pop )
#endif
#ifdef CV_DISABLE_CLANG_ENUM_WARNINGS
#undef CV_DISABLE_CLANG_ENUM_WARNINGS
#pragma clang diagnostic pop
#endif
#endif

View File

@ -151,7 +151,16 @@ public:
static Matx ones();
static Matx eye();
static Matx diag(const diag_type& d);
/** @brief Generates uniformly distributed random numbers
@param a Range boundary.
@param b The other range boundary (boundaries don't have to be ordered, the lower boundary is inclusive,
the upper one is exclusive).
*/
static Matx randu(_Tp a, _Tp b);
/** @brief Generates normally distributed random numbers
@param a Mean value.
@param b Standard deviation.
*/
static Matx randn(_Tp a, _Tp b);
//! dot product computed with the default precision
@ -391,6 +400,10 @@ public:
const _Tp& operator ()(int i) const;
_Tp& operator ()(int i);
#ifdef CV_CXX11
Vec<_Tp, cn>& operator=(const Vec<_Tp, cn>& rhs) = default;
#endif
Vec(const Matx<_Tp, cn, 1>& a, const Matx<_Tp, cn, 1>& b, Matx_AddOp);
Vec(const Matx<_Tp, cn, 1>& a, const Matx<_Tp, cn, 1>& b, Matx_SubOp);
template<typename _T2> Vec(const Matx<_Tp, cn, 1>& a, _T2 alpha, Matx_ScaleOp);
@ -1275,6 +1288,34 @@ Matx<_Tp, m, n> operator * (double alpha, const Matx<_Tp, m, n>& a)
return Matx<_Tp, m, n>(a, alpha, Matx_ScaleOp());
}
template<typename _Tp, int m, int n> static inline
Matx<_Tp, m, n>& operator /= (Matx<_Tp, m, n>& a, float alpha)
{
for( int i = 0; i < m*n; i++ )
a.val[i] = a.val[i] / alpha;
return a;
}
template<typename _Tp, int m, int n> static inline
Matx<_Tp, m, n>& operator /= (Matx<_Tp, m, n>& a, double alpha)
{
for( int i = 0; i < m*n; i++ )
a.val[i] = a.val[i] / alpha;
return a;
}
template<typename _Tp, int m, int n> static inline
Matx<_Tp, m, n> operator / (const Matx<_Tp, m, n>& a, float alpha)
{
return Matx<_Tp, m, n>(a, 1.f/alpha, Matx_ScaleOp());
}
template<typename _Tp, int m, int n> static inline
Matx<_Tp, m, n> operator / (const Matx<_Tp, m, n>& a, double alpha)
{
return Matx<_Tp, m, n>(a, 1./alpha, Matx_ScaleOp());
}
template<typename _Tp, int m, int n> static inline
Matx<_Tp, m, n> operator - (const Matx<_Tp, m, n>& a)
{

View File

@ -70,7 +70,7 @@ class CV_EXPORTS Image2D;
class CV_EXPORTS_W_SIMPLE Device
{
public:
CV_WRAP Device();
CV_WRAP Device() CV_NOEXCEPT;
explicit Device(void* d);
Device(const Device& d);
Device& operator = (const Device& d);
@ -238,7 +238,7 @@ protected:
class CV_EXPORTS Context
{
public:
Context();
Context() CV_NOEXCEPT;
explicit Context(int dtype);
~Context();
Context(const Context& c);
@ -269,7 +269,7 @@ public:
class CV_EXPORTS Platform
{
public:
Platform();
Platform() CV_NOEXCEPT;
~Platform();
Platform(const Platform& p);
Platform& operator = (const Platform& p);
@ -324,7 +324,7 @@ void initializeContextFromHandle(Context& ctx, void* platform, void* context, vo
class CV_EXPORTS Queue
{
public:
Queue();
Queue() CV_NOEXCEPT;
explicit Queue(const Context& c, const Device& d=Device());
~Queue();
Queue(const Queue& q);
@ -350,7 +350,7 @@ class CV_EXPORTS KernelArg
public:
enum { LOCAL=1, READ_ONLY=2, WRITE_ONLY=4, READ_WRITE=6, CONSTANT=8, PTR_ONLY = 16, NO_SIZE=256 };
KernelArg(int _flags, UMat* _m, int wscale=1, int iwscale=1, const void* _obj=0, size_t _sz=0);
KernelArg();
KernelArg() CV_NOEXCEPT;
static KernelArg Local(size_t localMemSize)
{ return KernelArg(LOCAL, 0, 1, 1, 0, localMemSize); }
@ -387,7 +387,7 @@ public:
class CV_EXPORTS Kernel
{
public:
Kernel();
Kernel() CV_NOEXCEPT;
Kernel(const char* kname, const Program& prog);
Kernel(const char* kname, const ProgramSource& prog,
const String& buildopts = String(), String* errmsg=0);
@ -597,7 +597,7 @@ protected:
class CV_EXPORTS Program
{
public:
Program();
Program() CV_NOEXCEPT;
Program(const ProgramSource& src,
const String& buildflags, String& errmsg);
Program(const Program& prog);
@ -642,7 +642,7 @@ class CV_EXPORTS ProgramSource
public:
typedef uint64 hash_t; // deprecated
ProgramSource();
ProgramSource() CV_NOEXCEPT;
explicit ProgramSource(const String& module, const String& name, const String& codeStr, const String& codeHash);
explicit ProgramSource(const String& prog); // deprecated
explicit ProgramSource(const char* prog); // deprecated
@ -711,7 +711,7 @@ protected:
class CV_EXPORTS PlatformInfo
{
public:
PlatformInfo();
PlatformInfo() CV_NOEXCEPT;
explicit PlatformInfo(void* id);
~PlatformInfo();
@ -720,7 +720,12 @@ public:
String name() const;
String vendor() const;
/// See CL_PLATFORM_VERSION
String version() const;
int versionMajor() const;
int versionMinor() const;
int deviceNumber() const;
void getDevice(Device& device, int d) const;
@ -771,7 +776,7 @@ CV_EXPORTS void buildOptionsAddMatrixDescription(String& buildOptions, const Str
class CV_EXPORTS Image2D
{
public:
Image2D();
Image2D() CV_NOEXCEPT;
/**
@param src UMat object from which to get image properties and data

View File

@ -47,6 +47,23 @@ static std::string bytesToStringRepr(size_t value)
s = s.substr(0, s.size() - 1);
return s;
}
static String getDeviceTypeString(const cv::ocl::Device& device)
{
if (device.type() == cv::ocl::Device::TYPE_CPU) {
return "CPU";
}
if (device.type() == cv::ocl::Device::TYPE_GPU) {
if (device.hostUnifiedMemory()) {
return "iGPU";
} else {
return "dGPU";
}
}
return "unknown";
}
} // namespace
static void dumpOpenCLInformation()
@ -64,46 +81,36 @@ static void dumpOpenCLInformation()
std::vector<PlatformInfo> platforms;
cv::ocl::getPlatfomsInfo(platforms);
if (platforms.size() > 0)
{
DUMP_MESSAGE_STDOUT("OpenCL Platforms: ");
for (size_t i = 0; i < platforms.size(); i++)
{
const PlatformInfo* platform = &platforms[i];
DUMP_MESSAGE_STDOUT(" " << platform->name().c_str());
Device current_device;
for (int j = 0; j < platform->deviceNumber(); j++)
{
platform->getDevice(current_device, j);
const char* deviceTypeStr = current_device.type() == Device::TYPE_CPU
? ("CPU") : (current_device.type() == Device::TYPE_GPU ? current_device.hostUnifiedMemory() ? "iGPU" : "dGPU" : "unknown");
DUMP_MESSAGE_STDOUT( " " << deviceTypeStr << ": " << current_device.name().c_str() << " (" << current_device.version().c_str() << ")");
DUMP_CONFIG_PROPERTY( cv::format("cv_ocl_platform_%d_device_%d", (int)i, (int)j ),
cv::format("(Platform=%s)(Type=%s)(Name=%s)(Version=%s)",
platform->name().c_str(), deviceTypeStr, current_device.name().c_str(), current_device.version().c_str()) );
}
}
}
else
if (platforms.empty())
{
DUMP_MESSAGE_STDOUT("OpenCL is not available");
DUMP_CONFIG_PROPERTY("cv_ocl", "not available");
return;
}
DUMP_MESSAGE_STDOUT("OpenCL Platforms: ");
for (size_t i = 0; i < platforms.size(); i++)
{
const PlatformInfo* platform = &platforms[i];
DUMP_MESSAGE_STDOUT(" " << platform->name());
Device current_device;
for (int j = 0; j < platform->deviceNumber(); j++)
{
platform->getDevice(current_device, j);
String deviceTypeStr = getDeviceTypeString(current_device);
DUMP_MESSAGE_STDOUT( " " << deviceTypeStr << ": " << current_device.name() << " (" << current_device.version() << ")");
DUMP_CONFIG_PROPERTY( cv::format("cv_ocl_platform_%d_device_%d", (int)i, j ),
cv::format("(Platform=%s)(Type=%s)(Name=%s)(Version=%s)",
platform->name().c_str(), deviceTypeStr.c_str(), current_device.name().c_str(), current_device.version().c_str()) );
}
}
const Device& device = Device::getDefault();
if (!device.available())
CV_Error(Error::OpenCLInitError, "OpenCL device is not available");
DUMP_MESSAGE_STDOUT("Current OpenCL device: ");
#if 0
DUMP_MESSAGE_STDOUT(" Platform = " << device.getPlatform().name());
DUMP_CONFIG_PROPERTY("cv_ocl_current_platformName", device.getPlatform().name());
#endif
const char* deviceTypeStr = device.type() == Device::TYPE_CPU
? ("CPU") : (device.type() == Device::TYPE_GPU ? device.hostUnifiedMemory() ? "iGPU" : "dGPU" : "unknown");
String deviceTypeStr = getDeviceTypeString(device);
DUMP_MESSAGE_STDOUT(" Type = " << deviceTypeStr);
DUMP_CONFIG_PROPERTY("cv_ocl_current_deviceType", deviceTypeStr);
@ -156,7 +163,7 @@ static void dumpOpenCLInformation()
}
pos = pos2 + 1;
}
DUMP_CONFIG_PROPERTY("cv_ocl_current_extensions", extensionsStr.c_str());
DUMP_CONFIG_PROPERTY("cv_ocl_current_extensions", extensionsStr);
const char* haveAmdBlasStr = haveAmdBlas() ? "Yes" : "No";
DUMP_MESSAGE_STDOUT(" Has AMD Blas = " << haveAmdBlasStr);

Some files were not shown because too many files have changed in this diff Show More