前言
仍然是学习Imageshop的文章,https://www.cnblogs.com/Imageshop/p/7285564.html 。做了一个小总结,并且完整实现了这个SSE优化的算法,可以关注我专门用SIMD优化图像处理算法的工程:https://github.com/BBuf/Image-processing-algorithm-Speed
传统的Sobel算法实现
我之前写过Sobel边缘检测算法如何产生X方向和方向的系数,https://blog.csdn.net/just_sort/article/details/85015054 。具体原理就不再赘述,这里原理就不再叙述了,给出指针版本实现的边缘检测算法。
inline unsigned char IM_ClampToByte(int Value)
{
if (Value < 0)
return 0;
else if (Value > 255)
return 255;
else
return (unsigned char)Value;
//return ((Value | ((signed int)(255 - Value) >> 31)) & ~((signed int)Value >> 31));
}
void Sobel_FLOAT(unsigned char *Src, unsigned char *Dest, int Width, int Height, int Stride) {
int Channel = Stride / Width;
unsigned char *RowCopy = (unsigned char*)malloc((Width + 2) * 3 * Channel);
unsigned char *First = RowCopy;
unsigned char *Second = RowCopy + (Width + 2) * Channel;
unsigned char *Third = RowCopy + (Width + 2) * 2 * Channel;
//拷贝第二行数据,边界值填充
memcpy(Second, Src, Channel);
memcpy(Second + Channel, Src, Width*Channel);
memcpy(Second + (Width + 1)*Channel, Src + (Width - 1)*Channel, Channel);
//第一行和第二行一样
memcpy(First, Second, (Width + 2) * Channel);
//拷贝第三行数据,边界值填充
memcpy(Third, Src + Stride, Channel);
memcpy(Third + Channel, Src + Stride, Width * Channel);
memcpy(Third + (Width + 1) * Channel, Src + Stride + (Width - 1) * Channel, Channel);
for (int Y = 0; Y < Height; Y++) {
unsigned char *LinePS = Src + Y * Stride;
unsigned char *LinePD = Dest + Y * Stride;
if (Y != 0) {
unsigned char *Temp = First;
First = Second;
Second = Third;
Third = Temp;
}
if (Y == Height - 1) {
memcpy(Third, Second, (Width + 2) * Channel);
}
else {
memcpy(Third, Src + (Y + 1) * Stride, Channel);
memcpy(Third + Channel, Src + (Y + 1) * Stride, Width * Channel); // 由于备份了前面一行的数据,这里即使Src和Dest相同也是没有问题的
memcpy(Third + (Width + 1) * Channel, Src + (Y + 1) * Stride + (Width - 1) * Channel, Channel);
}
if (Channel == 1) {
for (int X = 0; X < Width; X++)
{
int GX = First[X] - First[X + 2] + (Second[X] - Second[X + 2]) * 2 + Third[X] - Third[X + 2];
int GY = First[X] + First[X + 2] + (First[X + 1] - Third[X + 1]) * 2 - Third[X] - Third[X + 2];
LinePD[X] = IM_ClampToByte(sqrtf(GX * GX + GY * GY + 0.0F));
}
}
else
{
for (int X = 0; X < Width * 3; X++)
{
int GX = First[X] - First[X + 6] + (Second[X] - Second[X + 6]) * 2 + Third[X] - Third[X + 6];
int GY = First[X] + First[X + 6] + (First[X + 3] - Third[X + 3]) * 2 - Third[X] - Third[X + 6];
LinePD[X] = IM_ClampToByte(sqrtf(GX * GX + GY * GY + 0.0F));
}
}
}
free(RowCopy);
}
这个边缘检测算的代码写得很好,因为这个代码不仅仅支持了In-Place操作(就是Src核Dest可以是同一块内存),而且支持图像边界处理。这段代码使用I7-8770CPU测试得速度为192.01ms。
优化1:去掉浮点数
由于该Sobel的过程最后是把数据用图像的方式记录下来,因此,IM_ClampToByte(sqrtf(GX * GX + GY * GY + 0.0F))可以用查表的方式来实现。简单改成如下的版本, 避免了浮点计算。
void Sobel_INT(unsigned char *Src, unsigned char *Dest, int Width, int Height, int Stride) {
int Channel = Stride / Width;
unsigned char *RowCopy = (unsigned char*)malloc((Width + 2) * 3 * Channel);
unsigned char *First = RowCopy;
unsigned char *Second = RowCopy + (Width + 2) * Channel;
unsigned char *Third = RowCopy + (Width + 2) * 2 * Channel;
//拷贝第二行数据,边界值填充
memcpy(Second, Src, Channel);
memcpy(Second + Channel, Src, Width*Channel);
memcpy(Second + (Width + 1)*Channel, Src + (Width - 1)*Channel, Channel);
//第一行和第二行一样
memcpy(First, Second, (Width + 2) * Channel);
//拷贝第三行数据,边界值填充
memcpy(Third, Src + Stride, Channel);
memcpy(Third + Channel, Src + Stride, Width * Channel);
memcpy(Third + (Width + 1) * Channel, Src + Stride + (Width - 1) * Channel, Channel);
unsigned char Table[65026];
for (int Y = 0; Y < 65026; Y++) Table[Y] = (sqrtf(Y + 0.0f) + 0.5f);
for (int Y = 0; Y < Height; Y++) {
unsigned char *LinePS = Src + Y * Stride;
unsigned char *LinePD = Dest + Y * Stride;
if (Y != 0) {
unsigned char *Temp = First;
First = Second;
Second = Third;
Third = Temp;
}
if (Y == Height - 1) {
memcpy(Third, Second, (Width + 2) * Channel);
}
else {
memcpy(Third, Src + (Y + 1) * Stride, Channel);
memcpy(Third + Channel, Src + (Y + 1) * Stride, Width * Channel); // 由于备份了前面一行的数据,这里即使Src和Dest相同也是没有问题的
memcpy(Third + (Width + 1) * Channel, Src + (Y + 1) * Stride + (Width - 1) * Channel, Channel);
}
if (Channel == 1) {
for (int X = 0; X < Width; X++)
{
int GX = First[X] - First[X + 2] + (Second[X] - Second[X + 2]) * 2 + Third[X] - Third[X + 2];
int GY = First[X] + First[X + 2] + (First[X + 1] - Third[X + 1]) * 2 - Third[X] - Third[X + 2];
LinePD[X] = Table[min(GX * GX + GY * GY, 65025)];
}
}
else
{
for (int X = 0; X < Width * 3; X++)
{
int GX = First[X] - First[X + 6] + (Second[X] - Second[X + 6]) * 2 + Third[X] - Third[X + 6];
int GY = First[X] + First[X + 6] + (First[X + 3] - Third[X + 3]) * 2 - Third[X] - Third[X + 6];
LinePD[X] = Table[min(GX * GX + GY * GY, 65025)];
}
}
}
free(RowCopy);
}
速度测试的结果是91.20ms,比浮点数计算快了大概2倍,还是很不错的。
优化2:SSE优化
在第二段代码的基础上,我们来考虑一下SSE优化的位置,对于灰度图的优化是没有必要,因为在计算的时候每次只和3个像素相关。而我们来看BGR图的计算时,
for (int X = 0; X < Width * 3; X++)
{
int GX = First[X] - First[X + 6] + (Second[X] - Second[X + 6]) * 2 + Third[X] - Third[X + 6];
int GY = First[X] + First[X + 6] + (First[X + 3] - Third[X + 3]) * 2 - Third[X] - Third[X + 6];
LinePD[X] = Table[min(GX * GX + GY * GY, 65025)];
}
里面涉及到了8个不同的像素,考虑计算的特性和数据的范围,在内部计算时这个int可以用short代替,也就是要把加载的字节图像数据转换为short类型先,这样SSE优化方式为用8个SSE变量分别记录8个连续的像素风量的颜色值,每个颜色值用16位数据表达。这可以使用_mm_unpacklo_epi8
配合_mm_loadl_epi64
实现:
__m128i FirstP0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(First + X)), Zero);
__m128i FirstP1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(First + X + 3)), Zero);
__m128i FirstP2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(First + X + 6)), Zero);
__m128i SecondP0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(Second + X)), Zero);
__m128i SecondP2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(Second + X + 6)), Zero);
__m128i ThirdP0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(Third + X)), Zero);
__m128i ThirdP1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(Third + X + 3)), Zero);
__m128i ThirdP2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(Third + X + 6)), Zero);
接下来就是对GX和GY的计算:
__m128i GX16 = _mm_abs_epi16(_mm_add_epi16(_mm_add_epi16(_mm_sub_epi16(FirstP0, FirstP2), _mm_slli_epi16(_mm_sub_epi16(SecondP0, SecondP2), 1)), _mm_sub_epi16(ThirdP0, ThirdP2)));
__m128i GY16 = _mm_abs_epi16(_mm_sub_epi16(_mm_add_epi16(_mm_add_epi16(FirstP0, FirstP2), _mm_slli_epi16(_mm_sub_epi16(FirstP1, ThirdP1), 1)), _mm_add_epi16(ThirdP0, ThirdP2)));
这个时候的GX16和GY16里保存的是8个16位的中间结果,由于SSE只提供了浮点数的sqrt操作,我们必须将它们转换为浮点数,那么这个转换的第一步就必须是先将它们转换为int的整形数,这样,就必须一个拆成2个,即:
__m128i GX32L = _mm_unpacklo_epi16(GX16, Zero);
__m128i GX32H = _mm_unpackhi_epi16(GX16, Zero);
__m128i GY32L = _mm_unpacklo_epi16(GY16, Zero);
__m128i GY32H = _mm_unpackhi_epi16(GY16, Zero);
接下来分别对高位低位进行平方运算:
__m128i ResultL = _mm_cvtps_epi32(_mm_sqrt_ps(_mm_cvtepi32_ps(_mm_add_epi32(_mm_mullo_epi32(GX32L, GX32L), _mm_mullo_epi32(GY32L, GY32L)))));
__m128i ResultH = _mm_cvtps_epi32(_mm_sqrt_ps(_mm_cvtepi32_ps(_mm_add_epi32(_mm_mullo_epi32(GX32H, GX32H), _mm_mullo_epi32(GY32H, GY32H)))));
最后一步,得到8个int型的结果,这个结果有要转换为字节类型的,并且这些数据有可能会超出字节所能表达的范围,所以就需要用到SSE自带的抗饱和向下打包函数了,如下所示:
_mm_storel_epi64((__m128i *)(LinePD + X), _mm_packus_epi16(_mm_packus_epi32(ResultL, ResultH), Zero));
这里如ImageShop指出的需要特别注意一个细节。
转自ImageShop:
通常,我们都是对像素的字节数据进行向上扩展,他们都是正数,所以用unpack之类的配合zero把高8位或高16位的数据填充为0就可以了,但是在本例中,GX16或者GY16很有可能是负数,而负数的最高位是符号位,如果都填充为0,则变为正数了,明显改变原始的数据了,所以得到了错误的结果。
那如何解决问题呢,对于本例,很简单,因为后面只有一个平方操作,因此,对GX先取绝对值是不会改变计算的结果的,这样就不会出现负的数据了,修改之后,果然结果正确。
还有更加高级的优化方式,可以减少SIMD指令的个数:
我实现了使用_mm_madd_epi16的优化,代码在github: https://github.com/BBuf/Image-processing-algorithm-Speed/blob/master/speed_sobel_edgedetection_sse.cpp 。
最后的速度测试结果为21.14ms。相对于原始的C语言实现加速了9倍。