前言
这篇文章的原始来源请看我以前写的这篇博客:https://blog.csdn.net/just_sort/article/details/85007555 上面有算法原理和普通的opencv实现。然后这篇文章是在SSE优化系列七: 对盒滤波的SSE优化(https://blog.csdn.net/just_sort/article/details/98075712)的基础上,将多尺度提升算法中的高斯滤波换成高速的SSE优化的盒滤波,并对其他部分的处理代码进行SSE优化。
算法原理
具体见我在2018年12月写的:https://blog.csdn.net/just_sort/article/details/85007555
原始实现
void MultiScaleSharpen(unsigned char *Src, unsigned char *Dest, int Width, int Height, int Stride, int Radius) {
int Channel = Stride / Width;
unsigned char *B1 = (unsigned char *)malloc(Height * Stride * sizeof(unsigned char));
unsigned char *B2 = (unsigned char *)malloc(Height * Stride * sizeof(unsigned char));
unsigned char *B3 = (unsigned char *)malloc(Height * Stride * sizeof(unsigned char));
BoxBlur_SSE(Src, B1, Width, Height, Channel, Stride, Radius);
BoxBlur_SSE(Src, B2, Width, Height, Channel, Stride, Radius * 2);
BoxBlur_SSE(Src, B3, Width, Height, Channel, Stride, Radius * 4);
for (int Y = 0; Y < Height * Stride; Y++) {
int DiffB1 = Src[Y] - B1[Y];
int DiffB2 = B1[Y] - B2[Y];
int DiffB3 = B2[Y] - B3[Y];
Dest[Y] = IM_ClampToByte(((4 - 2 * IM_Sign(DiffB1)) * DiffB1 + 2 * DiffB2 + DiffB3) / 4 + Src[Y]);
}
}
这里有IM_ClampToByte
和IM_Sign
函数需要注意,其他都和论文的公式保持一致。这两个函数的源码表示为:
int IM_Sign(int X) {
return (X >> 31) | (unsigned(-X)) >> 31;
}
inline unsigned char IM_ClampToByte(int Value)
{
if (Value < 0)
return 0;
else if (Value > 255)
return 255;
else
return (unsigned char)Value;
//return ((Value | ((signed int)(255 - Value) >> 31)) & ~((signed int)Value >> 31));
}
SSE优化
重点来看一下对这个算法的SSE优化,第一个优化的点在于IM_Sign
这个函数,这个函数当参数大于0时,返回1,参数小于0时,返回-1,参数等于0时,返回0。从ImageShop博主的分享中了解到这个函数可以用下面的指令来实现:
inline __m128i _mm_sgn_epi16(__m128i v) {
#ifdef __SSSE3__
v = _mm_sign_epi16(_mm_set1_epi16(1), v); // use PSIGNW on SSSE3 and later
#else
v = _mm_min_epi16(v, _mm_set1_epi16(1)); // use PMINSW/PMAXSW on SSE2/SSE3.
v = _mm_max_epi16(v, _mm_set1_epi16(-1));
//_mm_set1_epi16(1) = _mm_srli_epi16(_mm_cmpeq_epi16(v, v), 15);
//_mm_set1_epi16(-1) = _mm_cmpeq_epi16(v, v);
#endif
return v;
}
注意上面分为SSE3及以上版本或者以下版本的不同实现,要注意自己的机器是否开启了SSE3支持或者是否支持SSE3。
第二个点是考虑到数据范围,需要将字节数据扩展到signed short类型,这个操作我们已经非常熟悉了。下面给出原始的SSE代码实现:
void MultiScaleSharpen_SSE(unsigned char *Src, unsigned char *Dest, int Width, int Height, int Stride, int Radius) {
int Channel = Stride / Width;
unsigned char *B1 = (unsigned char *)malloc(Height * Stride * sizeof(unsigned char));
unsigned char *B2 = (unsigned char *)malloc(Height * Stride * sizeof(unsigned char));
unsigned char *B3 = (unsigned char *)malloc(Height * Stride * sizeof(unsigned char));
BoxBlur_SSE(Src, B1, Width, Height, Channel, Stride, Radius);
BoxBlur_SSE(Src, B2, Width, Height, Channel, Stride, Radius * 2);
BoxBlur_SSE(Src, B3, Width, Height, Channel, Stride, Radius * 4);
int BlockSize = 8, Block = (Height * Stride) / BlockSize;
__m128i Zero = _mm_setzero_si128();
__m128i Four = _mm_set1_epi16(4);
for (int Y = 0; Y < Block * BlockSize; Y += BlockSize) {
__m128i SrcV = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(Src + Y)), Zero);
__m128i SrcB1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(B1 + Y)), Zero);
__m128i SrcB2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(B2 + Y)), Zero);
__m128i SrcB3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(B3 + Y)), Zero);
__m128i DiffB1 = _mm_sub_epi16(SrcV, SrcB1);
__m128i DiffB2 = _mm_sub_epi16(SrcB1, SrcB2);
__m128i DiffB3 = _mm_sub_epi16(SrcB2, SrcB3);
//__m128i Offset = _mm_srai_epi16(_mm_add_epi16(_mm_add_epi16(_mm_mullo_epi16(_mm_sub_epi16(Four, _mm_slli_epi16(_mm_sgn_epi16(DiffB1), 1)), DiffB1), _mm_slli_epi16(DiffB2, 1)), DiffB3), 2);
__m128i Offset = _mm_add_epi16(_mm_srai_epi16(_mm_sub_epi16(_mm_slli_epi16(_mm_sub_epi16(SrcB1, _mm_sign_epi16(DiffB1, DiffB1)), 1), _mm_add_epi16(SrcB2, SrcB3)), 2), DiffB1);
_mm_storel_epi64((__m128i *)(Dest + Y), _mm_packus_epi16(_mm_add_epi16(SrcV, Offset), Zero));
}
for (int Y = Block * BlockSize; Y < Height * Stride; Y++) {
int DiffB1 = Src[Y] - B1[Y];
int DiffB2 = B1[Y] - B2[Y];
int DiffB3 = B2[Y] - B3[Y];
Dest[Y] = IM_ClampToByte(((4 - 2 * IM_Sign(DiffB1)) * DiffB1 + 2 * DiffB2 + DiffB3) / 4 + Src[Y]);
}
}
注意到计算Offset的时候最直接的实现是使用注释那一行实现的,但是这里思考我们将IM_Sign(DiffB1) * DiffB1放在一起进行观察时,就会发现这个整体可以直接用_mm_sign_epi16
实现,这样我们就可以省下一条指令。最后我们还能将DiffB2,DiffB3展开一些项和DiffB1结合,也可以优化到几条指令。所以就成了最后的SSE优化的版本。
我们来看下速度对比:
效果
原图
多尺度细节提升后的结果
参考博客
https://www.cnblogs.com/Imageshop/p/7895008.html