2 #ifndef vil_math_sse_hxx_ 3 #define vil_math_sse_hxx_ 6 #error "This header cannot be included directly, only through vil_math_.h" 12 # include <vcl_msvc_warnings.h> 14 #include <vxl_config.h> 16 #include <emmintrin.h> 18 #include <pmmintrin.h> 30 const vxl_byte* pxA,
const vxl_byte* pxB, vxl_byte* pxD,
33 assert(
sizeof(vxl_byte) == 1);
35 const unsigned ni_d_16 = len >> 4;
36 const unsigned ni_m_16 = len & 0x0F;
38 const __m128i* pxAxmm = reinterpret_cast<const __m128i*>(pxA);
39 const __m128i* pxBxmm = reinterpret_cast<const __m128i*>(pxB);
40 __m128i* pxDxmm = reinterpret_cast<__m128i*>(pxD);
43 for (
unsigned i = 0; i < ni_d_16; ++i, ++pxAxmm, ++pxBxmm, ++pxDxmm)
46 __m128i xmmA = _mm_lddqu_si128(pxAxmm);
47 __m128i xmmB = _mm_lddqu_si128(pxBxmm);
49 __m128i xmmA = _mm_loadu_si128(pxAxmm);
50 __m128i xmmB = _mm_loadu_si128(pxBxmm);
53 __m128i xmmMax = _mm_max_epu8(xmmA, xmmB);
54 __m128i xmmMin = _mm_min_epu8(xmmA, xmmB);
55 __m128i xmmD = _mm_subs_epu8(xmmMax, xmmMin);
57 _mm_storeu_si128(pxDxmm, xmmD);
66 __m128i* pxLastAxmm = reinterpret_cast<__m128i*>(pxLastA);
67 __m128i* pxLastBxmm = reinterpret_cast<__m128i*>(pxLastB);
68 __m128i* pxLastDxmm = reinterpret_cast<__m128i*>(pxLastD);
70 std::memcpy(pxLastA, pxAxmm, ni_m_16);
71 std::memcpy(pxLastB, pxBxmm, ni_m_16);
73 __m128i xmmA = _mm_lddqu_si128(pxLastAxmm);
74 __m128i xmmB = _mm_lddqu_si128(pxLastBxmm);
76 __m128i xmmA = _mm_loadu_si128(pxLastAxmm);
77 __m128i xmmB = _mm_loadu_si128(pxLastBxmm);
80 __m128i xmmMax = _mm_max_epu8(xmmA, xmmB);
81 __m128i xmmMin = _mm_min_epu8(xmmA, xmmB);
82 __m128i xmmD = _mm_subs_epu8(xmmMax, xmmMin);
84 _mm_storeu_si128(pxLastDxmm, xmmD);
85 std::memcpy(pxDxmm, pxLastD, ni_m_16);
93 const float* pxA,
const float* pxB,
float* pxD,
96 assert(
sizeof(
float) == 4);
98 const unsigned ni_d_4 = len >> 2;
99 const unsigned ni_m_4_bytes = (len & 0x03) << 2;
102 for (
unsigned i = 0; i < ni_d_4; ++i, pxA += 4, pxB += 4, pxD += 4)
104 __m128 xmmA = _mm_loadu_ps(pxA);
105 __m128 xmmB = _mm_loadu_ps(pxB);
107 __m128 xmmMax = _mm_max_ps(xmmA, xmmB);
108 __m128 xmmMin = _mm_min_ps(xmmA, xmmB);
109 __m128 xmmD = _mm_sub_ps(xmmMax, xmmMin);
111 _mm_storeu_ps(pxD, xmmD);
114 if (ni_m_4_bytes != 0)
122 std::memcpy(pxLastA, pxA, ni_m_4_bytes);
123 std::memcpy(pxLastB, pxB, ni_m_4_bytes);
124 __m128 xmmA = _mm_loadu_ps(pxLastA);
125 __m128 xmmB = _mm_loadu_ps(pxLastB);
127 __m128 xmmMax = _mm_max_ps(xmmA, xmmB);
128 __m128 xmmMin = _mm_min_ps(xmmA, xmmB);
129 __m128 xmmD = _mm_sub_ps(xmmMax, xmmMin);
131 _mm_storeu_ps(pxLastD, xmmD);
133 std::memcpy(pxD, pxLastD, ni_m_4_bytes);
137 #endif // vil_math_sse_hxx_ void vil_math_image_abs_difference_1d_sse< float, float, float >(const float *pxA, const float *pxB, float *pxD, unsigned len)
Compute absolute difference of two images (im_sum = |imA-imB|).
void vil_math_image_abs_difference_1d_sse< vxl_byte, vxl_byte, vxl_byte >(const vxl_byte *pxA, const vxl_byte *pxB, vxl_byte *pxD, unsigned len)
Compute absolute difference of two 1D images (im_sum = |imA-imB|).