vil_math_sse.hxx
Go to the documentation of this file.
1 // This is core/vil/vil_math_sse.hxx
2 #ifndef vil_math_sse_hxx_
3 #define vil_math_sse_hxx_
4 
5 #ifndef vil_math_h_
6 #error "This header cannot be included directly, only through vil_math_.h"
7 #endif
8 
9 #include <iostream>
10 #include <cstring>
11 #ifdef _MSC_VER
12 # include <vcl_msvc_warnings.h>
13 #endif
14 #include <vxl_config.h>
15 
16 #include <emmintrin.h>
17 #ifdef __SSE3__
18 #include <pmmintrin.h>
19 #endif
20 
21 //:
22 // \file
23 // \brief Various mathematical manipulations of 2D images implemented with SSE
24 // intrinsic functions
25 // \author Chuck Atkins
26 
27 //: Compute absolute difference of two 1D images (im_sum = |imA-imB|)
28 template<>
30  const vxl_byte* pxA, const vxl_byte* pxB, vxl_byte* pxD,
31  unsigned len)
32 {
33  assert(sizeof(vxl_byte) == 1);
34 
35  const unsigned ni_d_16 = len >> 4;
36  const unsigned ni_m_16 = len & 0x0F;
37 
38  const __m128i* pxAxmm = reinterpret_cast<const __m128i*>(pxA);
39  const __m128i* pxBxmm = reinterpret_cast<const __m128i*>(pxB);
40  __m128i* pxDxmm = reinterpret_cast<__m128i*>(pxD);
41 
42  // Loop through the first set of pxs in groups of 16
43  for (unsigned i = 0; i < ni_d_16; ++i, ++pxAxmm, ++pxBxmm, ++pxDxmm)
44  {
45 #ifdef __SSE3__
46  __m128i xmmA = _mm_lddqu_si128(pxAxmm);
47  __m128i xmmB = _mm_lddqu_si128(pxBxmm);
48 #else // SSE2
49  __m128i xmmA = _mm_loadu_si128(pxAxmm);
50  __m128i xmmB = _mm_loadu_si128(pxBxmm);
51 #endif
52 
53  __m128i xmmMax = _mm_max_epu8(xmmA, xmmB);
54  __m128i xmmMin = _mm_min_epu8(xmmA, xmmB);
55  __m128i xmmD = _mm_subs_epu8(xmmMax, xmmMin);
56 
57  _mm_storeu_si128(pxDxmm, xmmD);
58  }
59 
60  if (ni_m_16 != 0)
61  {
62  // Process the remainder < 16
63  vxl_byte pxLastA[16];
64  vxl_byte pxLastB[16];
65  vxl_byte pxLastD[16];
66  __m128i* pxLastAxmm = reinterpret_cast<__m128i*>(pxLastA);
67  __m128i* pxLastBxmm = reinterpret_cast<__m128i*>(pxLastB);
68  __m128i* pxLastDxmm = reinterpret_cast<__m128i*>(pxLastD);
69 
70  std::memcpy(pxLastA, pxAxmm, ni_m_16);
71  std::memcpy(pxLastB, pxBxmm, ni_m_16);
72 #ifdef __SSE3__
73  __m128i xmmA = _mm_lddqu_si128(pxLastAxmm);
74  __m128i xmmB = _mm_lddqu_si128(pxLastBxmm);
75 #else // SSE2
76  __m128i xmmA = _mm_loadu_si128(pxLastAxmm);
77  __m128i xmmB = _mm_loadu_si128(pxLastBxmm);
78 #endif
79 
80  __m128i xmmMax = _mm_max_epu8(xmmA, xmmB);
81  __m128i xmmMin = _mm_min_epu8(xmmA, xmmB);
82  __m128i xmmD = _mm_subs_epu8(xmmMax, xmmMin);
83 
84  _mm_storeu_si128(pxLastDxmm, xmmD);
85  std::memcpy(pxDxmm, pxLastD, ni_m_16);
86  }
87 }
88 
89 
90 //: Compute absolute difference of two images (im_sum = |imA-imB|)
91 template<>
93  const float* pxA, const float* pxB, float* pxD,
94  unsigned len)
95 {
96  assert(sizeof(float) == 4);
97 
98  const unsigned ni_d_4 = len >> 2;
99  const unsigned ni_m_4_bytes = (len & 0x03) << 2;
100 
101  // Loop through the first set of pxs in groups of 16
102  for (unsigned i = 0; i < ni_d_4; ++i, pxA += 4, pxB += 4, pxD += 4)
103  {
104  __m128 xmmA = _mm_loadu_ps(pxA);
105  __m128 xmmB = _mm_loadu_ps(pxB);
106 
107  __m128 xmmMax = _mm_max_ps(xmmA, xmmB);
108  __m128 xmmMin = _mm_min_ps(xmmA, xmmB);
109  __m128 xmmD = _mm_sub_ps(xmmMax, xmmMin);
110 
111  _mm_storeu_ps(pxD, xmmD);
112  }
113 
114  if (ni_m_4_bytes != 0)
115  {
116  // Process the remainder < 4
117  // Use these for the last set of pxs in each row
118  float pxLastA[4];
119  float pxLastB[4];
120  float pxLastD[4];
121 
122  std::memcpy(pxLastA, pxA, ni_m_4_bytes);
123  std::memcpy(pxLastB, pxB, ni_m_4_bytes);
124  __m128 xmmA = _mm_loadu_ps(pxLastA);
125  __m128 xmmB = _mm_loadu_ps(pxLastB);
126 
127  __m128 xmmMax = _mm_max_ps(xmmA, xmmB);
128  __m128 xmmMin = _mm_min_ps(xmmA, xmmB);
129  __m128 xmmD = _mm_sub_ps(xmmMax, xmmMin);
130 
131  _mm_storeu_ps(pxLastD, xmmD);
132 
133  std::memcpy(pxD, pxLastD, ni_m_4_bytes);
134  }
135 }
136 
137 #endif // vil_math_sse_hxx_
void vil_math_image_abs_difference_1d_sse< float, float, float >(const float *pxA, const float *pxB, float *pxD, unsigned len)
Compute absolute difference of two images (im_sum = |imA-imB|).
void vil_math_image_abs_difference_1d_sse< vxl_byte, vxl_byte, vxl_byte >(const vxl_byte *pxA, const vxl_byte *pxB, vxl_byte *pxD, unsigned len)
Compute absolute difference of two 1D images (im_sum = |imA-imB|).