vnl_sse.h
Go to the documentation of this file.
1 #ifndef vnl_sse_h_
2 #define vnl_sse_h_
3 //:
4 // \file
5 // \author Kieran O'Mahony
6 // \date Sep 2007
7 // \brief Support for Streaming SIMD Extensions to speed up vector arithmetic
8 // \verbatim
9 // Modifications
10 // 2009-03-30 Peter Vanroose - Added arg_min() & arg_max() and reimplemented min() & max()
11 // \endverbatim
12 
13 #include <vcl_compiler_detection.h>
14 #ifdef _MSC_VER
15 # include <vcl_msvc_warnings.h>
16 #endif // for macro decisions based on compiler type
17 #include <vxl_config.h> // for checking supported integer data types
18 #include <cfloat>// for DBL_MAX and FLT_MAX
19 
20 #include <vnl/vnl_config.h> // is SSE enabled
21 #include <vnl/vnl_alloc.h> // is SSE enabled
22 #include "vnl/vnl_export.h"
23 
24 // some caveats...
25 // - Due to the way vnl_matrix is represented in memory cannot guarantee 16-byte alignment,
26 // therefore have to use slower unaligned loading intrinsics for matrices.
27 // - The GCC 3.4 intrinsics seem to be horrendously slow...
28 
29 // - On Mac OS X, in order to support Universal Binaries, we do not consider it a hard
30 // error if VNL_CONFIG_ENABLE_SSE2 is true for PowerPC builds. PowerPC obviously does
31 // not support SSE2, so we simply redefine it to false.
32 #if VNL_CONFIG_ENABLE_SSE2
33 # if defined(__APPLE__) && (defined(__ppc__) || defined(__ppc64__))
34 # undef VNL_CONFIG_ENABLE_SSE2
35 # define VNL_CONFIG_ENABLE_SSE2 0
36 # elif !VXL_HAS_EMMINTRIN_H
37 # error "Required file emmintrin.h for SSE2 not found"
38 # else
39 # include <emmintrin.h> // sse 2 intrinsics
40 # endif
41 #endif
42 
43 
44 // Try and use compiler instructions for forcing inlining if possible
45 // Also instruction for aligning stack memory is compiler dependent
46 #if defined(__GNUC__)
47 // With attribute always_inline, gcc can give an error if a function
48 // cannot be inlined, so it is disabled. Problem seen on 64 bit
49 // platforms with std::vector<vnl_rational>.
50 # define VNL_SSE_FORCE_INLINE /* __attribute__((always_inline)) */ inline
51 # define VNL_SSE_STACK_ALIGNED(x) __attribute__((aligned(x))) VNL_EXPORT
52 #elif defined _MSC_VER
53 # define VNL_SSE_FORCE_INLINE __forceinline
54 # define VNL_SSE_STACK_ALIGNED(x) __declspec(align(x)) VNL_EXPORT
55 #else
56 # define VNL_SSE_FORCE_INLINE inline
57 # define VNL_SSE_STACK_ALIGNED(x)
58 # define VNL_SSE_STACK_STORE(pf) _mm_storeu_##pf // no stack alignment so use unaligned store (slower!)
59 #endif
60 
61 
62 // SSE operates faster with 16 byte aligned memory addresses.
63 // Check what memory alignment function is supported
64 #if VNL_CONFIG_ENABLE_SSE2 && VXL_HAS_MM_MALLOC
65 # define VNL_SSE_ALLOC(n,s,a) _mm_malloc(n*s,a)
66 # define VNL_SSE_FREE(v,n,s) _mm_free(v)
67 #elif VNL_CONFIG_ENABLE_SSE2 && VXL_HAS_ALIGNED_MALLOC
68 # include <malloc.h>
69 # define VNL_SSE_ALLOC(n,s,a) _aligned_malloc(n*s,a)
70 # define VNL_SSE_FREE(v,n,s) _aligned_free(v)
71 #elif VNL_CONFIG_ENABLE_SSE2 && VXL_HAS_MINGW_ALIGNED_MALLOC
72 # include <malloc.h>
73 # define VNL_SSE_ALLOC(n,s,a) __mingw_aligned_malloc(n*s,a)
74 # define VNL_SSE_FREE(v,n,s) __mingw_aligned_free(v)
75 #elif VNL_CONFIG_ENABLE_SSE2 && VXL_HAS_POSIX_MEMALIGN
76 #include <cstdlib>
77 # define VNL_SSE_ALLOC(n,s,a) memalign(a,n*s)
78 # define VNL_SSE_FREE(v,n,s) std::free(v)
79 #else // sse2 disabled or could not get memory alignment support, use slower unaligned based intrinsics
80 # define VNL_SSE_HEAP_STORE(pf) _mm_storeu_##pf
81 # define VNL_SSE_HEAP_LOAD(pf) _mm_loadu_##pf
82 # if VNL_CONFIG_THREAD_SAFE
83 # define VNL_SSE_ALLOC(n,s,a) new char[(n)*(s)]
84 # define VNL_SSE_FREE(v,n,s) (delete [] static_cast<char*>(v))
85 # else
86 # define VNL_SSE_ALLOC(n,s,a) vnl_alloc::allocate((n == 0) ? 8 : (n * s));
87 # define VNL_SSE_FREE(v,n,s) if (v) vnl_alloc::deallocate(v, (n == 0) ? 8 : (n * s));
88 # endif
89 #endif
90 
91 
92 // Stack memory can be aligned -> use SSE aligned store
93 #ifndef VNL_SSE_STACK_STORE
94 # define VNL_SSE_STACK_STORE(pf) _mm_store_##pf
95 #endif
96 
97 // Heap memory can be aligned -> use SSE aligned load & store
98 #ifndef VNL_SSE_HEAP_STORE
99 # define VNL_SSE_HEAP_STORE(pf) _mm_store_##pf
100 # define VNL_SSE_HEAP_LOAD(pf) _mm_load_##pf
101 #endif
102 
103 //: Custom memory allocation function to force 16 byte alignment of data
104 VNL_SSE_FORCE_INLINE void* vnl_sse_alloc(std::size_t n, unsigned size)
105 {
106  return VNL_SSE_ALLOC(n,size,16);
107 }
108 
109 //: Custom memory deallocation function to free 16 byte aligned of data
110 VNL_SSE_FORCE_INLINE void vnl_sse_dealloc(void* mem, std::size_t n, unsigned size)
111 {
112  // Variables n and size are not used in all versions of the VNL_SSE_FREE macro.
113  // Cast to void here to avoid unused variable warnings.
114  (void)n,(void)size;
115  VNL_SSE_FREE(mem,n,size);
116 }
117 
118 // avoid inlining when debugging
119 #ifndef NDEBUG
120 #undef VNL_SSE_FORCE_INLINE
121 #define VNL_SSE_FORCE_INLINE
122 #endif
123 
124 #if VNL_CONFIG_ENABLE_SSE2
125 class VNL_EXPORT vnl_sse_supplement
126 {
127 public:
128  // SSE2 does not have a native _mm_min_epi32 or _mm_max_epi32 (le sigh-- SSE4.1
129  // provides these). So, these are substitutes written in SSE2 based off the
130  // SSEPlus library.
131  static VNL_SSE_FORCE_INLINE __m128i vnl_mm_min_epi32(__m128i a, __m128i b)
132  {
133  __m128i mask = _mm_cmplt_epi32(a, b);
134  a = _mm_and_si128(a, mask);
135  b = _mm_andnot_si128(mask, b);
136  a = _mm_or_si128(a, b);
137  return a;
138  }
139 
140  static VNL_SSE_FORCE_INLINE __m128i vnl_mm_max_epi32(__m128i a, __m128i b)
141  {
142  __m128i mask = _mm_cmpgt_epi32(a, b);
143  a = _mm_and_si128(a, mask);
144  b = _mm_andnot_si128(mask, b);
145  a = _mm_or_si128(a, b);
146  return a;
147  }
148 };
149 // If SSE4.1 is available, these can be replaced by their native
150 // implementations.
151 #define VNL_MM_MIN_EPI32 vnl_sse_supplement::vnl_mm_min_epi32
152 #define VNL_MM_MAX_EPI32 vnl_sse_supplement::vnl_mm_max_epi32
153 #endif // VNL_CONFIG_ENABLE_SSE2
154 
155 //: Bog standard (no sse) implementation for non sse enabled hardware and any type which doesn't have a template specialisation.
156 template <class T>
157 class VNL_EXPORT vnl_sse
158 {
159  public:
160  static VNL_SSE_FORCE_INLINE void element_product(const T* x, const T* y, T* r, unsigned n)
161  {
162  for (unsigned i = 0; i < n; ++i)
163  r[i] = x[i] * y[i];
164  }
165 
166  static VNL_SSE_FORCE_INLINE T dot_product(const T* x, const T* y, unsigned n)
167  {
168  T sum(0);
169  for (unsigned i = 0; i < n; ++i)
170  sum += x[i] * y[i];
171  return sum;
172  }
173 
174  static VNL_SSE_FORCE_INLINE T euclid_dist_sq(const T* x, const T* y, unsigned n)
175  {
176  // IMS: Unable to optimise this any further for MSVC compiler
177  T sum(0);
178  --x;
179  --y;
180  while (n!=0)
181  {
182  const T diff = x[n] - y[n];
183  sum += diff*diff;
184  --n;
185  }
186  return sum;
187  }
188 
189  static VNL_SSE_FORCE_INLINE void vector_x_matrix(const T* v, const T* m, T* r, unsigned rows, unsigned cols)
190  {
191  for (unsigned int j=0; j<cols; ++j) {
192  T som(0);
193  for (unsigned int i=0; i<rows; ++i)
194  som += (m+i*cols)[j] * v[i];
195  r[j] = som;
196  }
197  }
198 
199  static VNL_SSE_FORCE_INLINE void matrix_x_vector(const T* m, const T* v, T* r, unsigned rows, unsigned cols)
200  {
201  for (unsigned int i=0; i<rows; ++i) {
202  T som(0);
203  for (unsigned int j=0; j<cols; ++j)
204  som += (m+i*cols)[j] * v[j];
205  r[i] = som;
206  }
207  }
208 
209  static VNL_SSE_FORCE_INLINE T sum(const T* v, unsigned n)
210  {
211  T tot(0);
212  for (unsigned i = 0; i < n; ++i)
213  tot += *v++;
214  return tot;
215  }
216 
217  static VNL_SSE_FORCE_INLINE T max(const T* v, unsigned n)
218  {
219  if (n==0) return T(0); // the maximum of an empty set is undefined
220  T tmp = *v;
221  while (--n > 0)
222  if (*++v > tmp)
223  tmp = *v;
224  return tmp;
225  }
226 
227  static VNL_SSE_FORCE_INLINE T min(const T* v, unsigned n)
228  {
229  if (n==0) return T(0); // the minimum of an empty set is undefined
230  T tmp = *v;
231  while (--n > 0)
232  if (*++v < tmp)
233  tmp = *v;
234  return tmp;
235  }
236 
237  static VNL_SSE_FORCE_INLINE unsigned arg_max(const T* v, unsigned n)
238  {
239  if (n==0) return unsigned(-1); // the maximum of an empty set is undefined
240  T tmp = *v;
241  unsigned idx = 0;
242  for (unsigned i=1; i<n; ++i)
243  if (*++v > tmp)
244  tmp = *v, idx = i;
245  return idx;
246  }
247 
248  static VNL_SSE_FORCE_INLINE unsigned arg_min(const T* v, unsigned n)
249  {
250  if (n==0) return unsigned(-1); // the minimum of an empty set is undefined
251  T tmp = *v;
252  unsigned idx = 0;
253  for (unsigned i=1; i<n; ++i)
254  if (*++v < tmp)
255  tmp = *v, idx = i;
256  return idx;
257  }
258 };
259 
260 #if VNL_CONFIG_ENABLE_SSE2
261 
262 //: SSE2 implementation for double precision floating point (64 bit)
263 template <>
264 class VNL_EXPORT vnl_sse<double>
265 {
266  public:
267  static VNL_SSE_FORCE_INLINE void element_product(const double* x, const double* y, double* r, unsigned n)
268  {
269  switch (n%4)
270  {
271  // do scalar (single value) load, multiply and store for end elements
272  case 3: --n; _mm_store_sd(r+n,_mm_mul_sd(_mm_load_sd(x+n),_mm_load_sd(y+n)));
273  case 2: --n; _mm_store_sd(r+n,_mm_mul_sd(_mm_load_sd(x+n),_mm_load_sd(y+n)));
274  case 1: --n; _mm_store_sd(r+n,_mm_mul_sd(_mm_load_sd(x+n),_mm_load_sd(y+n)));
275  case 0: ;
276  }
277 
278  // load, multiply and store two doubles at a time
279  // loop unroll to handle 4
280  if (std::ptrdiff_t(x)%16 || std::ptrdiff_t(y)%16 || std::ptrdiff_t(r)%16)
281  // unaligned case
282  for (int i = n-4; i >= 0; i-=4)
283  {
284  _mm_storeu_pd(r+i,_mm_mul_pd(_mm_loadu_pd(x+i),_mm_loadu_pd(y+i)));
285  _mm_storeu_pd(r+i+2,_mm_mul_pd(_mm_loadu_pd(x+i+2),_mm_loadu_pd(y+i+2)));
286  }
287  else // aligned case
288  for (int i = n-4; i >= 0; i-=4)
289  {
290  VNL_SSE_HEAP_STORE(pd)(r+i,_mm_mul_pd(VNL_SSE_HEAP_LOAD(pd)(x+i),VNL_SSE_HEAP_LOAD(pd)(y+i)));
291  VNL_SSE_HEAP_STORE(pd)(r+i+2,_mm_mul_pd(VNL_SSE_HEAP_LOAD(pd)(x+i+2),VNL_SSE_HEAP_LOAD(pd)(y+i+2)));
292  }
293  }
294 
295  static VNL_SSE_FORCE_INLINE double dot_product(const double* x, const double* y, unsigned n)
296  {
297  double ret;
298  __m128d sum;
299  if (n%2)
300  {
301  // handle single element at end of odd sized vectors
302  n--; sum = _mm_mul_sd(_mm_load_sd(x+n),_mm_load_sd(y+n));
303  }
304  else
305  sum = _mm_setzero_pd();
306 
307  if (std::ptrdiff_t(x)%16 || std::ptrdiff_t(y)%16)
308  // unaligned case
309  for (int i = n-2; i >= 0; i-=2)
310  sum = _mm_add_pd(_mm_mul_pd(_mm_loadu_pd(x+i), _mm_loadu_pd(y+i)),sum);
311  else // aligned case
312  for (int i = n-2; i >= 0; i-=2)
313  sum = _mm_add_pd(_mm_mul_pd(VNL_SSE_HEAP_LOAD(pd)(x+i), VNL_SSE_HEAP_LOAD(pd)(y+i)),sum);
314 
315  // sum will contain 2 accumulated values, need to add them together
316  sum = _mm_add_sd(sum,_mm_unpackhi_pd(sum,_mm_setzero_pd()));
317  _mm_store_sd(&ret,sum);
318  return ret;
319  }
320 
321  static VNL_SSE_FORCE_INLINE double euclid_dist_sq(const double* x, const double* y, unsigned n)
322  {
323  double ret;
324  __m128d sum,a;
325 
326  if (n%2)
327  {
328  // handle single element at end of odd sized vectors
329  n--; a = _mm_sub_sd(_mm_load_sd(x+n),_mm_load_sd(y+n));
330  sum = _mm_mul_sd(a,a);
331  }
332  else
333  sum = _mm_setzero_pd();
334 
335  if (std::ptrdiff_t(x)%16 || std::ptrdiff_t(y)%16)
336  // unaligned case
337  for ( int i = n-2; i >= 0; i-=2 )
338  {
339  a = _mm_sub_pd(_mm_loadu_pd(x+i),_mm_loadu_pd(y+i));
340  sum = _mm_add_pd(_mm_mul_pd(a,a),sum);
341  }
342  else // aligned case
343  for ( int i = n-2; i >= 0; i-=2 )
344  {
345  a = _mm_sub_pd(VNL_SSE_HEAP_LOAD(pd)(x+i),VNL_SSE_HEAP_LOAD(pd)(y+i));
346  sum = _mm_add_pd(_mm_mul_pd(a,a),sum);
347  }
348 
349  // sum will contain 2 accumulated values, need to add them together
350  sum = _mm_add_sd(sum,_mm_unpackhi_pd(sum,_mm_setzero_pd()));
351  _mm_store_sd(&ret,sum);
352  return ret;
353  }
354 
355  static VNL_SSE_FORCE_INLINE void vector_x_matrix(const double* v, const double* m, double* r, unsigned rows, unsigned cols)
356  {
357  __m128d accum, x,y,z,w;
358 
359  // calculate if there are any left-over rows/columns
360  unsigned r_left = rows%4;
361  unsigned r_nice = rows - r_left;
362  unsigned c_left = cols%2;
363  unsigned c_nice = cols - c_left;
364 
365  // handle 2 matrix columns at a time
366  for (unsigned j = 0; j < c_nice; j+=2)
367  {
368  // handle 4 matrix rows at a time
369  accum = _mm_setzero_pd();
370  unsigned i = 0;
371  while ( i < r_nice )
372  {
373  // load vector data so that
374  // y = (v0,v1) , w = (v2,v3)
375  y = VNL_SSE_HEAP_LOAD(pd)(v+i);
376  w = VNL_SSE_HEAP_LOAD(pd)(v+i+2);
377 
378  _mm_prefetch((const char*)(v+i+4), _MM_HINT_NTA);
379 
380  // after shuffling
381  // x = (v0, v0)
382  // y = (v1, v1)
383  // z = (v2, v2)
384  // w = (v3, v3)
385  x = _mm_shuffle_pd(y,y,_MM_SHUFFLE2(0,0));
386  y = _mm_shuffle_pd(y,y,_MM_SHUFFLE2(1,1));
387  z = _mm_shuffle_pd(w,w,_MM_SHUFFLE2(0,0));
388  w = _mm_shuffle_pd(w,w,_MM_SHUFFLE2(1,1));
389 
390  // multiply the two matrix columns
391  // i.e. x = ( v0 * m00, v0 * m01)
392  // y = ( v1 * m10, v1 * m11)
393  // z = ( v2 * m20, v2 * m21)
394  // w = ( v3 * m30, v3 * m31)
395  x = _mm_mul_pd(x,_mm_loadu_pd(i++*cols+m+j));
396  y = _mm_mul_pd(y,_mm_loadu_pd(i++*cols+m+j));
397  z = _mm_mul_pd(z,_mm_loadu_pd(i++*cols+m+j));
398  w = _mm_mul_pd(w,_mm_loadu_pd(i++*cols+m+j));
399 
400  // now sum both columns
401  accum = _mm_add_pd(x,accum);
402  accum = _mm_add_pd(y,accum);
403  accum = _mm_add_pd(z,accum);
404  accum = _mm_add_pd(w,accum);
405 
406  // accum is now ( v0 * m00 + v1 * m10 + v2 * m20 + v3 * m30,
407  // v0 * m01 + v1 * m11 + v2 * m21 + v3 * m31 )
408  }
409 
410  // handle left-over rows
411  switch (r_left)
412  {
413  case 3: accum = _mm_add_pd(_mm_mul_pd(_mm_load1_pd(v+i),_mm_loadu_pd(m+i*cols+j)), accum); i++;
414  case 2: accum = _mm_add_pd(_mm_mul_pd(_mm_load1_pd(v+i),_mm_loadu_pd(m+i*cols+j)), accum); i++;
415  case 1: accum = _mm_add_pd(_mm_mul_pd(_mm_load1_pd(v+i),_mm_loadu_pd(m+i*cols+j)), accum);
416  case 0: ;
417  }
418 
419  // store the 2 values of the result vector
420  // use stream to avoid polluting the cache
421  _mm_stream_pd(r+j,accum);
422  }
423 
424  // handle the left over columns
425  if ( c_left )
426  {
427  accum = _mm_setzero_pd();
428  for (unsigned int i=0; i<rows; ++i)
429  accum = _mm_add_sd(_mm_mul_sd(_mm_load_sd(m+i*cols+cols-1),_mm_load_sd(v+i)),accum);
430  _mm_store_sd(r+cols-1, accum);
431  }
432  }
433 
434  static VNL_SSE_FORCE_INLINE void matrix_x_vector(const double* m, const double* v, double* r, unsigned rows, unsigned cols)
435  {
436  __m128d accum, x,y,mxy1,mxy2;
437 
438  // calculate if there are any left-over rows/columns
439  unsigned r_left = rows%2;
440  unsigned r_nice = rows - r_left;
441  unsigned c_left = cols%2;
442  unsigned c_nice = cols - c_left;
443 
444  // handle 2 matrix rows at a time
445  for (unsigned i = 0; i < r_nice; i+=2)
446  {
447  // handle 2 matrix columns at a time
448  accum = _mm_setzero_pd();
449  const double *r1 = m+i*cols, *r2 = m+(i+1)*cols;
450  unsigned j = 0;
451  for (; j < c_nice; j+=2)
452  {
453  // load vector data so that
454  // y = (v0, v1)
455  y = VNL_SSE_HEAP_LOAD(pd)(v+j);
456 
457  // shuffle so that
458  // x = (v0,v0) y = (v1,v1)
459  x = _mm_shuffle_pd(y,y,_MM_SHUFFLE2(0,0));
460  y = _mm_shuffle_pd(y,y,_MM_SHUFFLE2(1,1));
461 
462  // load the matrix data so that
463  // mxy1 = (m00,m01), mxy2 = (m10,m11)
464  mxy1 = _mm_loadu_pd(r1+j);
465  mxy2 = _mm_loadu_pd(r2+j);
466 
467  // unpack matrix data to achieve
468  // (v0,v0) * (m00,m10)
469  // (v1,v1) * (m01,m11)
470  x = _mm_mul_pd(x,_mm_unpacklo_pd(mxy1,mxy2));
471  y = _mm_mul_pd(y,_mm_unpackhi_pd(mxy1,mxy2));
472 
473  // now sum the results
474  accum = _mm_add_pd(x,accum);
475  accum = _mm_add_pd(y,accum);
476 
477  // accum is now ( v0 * m00 + v1 * m01,
478  // v0 * m11 + v1 * m11 )
479  }
480  // handle the left over columns
481  if (c_left)
482  accum = _mm_add_pd(_mm_mul_pd(_mm_load1_pd(v+j),_mm_set_pd(*(r2+j),*(r1+j))), accum);
483 
484  // store the 2 values of the result vector
485  // use stream to avoid polluting the cache
486  _mm_stream_pd(r+i,accum);
487  }
488 
489  // handle the left over rows
490  if ( r_left )
491  {
492  accum = _mm_setzero_pd();
493  const double* p = m+(rows-1)*cols;
494  for (unsigned int j=0; j<cols; ++j)
495  accum = _mm_add_sd(_mm_mul_sd(_mm_load_sd(p+j),_mm_load_sd(v+j)),accum);
496  _mm_store_sd(r+rows-1, accum);
497  }
498  }
499 
500  static VNL_SSE_FORCE_INLINE double sum(const double* x, unsigned n)
501  {
502  double ret;
503  // decision logic for odd sized vectors
504  __m128d sum = n%2 ? _mm_load_sd(x+--n) : _mm_setzero_pd();
505 
506  // sum two elements at a time, sum will contain two running totals
507  for (int i = n-2; i >= 0; i-=2)
508  sum = _mm_add_pd(VNL_SSE_HEAP_LOAD(pd)(x+i),sum);
509 
510  // sum will contain 2 accumulated values, need to add them together
511  sum = _mm_add_sd(sum,_mm_unpackhi_pd(sum,_mm_setzero_pd()));
512  _mm_store_sd(&ret,sum);
513  return ret;
514  }
515 
516  static VNL_SSE_FORCE_INLINE unsigned arg_max(const double* x, unsigned n)
517  {
518  if (n == 1)
519  return 0;
520 
521  __m128d min_double = _mm_set1_pd(- DBL_MAX);
522  __m128d max = min_double;
523  __m128d input;
524  __m128i input_idx = _mm_set_epi32(1, 1, 0, 0);
525  __m128i input_idx_increment = _mm_set1_epi32(2);
526  __m128i arg_max = _mm_set1_epi32(0);
527  union IsMaxMask
528  {
529  __m128d m128d;
530  __m128i m128i;
531  };
532  IsMaxMask is_max;
533 
534  const int n16 = (n/2) * 2;
535  // handle two elements at a time, max will contain two max values
536  for (int i=0; i<n16; i+=2)
537  {
538  input = VNL_SSE_HEAP_LOAD(pd)(x+i);
539  max = _mm_max_pd(input, max);
540  is_max.m128d = _mm_cmpeq_pd(max, input);
541  arg_max = VNL_MM_MAX_EPI32(arg_max, _mm_and_si128(is_max.m128i, input_idx));
542  input_idx = _mm_add_epi32(input_idx, input_idx_increment);
543  }
544 
545  // decision logic for odd sized vectors
546  if (n%2)
547  {
548  input_idx = _mm_set1_epi32(--n);
549  input = _mm_load1_pd(x+n);
550  max = _mm_max_sd(max, input);
551  is_max.m128d = _mm_cmpeq_pd(max, input);
552  arg_max = VNL_MM_MAX_EPI32(arg_max, _mm_and_si128(is_max.m128i, input_idx));
553  }
554 
555  // need to store the index of the max value
556  is_max.m128d = max;
557  max = _mm_max_sd(_mm_unpackhi_pd(max, min_double), max);
558  max = _mm_unpacklo_pd(max, max);
559  is_max.m128d = _mm_cmpeq_pd(is_max.m128d, max);
560  arg_max = _mm_and_si128(is_max.m128i, arg_max);
561  arg_max = VNL_MM_MAX_EPI32(arg_max, _mm_unpackhi_epi32(arg_max, _mm_set1_epi32(0)));
562  unsigned ret = _mm_cvtsi128_si32(arg_max);
563  return ret;
564  }
565 
566  static VNL_SSE_FORCE_INLINE unsigned arg_min(const double* x, unsigned n)
567  {
568  if (n == 1)
569  return 0;
570 
571  __m128d max_double = _mm_set1_pd(DBL_MAX);
572  __m128d min = max_double;
573  __m128d input;
574  __m128i input_idx = _mm_set_epi32(1, 1, 0, 0);
575  __m128i input_idx_increment = _mm_set1_epi32(2);
576  __m128i arg_min = _mm_set1_epi32(0);
577  union IsMinMask
578  {
579  __m128d m128d;
580  __m128i m128i;
581  };
582  IsMinMask is_min;
583 
584  const int n16 = (n/2) * 2;
585  // handle two elements at a time, max will contain two max values
586  for (int i=0; i<n16; i+=2)
587  {
588  input = VNL_SSE_HEAP_LOAD(pd)(x+i);
589  min = _mm_min_pd(input, min);
590  is_min.m128d = _mm_cmpeq_pd(min, input);
591  arg_min = VNL_MM_MAX_EPI32(arg_min, _mm_and_si128(is_min.m128i, input_idx));
592  input_idx = _mm_add_epi32(input_idx, input_idx_increment);
593  }
594 
595  // decision logic for odd sized vectors
596  if (n%2)
597  {
598  input_idx = _mm_set1_epi32(--n);
599  input = _mm_load1_pd(x+n);
600  min = _mm_min_sd(min, input);
601  is_min.m128d = _mm_cmpeq_pd(min, input);
602  arg_min = VNL_MM_MAX_EPI32(arg_min, _mm_and_si128(is_min.m128i, input_idx));
603  }
604 
605  // need to store the index of the min value
606  is_min.m128d = min;
607  min = _mm_min_sd(_mm_unpackhi_pd(min, max_double), min);
608  min = _mm_unpacklo_pd(min, min);
609  is_min.m128d = _mm_cmpeq_pd(is_min.m128d, min);
610  arg_min = _mm_and_si128(is_min.m128i, arg_min);
611  arg_min = VNL_MM_MAX_EPI32(arg_min, _mm_unpackhi_epi32(arg_min, _mm_set1_epi32(0)));
612  unsigned ret = _mm_cvtsi128_si32(arg_min);
613  return ret;
614  }
615 
616  static VNL_SSE_FORCE_INLINE double max(const double* x, unsigned n)
617  {
618  double ret;
619  __m128d min_double = _mm_set1_pd(- DBL_MAX);
620  __m128d max = min_double;
621 
622  // decision logic for odd sized vectors
623  if (n%2)
624  max = _mm_max_sd(max,_mm_load_sd(x+--n));
625 
626  // handle two elements at a time, max will contain two max values
627  for (int i=n-2; i>=0; i-=2)
628  max = _mm_max_pd(VNL_SSE_HEAP_LOAD(pd)(x+i), max);
629 
630  // need to store max's two values
631  max = _mm_max_sd(max,_mm_unpackhi_pd(max,min_double));
632  _mm_store_sd(&ret,max);
633  return ret;
634  }
635 
636  static VNL_SSE_FORCE_INLINE double min(const double* x, unsigned n)
637  {
638  double ret;
639  __m128d max_double = _mm_set1_pd(DBL_MAX);
640  __m128d min = max_double;
641 
642  // hand last element if odd sized vector
643  if (n%2)
644  min = _mm_min_sd(min,_mm_load_sd(x+--n));
645 
646  // handle two elements at a time, min will contain two min values
647  for (int i=n-2; i>=0; i-=2)
648  min = _mm_min_pd(VNL_SSE_HEAP_LOAD(pd)(x+i), min);
649 
650  // need to store min's two values
651  min = _mm_min_sd(min,_mm_unpackhi_pd(min,max_double));
652  _mm_store_sd(&ret,min);
653  return ret;
654  }
655 };
656 
657 //: SSE2 implementation for single precision floating point (32 bit)
658 template <>
659 class VNL_EXPORT vnl_sse<float>
660 {
661  public:
662  static VNL_SSE_FORCE_INLINE void element_product(const float* x, const float* y, float* r, unsigned n)
663  {
664  switch (n%4)
665  {
666  // do scalar (single value) load, multiply and store for end elements
667  case 3: --n; _mm_store_ss(r+n,_mm_mul_ss(_mm_load_ss(x+n),_mm_load_ss(y+n)));
668  case 2: --n; _mm_store_ss(r+n,_mm_mul_ss(_mm_load_ss(x+n),_mm_load_ss(y+n)));
669  case 1: --n; _mm_store_ss(r+n,_mm_mul_ss(_mm_load_ss(x+n),_mm_load_ss(y+n)));
670  case 0: ;
671  }
672 
673  // load, multiply and store four floats at a time
674  for (int i = n-4; i >= 0; i-=4)
675  VNL_SSE_HEAP_STORE(ps)(r+i,_mm_mul_ps(VNL_SSE_HEAP_LOAD(ps)(x+i),VNL_SSE_HEAP_LOAD(ps)(y+i)));
676  }
677 
678  static VNL_SSE_FORCE_INLINE float dot_product(const float* x, const float* y, unsigned n)
679  {
680  float ret;
681  __m128 sum = _mm_setzero_ps();
682  switch (n%4)
683  {
684  // handle elements at end of vectors with sizes not divisable by 4
685  case 3: n--; sum = _mm_mul_ss(_mm_load_ss(x+n), _mm_load_ss(y+n));
686  case 2: n--; sum = _mm_add_ss(_mm_mul_ss(_mm_load_ss(x+n), _mm_load_ss(y+n)),sum);
687  case 1: n--; sum = _mm_add_ss(_mm_mul_ss(_mm_load_ss(x+n), _mm_load_ss(y+n)),sum);
688  case 0: ;
689  }
690 
691  for (int i = n-4; i >= 0; i-=4)
692  sum = _mm_add_ps(_mm_mul_ps(VNL_SSE_HEAP_LOAD(ps)(x+i), VNL_SSE_HEAP_LOAD(ps)(y+i)),sum);
693 
694  // sum will contain 4 accumulated values, need to add them together
695  sum = _mm_add_ps(sum,_mm_movehl_ps(_mm_setzero_ps(),sum));
696  sum = _mm_add_ss(sum,_mm_shuffle_ps(sum,sum,_MM_SHUFFLE(3,2,1,1)));
697 
698  _mm_store_ss(&ret,sum);
699  return ret;
700  }
701 
702  static VNL_SSE_FORCE_INLINE float euclid_dist_sq(const float* x, const float* y, unsigned n)
703  {
704  float ret;
705  __m128 sum,a;
706  sum = a = _mm_setzero_ps();
707  switch (n%4)
708  {
709  // handle elements at end of vectors with sizes not divisable by 4
710  case 3: --n; a = _mm_sub_ss(_mm_load_ss(x+n),_mm_load_ss(y+n));
711  case 2: --n; a = _mm_shuffle_ps(_mm_sub_ss(_mm_load_ss(x+n),_mm_load_ss(y+n)), a ,_MM_SHUFFLE(1,0,0,1));
712  case 1: --n; a = _mm_move_ss(a,_mm_sub_ss(_mm_load_ss(x+n),_mm_load_ss(y+n)));
713  sum = _mm_mul_ps(a,a);
714  case 0: ;
715  }
716 
717  for ( int i = n-4; i >= 0; i-=4 )
718  {
719  a = _mm_sub_ps(VNL_SSE_HEAP_LOAD(ps)(x+i),VNL_SSE_HEAP_LOAD(ps)(y+i));
720  sum = _mm_add_ps(_mm_mul_ps(a,a),sum);
721  }
722 
723  // sum will contain 4 accumulated values, need to add them together
724  sum = _mm_add_ps(sum,_mm_movehl_ps(_mm_setzero_ps(),sum));
725  sum = _mm_add_ss(sum,_mm_shuffle_ps(sum,sum,_MM_SHUFFLE(3,2,1,1)));
726 
727  _mm_store_ss(&ret,sum);
728  return ret;
729  }
730 
731  static VNL_SSE_FORCE_INLINE void vector_x_matrix(const float* v, const float* m, float* r, unsigned rows, unsigned cols)
732  {
733  __m128 accum, x,y,z,w;
734 
735  // calculate if there are any left-over rows/columns
736  unsigned r_left = rows%4;
737  unsigned r_nice = rows - r_left;
738  unsigned c_left = cols%4;
739  unsigned c_nice = cols - c_left;
740 
741  // handle 4 matrix columns at a time
742  for (unsigned j = 0; j < c_nice; j+=4)
743  {
744  // handle 4 matrix rows at a time
745  accum = _mm_setzero_ps();
746  unsigned i = 0;
747  while ( i < r_nice )
748  {
749  // load vector data so that
750  // w = (v0,v1,v2,v3)
751  w = VNL_SSE_HEAP_LOAD(ps)(v+i);
752 
753  // after shuffling
754  // x = (v0, v0, v0, v0)
755  // y = (v1, v1, v1, v1)
756  // z = (v2, v2, v2, v2)
757  // w = (v3, v3, v3, v3)
758  x = _mm_shuffle_ps(w,w,_MM_SHUFFLE(0,0,0,0));
759  y = _mm_shuffle_ps(w,w,_MM_SHUFFLE(1,1,1,1));
760  z = _mm_shuffle_ps(w,w,_MM_SHUFFLE(2,2,2,2));
761  w = _mm_shuffle_ps(w,w,_MM_SHUFFLE(3,3,3,3));
762 
763  // multiply the four matrix columns
764  // i.e. x = ( v0 * m00, v0 * m01, v0 * m02, v0 * m03)
765  // y = ( v1 * m10, v1 * m11, v1 * m12, v1 * m13)
766  // z = ( v2 * m20, v2 * m21, v2 * m22, v2 * m23)
767  // w = ( v3 * m30, v3 * m31, v3 * m32, v3 * m33)
768  x = _mm_mul_ps(x,_mm_loadu_ps(m+i++*cols+j));
769  y = _mm_mul_ps(y,_mm_loadu_ps(m+i++*cols+j));
770  z = _mm_mul_ps(z,_mm_loadu_ps(m+i++*cols+j));
771  w = _mm_mul_ps(w,_mm_loadu_ps(m+i++*cols+j));
772 
773  // now sum the four columns
774  accum = _mm_add_ps(x,accum);
775  accum = _mm_add_ps(y,accum);
776  accum = _mm_add_ps(z,accum);
777  accum = _mm_add_ps(w,accum);
778 
779  // accum is now ( v0 * m00 + v1 * m10 + v2 * m20 + v3 * m30,
780  // v0 * m01 + v1 * m11 + v2 * m21 + v3 * m31,
781  // v0 * m02 + v1 * m12 + v2 * m22 + v3 * m32,
782  // v0 * m03 + v1 * m13 + v2 * m23 + v3 * m33 )
783  }
784 
785  // handle left-over rows
786  switch (r_left)
787  {
788  case 3: accum = _mm_add_ps(_mm_mul_ps(_mm_load1_ps(v+i),_mm_loadu_ps(m+i*cols+j)), accum); i++;
789  case 2: accum = _mm_add_ps(_mm_mul_ps(_mm_load1_ps(v+i),_mm_loadu_ps(m+i*cols+j)), accum); i++;
790  case 1: accum = _mm_add_ps(_mm_mul_ps(_mm_load1_ps(v+i),_mm_loadu_ps(m+i*cols+j)), accum);
791  case 0: ;
792  }
793 
794  // store the 4 values of the result vector
795  // use stream to avoid polluting the cache
796  _mm_stream_ps(r+j,accum);
797  }
798 
799  // handle the left over columns
800  for (; c_left > 0; --c_left) {
801  accum = _mm_setzero_ps();
802  for (unsigned int i=0; i<rows; ++i)
803  accum = _mm_add_ss(_mm_mul_ss(_mm_load_ss(m+i*cols+cols-c_left), _mm_load_ss(v+i)),accum);
804  _mm_store_ss(r+cols-c_left,accum);
805  }
806  }
807 
808  static VNL_SSE_FORCE_INLINE void matrix_x_vector(const float* m, const float* v, float* r, unsigned rows, unsigned cols)
809  {
810  __m128 accum, x,y,z,w,mxy1,mxy2,mzw1,mzw2, mr1,mr2,mr3,mr4;
811 
812  // calculate if there are any left-over rows/columns
813  unsigned r_left = rows%4;
814  unsigned r_nice = rows - r_left;
815  unsigned c_left = cols%4;
816  unsigned c_nice = cols - c_left;
817 
818  // handle 4 matrix rows at a time
819  for (unsigned i = 0; i < r_nice; i+=4)
820  {
821  // handle 4 matrix columns at a time
822  accum = _mm_setzero_ps();
823  const float *r1 = m+i*cols, *r2 = m+(i+1)*cols,
824  *r3 = m+(i+2)*cols, *r4 = m+(i+3)*cols;
825  unsigned j = 0;
826  for (; j < c_nice; j+=4)
827  {
828  // load vector data so that
829  // w = (v0, v1, v2, v3)
830  w = VNL_SSE_HEAP_LOAD(ps)(v+j);
831 
832  // after shuffling
833  // x = (v0, v0, v0, v0)
834  // y = (v1, v1, v1, v1)
835  // z = (v2, v2, v2, v2)
836  // w = (v3, v3, v3, v3)
837  x = _mm_shuffle_ps(w,w,_MM_SHUFFLE(0,0,0,0));
838  y = _mm_shuffle_ps(w,w,_MM_SHUFFLE(1,1,1,1));
839  z = _mm_shuffle_ps(w,w,_MM_SHUFFLE(2,2,2,2));
840  w = _mm_shuffle_ps(w,w,_MM_SHUFFLE(3,3,3,3));
841 
842  // load form first two rows of the matrix
843  // i.e. mr1 = (m00, m01, m02, m03)
844  // mr2 = (m10, m11, m12, m13)
845  mr1 = _mm_loadu_ps(r1+j);
846  mr2 = _mm_loadu_ps(r2+j);
847 
848  // unpack into xy and zw parts
849  // i.e mxy1 = (m00, m10, m01, m11)
850  // mzw1 = (m02, m12, m03, m13)
851  mxy1 = _mm_unpacklo_ps(mr1,mr2);
852  mzw1 = _mm_unpackhi_ps(mr1,mr2);
853 
854  // similarly for the next two rows
855  mr3 = _mm_loadu_ps(r3+j);
856  mr4 = _mm_loadu_ps(r4+j);
857 
858  // unpack into xy and zw parts
859  // i.e mxy2 = (m20, m30, m21, m31)
860  // mxy2 = (m22, m32, m23, m33)
861  mxy2 = _mm_unpacklo_ps(mr3,mr4);
862  mzw2 = _mm_unpackhi_ps(mr3,mr4);
863 
864  // move around matrix data and multiply so that
865  // x = (v0,v0,v0,v0) * (m00,m10,m20,m30)
866  // y = (v1,v1,v1,v1) * (m01,m11,m21,m31)
867  // z = (v2,v2,v2,v2) * (m02,m12,m22,m32)
868  // w = (v3,v3,v3,v3) * (m03,m13,m23,m33)
869 #if 1
870  __m128 mx = _mm_movelh_ps(mxy1,mxy2);
871  x = _mm_mul_ps(x, mx);
872  __m128 my = _mm_movehl_ps(mxy2,mxy1);
873  y = _mm_mul_ps(y, my);
874  __m128 mz = _mm_movelh_ps(mzw1,mzw2);
875  z = _mm_mul_ps(z, mz);
876  __m128 mw = _mm_movehl_ps(mzw2,mzw1);
877  w = _mm_mul_ps(w,mw);
878 #else
879  x = _mm_mul_ps(x,_mm_movelh_ps(mxy1,mxy2));
880  y = _mm_mul_ps(y,_mm_movehl_ps(mxy1,mxy2));
881  z = _mm_mul_ps(z,_mm_movelh_ps(mzw1,mzw2));
882  w = _mm_mul_ps(w,_mm_movehl_ps(mzw1,mzw2));
883 #endif // 0
884 
885  // now sum the four results
886  accum = _mm_add_ps(x,accum);
887  accum = _mm_add_ps(y,accum);
888  accum = _mm_add_ps(z,accum);
889  accum = _mm_add_ps(w,accum);
890 
891  // accum is now ( v0 * m00 + v1 * m01 + v2 * m02 + v3 * m03,
892  // v0 * m10 + v1 * m11 + v2 * m12 + v3 * m13,
893  // v0 * m20 + v1 * m21 + v2 * m22 + v3 * m23,
894  // v0 * m30 + v1 * m31 + v2 * m32 + v3 * m33 )
895  }
896 
897  // handle the left over columns
898  switch (c_left)
899  {
900  case 3: accum = _mm_add_ps(_mm_mul_ps(_mm_load1_ps(v+j),_mm_set_ps(*(r4+j),*(r3+j),*(r2+j),*(r1+j))), accum); j++;
901  case 2: accum = _mm_add_ps(_mm_mul_ps(_mm_load1_ps(v+j),_mm_set_ps(*(r4+j),*(r3+j),*(r2+j),*(r1+j))), accum); j++;
902  case 1: accum = _mm_add_ps(_mm_mul_ps(_mm_load1_ps(v+j),_mm_set_ps(*(r4+j),*(r3+j),*(r2+j),*(r1+j))), accum);
903  case 0: ;
904  }
905  // store the 4 values of the result vector
906  // use stream to avoid polluting the cache
907  _mm_stream_ps(r+i,accum);
908  }
909 
910  // handle the left over rows
911  for (; r_left > 0; --r_left) {
912  accum = _mm_setzero_ps();
913  const float* p = m+(rows-r_left)*cols;
914  for (unsigned int j=0; j<cols; ++j)
915  accum = _mm_add_ss(_mm_mul_ss(_mm_load_ss(p+j), _mm_load_ss(v+j)),accum);
916  _mm_store_ss(r+rows-r_left,accum);
917  }
918  }
919 
920  static VNL_SSE_FORCE_INLINE float sum(const float* x, unsigned n)
921  {
922  float ret;
923  __m128 sum = _mm_setzero_ps();
924  switch (n%4)
925  { // handle vector sizes which aren't divisible by 4
926  case 3: sum = _mm_load_ss(x+--n);
927  case 2: sum = _mm_shuffle_ps(_mm_load_ss(x+--n), sum ,_MM_SHUFFLE(1,0,0,1));
928  case 1: sum = _mm_move_ss(sum,_mm_load_ss(x+--n));
929  case 0: ;
930  }
931 
932  // sum four elements at a time, sum will contain four running totals
933  for (int i = n-4; i >= 0; i-=4)
934  sum = _mm_add_ps(VNL_SSE_HEAP_LOAD(ps)(x+i),sum);
935 
936  // sum will contain 4 accumulated values, need to add them together
937  sum = _mm_add_ps(sum,_mm_movehl_ps(_mm_setzero_ps(),sum));
938  sum = _mm_add_ss(sum,_mm_shuffle_ps(sum,sum,_MM_SHUFFLE(3,2,1,1)));
939  _mm_store_ss(&ret,sum);
940  return ret;
941  }
942 
943  static VNL_SSE_FORCE_INLINE float max(const float* x, unsigned n)
944  {
945  float ret;
946  __m128 min_float = _mm_set1_ps(- FLT_MAX);
947  __m128 max = min_float;
948  switch (n%4)
949  { // handle vector sizes which aren't divisible by 4
950  case 3: max = _mm_load_ss(x+--n);
951  case 2: max = _mm_shuffle_ps(_mm_load_ss(x+--n), max ,_MM_SHUFFLE(1,0,0,1));
952  case 1: max = _mm_move_ss(max,_mm_load_ss(x+--n));
953  case 0: ;
954  }
955 
956  // handle four elements at a time, max will contain four max values
957  for (int i = n-4; i >= 0; i-=4)
958  max = _mm_max_ps(VNL_SSE_HEAP_LOAD(ps)(x+i), max);
959 
960  // need compare max's four values
961  max = _mm_max_ps(max,_mm_movehl_ps(min_float,max));
962  max = _mm_max_ss(max,_mm_shuffle_ps(max,max,_MM_SHUFFLE(3,2,1,1)));
963  _mm_store_ss(&ret,max);
964 
965  return ret;
966  }
967 
968  static VNL_SSE_FORCE_INLINE float min(const float* x, unsigned n)
969  {
970  float ret;
971  __m128 max_float = _mm_set1_ps(FLT_MAX);
972  __m128 min = max_float;
973 
974  switch (n%4)
975  { // handle vector sizes which aren't divisible by 4
976  case 3: min = _mm_min_ss(min,_mm_load_ss(x+--n));
977  case 2: min = _mm_min_ss(min,_mm_load_ss(x+--n));
978  case 1: min = _mm_min_ss(min,_mm_load_ss(x+--n));
979  case 0: ;
980  }
981 
982  // handle four elements at a time, min will contain four min values
983  for (int i = n-4; i >= 0; i-=4)
984  min = _mm_min_ps(VNL_SSE_HEAP_LOAD(ps)(x+i), min);
985 
986 
987  // need compare min's four values
988  min = _mm_min_ps(min,_mm_movehl_ps(max_float,min));
989  min = _mm_min_ss(min,_mm_shuffle_ps(min,min,_MM_SHUFFLE(3,2,1,1)));
990  _mm_store_ss(&ret,min);
991 
992  return ret;
993  }
994 
995  static VNL_SSE_FORCE_INLINE unsigned arg_max(const float* x, unsigned n)
996  {
997  __m128 max = _mm_set1_ps(- FLT_MAX);
998  __m128 input;
999  __m128i input_idx = _mm_set_epi32(3, 2, 1, 0);
1000  __m128i input_idx_increment = _mm_set1_epi32(4);
1001  __m128i arg_max = _mm_set1_epi32(0);
1002  union IsMaxMask
1003  {
1004  __m128 m128;
1005  __m128i m128i;
1006  };
1007  IsMaxMask is_max;
1008 
1009  const int n16 = (n/4) * 4;
1010  // handle two elements at a time, max will contain two max values
1011  for (int i=0; i<n16; i+=4)
1012  {
1013  input = VNL_SSE_HEAP_LOAD(ps)(x+i);
1014  max = _mm_max_ps(input, max);
1015  is_max.m128 = _mm_cmpeq_ps(max, input);
1016  arg_max = VNL_MM_MAX_EPI32(arg_max, _mm_and_si128(is_max.m128i, input_idx));
1017  input_idx = _mm_add_epi32(input_idx, input_idx_increment);
1018  }
1019 
1020  // decision logic for vector sizes whach aren't divisible by 4
1021  int mod = n%4;
1022  n = n-mod;
1023  input_idx_increment = _mm_set1_epi32(1);
1024  while (mod != 0)
1025  {
1026  input_idx = _mm_set1_epi32(n);
1027  input = _mm_load1_ps(x+n);
1028  max = _mm_max_ps(max, input);
1029  is_max.m128 = _mm_cmpeq_ps(max, input);
1030  arg_max = VNL_MM_MAX_EPI32(arg_max, _mm_and_si128(is_max.m128i, input_idx));
1031  --mod;
1032  ++n;
1033  input_idx = _mm_add_epi32(input_idx, input_idx_increment);
1034  }
1035 
1036  // need to store the index of the max value
1037  is_max.m128 = max;
1038  max = _mm_max_ps(max, _mm_shuffle_ps(max, max, _MM_SHUFFLE(2,3,0,1)));
1039  max = _mm_max_ps(max, _mm_movehl_ps(max, max));
1040  max = _mm_shuffle_ps(max, max, _MM_SHUFFLE(0,0,0,0));
1041  is_max.m128 = _mm_cmpeq_ps(is_max.m128, max);
1042  arg_max = _mm_and_si128(is_max.m128i, arg_max);
1043  arg_max = VNL_MM_MAX_EPI32(arg_max, _mm_unpackhi_epi32(arg_max, _mm_set1_epi32(0)));
1044  arg_max = VNL_MM_MAX_EPI32(arg_max, _mm_srli_si128(arg_max, 4));
1045  unsigned ret = _mm_cvtsi128_si32(arg_max);
1046  return ret;
1047  }
1048 
1049  static VNL_SSE_FORCE_INLINE unsigned arg_min(const float* x, unsigned n)
1050  {
1051  __m128 min = _mm_set1_ps(FLT_MAX);
1052  __m128 input;
1053  __m128i input_idx = _mm_set_epi32(3, 2, 1, 0);
1054  __m128i input_idx_increment = _mm_set1_epi32(4);
1055  __m128i arg_min = _mm_set1_epi32(0);
1056  union IsMinMask
1057  {
1058  __m128 m128;
1059  __m128i m128i;
1060  };
1061  IsMinMask is_min;
1062 
1063  const int n16 = (n/4) * 4;
1064  // handle two elements at a time, max will contain two max values
1065  for (int i=0; i<n16; i+=4)
1066  {
1067  input = VNL_SSE_HEAP_LOAD(ps)(x+i);
1068  min = _mm_min_ps(input, min);
1069  is_min.m128 = _mm_cmpeq_ps(min, input);
1070  arg_min = VNL_MM_MAX_EPI32(arg_min, _mm_and_si128(is_min.m128i, input_idx));
1071  input_idx = _mm_add_epi32(input_idx, input_idx_increment);
1072  }
1073 
1074  //// decision logic for vector sizes whach aren't divisible by 4
1075  int mod = n%4;
1076  n = n-mod;
1077  input_idx_increment = _mm_set1_epi32(1);
1078  while (mod != 0)
1079  {
1080  input_idx = _mm_set1_epi32(n);
1081  input = _mm_load1_ps(x+n);
1082  min = _mm_min_ps(min, input);
1083  is_min.m128 = _mm_cmpeq_ps(min, input);
1084  arg_min = VNL_MM_MAX_EPI32(arg_min, _mm_and_si128(is_min.m128i, input_idx));
1085  --mod;
1086  ++n;
1087  input_idx = _mm_add_epi32(input_idx, input_idx_increment);
1088  }
1089 
1090  // need to store the index of the max value
1091  is_min.m128 = min;
1092  min = _mm_min_ps(min, _mm_shuffle_ps(min, min, _MM_SHUFFLE(2,3,0,1)));
1093  min = _mm_min_ps(min, _mm_movehl_ps(min, min));
1094  min = _mm_shuffle_ps(min, min, _MM_SHUFFLE(0,0,0,0));
1095  is_min.m128 = _mm_cmpeq_ps(is_min.m128, min);
1096  arg_min = _mm_and_si128(is_min.m128i, arg_min);
1097  arg_min = VNL_MM_MAX_EPI32(arg_min, _mm_unpackhi_epi32(arg_min, _mm_set1_epi32(0)));
1098  arg_min = VNL_MM_MAX_EPI32(arg_min, _mm_srli_si128(arg_min, 4));
1099  unsigned ret = _mm_cvtsi128_si32(arg_min);
1100  return ret;
1101  }
1102 };
1103 
1104 #endif // VNL_CONFIG_ENABLE_SSE2
1105 
1106 #endif // vnl_sse_h_
static VNL_SSE_FORCE_INLINE T min(const T *v, unsigned n)
Definition: vnl_sse.h:227
#define VNL_SSE_FREE(v, n, s)
Definition: vnl_sse.h:87
static VNL_SSE_FORCE_INLINE T euclid_dist_sq(const T *x, const T *y, unsigned n)
Definition: vnl_sse.h:174
Default node allocator.
#define VNL_SSE_ALLOC(n, s, a)
Definition: vnl_sse.h:86
VNL_EXPORT T dot_product(m const &, m const &)
static VNL_SSE_FORCE_INLINE unsigned arg_min(const T *v, unsigned n)
Definition: vnl_sse.h:248
#define m
Definition: vnl_vector.h:43
VNL_SSE_FORCE_INLINE void vnl_sse_dealloc(void *mem, std::size_t n, unsigned size)
Custom memory deallocation function to free 16 byte aligned of data.
Definition: vnl_sse.h:110
#define VNL_SSE_FORCE_INLINE
Definition: vnl_sse.h:121
static VNL_SSE_FORCE_INLINE T dot_product(const T *x, const T *y, unsigned n)
Definition: vnl_sse.h:166
Bog standard (no sse) implementation for non sse enabled hardware and any type which doesn't have a t...
Definition: vnl_sse.h:157
static VNL_SSE_FORCE_INLINE T sum(const T *v, unsigned n)
Definition: vnl_sse.h:209
#define v
Definition: vnl_vector.h:42
VNL_EXPORT m element_product(m const &, m const &)
VNL_SSE_FORCE_INLINE void * vnl_sse_alloc(std::size_t n, unsigned size)
Custom memory allocation function to force 16 byte alignment of data.
Definition: vnl_sse.h:104
static VNL_SSE_FORCE_INLINE void element_product(const T *x, const T *y, T *r, unsigned n)
Definition: vnl_sse.h:160
#define VNL_SSE_HEAP_LOAD(pf)
Definition: vnl_sse.h:81
vnl_decnum max(vnl_decnum const &x, vnl_decnum const &y)
Definition: vnl_decnum.h:412
#define VNL_SSE_HEAP_STORE(pf)
Definition: vnl_sse.h:80
static VNL_SSE_FORCE_INLINE void matrix_x_vector(const T *m, const T *v, T *r, unsigned rows, unsigned cols)
Definition: vnl_sse.h:199
static VNL_SSE_FORCE_INLINE T max(const T *v, unsigned n)
Definition: vnl_sse.h:217
static VNL_SSE_FORCE_INLINE unsigned arg_max(const T *v, unsigned n)
Definition: vnl_sse.h:237
vnl_decnum min(vnl_decnum const &x, vnl_decnum const &y)
Definition: vnl_decnum.h:413
static VNL_SSE_FORCE_INLINE void vector_x_matrix(const T *v, const T *m, T *r, unsigned rows, unsigned cols)
Definition: vnl_sse.h:189