13 #include <vcl_compiler_detection.h> 15 # include <vcl_msvc_warnings.h> 16 #endif // for macro decisions based on compiler type 17 #include <vxl_config.h> 20 #include <vnl/vnl_config.h> 22 #include "vnl/vnl_export.h" 32 #if VNL_CONFIG_ENABLE_SSE2 33 # if defined(__APPLE__) && (defined(__ppc__) || defined(__ppc64__)) 34 # undef VNL_CONFIG_ENABLE_SSE2 35 # define VNL_CONFIG_ENABLE_SSE2 0 36 # elif !VXL_HAS_EMMINTRIN_H 37 # error "Required file emmintrin.h for SSE2 not found" 39 # include <emmintrin.h> 50 # define VNL_SSE_FORCE_INLINE inline 51 # define VNL_SSE_STACK_ALIGNED(x) __attribute__((aligned(x))) VNL_EXPORT 52 #elif defined _MSC_VER 53 # define VNL_SSE_FORCE_INLINE __forceinline 54 # define VNL_SSE_STACK_ALIGNED(x) __declspec(align(x)) VNL_EXPORT 56 # define VNL_SSE_FORCE_INLINE inline 57 # define VNL_SSE_STACK_ALIGNED(x) 58 # define VNL_SSE_STACK_STORE(pf) _mm_storeu_##pf // no stack alignment so use unaligned store (slower!) 64 #if VNL_CONFIG_ENABLE_SSE2 && VXL_HAS_MM_MALLOC 65 # define VNL_SSE_ALLOC(n,s,a) _mm_malloc(n*s,a) 66 # define VNL_SSE_FREE(v,n,s) _mm_free(v) 67 #elif VNL_CONFIG_ENABLE_SSE2 && VXL_HAS_ALIGNED_MALLOC 69 # define VNL_SSE_ALLOC(n,s,a) _aligned_malloc(n*s,a) 70 # define VNL_SSE_FREE(v,n,s) _aligned_free(v) 71 #elif VNL_CONFIG_ENABLE_SSE2 && VXL_HAS_MINGW_ALIGNED_MALLOC 73 # define VNL_SSE_ALLOC(n,s,a) __mingw_aligned_malloc(n*s,a) 74 # define VNL_SSE_FREE(v,n,s) __mingw_aligned_free(v) 75 #elif VNL_CONFIG_ENABLE_SSE2 && VXL_HAS_POSIX_MEMALIGN 77 # define VNL_SSE_ALLOC(n,s,a) memalign(a,n*s) 78 # define VNL_SSE_FREE(v,n,s) std::free(v) 79 #else // sse2 disabled or could not get memory alignment support, use slower unaligned based intrinsics 80 # define VNL_SSE_HEAP_STORE(pf) _mm_storeu_##pf 81 # define VNL_SSE_HEAP_LOAD(pf) _mm_loadu_##pf 82 # if VNL_CONFIG_THREAD_SAFE 83 # define VNL_SSE_ALLOC(n,s,a) new char[(n)*(s)] 84 # define VNL_SSE_FREE(v,n,s) (delete [] static_cast<char*>(v)) 86 # define VNL_SSE_ALLOC(n,s,a) vnl_alloc::allocate((n == 0) ? 8 : (n * s)); 87 # define VNL_SSE_FREE(v,n,s) if (v) vnl_alloc::deallocate(v, (n == 0) ? 8 : (n * s)); 93 #ifndef VNL_SSE_STACK_STORE 94 # define VNL_SSE_STACK_STORE(pf) _mm_store_##pf 98 #ifndef VNL_SSE_HEAP_STORE 99 # define VNL_SSE_HEAP_STORE(pf) _mm_store_##pf 100 # define VNL_SSE_HEAP_LOAD(pf) _mm_load_##pf 120 #undef VNL_SSE_FORCE_INLINE 121 #define VNL_SSE_FORCE_INLINE 124 #if VNL_CONFIG_ENABLE_SSE2 125 class VNL_EXPORT vnl_sse_supplement
133 __m128i mask = _mm_cmplt_epi32(a, b);
134 a = _mm_and_si128(a, mask);
135 b = _mm_andnot_si128(mask, b);
136 a = _mm_or_si128(a, b);
142 __m128i mask = _mm_cmpgt_epi32(a, b);
143 a = _mm_and_si128(a, mask);
144 b = _mm_andnot_si128(mask, b);
145 a = _mm_or_si128(a, b);
151 #define VNL_MM_MIN_EPI32 vnl_sse_supplement::vnl_mm_min_epi32 152 #define VNL_MM_MAX_EPI32 vnl_sse_supplement::vnl_mm_max_epi32 153 #endif // VNL_CONFIG_ENABLE_SSE2 162 for (
unsigned i = 0; i < n; ++i)
169 for (
unsigned i = 0; i < n; ++i)
182 const T diff = x[n] - y[n];
191 for (
unsigned int j=0; j<cols; ++j) {
193 for (
unsigned int i=0; i<rows; ++i)
194 som += (
m+i*cols)[j] *
v[i];
201 for (
unsigned int i=0; i<rows; ++i) {
203 for (
unsigned int j=0; j<cols; ++j)
204 som += (
m+i*cols)[j] *
v[j];
212 for (
unsigned i = 0; i < n; ++i)
219 if (n==0)
return T(0);
229 if (n==0)
return T(0);
239 if (n==0)
return unsigned(-1);
242 for (
unsigned i=1; i<n; ++i)
250 if (n==0)
return unsigned(-1);
253 for (
unsigned i=1; i<n; ++i)
260 #if VNL_CONFIG_ENABLE_SSE2 264 class VNL_EXPORT
vnl_sse<double>
272 case 3: --n; _mm_store_sd(r+n,_mm_mul_sd(_mm_load_sd(x+n),_mm_load_sd(y+n)));
273 case 2: --n; _mm_store_sd(r+n,_mm_mul_sd(_mm_load_sd(x+n),_mm_load_sd(y+n)));
274 case 1: --n; _mm_store_sd(r+n,_mm_mul_sd(_mm_load_sd(x+n),_mm_load_sd(y+n)));
280 if (std::ptrdiff_t(x)%16 || std::ptrdiff_t(y)%16 || std::ptrdiff_t(r)%16)
282 for (
int i = n-4; i >= 0; i-=4)
284 _mm_storeu_pd(r+i,_mm_mul_pd(_mm_loadu_pd(x+i),_mm_loadu_pd(y+i)));
285 _mm_storeu_pd(r+i+2,_mm_mul_pd(_mm_loadu_pd(x+i+2),_mm_loadu_pd(y+i+2)));
288 for (
int i = n-4; i >= 0; i-=4)
302 n--; sum = _mm_mul_sd(_mm_load_sd(x+n),_mm_load_sd(y+n));
305 sum = _mm_setzero_pd();
307 if (std::ptrdiff_t(x)%16 || std::ptrdiff_t(y)%16)
309 for (
int i = n-2; i >= 0; i-=2)
310 sum = _mm_add_pd(_mm_mul_pd(_mm_loadu_pd(x+i), _mm_loadu_pd(y+i)),sum);
312 for (
int i = n-2; i >= 0; i-=2)
316 sum = _mm_add_sd(sum,_mm_unpackhi_pd(sum,_mm_setzero_pd()));
317 _mm_store_sd(&ret,sum);
329 n--; a = _mm_sub_sd(_mm_load_sd(x+n),_mm_load_sd(y+n));
330 sum = _mm_mul_sd(a,a);
333 sum = _mm_setzero_pd();
335 if (std::ptrdiff_t(x)%16 || std::ptrdiff_t(y)%16)
337 for (
int i = n-2; i >= 0; i-=2 )
339 a = _mm_sub_pd(_mm_loadu_pd(x+i),_mm_loadu_pd(y+i));
340 sum = _mm_add_pd(_mm_mul_pd(a,a),sum);
343 for (
int i = n-2; i >= 0; i-=2 )
346 sum = _mm_add_pd(_mm_mul_pd(a,a),sum);
350 sum = _mm_add_sd(sum,_mm_unpackhi_pd(sum,_mm_setzero_pd()));
351 _mm_store_sd(&ret,sum);
355 static VNL_SSE_FORCE_INLINE void vector_x_matrix(
const double*
v,
const double*
m,
double* r,
unsigned rows,
unsigned cols)
357 __m128d accum, x,y,z,w;
360 unsigned r_left = rows%4;
361 unsigned r_nice = rows - r_left;
362 unsigned c_left = cols%2;
363 unsigned c_nice = cols - c_left;
366 for (
unsigned j = 0; j < c_nice; j+=2)
369 accum = _mm_setzero_pd();
378 _mm_prefetch((
const char*)(
v+i+4), _MM_HINT_NTA);
385 x = _mm_shuffle_pd(y,y,_MM_SHUFFLE2(0,0));
386 y = _mm_shuffle_pd(y,y,_MM_SHUFFLE2(1,1));
387 z = _mm_shuffle_pd(w,w,_MM_SHUFFLE2(0,0));
388 w = _mm_shuffle_pd(w,w,_MM_SHUFFLE2(1,1));
395 x = _mm_mul_pd(x,_mm_loadu_pd(i++*cols+
m+j));
396 y = _mm_mul_pd(y,_mm_loadu_pd(i++*cols+
m+j));
397 z = _mm_mul_pd(z,_mm_loadu_pd(i++*cols+
m+j));
398 w = _mm_mul_pd(w,_mm_loadu_pd(i++*cols+
m+j));
401 accum = _mm_add_pd(x,accum);
402 accum = _mm_add_pd(y,accum);
403 accum = _mm_add_pd(z,accum);
404 accum = _mm_add_pd(w,accum);
413 case 3: accum = _mm_add_pd(_mm_mul_pd(_mm_load1_pd(
v+i),_mm_loadu_pd(
m+i*cols+j)), accum); i++;
414 case 2: accum = _mm_add_pd(_mm_mul_pd(_mm_load1_pd(
v+i),_mm_loadu_pd(
m+i*cols+j)), accum); i++;
415 case 1: accum = _mm_add_pd(_mm_mul_pd(_mm_load1_pd(
v+i),_mm_loadu_pd(
m+i*cols+j)), accum);
421 _mm_stream_pd(r+j,accum);
427 accum = _mm_setzero_pd();
428 for (
unsigned int i=0; i<rows; ++i)
429 accum = _mm_add_sd(_mm_mul_sd(_mm_load_sd(
m+i*cols+cols-1),_mm_load_sd(
v+i)),accum);
430 _mm_store_sd(r+cols-1, accum);
434 static VNL_SSE_FORCE_INLINE void matrix_x_vector(
const double*
m,
const double*
v,
double* r,
unsigned rows,
unsigned cols)
436 __m128d accum, x,y,mxy1,mxy2;
439 unsigned r_left = rows%2;
440 unsigned r_nice = rows - r_left;
441 unsigned c_left = cols%2;
442 unsigned c_nice = cols - c_left;
445 for (
unsigned i = 0; i < r_nice; i+=2)
448 accum = _mm_setzero_pd();
449 const double *r1 =
m+i*cols, *r2 =
m+(i+1)*cols;
451 for (; j < c_nice; j+=2)
459 x = _mm_shuffle_pd(y,y,_MM_SHUFFLE2(0,0));
460 y = _mm_shuffle_pd(y,y,_MM_SHUFFLE2(1,1));
464 mxy1 = _mm_loadu_pd(r1+j);
465 mxy2 = _mm_loadu_pd(r2+j);
470 x = _mm_mul_pd(x,_mm_unpacklo_pd(mxy1,mxy2));
471 y = _mm_mul_pd(y,_mm_unpackhi_pd(mxy1,mxy2));
474 accum = _mm_add_pd(x,accum);
475 accum = _mm_add_pd(y,accum);
482 accum = _mm_add_pd(_mm_mul_pd(_mm_load1_pd(
v+j),_mm_set_pd(*(r2+j),*(r1+j))), accum);
486 _mm_stream_pd(r+i,accum);
492 accum = _mm_setzero_pd();
493 const double* p =
m+(rows-1)*cols;
494 for (
unsigned int j=0; j<cols; ++j)
495 accum = _mm_add_sd(_mm_mul_sd(_mm_load_sd(p+j),_mm_load_sd(
v+j)),accum);
496 _mm_store_sd(r+rows-1, accum);
504 __m128d sum = n%2 ? _mm_load_sd(x+--n) : _mm_setzero_pd();
507 for (
int i = n-2; i >= 0; i-=2)
511 sum = _mm_add_sd(sum,_mm_unpackhi_pd(sum,_mm_setzero_pd()));
512 _mm_store_sd(&ret,sum);
521 __m128d min_double = _mm_set1_pd(- DBL_MAX);
522 __m128d
max = min_double;
524 __m128i input_idx = _mm_set_epi32(1, 1, 0, 0);
525 __m128i input_idx_increment = _mm_set1_epi32(2);
526 __m128i arg_max = _mm_set1_epi32(0);
534 const int n16 = (n/2) * 2;
536 for (
int i=0; i<n16; i+=2)
539 max = _mm_max_pd(input,
max);
540 is_max.m128d = _mm_cmpeq_pd(
max, input);
541 arg_max = VNL_MM_MAX_EPI32(arg_max, _mm_and_si128(is_max.m128i, input_idx));
542 input_idx = _mm_add_epi32(input_idx, input_idx_increment);
548 input_idx = _mm_set1_epi32(--n);
549 input = _mm_load1_pd(x+n);
550 max = _mm_max_sd(
max, input);
551 is_max.m128d = _mm_cmpeq_pd(
max, input);
552 arg_max = VNL_MM_MAX_EPI32(arg_max, _mm_and_si128(is_max.m128i, input_idx));
557 max = _mm_max_sd(_mm_unpackhi_pd(
max, min_double),
max);
559 is_max.m128d = _mm_cmpeq_pd(is_max.m128d,
max);
560 arg_max = _mm_and_si128(is_max.m128i, arg_max);
561 arg_max = VNL_MM_MAX_EPI32(arg_max, _mm_unpackhi_epi32(arg_max, _mm_set1_epi32(0)));
562 unsigned ret = _mm_cvtsi128_si32(arg_max);
571 __m128d max_double = _mm_set1_pd(DBL_MAX);
572 __m128d
min = max_double;
574 __m128i input_idx = _mm_set_epi32(1, 1, 0, 0);
575 __m128i input_idx_increment = _mm_set1_epi32(2);
576 __m128i arg_min = _mm_set1_epi32(0);
584 const int n16 = (n/2) * 2;
586 for (
int i=0; i<n16; i+=2)
589 min = _mm_min_pd(input,
min);
590 is_min.m128d = _mm_cmpeq_pd(
min, input);
591 arg_min = VNL_MM_MAX_EPI32(arg_min, _mm_and_si128(is_min.m128i, input_idx));
592 input_idx = _mm_add_epi32(input_idx, input_idx_increment);
598 input_idx = _mm_set1_epi32(--n);
599 input = _mm_load1_pd(x+n);
600 min = _mm_min_sd(
min, input);
601 is_min.m128d = _mm_cmpeq_pd(
min, input);
602 arg_min = VNL_MM_MAX_EPI32(arg_min, _mm_and_si128(is_min.m128i, input_idx));
607 min = _mm_min_sd(_mm_unpackhi_pd(
min, max_double),
min);
609 is_min.m128d = _mm_cmpeq_pd(is_min.m128d,
min);
610 arg_min = _mm_and_si128(is_min.m128i, arg_min);
611 arg_min = VNL_MM_MAX_EPI32(arg_min, _mm_unpackhi_epi32(arg_min, _mm_set1_epi32(0)));
612 unsigned ret = _mm_cvtsi128_si32(arg_min);
619 __m128d min_double = _mm_set1_pd(- DBL_MAX);
620 __m128d
max = min_double;
624 max = _mm_max_sd(
max,_mm_load_sd(x+--n));
627 for (
int i=n-2; i>=0; i-=2)
631 max = _mm_max_sd(
max,_mm_unpackhi_pd(
max,min_double));
632 _mm_store_sd(&ret,
max);
639 __m128d max_double = _mm_set1_pd(DBL_MAX);
640 __m128d
min = max_double;
644 min = _mm_min_sd(
min,_mm_load_sd(x+--n));
647 for (
int i=n-2; i>=0; i-=2)
651 min = _mm_min_sd(
min,_mm_unpackhi_pd(
min,max_double));
652 _mm_store_sd(&ret,
min);
659 class VNL_EXPORT
vnl_sse<float>
667 case 3: --n; _mm_store_ss(r+n,_mm_mul_ss(_mm_load_ss(x+n),_mm_load_ss(y+n)));
668 case 2: --n; _mm_store_ss(r+n,_mm_mul_ss(_mm_load_ss(x+n),_mm_load_ss(y+n)));
669 case 1: --n; _mm_store_ss(r+n,_mm_mul_ss(_mm_load_ss(x+n),_mm_load_ss(y+n)));
674 for (
int i = n-4; i >= 0; i-=4)
681 __m128 sum = _mm_setzero_ps();
685 case 3: n--; sum = _mm_mul_ss(_mm_load_ss(x+n), _mm_load_ss(y+n));
686 case 2: n--; sum = _mm_add_ss(_mm_mul_ss(_mm_load_ss(x+n), _mm_load_ss(y+n)),sum);
687 case 1: n--; sum = _mm_add_ss(_mm_mul_ss(_mm_load_ss(x+n), _mm_load_ss(y+n)),sum);
691 for (
int i = n-4; i >= 0; i-=4)
695 sum = _mm_add_ps(sum,_mm_movehl_ps(_mm_setzero_ps(),sum));
696 sum = _mm_add_ss(sum,_mm_shuffle_ps(sum,sum,_MM_SHUFFLE(3,2,1,1)));
698 _mm_store_ss(&ret,sum);
706 sum = a = _mm_setzero_ps();
710 case 3: --n; a = _mm_sub_ss(_mm_load_ss(x+n),_mm_load_ss(y+n));
711 case 2: --n; a = _mm_shuffle_ps(_mm_sub_ss(_mm_load_ss(x+n),_mm_load_ss(y+n)), a ,_MM_SHUFFLE(1,0,0,1));
712 case 1: --n; a = _mm_move_ss(a,_mm_sub_ss(_mm_load_ss(x+n),_mm_load_ss(y+n)));
713 sum = _mm_mul_ps(a,a);
717 for (
int i = n-4; i >= 0; i-=4 )
720 sum = _mm_add_ps(_mm_mul_ps(a,a),sum);
724 sum = _mm_add_ps(sum,_mm_movehl_ps(_mm_setzero_ps(),sum));
725 sum = _mm_add_ss(sum,_mm_shuffle_ps(sum,sum,_MM_SHUFFLE(3,2,1,1)));
727 _mm_store_ss(&ret,sum);
731 static VNL_SSE_FORCE_INLINE void vector_x_matrix(
const float*
v,
const float*
m,
float* r,
unsigned rows,
unsigned cols)
733 __m128 accum, x,y,z,w;
736 unsigned r_left = rows%4;
737 unsigned r_nice = rows - r_left;
738 unsigned c_left = cols%4;
739 unsigned c_nice = cols - c_left;
742 for (
unsigned j = 0; j < c_nice; j+=4)
745 accum = _mm_setzero_ps();
758 x = _mm_shuffle_ps(w,w,_MM_SHUFFLE(0,0,0,0));
759 y = _mm_shuffle_ps(w,w,_MM_SHUFFLE(1,1,1,1));
760 z = _mm_shuffle_ps(w,w,_MM_SHUFFLE(2,2,2,2));
761 w = _mm_shuffle_ps(w,w,_MM_SHUFFLE(3,3,3,3));
768 x = _mm_mul_ps(x,_mm_loadu_ps(
m+i++*cols+j));
769 y = _mm_mul_ps(y,_mm_loadu_ps(
m+i++*cols+j));
770 z = _mm_mul_ps(z,_mm_loadu_ps(
m+i++*cols+j));
771 w = _mm_mul_ps(w,_mm_loadu_ps(
m+i++*cols+j));
774 accum = _mm_add_ps(x,accum);
775 accum = _mm_add_ps(y,accum);
776 accum = _mm_add_ps(z,accum);
777 accum = _mm_add_ps(w,accum);
788 case 3: accum = _mm_add_ps(_mm_mul_ps(_mm_load1_ps(
v+i),_mm_loadu_ps(
m+i*cols+j)), accum); i++;
789 case 2: accum = _mm_add_ps(_mm_mul_ps(_mm_load1_ps(
v+i),_mm_loadu_ps(
m+i*cols+j)), accum); i++;
790 case 1: accum = _mm_add_ps(_mm_mul_ps(_mm_load1_ps(
v+i),_mm_loadu_ps(
m+i*cols+j)), accum);
796 _mm_stream_ps(r+j,accum);
800 for (; c_left > 0; --c_left) {
801 accum = _mm_setzero_ps();
802 for (
unsigned int i=0; i<rows; ++i)
803 accum = _mm_add_ss(_mm_mul_ss(_mm_load_ss(
m+i*cols+cols-c_left), _mm_load_ss(
v+i)),accum);
804 _mm_store_ss(r+cols-c_left,accum);
808 static VNL_SSE_FORCE_INLINE void matrix_x_vector(
const float*
m,
const float*
v,
float* r,
unsigned rows,
unsigned cols)
810 __m128 accum, x,y,z,w,mxy1,mxy2,mzw1,mzw2, mr1,mr2,mr3,mr4;
813 unsigned r_left = rows%4;
814 unsigned r_nice = rows - r_left;
815 unsigned c_left = cols%4;
816 unsigned c_nice = cols - c_left;
819 for (
unsigned i = 0; i < r_nice; i+=4)
822 accum = _mm_setzero_ps();
823 const float *r1 =
m+i*cols, *r2 =
m+(i+1)*cols,
824 *r3 =
m+(i+2)*cols, *r4 =
m+(i+3)*cols;
826 for (; j < c_nice; j+=4)
837 x = _mm_shuffle_ps(w,w,_MM_SHUFFLE(0,0,0,0));
838 y = _mm_shuffle_ps(w,w,_MM_SHUFFLE(1,1,1,1));
839 z = _mm_shuffle_ps(w,w,_MM_SHUFFLE(2,2,2,2));
840 w = _mm_shuffle_ps(w,w,_MM_SHUFFLE(3,3,3,3));
845 mr1 = _mm_loadu_ps(r1+j);
846 mr2 = _mm_loadu_ps(r2+j);
851 mxy1 = _mm_unpacklo_ps(mr1,mr2);
852 mzw1 = _mm_unpackhi_ps(mr1,mr2);
855 mr3 = _mm_loadu_ps(r3+j);
856 mr4 = _mm_loadu_ps(r4+j);
861 mxy2 = _mm_unpacklo_ps(mr3,mr4);
862 mzw2 = _mm_unpackhi_ps(mr3,mr4);
870 __m128 mx = _mm_movelh_ps(mxy1,mxy2);
871 x = _mm_mul_ps(x, mx);
872 __m128 my = _mm_movehl_ps(mxy2,mxy1);
873 y = _mm_mul_ps(y, my);
874 __m128 mz = _mm_movelh_ps(mzw1,mzw2);
875 z = _mm_mul_ps(z, mz);
876 __m128 mw = _mm_movehl_ps(mzw2,mzw1);
877 w = _mm_mul_ps(w,mw);
879 x = _mm_mul_ps(x,_mm_movelh_ps(mxy1,mxy2));
880 y = _mm_mul_ps(y,_mm_movehl_ps(mxy1,mxy2));
881 z = _mm_mul_ps(z,_mm_movelh_ps(mzw1,mzw2));
882 w = _mm_mul_ps(w,_mm_movehl_ps(mzw1,mzw2));
886 accum = _mm_add_ps(x,accum);
887 accum = _mm_add_ps(y,accum);
888 accum = _mm_add_ps(z,accum);
889 accum = _mm_add_ps(w,accum);
900 case 3: accum = _mm_add_ps(_mm_mul_ps(_mm_load1_ps(
v+j),_mm_set_ps(*(r4+j),*(r3+j),*(r2+j),*(r1+j))), accum); j++;
901 case 2: accum = _mm_add_ps(_mm_mul_ps(_mm_load1_ps(
v+j),_mm_set_ps(*(r4+j),*(r3+j),*(r2+j),*(r1+j))), accum); j++;
902 case 1: accum = _mm_add_ps(_mm_mul_ps(_mm_load1_ps(
v+j),_mm_set_ps(*(r4+j),*(r3+j),*(r2+j),*(r1+j))), accum);
907 _mm_stream_ps(r+i,accum);
911 for (; r_left > 0; --r_left) {
912 accum = _mm_setzero_ps();
913 const float* p =
m+(rows-r_left)*cols;
914 for (
unsigned int j=0; j<cols; ++j)
915 accum = _mm_add_ss(_mm_mul_ss(_mm_load_ss(p+j), _mm_load_ss(
v+j)),accum);
916 _mm_store_ss(r+rows-r_left,accum);
923 __m128 sum = _mm_setzero_ps();
926 case 3: sum = _mm_load_ss(x+--n);
927 case 2: sum = _mm_shuffle_ps(_mm_load_ss(x+--n), sum ,_MM_SHUFFLE(1,0,0,1));
928 case 1: sum = _mm_move_ss(sum,_mm_load_ss(x+--n));
933 for (
int i = n-4; i >= 0; i-=4)
937 sum = _mm_add_ps(sum,_mm_movehl_ps(_mm_setzero_ps(),sum));
938 sum = _mm_add_ss(sum,_mm_shuffle_ps(sum,sum,_MM_SHUFFLE(3,2,1,1)));
939 _mm_store_ss(&ret,sum);
946 __m128 min_float = _mm_set1_ps(- FLT_MAX);
947 __m128
max = min_float;
950 case 3:
max = _mm_load_ss(x+--n);
951 case 2:
max = _mm_shuffle_ps(_mm_load_ss(x+--n),
max ,_MM_SHUFFLE(1,0,0,1));
952 case 1:
max = _mm_move_ss(
max,_mm_load_ss(x+--n));
957 for (
int i = n-4; i >= 0; i-=4)
961 max = _mm_max_ps(
max,_mm_movehl_ps(min_float,
max));
962 max = _mm_max_ss(
max,_mm_shuffle_ps(
max,
max,_MM_SHUFFLE(3,2,1,1)));
963 _mm_store_ss(&ret,
max);
971 __m128 max_float = _mm_set1_ps(FLT_MAX);
972 __m128
min = max_float;
976 case 3:
min = _mm_min_ss(
min,_mm_load_ss(x+--n));
977 case 2:
min = _mm_min_ss(
min,_mm_load_ss(x+--n));
978 case 1:
min = _mm_min_ss(
min,_mm_load_ss(x+--n));
983 for (
int i = n-4; i >= 0; i-=4)
988 min = _mm_min_ps(
min,_mm_movehl_ps(max_float,
min));
989 min = _mm_min_ss(
min,_mm_shuffle_ps(
min,
min,_MM_SHUFFLE(3,2,1,1)));
990 _mm_store_ss(&ret,
min);
997 __m128
max = _mm_set1_ps(- FLT_MAX);
999 __m128i input_idx = _mm_set_epi32(3, 2, 1, 0);
1000 __m128i input_idx_increment = _mm_set1_epi32(4);
1001 __m128i arg_max = _mm_set1_epi32(0);
1009 const int n16 = (n/4) * 4;
1011 for (
int i=0; i<n16; i+=4)
1014 max = _mm_max_ps(input,
max);
1015 is_max.m128 = _mm_cmpeq_ps(
max, input);
1016 arg_max = VNL_MM_MAX_EPI32(arg_max, _mm_and_si128(is_max.m128i, input_idx));
1017 input_idx = _mm_add_epi32(input_idx, input_idx_increment);
1023 input_idx_increment = _mm_set1_epi32(1);
1026 input_idx = _mm_set1_epi32(n);
1027 input = _mm_load1_ps(x+n);
1028 max = _mm_max_ps(
max, input);
1029 is_max.m128 = _mm_cmpeq_ps(
max, input);
1030 arg_max = VNL_MM_MAX_EPI32(arg_max, _mm_and_si128(is_max.m128i, input_idx));
1033 input_idx = _mm_add_epi32(input_idx, input_idx_increment);
1038 max = _mm_max_ps(
max, _mm_shuffle_ps(
max,
max, _MM_SHUFFLE(2,3,0,1)));
1040 max = _mm_shuffle_ps(
max,
max, _MM_SHUFFLE(0,0,0,0));
1041 is_max.m128 = _mm_cmpeq_ps(is_max.m128,
max);
1042 arg_max = _mm_and_si128(is_max.m128i, arg_max);
1043 arg_max = VNL_MM_MAX_EPI32(arg_max, _mm_unpackhi_epi32(arg_max, _mm_set1_epi32(0)));
1044 arg_max = VNL_MM_MAX_EPI32(arg_max, _mm_srli_si128(arg_max, 4));
1045 unsigned ret = _mm_cvtsi128_si32(arg_max);
1051 __m128
min = _mm_set1_ps(FLT_MAX);
1053 __m128i input_idx = _mm_set_epi32(3, 2, 1, 0);
1054 __m128i input_idx_increment = _mm_set1_epi32(4);
1055 __m128i arg_min = _mm_set1_epi32(0);
1063 const int n16 = (n/4) * 4;
1065 for (
int i=0; i<n16; i+=4)
1068 min = _mm_min_ps(input,
min);
1069 is_min.m128 = _mm_cmpeq_ps(
min, input);
1070 arg_min = VNL_MM_MAX_EPI32(arg_min, _mm_and_si128(is_min.m128i, input_idx));
1071 input_idx = _mm_add_epi32(input_idx, input_idx_increment);
1077 input_idx_increment = _mm_set1_epi32(1);
1080 input_idx = _mm_set1_epi32(n);
1081 input = _mm_load1_ps(x+n);
1082 min = _mm_min_ps(
min, input);
1083 is_min.m128 = _mm_cmpeq_ps(
min, input);
1084 arg_min = VNL_MM_MAX_EPI32(arg_min, _mm_and_si128(is_min.m128i, input_idx));
1087 input_idx = _mm_add_epi32(input_idx, input_idx_increment);
1092 min = _mm_min_ps(
min, _mm_shuffle_ps(
min,
min, _MM_SHUFFLE(2,3,0,1)));
1094 min = _mm_shuffle_ps(
min,
min, _MM_SHUFFLE(0,0,0,0));
1095 is_min.m128 = _mm_cmpeq_ps(is_min.m128,
min);
1096 arg_min = _mm_and_si128(is_min.m128i, arg_min);
1097 arg_min = VNL_MM_MAX_EPI32(arg_min, _mm_unpackhi_epi32(arg_min, _mm_set1_epi32(0)));
1098 arg_min = VNL_MM_MAX_EPI32(arg_min, _mm_srli_si128(arg_min, 4));
1099 unsigned ret = _mm_cvtsi128_si32(arg_min);
1104 #endif // VNL_CONFIG_ENABLE_SSE2 1106 #endif // vnl_sse_h_ static VNL_SSE_FORCE_INLINE T min(const T *v, unsigned n)
#define VNL_SSE_FREE(v, n, s)
static VNL_SSE_FORCE_INLINE T euclid_dist_sq(const T *x, const T *y, unsigned n)
#define VNL_SSE_ALLOC(n, s, a)
VNL_EXPORT T dot_product(m const &, m const &)
static VNL_SSE_FORCE_INLINE unsigned arg_min(const T *v, unsigned n)
VNL_SSE_FORCE_INLINE void vnl_sse_dealloc(void *mem, std::size_t n, unsigned size)
Custom memory deallocation function to free 16 byte aligned of data.
#define VNL_SSE_FORCE_INLINE
static VNL_SSE_FORCE_INLINE T dot_product(const T *x, const T *y, unsigned n)
Bog standard (no sse) implementation for non sse enabled hardware and any type which doesn't have a t...
static VNL_SSE_FORCE_INLINE T sum(const T *v, unsigned n)
VNL_EXPORT m element_product(m const &, m const &)
VNL_SSE_FORCE_INLINE void * vnl_sse_alloc(std::size_t n, unsigned size)
Custom memory allocation function to force 16 byte alignment of data.
static VNL_SSE_FORCE_INLINE void element_product(const T *x, const T *y, T *r, unsigned n)
#define VNL_SSE_HEAP_LOAD(pf)
vnl_decnum max(vnl_decnum const &x, vnl_decnum const &y)
#define VNL_SSE_HEAP_STORE(pf)
static VNL_SSE_FORCE_INLINE void matrix_x_vector(const T *m, const T *v, T *r, unsigned rows, unsigned cols)
static VNL_SSE_FORCE_INLINE T max(const T *v, unsigned n)
static VNL_SSE_FORCE_INLINE unsigned arg_max(const T *v, unsigned n)
vnl_decnum min(vnl_decnum const &x, vnl_decnum const &y)
static VNL_SSE_FORCE_INLINE void vector_x_matrix(const T *v, const T *m, T *r, unsigned rows, unsigned cols)