293 BLI_assert(components > 0 && components <= 4);
296 float a_b, ma_b, a_mb, ma_mb;
314 const float *row1, *row2, *row3, *row4;
315 const float empty[4] = {0.0f, 0.0f, 0.0f, 0.0f};
343 row1 = buffer + (
int64_t(width) * y1c + x1c) * components;
344 row2 = buffer + (
int64_t(width) * y2c + x1c) * components;
345 row3 = buffer + (
int64_t(width) * y1c + x2c) * components;
346 row4 = buffer + (
int64_t(width) * y2c + x2c) * components;
353 if (x2 > width - 1) {
363 if (y2 > height - 1) {
373 ma_b = (1.0f - a) *
b;
374 a_mb = a * (1.0f -
b);
375 ma_mb = (1.0f - a) * (1.0f -
b);
377 if (components == 1) {
378 output[0] = ma_mb * row1[0] + a_mb * row3[0] + ma_b * row2[0] + a_b * row4[0];
380 else if (components == 2) {
381 output[0] = ma_mb * row1[0] + a_mb * row3[0] + ma_b * row2[0] + a_b * row4[0];
382 output[1] = ma_mb * row1[1] + a_mb * row3[1] + ma_b * row2[1] + a_b * row4[1];
384 else if (components == 3) {
385 output[0] = ma_mb * row1[0] + a_mb * row3[0] + ma_b * row2[0] + a_b * row4[0];
386 output[1] = ma_mb * row1[1] + a_mb * row3[1] + ma_b * row2[1] + a_b * row4[1];
387 output[2] = ma_mb * row1[2] + a_mb * row3[2] + ma_b * row2[2] + a_b * row4[2];
391 __m128 rgba1 = _mm_loadu_ps(row1);
392 __m128 rgba2 = _mm_loadu_ps(row2);
393 __m128 rgba3 = _mm_loadu_ps(row3);
394 __m128 rgba4 = _mm_loadu_ps(row4);
395 rgba1 = _mm_mul_ps(_mm_set1_ps(ma_mb), rgba1);
396 rgba2 = _mm_mul_ps(_mm_set1_ps(ma_b), rgba2);
397 rgba3 = _mm_mul_ps(_mm_set1_ps(a_mb), rgba3);
398 rgba4 = _mm_mul_ps(_mm_set1_ps(a_b), rgba4);
399 __m128 rgba13 = _mm_add_ps(rgba1, rgba3);
400 __m128 rgba24 = _mm_add_ps(rgba2, rgba4);
401 __m128 rgba = _mm_add_ps(rgba13, rgba24);
402 _mm_storeu_ps(
output, rgba);
404 output[0] = ma_mb * row1[0] + a_mb * row3[0] + ma_b * row2[0] + a_b * row4[0];
405 output[1] = ma_mb * row1[1] + a_mb * row3[1] + ma_b * row2[1] + a_b * row4[1];
406 output[2] = ma_mb * row1[2] + a_mb * row3[2] + ma_b * row2[2] + a_b * row4[2];
407 output[3] = ma_mb * row1[3] + a_mb * row3[3] + ma_b * row2[3] + a_b * row4[3];
419 __m128 uvuv = _mm_set_ps(
v, u,
v, u);
420 __m128 uvuv_floor = _mm_floor_ps(uvuv);
423 __m128i xy12 = _mm_add_epi32(_mm_cvttps_epi32(uvuv_floor), _mm_set_epi32(1, 1, 0, 0));
425 __m128i size_minus_1 = _mm_sub_epi32(_mm_set_epi32(height, width, height, width),
429 __m128i x1234, y1234, invalid_1234;
431 if constexpr (border) {
435 __m128i too_lo_xy12 = _mm_cmplt_epi32(xy12, _mm_setzero_si128());
436 __m128i too_hi_xy12 = _mm_cmplt_epi32(size_minus_1, xy12);
437 __m128i invalid_xy12 = _mm_or_si128(too_lo_xy12, too_hi_xy12);
440 x1234 = _mm_shuffle_epi32(xy12, _MM_SHUFFLE(2, 2, 0, 0));
441 y1234 = _mm_shuffle_epi32(xy12, _MM_SHUFFLE(3, 1, 3, 1));
442 invalid_1234 = _mm_or_si128(_mm_shuffle_epi32(invalid_xy12, _MM_SHUFFLE(2, 2, 0, 0)),
443 _mm_shuffle_epi32(invalid_xy12, _MM_SHUFFLE(3, 1, 3, 1)));
445 x1234 = _mm_andnot_si128(invalid_1234, x1234);
446 y1234 = _mm_andnot_si128(invalid_1234, y1234);
450 __m128i xy12_clamped = _mm_max_epi32(xy12, _mm_setzero_si128());
451 xy12_clamped = _mm_min_epi32(xy12_clamped, size_minus_1);
452 x1234 = _mm_shuffle_epi32(xy12_clamped, _MM_SHUFFLE(2, 2, 0, 0));
453 y1234 = _mm_shuffle_epi32(xy12_clamped, _MM_SHUFFLE(3, 1, 3, 1));
460 _mm_storeu_ps((
float *)xcoord, _mm_castsi128_ps(x1234));
461 _mm_storeu_ps((
float *)ycoord, _mm_castsi128_ps(y1234));
462 int sample1 = ((
const int *)buffer)[ycoord[0] *
int64_t(width) + xcoord[0]];
463 int sample2 = ((
const int *)buffer)[ycoord[1] *
int64_t(width) + xcoord[1]];
464 int sample3 = ((
const int *)buffer)[ycoord[2] *
int64_t(width) + xcoord[2]];
465 int sample4 = ((
const int *)buffer)[ycoord[3] *
int64_t(width) + xcoord[3]];
466 __m128i samples1234 = _mm_set_epi32(sample4, sample3, sample2, sample1);
467 if constexpr (border) {
469 samples1234 = _mm_andnot_si128(invalid_1234, samples1234);
474 __m128i rgba16_12 = _mm_unpacklo_epi8(samples1234, _mm_setzero_si128());
475 __m128i rgba16_34 = _mm_unpackhi_epi8(samples1234, _mm_setzero_si128());
477 __m128 rgba1 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(rgba16_12, _mm_setzero_si128()));
478 __m128 rgba2 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(rgba16_12, _mm_setzero_si128()));
479 __m128 rgba3 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(rgba16_34, _mm_setzero_si128()));
480 __m128 rgba4 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(rgba16_34, _mm_setzero_si128()));
483 __m128 abab = _mm_sub_ps(uvuv, uvuv_floor);
484 __m128 m_abab = _mm_sub_ps(_mm_set1_ps(1.0f), abab);
485 __m128 ab_mab = _mm_shuffle_ps(abab, m_abab, _MM_SHUFFLE(3, 2, 1, 0));
486 __m128 factors = _mm_mul_ps(_mm_shuffle_ps(ab_mab, ab_mab, _MM_SHUFFLE(0, 0, 2, 2)),
487 _mm_shuffle_ps(ab_mab, ab_mab, _MM_SHUFFLE(1, 3, 1, 3)));
490 rgba1 = _mm_mul_ps(_mm_shuffle_ps(factors, factors, _MM_SHUFFLE(0, 0, 0, 0)), rgba1);
491 rgba2 = _mm_mul_ps(_mm_shuffle_ps(factors, factors, _MM_SHUFFLE(1, 1, 1, 1)), rgba2);
492 rgba3 = _mm_mul_ps(_mm_shuffle_ps(factors, factors, _MM_SHUFFLE(2, 2, 2, 2)), rgba3);
493 rgba4 = _mm_mul_ps(_mm_shuffle_ps(factors, factors, _MM_SHUFFLE(3, 3, 3, 3)), rgba4);
494 __m128 rgba13 = _mm_add_ps(rgba1, rgba3);
495 __m128 rgba24 = _mm_add_ps(rgba2, rgba4);
496 __m128 rgba = _mm_add_ps(rgba13, rgba24);
497 rgba = _mm_add_ps(rgba, _mm_set1_ps(0.5f));
500 __m128i rgba32 = _mm_cvttps_epi32(rgba);
501 __m128i rgba16 = _mm_packs_epi32(rgba32, _mm_setzero_si128());
502 __m128i rgba8 = _mm_packus_epi16(rgba16, _mm_setzero_si128());
503 _mm_store_ss((
float *)&res, _mm_castsi128_ps(rgba8));
516 if (border && (x2 < 0 || x1 >= width || y2 < 0 || y1 >= height)) {
521 const uchar *row1, *row2, *row3, *row4;
522 uchar empty[4] = {0, 0, 0, 0};
523 if constexpr (border) {
524 row1 = (x1 < 0 || y1 < 0) ? empty : buffer + (
int64_t(width) * y1 + x1) * 4;
525 row2 = (x1 < 0 || y2 > height - 1) ? empty : buffer + (
int64_t(width) * y2 + x1) * 4;
526 row3 = (x2 > width - 1 || y1 < 0) ? empty : buffer + (
int64_t(width) * y1 + x2) * 4;
527 row4 = (x2 > width - 1 || y2 > height - 1) ? empty : buffer + (
int64_t(width) * y2 + x2) * 4;
534 row1 = buffer + (
int64_t(width) * y1 + x1) * 4;
535 row2 = buffer + (
int64_t(width) * y2 + x1) * 4;
536 row3 = buffer + (
int64_t(width) * y1 + x2) * 4;
537 row4 = buffer + (
int64_t(width) * y2 + x2) * 4;
543 float ma_b = (1.0f - a) *
b;
544 float a_mb = a * (1.0f -
b);
545 float ma_mb = (1.0f - a) * (1.0f -
b);
547 res.x =
uchar(ma_mb * row1[0] + a_mb * row3[0] + ma_b * row2[0] + a_b * row4[0] + 0.5f);
548 res.y =
uchar(ma_mb * row1[1] + a_mb * row3[1] + ma_b * row2[1] + a_b * row4[1] + 0.5f);
549 res.z =
uchar(ma_mb * row1[2] + a_mb * row3[2] + ma_b * row2[2] + a_b * row4[2] + 0.5f);
550 res.w =
uchar(ma_mb * row1[3] + a_mb * row3[3] + ma_b * row2[3] + a_b * row4[3] + 0.5f);
763 1.0f, 0.990965f, 0.982f, 0.973105f, 0.96428f, 0.955524f, 0.946836f,
764 0.938216f, 0.929664f, 0.921178f, 0.912759f, 0.904405f, 0.896117f, 0.887893f,
765 0.879734f, 0.871638f, 0.863605f, 0.855636f, 0.847728f, 0.839883f, 0.832098f,
766 0.824375f, 0.816712f, 0.809108f, 0.801564f, 0.794079f, 0.786653f, 0.779284f,
767 0.771974f, 0.76472f, 0.757523f, 0.750382f, 0.743297f, 0.736267f, 0.729292f,
768 0.722372f, 0.715505f, 0.708693f, 0.701933f, 0.695227f, 0.688572f, 0.68197f,
769 0.67542f, 0.66892f, 0.662471f, 0.656073f, 0.649725f, 0.643426f, 0.637176f,
770 0.630976f, 0.624824f, 0.618719f, 0.612663f, 0.606654f, 0.600691f, 0.594776f,
771 0.588906f, 0.583083f, 0.577305f, 0.571572f, 0.565883f, 0.56024f, 0.55464f,
772 0.549084f, 0.543572f, 0.538102f, 0.532676f, 0.527291f, 0.521949f, 0.516649f,
773 0.511389f, 0.506171f, 0.500994f, 0.495857f, 0.490761f, 0.485704f, 0.480687f,
774 0.475709f, 0.470769f, 0.465869f, 0.461006f, 0.456182f, 0.451395f, 0.446646f,
775 0.441934f, 0.437258f, 0.432619f, 0.428017f, 0.42345f, 0.418919f, 0.414424f,
776 0.409963f, 0.405538f, 0.401147f, 0.39679f, 0.392467f, 0.388178f, 0.383923f,
777 0.379701f, 0.375511f, 0.371355f, 0.367231f, 0.363139f, 0.359079f, 0.355051f,
778 0.351055f, 0.347089f, 0.343155f, 0.339251f, 0.335378f, 0.331535f, 0.327722f,
779 0.323939f, 0.320186f, 0.316461f, 0.312766f, 0.3091f, 0.305462f, 0.301853f,
780 0.298272f, 0.294719f, 0.291194f, 0.287696f, 0.284226f, 0.280782f, 0.277366f,
781 0.273976f, 0.270613f, 0.267276f, 0.263965f, 0.26068f, 0.257421f, 0.254187f,
782 0.250979f, 0.247795f, 0.244636f, 0.241502f, 0.238393f, 0.235308f, 0.232246f,
783 0.229209f, 0.226196f, 0.223206f, 0.220239f, 0.217296f, 0.214375f, 0.211478f,
784 0.208603f, 0.20575f, 0.20292f, 0.200112f, 0.197326f, 0.194562f, 0.191819f,
785 0.189097f, 0.186397f, 0.183718f, 0.18106f, 0.178423f, 0.175806f, 0.17321f,
786 0.170634f, 0.168078f, 0.165542f, 0.163026f, 0.16053f, 0.158053f, 0.155595f,
787 0.153157f, 0.150738f, 0.148337f, 0.145955f, 0.143592f, 0.141248f, 0.138921f,
788 0.136613f, 0.134323f, 0.132051f, 0.129797f, 0.12756f, 0.125341f, 0.123139f,
789 0.120954f, 0.118786f, 0.116635f, 0.114501f, 0.112384f, 0.110283f, 0.108199f,
790 0.106131f, 0.104079f, 0.102043f, 0.100023f, 0.0980186f, 0.09603f, 0.094057f,
791 0.0920994f, 0.0901571f, 0.08823f, 0.0863179f, 0.0844208f, 0.0825384f, 0.0806708f,
792 0.0788178f, 0.0769792f, 0.0751551f, 0.0733451f, 0.0715493f, 0.0697676f, 0.0679997f,
793 0.0662457f, 0.0645054f, 0.0627786f, 0.0610654f, 0.0593655f, 0.0576789f, 0.0560055f,
794 0.0543452f, 0.0526979f, 0.0510634f, 0.0494416f, 0.0478326f, 0.0462361f, 0.0446521f,
795 0.0430805f, 0.0415211f, 0.039974f, 0.0384389f, 0.0369158f, 0.0354046f, 0.0339052f,
796 0.0324175f, 0.0309415f, 0.029477f, 0.0280239f, 0.0265822f, 0.0251517f, 0.0237324f,
797 0.0223242f, 0.020927f, 0.0195408f, 0.0181653f, 0.0168006f, 0.0154466f, 0.0141031f,
798 0.0127701f, 0.0114476f, 0.0101354f, 0.00883339f, 0.00754159f, 0.00625989f, 0.00498819f,
799 0.00372644f, 0.00247454f, 0.00123242f, 0.0f,