|
libflame
revision_anchor
|
Functions | |
| void | bli_sdotv2axpyv2b (int n, float *a1, int inc_a1, float *a2, int inc_a2, float *x, int inc_x, float *kappa1, float *kappa2, float *rho1, float *rho2, float *w, int inc_w) |
| void | bli_ddotv2axpyv2b (int n, double *a1, int inc_a1, double *a2, int inc_a2, double *x, int inc_x, double *kappa1, double *kappa2, double *rho1, double *rho2, double *w, int inc_w) |
| void bli_ddotv2axpyv2b | ( | int | n, |
| double * | a1, | ||
| int | inc_a1, | ||
| double * | a2, | ||
| int | inc_a2, | ||
| double * | x, | ||
| int | inc_x, | ||
| double * | kappa1, | ||
| double * | kappa2, | ||
| double * | rho1, | ||
| double * | rho2, | ||
| double * | w, | ||
| int | inc_w | ||
| ) |
References bli_abort(), v2df_t::d, and v2df_t::v.
Referenced by FLA_Fused_Ahx_Ax_opd_var1(), and FLA_Fused_UZhu_ZUhu_opd_var1().
{
double* restrict alpha1;
double* restrict alpha2;
double* restrict chi1;
double* restrict omega1;
double rho1_c;
double rho2_c;
int i;
int n_pre;
int n_run;
int n_left;
v2df_t k1v, rho1v;
v2df_t k2v, rho2v;
v2df_t a11v, a12v, x1v, w1v;
v2df_t a21v, a22v, x2v, w2v;
if ( inc_a1 != 1 ||
inc_a2 != 1 ||
inc_x != 1 ||
inc_w != 1 ) bli_abort();
n_pre = 0;
if ( ( unsigned long ) a1 % 16 != 0 )
{
if ( ( unsigned long ) a2 % 16 == 0 ||
( unsigned long ) x % 16 == 0 ||
( unsigned long ) w % 16 == 0 ) bli_abort();
n_pre = 1;
}
n_run = ( n - n_pre ) / 4;
n_left = ( n - n_pre ) % 4;
alpha1 = a1;
alpha2 = a2;
chi1 = x;
omega1 = w;
rho1_c = 0.0;
rho2_c = 0.0;
if ( n_pre == 1 )
{
double kappa1_c = *kappa1;
double kappa2_c = *kappa2;
double alpha1_c = *alpha1;
double alpha2_c = *alpha2;
double chi1_c = *chi1;
double omega1_c = *omega1;
rho1_c += alpha1_c * chi1_c;
omega1_c += kappa1_c * alpha1_c;
rho2_c += alpha2_c * chi1_c;
omega1_c += kappa2_c * alpha2_c;
*omega1 = omega1_c;
alpha1 += inc_a1;
alpha2 += inc_a2;
chi1 += inc_x;
omega1 += inc_w;
}
rho1v.v = _mm_setzero_pd();
rho2v.v = _mm_setzero_pd();
k1v.v = _mm_loaddup_pd( ( double* )kappa1 );
k2v.v = _mm_loaddup_pd( ( double* )kappa2 );
for ( i = 0; i < n_run; ++i )
{
a11v.v = _mm_load_pd( ( double* )alpha1 );
a12v.v = _mm_load_pd( ( double* )alpha2 );
x1v.v = _mm_load_pd( ( double* )chi1 );
w1v.v = _mm_load_pd( ( double* )omega1 );
rho1v.v += a11v.v * x1v.v;
w1v.v += k1v.v * a11v.v;
rho2v.v += a12v.v * x1v.v;
w1v.v += k2v.v * a12v.v;
_mm_store_pd( ( double* )omega1, w1v.v );
a21v.v = _mm_load_pd( ( double* )(alpha1 + 2) );
a22v.v = _mm_load_pd( ( double* )(alpha2 + 2) );
x2v.v = _mm_load_pd( ( double* )(chi1 + 2) );
w2v.v = _mm_load_pd( ( double* )(omega1 + 2) );
rho1v.v += a21v.v * x2v.v;
w2v.v += k1v.v * a21v.v;
rho2v.v += a22v.v * x2v.v;
w2v.v += k2v.v * a22v.v;
_mm_store_pd( ( double* )(omega1 + 2), w2v.v );
alpha1 += 4;
alpha2 += 4;
chi1 += 4;
omega1 += 4;
}
if ( n_left > 0 )
{
for ( i = 0; i < n_left; ++i )
{
double kappa1_c = *kappa1;
double kappa2_c = *kappa2;
double alpha1_c = *alpha1;
double alpha2_c = *alpha2;
double chi1_c = *chi1;
double omega1_c = *omega1;
rho1_c += alpha1_c * chi1_c;
omega1_c += kappa1_c * alpha1_c;
rho2_c += alpha2_c * chi1_c;
omega1_c += kappa2_c * alpha2_c;
*omega1 = omega1_c;
alpha1 += inc_a1;
alpha2 += inc_a2;
chi1 += inc_x;
omega1 += inc_w;
}
}
rho1_c += rho1v.d[0] + rho1v.d[1];
rho2_c += rho2v.d[0] + rho2v.d[1];
*rho1 = rho1_c;
*rho2 = rho2_c;
}
| void bli_sdotv2axpyv2b | ( | int | n, |
| float * | a1, | ||
| int | inc_a1, | ||
| float * | a2, | ||
| int | inc_a2, | ||
| float * | x, | ||
| int | inc_x, | ||
| float * | kappa1, | ||
| float * | kappa2, | ||
| float * | rho1, | ||
| float * | rho2, | ||
| float * | w, | ||
| int | inc_w | ||
| ) |
References bli_abort().
{
bli_abort();
}
1.7.6.1