|
libflame
revision_anchor
|
Functions | |
| void | bli_sdotaxmyv2 (int n, float *alpha, float *beta, float *x, int inc_x, float *u, int inc_u, float *rho, float *y, int inc_y, float *z, int inc_z) |
| void | bli_ddotaxmyv2 (int n, double *alpha, double *beta, double *x, int inc_x, double *u, int inc_u, double *rho, double *y, int inc_y, double *z, int inc_z) |
| void bli_ddotaxmyv2 | ( | int | n, |
| double * | alpha, | ||
| double * | beta, | ||
| double * | x, | ||
| int | inc_x, | ||
| double * | u, | ||
| int | inc_u, | ||
| double * | rho, | ||
| double * | y, | ||
| int | inc_y, | ||
| double * | z, | ||
| int | inc_z | ||
| ) |
References bli_abort(), v2df_t::d, and v2df_t::v.
Referenced by FLA_Fused_Uhu_Yhu_Zhu_opd_var1().
{
double* restrict chi1;
double* restrict upsilon1;
double* restrict psi1;
double* restrict zeta1;
double rho_c;
int i;
int n_pre;
int n_run;
int n_left;
v2df_t a1v, b1v;
v2df_t rho1v;
v2df_t x1v, u1v, y1v, z1v;
if ( inc_x != 1 ||
inc_u != 1 ||
inc_y != 1 ||
inc_z != 1 ) bli_abort();
n_pre = 0;
if ( ( unsigned long ) z % 16 != 0 )
{
if ( ( unsigned long ) x % 16 == 0 ||
( unsigned long ) u % 16 == 0 ||
( unsigned long ) y % 16 == 0 ) bli_abort();
n_pre = 1;
}
n_run = ( n - n_pre ) / 2;
n_left = ( n - n_pre ) % 2;
chi1 = x;
upsilon1 = u;
psi1 = y;
zeta1 = z;
rho_c = 0.0;
if ( n_pre == 1 )
{
double alpha_c = *alpha;
double beta_c = *beta;
double chi1_c = *chi1;
double upsilon_c = *upsilon1;
rho_c += chi1_c * upsilon_c;
*psi1 -= alpha_c * chi1_c;
*zeta1 -= beta_c * chi1_c;
chi1 += inc_x;
upsilon1 += inc_u;
psi1 += inc_y;
zeta1 += inc_z;
}
a1v.v = _mm_loaddup_pd( ( double* )alpha );
b1v.v = _mm_loaddup_pd( ( double* )beta );
rho1v.v = _mm_setzero_pd();
for ( i = 0; i < n_run; ++i )
{
x1v.v = _mm_load_pd( ( double* )chi1 );
u1v.v = _mm_load_pd( ( double* )upsilon1 );
y1v.v = _mm_load_pd( ( double* )psi1 );
z1v.v = _mm_load_pd( ( double* )zeta1 );
rho1v.v += x1v.v * u1v.v;
y1v.v -= a1v.v * x1v.v;
z1v.v -= b1v.v * x1v.v;
_mm_store_pd( ( double* )psi1, y1v.v );
_mm_store_pd( ( double* )zeta1, z1v.v );
chi1 += 2;
upsilon1 += 2;
psi1 += 2;
zeta1 += 2;
}
rho_c += rho1v.d[0] + rho1v.d[1];
if ( n_left > 0 )
{
double alpha_c = *alpha;
double beta_c = *beta;
for( i = 0; i < n_left; ++i )
{
double chi1_c = *chi1;
double upsilon_c = *upsilon1;
rho_c += chi1_c * upsilon_c;
*psi1 -= alpha_c * chi1_c;
*zeta1 -= beta_c * chi1_c;
chi1 += inc_x;
upsilon1 += inc_u;
psi1 += inc_y;
zeta1 += inc_z;
}
}
*rho = rho_c;
}
| void bli_sdotaxmyv2 | ( | int | n, |
| float * | alpha, | ||
| float * | beta, | ||
| float * | x, | ||
| int | inc_x, | ||
| float * | u, | ||
| int | inc_u, | ||
| float * | rho, | ||
| float * | y, | ||
| int | inc_y, | ||
| float * | z, | ||
| int | inc_z | ||
| ) |
References bli_abort().
{
bli_abort();
}
1.7.6.1