|
libflame
revision_anchor
|
Functions | |
| void | bli_saxpyv2b (int n, float *alpha1, float *alpha2, float *x1, int inc_x1, float *x2, int inc_x2, float *y, int inc_y) |
| void | bli_daxpyv2b (int n, double *alpha1, double *alpha2, double *x1, int inc_x1, double *x2, int inc_x2, double *y, int inc_y) |
| void bli_daxpyv2b | ( | int | n, |
| double * | alpha1, | ||
| double * | alpha2, | ||
| double * | x1, | ||
| int | inc_x1, | ||
| double * | x2, | ||
| int | inc_x2, | ||
| double * | y, | ||
| int | inc_y | ||
| ) |
References bli_abort(), and v2df_t::v.
Referenced by FLA_Fused_Ahx_Axpy_Ax_opd_var1(), FLA_Fused_Gerc2_Ahx_Axpy_Ax_opd_var1(), FLA_Fused_Gerc2_opd_var1(), and FLA_Fused_UZhu_ZUhu_opd_var1().
{
double* restrict chi1;
double* restrict chi2;
double* restrict psi1;
int i;
int n_pre;
int n_run;
int n_left;
v2df_t a1v, a2v;
v2df_t x11v, x12v;
v2df_t x21v, x22v;
v2df_t y1v;
v2df_t y2v;
if ( inc_x1 != 1 ||
inc_x2 != 1 ||
inc_y != 1 ) bli_abort();
n_pre = 0;
if ( ( unsigned long ) y % 16 != 0 )
{
if ( ( unsigned long ) x1 % 16 == 0 ||
( unsigned long ) x2 % 16 == 0 ) bli_abort();
n_pre = 1;
}
n_run = ( n - n_pre ) / 4;
n_left = ( n - n_pre ) % 4;
chi1 = x1;
chi2 = x2;
psi1 = y;
if ( n_pre == 1 )
{
double alpha1_c = *alpha1;
double alpha2_c = *alpha2;
double chi11_c = *chi1;
double chi12_c = *chi2;
double temp1;
// psi1 = psi1 + alpha1 * chi11 + alpha2 * chi12;
temp1 = alpha1_c * chi11_c + alpha2_c * chi12_c;
*psi1 = *psi1 + temp1;
chi1 += inc_x1;
chi2 += inc_x2;
psi1 += inc_y;
}
a1v.v = _mm_loaddup_pd( ( double* )alpha1 );
a2v.v = _mm_loaddup_pd( ( double* )alpha2 );
for ( i = 0; i < n_run; ++i )
{
x11v.v = _mm_load_pd( ( double* )chi1 );
x12v.v = _mm_load_pd( ( double* )chi2 );
y1v.v = _mm_load_pd( ( double* )psi1 );
x21v.v = _mm_load_pd( ( double* )(chi1 + 2) );
x22v.v = _mm_load_pd( ( double* )(chi2 + 2) );
y2v.v = _mm_load_pd( ( double* )(psi1 + 2) );
y1v.v += a1v.v * x11v.v + a2v.v * x12v.v;
y2v.v += a1v.v * x21v.v + a2v.v * x22v.v;
_mm_store_pd( ( double* )psi1, y1v.v );
_mm_store_pd( ( double* )(psi1 + 2), y2v.v );
//chi1 += step_x1;
//chi2 += step_x2;
//psi1 += step_y;
chi1 += 4;
chi2 += 4;
psi1 += 4;
}
if ( n_left > 0 )
{
double alpha1_c = *alpha1;
double alpha2_c = *alpha2;
for ( i = 0; i < n_left; ++i )
{
double chi11_c = *chi1;
double chi12_c = *chi2;
double psi1_c = *psi1;
double temp1;
temp1 = alpha1_c * chi11_c + alpha2_c * chi12_c;
*psi1 = psi1_c + temp1;
chi1 += inc_x1;
chi2 += inc_x2;
psi1 += inc_y;
}
}
}
| void bli_saxpyv2b | ( | int | n, |
| float * | alpha1, | ||
| float * | alpha2, | ||
| float * | x1, | ||
| int | inc_x1, | ||
| float * | x2, | ||
| int | inc_x2, | ||
| float * | y, | ||
| int | inc_y | ||
| ) |
References bli_abort().
{
bli_abort();
}
1.7.6.1