|
libflame
revision_anchor
|
Go to the source code of this file.
| FLA_Error FLA_SA_Apply_pivots | ( | FLA_Obj | C, |
| FLA_Obj | E, | ||
| FLA_Obj | p | ||
| ) |
References bli_cswap(), bli_dswap(), bli_sswap(), bli_zswap(), FLA_Obj_col_stride(), FLA_Obj_datatype(), FLA_Obj_has_zero_dim(), FLA_Obj_length(), and FLA_Obj_width().
Referenced by FLA_SA_FS_blk(), and FLA_SA_LU_blk().
{
FLA_Datatype datatype;
int m_C, n_C, cs_C;
int cs_E;
// int rs_C;
// int rs_E;
int m_p;
int i;
int* buff_p;
if ( FLA_Obj_has_zero_dim( C ) ) return FLA_SUCCESS;
datatype = FLA_Obj_datatype( C );
m_C = FLA_Obj_length( C );
n_C = FLA_Obj_width( C );
cs_C = FLA_Obj_col_stride( C );
// rs_C = FLA_Obj_row_stride( C );
cs_E = FLA_Obj_col_stride( E );
// rs_E = FLA_Obj_row_stride( E );
m_p = FLA_Obj_length( p );
buff_p = ( int * ) FLA_INT_PTR( p );
switch ( datatype ){
case FLA_FLOAT:
{
float* buff_C = ( float * ) FLA_FLOAT_PTR( C );
float* buff_E = ( float * ) FLA_FLOAT_PTR( E );
for ( i = 0; i < m_p; ++i )
{
if ( buff_p[ i ] != 0 )
bli_sswap( n_C,
buff_C + 0*cs_C + i, cs_C,
buff_E + 0*cs_E + buff_p[ i ] - ( m_C - i ), cs_E );
}
break;
}
case FLA_DOUBLE:
{
double* buff_C = ( double * ) FLA_DOUBLE_PTR( C );
double* buff_E = ( double * ) FLA_DOUBLE_PTR( E );
for ( i = 0; i < m_p; ++i )
{
if ( buff_p[ i ] != 0 )
bli_dswap( n_C,
buff_C + 0*cs_C + i, cs_C,
buff_E + 0*cs_E + buff_p[ i ] - ( m_C - i ), cs_E );
}
break;
}
case FLA_COMPLEX:
{
scomplex* buff_C = ( scomplex * ) FLA_COMPLEX_PTR( C );
scomplex* buff_E = ( scomplex * ) FLA_COMPLEX_PTR( E );
for ( i = 0; i < m_p; ++i )
{
if ( buff_p[ i ] != 0 )
bli_cswap( n_C,
buff_C + 0*cs_C + i, cs_C,
buff_E + 0*cs_E + buff_p[ i ] - ( m_C - i ), cs_E );
}
break;
}
case FLA_DOUBLE_COMPLEX:
{
dcomplex* buff_C = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( C );
dcomplex* buff_E = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( E );
for ( i = 0; i < m_p; ++i )
{
if ( buff_p[ i ] != 0 )
bli_zswap( n_C,
buff_C + 0*cs_C + i, cs_C,
buff_E + 0*cs_E + buff_p[ i ] - ( m_C - i ), cs_E );
}
break;
}
}
return FLA_SUCCESS;
}
References FLA_Cont_with_1x3_to_1x2(), FLA_Cont_with_3x1_to_2x1(), FLA_Gemm_external(), FLA_MINUS_ONE, FLA_Obj_length(), FLA_ONE, FLA_Part_1x2(), FLA_Part_2x1(), FLA_Repart_1x2_to_1x3(), FLA_Repart_2x1_to_3x1(), FLA_SA_Apply_pivots(), and FLA_Trsm_external().
Referenced by FLA_SA_FS_task(), and FLASH_FS_incpiv_aux2().
{
FLA_Obj LT, L0,
LB, L1,
L2;
FLA_Obj DL, DR, D0, D1, D2;
FLA_Obj pT, p0,
pB, p1,
p2;
FLA_Obj CT, C0,
CB, C1,
C2;
FLA_Obj L1_sqr, L1_rest;
dim_t b;
FLA_Part_2x1( L, <,
&LB, 0, FLA_TOP );
FLA_Part_1x2( D, &DL, &DR, 0, FLA_LEFT );
FLA_Part_2x1( p, &pT,
&pB, 0, FLA_TOP );
FLA_Part_2x1( C, &CT,
&CB, 0, FLA_TOP );
while ( FLA_Obj_length( LT ) < FLA_Obj_length( L ) )
{
b = min( FLA_Obj_length( LB ), nb_alg );
FLA_Repart_2x1_to_3x1( LT, &L0,
/* ** */ /* ** */
&L1,
LB, &L2, b, FLA_BOTTOM );
FLA_Repart_1x2_to_1x3( DL, /**/ DR, &D0, /**/ &D1, &D2,
b, FLA_RIGHT );
FLA_Repart_2x1_to_3x1( pT, &p0,
/* ** */ /* ** */
&p1,
pB, &p2, b, FLA_BOTTOM );
FLA_Repart_2x1_to_3x1( CT, &C0,
/* ** */ /* ** */
&C1,
CB, &C2, b, FLA_BOTTOM );
/*------------------------------------------------------------*/
FLA_Part_1x2( L1, &L1_sqr, &L1_rest, b, FLA_LEFT );
FLA_SA_Apply_pivots( C1,
E, p1 );
FLA_Trsm_external( FLA_LEFT, FLA_LOWER_TRIANGULAR,
FLA_NO_TRANSPOSE, FLA_UNIT_DIAG,
FLA_ONE, L1_sqr, C1 );
FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE,
FLA_MINUS_ONE, D1, C1, FLA_ONE, E );
/*------------------------------------------------------------*/
FLA_Cont_with_3x1_to_2x1( <, L0,
L1,
/* ** */ /* ** */
&LB, L2, FLA_TOP );
FLA_Cont_with_1x3_to_1x2( &DL, /**/ &DR, D0, D1, /**/ D2,
FLA_LEFT );
FLA_Cont_with_3x1_to_2x1( &pT, p0,
p1,
/* ** */ /* ** */
&pB, p2, FLA_TOP );
FLA_Cont_with_3x1_to_2x1( &CT, C0,
C1,
/* ** */ /* ** */
&CB, C2, FLA_TOP );
}
return FLA_SUCCESS;
}
References FLA_Cont_with_1x3_to_1x2(), FLA_Cont_with_3x1_to_2x1(), FLA_Cont_with_3x3_to_2x2(), FLA_Gemm_external(), FLA_MINUS_ONE, FLA_Obj_length(), FLA_ONE, FLA_Part_1x2(), FLA_Part_2x1(), FLA_Part_2x2(), FLA_Repart_1x2_to_1x3(), FLA_Repart_2x1_to_3x1(), FLA_Repart_2x2_to_3x3(), FLA_SA_Apply_pivots(), FLA_SA_LU_unb(), and FLA_Trsm_external().
Referenced by FLA_SA_LU_task().
{
FLA_Obj UTL, UTR, U00, U01, U02,
UBL, UBR, U10, U11, U12,
U20, U21, U22;
FLA_Obj DL, DR, D0, D1, D2;
FLA_Obj pT, p0,
pB, p1,
p2;
FLA_Obj LT, L0,
LB, L1,
L2;
FLA_Obj L1_sqr, L1_rest;
dim_t b;
FLA_Part_2x2( U, &UTL, &UTR,
&UBL, &UBR, 0, 0, FLA_TL );
FLA_Part_1x2( D, &DL, &DR, 0, FLA_LEFT );
FLA_Part_2x1( p, &pT,
&pB, 0, FLA_TOP );
FLA_Part_2x1( L, <,
&LB, 0, FLA_TOP );
while ( FLA_Obj_length( UTL ) < FLA_Obj_length( U ) )
{
b = min( FLA_Obj_length( UBR ), nb_alg );
FLA_Repart_2x2_to_3x3( UTL, /**/ UTR, &U00, /**/ &U01, &U02,
/* ************* */ /* ******************** */
&U10, /**/ &U11, &U12,
UBL, /**/ UBR, &U20, /**/ &U21, &U22,
b, b, FLA_BR );
FLA_Repart_1x2_to_1x3( DL, /**/ DR, &D0, /**/ &D1, &D2,
b, FLA_RIGHT );
FLA_Repart_2x1_to_3x1( pT, &p0,
/* ** */ /* ** */
&p1,
pB, &p2, b, FLA_BOTTOM );
FLA_Repart_2x1_to_3x1( LT, &L0,
/* ** */ /* ** */
&L1,
LB, &L2, b, FLA_BOTTOM );
/*------------------------------------------------------------*/
FLA_Part_1x2( L1, &L1_sqr, &L1_rest, b, FLA_LEFT );
FLA_SA_LU_unb( U11,
D1, p1, L1_sqr );
FLA_SA_Apply_pivots( U12,
D2, p1 );
FLA_Trsm_external( FLA_LEFT, FLA_LOWER_TRIANGULAR,
FLA_NO_TRANSPOSE, FLA_UNIT_DIAG,
FLA_ONE, L1_sqr, U12 );
FLA_Gemm_external( FLA_NO_TRANSPOSE, FLA_NO_TRANSPOSE,
FLA_MINUS_ONE, D1, U12, FLA_ONE, D2 );
/*------------------------------------------------------------*/
FLA_Cont_with_3x3_to_2x2( &UTL, /**/ &UTR, U00, U01, /**/ U02,
U10, U11, /**/ U12,
/* ************** */ /* ****************** */
&UBL, /**/ &UBR, U20, U21, /**/ U22,
FLA_TL );
FLA_Cont_with_1x3_to_1x2( &DL, /**/ &DR, D0, D1, /**/ D2,
FLA_LEFT );
FLA_Cont_with_3x1_to_2x1( &pT, p0,
p1,
/* ** */ /* ** */
&pB, p2, FLA_TOP );
FLA_Cont_with_3x1_to_2x1( <, L0,
L1,
/* ** */ /* ** */
&LB, L2, FLA_TOP );
}
return FLA_SUCCESS;
}
References bli_camax(), bli_ccopy(), bli_cger(), bli_cscal(), bli_cswap(), bli_damax(), bli_dcopy(), bli_dger(), bli_dscal(), bli_dswap(), bli_samax(), bli_scopy(), bli_sger(), bli_sscal(), bli_sswap(), bli_zamax(), bli_zcopy(), bli_zger(), bli_zscal(), bli_zswap(), BLIS_NO_CONJUGATE, FLA_Copy_external(), FLA_MINUS_ONE, FLA_Obj_col_stride(), FLA_Obj_datatype(), FLA_Obj_has_zero_dim(), FLA_Obj_length(), FLA_Obj_row_stride(), FLA_Triangularize(), scomplex::imag, dcomplex::imag, scomplex::real, and dcomplex::real.
Referenced by FLA_SA_LU_blk().
{
FLA_Datatype datatype;
int m_U, cs_U;
int m_D, cs_D;
int cs_L;
// int rs_U;
int rs_D;
// int rs_L;
int m_U_min_j, m_U_min_j_min_1;
int j, ipiv;
int* buff_p;
if ( FLA_Obj_has_zero_dim( U ) ) return FLA_SUCCESS;
datatype = FLA_Obj_datatype( U );
m_U = FLA_Obj_length( U );
// rs_U = FLA_Obj_row_stride( U );
cs_U = FLA_Obj_col_stride( U );
m_D = FLA_Obj_length( D );
rs_D = FLA_Obj_row_stride( D );
cs_D = FLA_Obj_col_stride( D );
// rs_L = FLA_Obj_row_stride( L );
cs_L = FLA_Obj_col_stride( L );
FLA_Copy_external( U, L );
FLA_Triangularize( FLA_UPPER_TRIANGULAR, FLA_NONUNIT_DIAG, L );
buff_p = ( int * ) FLA_INT_PTR( p );
switch ( datatype ){
case FLA_FLOAT:
{
float* buff_U = ( float * ) FLA_FLOAT_PTR( U );
float* buff_D = ( float * ) FLA_FLOAT_PTR( D );
float* buff_L = ( float * ) FLA_FLOAT_PTR( L );
float* buff_minus1 = ( float * ) FLA_FLOAT_PTR( FLA_MINUS_ONE );
float L_tmp;
float D_tmp;
float d_inv_Ljj;
for ( j = 0; j < m_U; ++j )
{
bli_samax( m_D,
buff_D + j*cs_D + 0*rs_D,
rs_D,
&ipiv );
L_tmp = buff_L[ j*cs_L + j ];
D_tmp = buff_D[ j*cs_D + ipiv ];
if ( fabsf( L_tmp ) < fabsf( D_tmp ) )
{
bli_sswap( m_U,
buff_L + 0*cs_L + j, cs_L,
buff_D + 0*cs_D + ipiv, cs_D );
buff_p[ j ] = ipiv + m_U - j;
}
else
{
buff_p[ j ] = 0;
}
d_inv_Ljj = 1.0F / buff_L[ j*cs_L + j ];
bli_sscal( m_D,
&d_inv_Ljj,
buff_D + j*cs_D + 0, rs_D );
m_U_min_j_min_1 = m_U - j - 1;
if ( m_U_min_j_min_1 > 0 )
{
bli_sger( BLIS_NO_CONJUGATE,
BLIS_NO_CONJUGATE,
m_D,
m_U_min_j_min_1,
buff_minus1,
buff_D + (j+0)*cs_D + 0, rs_D,
buff_L + (j+1)*cs_L + j, cs_L,
buff_D + (j+1)*cs_D + 0, rs_D, cs_D );
}
m_U_min_j = m_U - j;
if ( m_U_min_j > 0 )
{
bli_scopy( m_U_min_j,
buff_L + j*cs_L + j, cs_L,
buff_U + j*cs_U + j, cs_U );
}
}
break;
}
case FLA_DOUBLE:
{
double* buff_U = ( double * ) FLA_DOUBLE_PTR( U );
double* buff_D = ( double * ) FLA_DOUBLE_PTR( D );
double* buff_L = ( double * ) FLA_DOUBLE_PTR( L );
double* buff_minus1 = ( double * ) FLA_DOUBLE_PTR( FLA_MINUS_ONE );
double L_tmp;
double D_tmp;
double d_inv_Ljj;
for ( j = 0; j < m_U; ++j )
{
bli_damax( m_D,
buff_D + j*cs_D + 0*rs_D,
rs_D,
&ipiv );
L_tmp = buff_L[ j*cs_L + j ];
D_tmp = buff_D[ j*cs_D + ipiv ];
if ( fabs( L_tmp ) < fabs( D_tmp ) )
{
bli_dswap( m_U,
buff_L + 0*cs_L + j, cs_L,
buff_D + 0*cs_D + ipiv, cs_D );
buff_p[ j ] = ipiv + m_U - j;
}
else
{
buff_p[ j ] = 0;
}
d_inv_Ljj = 1.0 / buff_L[ j*cs_L + j ];
bli_dscal( m_D,
&d_inv_Ljj,
buff_D + j*cs_D + 0, rs_D );
m_U_min_j_min_1 = m_U - j - 1;
if ( m_U_min_j_min_1 > 0 )
{
bli_dger( BLIS_NO_CONJUGATE,
BLIS_NO_CONJUGATE,
m_D,
m_U_min_j_min_1,
buff_minus1,
buff_D + (j+0)*cs_D + 0, rs_D,
buff_L + (j+1)*cs_L + j, cs_L,
buff_D + (j+1)*cs_D + 0, rs_D, cs_D );
}
m_U_min_j = m_U - j;
if ( m_U_min_j > 0 )
{
bli_dcopy( m_U_min_j,
buff_L + j*cs_L + j, cs_L,
buff_U + j*cs_U + j, cs_U );
}
}
break;
}
case FLA_COMPLEX:
{
scomplex* buff_U = ( scomplex * ) FLA_COMPLEX_PTR( U );
scomplex* buff_D = ( scomplex * ) FLA_COMPLEX_PTR( D );
scomplex* buff_L = ( scomplex * ) FLA_COMPLEX_PTR( L );
scomplex* buff_minus1 = ( scomplex * ) FLA_COMPLEX_PTR( FLA_MINUS_ONE );
scomplex L_tmp;
scomplex D_tmp;
scomplex d_inv_Ljj;
scomplex Ljj;
float temp;
for ( j = 0; j < m_U; ++j )
{
bli_camax( m_D,
buff_D + j*cs_D + 0*rs_D,
rs_D,
&ipiv );
L_tmp = buff_L[ j*cs_L + j ];
D_tmp = buff_D[ j*cs_D + ipiv ];
if ( fabsf( L_tmp.real + L_tmp.imag ) < fabsf( D_tmp.real + D_tmp.imag ) )
{
bli_cswap( m_U,
buff_L + 0*cs_L + j, cs_L,
buff_D + 0*cs_D + ipiv, cs_D );
buff_p[ j ] = ipiv + m_U - j;
}
else
{
buff_p[ j ] = 0;
}
Ljj = buff_L[ j*cs_L + j ];
// d_inv_Ljj = 1.0 / Ljj
temp = 1.0F / ( Ljj.real * Ljj.real +
Ljj.imag * Ljj.imag );
d_inv_Ljj.real = Ljj.real * temp;
d_inv_Ljj.imag = Ljj.imag * -temp;
bli_cscal( m_D,
&d_inv_Ljj,
buff_D + j*cs_D + 0, rs_D );
m_U_min_j_min_1 = m_U - j - 1;
if ( m_U_min_j_min_1 > 0 )
{
bli_cger( BLIS_NO_CONJUGATE,
BLIS_NO_CONJUGATE,
m_D,
m_U_min_j_min_1,
buff_minus1,
buff_D + (j+0)*cs_D + 0, rs_D,
buff_L + (j+1)*cs_L + j, cs_L,
buff_D + (j+1)*cs_D + 0, rs_D, cs_D );
}
m_U_min_j = m_U - j;
if ( m_U_min_j > 0 )
{
bli_ccopy( m_U_min_j,
buff_L + j*cs_L + j, cs_L,
buff_U + j*cs_U + j, cs_U );
}
}
break;
}
case FLA_DOUBLE_COMPLEX:
{
dcomplex* buff_U = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( U );
dcomplex* buff_D = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( D );
dcomplex* buff_L = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( L );
dcomplex* buff_minus1 = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( FLA_MINUS_ONE );
dcomplex L_tmp;
dcomplex D_tmp;
dcomplex d_inv_Ljj;
dcomplex Ljj;
double temp;
for ( j = 0; j < m_U; ++j )
{
bli_zamax( m_D,
buff_D + j*cs_D + 0*rs_D,
rs_D,
&ipiv );
L_tmp = buff_L[ j*cs_L + j ];
D_tmp = buff_D[ j*cs_D + ipiv ];
if ( fabs( L_tmp.real + L_tmp.imag ) < fabs( D_tmp.real + D_tmp.imag ) )
{
bli_zswap( m_U,
buff_L + 0*cs_L + j, cs_L,
buff_D + 0*cs_D + ipiv, cs_D );
buff_p[ j ] = ipiv + m_U - j;
}
else
{
buff_p[ j ] = 0;
}
Ljj = buff_L[ j*cs_L + j ];
// d_inv_Ljj = 1.0 / Ljj
temp = 1.0 / ( Ljj.real * Ljj.real +
Ljj.imag * Ljj.imag );
d_inv_Ljj.real = Ljj.real * temp;
d_inv_Ljj.imag = Ljj.imag * -temp;
bli_zscal( m_D,
&d_inv_Ljj,
buff_D + j*cs_D + 0, rs_D );
m_U_min_j_min_1 = m_U - j - 1;
if ( m_U_min_j_min_1 > 0 )
{
bli_zger( BLIS_NO_CONJUGATE,
BLIS_NO_CONJUGATE,
m_D,
m_U_min_j_min_1,
buff_minus1,
buff_D + (j+0)*cs_D + 0, rs_D,
buff_L + (j+1)*cs_L + j, cs_L,
buff_D + (j+1)*cs_D + 0, rs_D, cs_D );
}
m_U_min_j = m_U - j;
if ( m_U_min_j > 0 )
{
bli_zcopy( m_U_min_j,
buff_L + j*cs_L + j, cs_L,
buff_U + j*cs_U + j, cs_U );
}
}
break;
}
}
return FLA_SUCCESS;
}
References FLA_Apply_pivots(), FLA_Cont_with_3x1_to_2x1(), FLA_Cont_with_3x3_to_2x2(), FLA_Obj_length(), FLA_Obj_width(), FLA_Part_2x1(), FLA_Part_2x2(), FLA_Repart_2x1_to_3x1(), FLA_Repart_2x2_to_3x3(), FLA_Trsv_external(), and FLASH_FS_incpiv_aux2().
Referenced by FLASH_FS_incpiv().
{
FLA_Obj ATL, ATR, A00, A01, A02,
ABL, ABR, A10, A11, A12,
A20, A21, A22;
FLA_Obj pTL, pTR, p00, p01, p02,
pBL, pBR, p10, p11, p12,
p20, p21, p22;
FLA_Obj LTL, LTR, L00, L01, L02,
LBL, LBR, L10, L11, L12,
L20, L21, L22;
FLA_Obj bT, b0,
bB, b1,
b2;
FLA_Obj p11_conf,
p11_rest;
FLA_Part_2x2( A, &ATL, &ATR,
&ABL, &ABR, 0, 0, FLA_TL );
FLA_Part_2x2( p, &pTL, &pTR,
&pBL, &pBR, 0, 0, FLA_TL );
FLA_Part_2x2( L, <L, <R,
&LBL, &LBR, 0, 0, FLA_TL );
FLA_Part_2x1( b, &bT,
&bB, 0, FLA_TOP );
while ( FLA_Obj_length( ATL ) < FLA_Obj_length( A ) &&
FLA_Obj_width ( ATL ) < FLA_Obj_width ( A ) )
{
FLA_Repart_2x2_to_3x3( ATL, /**/ ATR, &A00, /**/ &A01, &A02,
/* ************* */ /* ******************** */
&A10, /**/ &A11, &A12,
ABL, /**/ ABR, &A20, /**/ &A21, &A22,
1, 1, FLA_BR );
FLA_Repart_2x2_to_3x3( pTL, /**/ pTR, &p00, /**/ &p01, &p02,
/* ************* */ /* ******************** */
&p10, /**/ &p11, &p12,
pBL, /**/ pBR, &p20, /**/ &p21, &p22,
1, 1, FLA_BR );
FLA_Repart_2x2_to_3x3( LTL, /**/ LTR, &L00, /**/ &L01, &L02,
/* ************* */ /* ******************** */
&L10, /**/ &L11, &L12,
LBL, /**/ LBR, &L20, /**/ &L21, &L22,
1, 1, FLA_BR );
FLA_Repart_2x1_to_3x1( bT, &b0,
/* ** */ /* ** */
&b1,
bB, &b2, 1, FLA_BOTTOM );
/*------------------------------------------------------------*/
FLA_Part_2x1( *FLASH_OBJ_PTR_AT( p11 ), &p11_conf,
&p11_rest,
FLA_Obj_length( *FLASH_OBJ_PTR_AT( b1 ) ), FLA_TOP );
FLA_Apply_pivots( FLA_LEFT, FLA_NO_TRANSPOSE,
p11_conf,
*FLASH_OBJ_PTR_AT( b1 ) );
FLA_Trsv_external( FLA_LOWER_TRIANGULAR, FLA_NO_TRANSPOSE, FLA_UNIT_DIAG,
*FLASH_OBJ_PTR_AT( A11 ),
*FLASH_OBJ_PTR_AT( b1 ) );
FLASH_FS_incpiv_aux2( L21,
A21, p21, b1,
b2, nb_alg );
/*------------------------------------------------------------*/
FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR, A00, A01, /**/ A02,
A10, A11, /**/ A12,
/* ************** */ /* ****************** */
&ABL, /**/ &ABR, A20, A21, /**/ A22,
FLA_TL );
FLA_Cont_with_3x3_to_2x2( &pTL, /**/ &pTR, p00, p01, /**/ p02,
p10, p11, /**/ p12,
/* ************** */ /* ****************** */
&pBL, /**/ &pBR, p20, p21, /**/ p22,
FLA_TL );
FLA_Cont_with_3x3_to_2x2( <L, /**/ <R, L00, L01, /**/ L02,
L10, L11, /**/ L12,
/* ************** */ /* ****************** */
&LBL, /**/ &LBR, L20, L21, /**/ L22,
FLA_TL );
FLA_Cont_with_3x1_to_2x1( &bT, b0,
b1,
/* ** */ /* ** */
&bB, b2, FLA_TOP );
}
return FLA_SUCCESS;
}
| FLA_Error FLASH_FS_incpiv_aux2 | ( | FLA_Obj | L, |
| FLA_Obj | D, | ||
| FLA_Obj | p, | ||
| FLA_Obj | C, | ||
| FLA_Obj | E, | ||
| dim_t | nb_alg | ||
| ) |
References FLA_Cont_with_3x1_to_2x1(), FLA_Obj_length(), FLA_Part_2x1(), FLA_Repart_2x1_to_3x1(), and FLA_SA_FS_blk().
Referenced by FLASH_FS_incpiv_aux1().
{
FLA_Obj LT, L0,
LB, L1,
L2;
FLA_Obj DT, D0,
DB, D1,
D2;
FLA_Obj pT, p0,
pB, p1,
p2;
FLA_Obj ET, E0,
EB, E1,
E2;
FLA_Part_2x1( L, <,
&LB, 0, FLA_TOP );
FLA_Part_2x1( D, &DT,
&DB, 0, FLA_TOP );
FLA_Part_2x1( p, &pT,
&pB, 0, FLA_TOP );
FLA_Part_2x1( E, &ET,
&EB, 0, FLA_TOP );
while ( FLA_Obj_length( DT ) < FLA_Obj_length( D ) )
{
FLA_Repart_2x1_to_3x1( LT, &L0,
/* ** */ /* ** */
&L1,
LB, &L2, 1, FLA_BOTTOM );
FLA_Repart_2x1_to_3x1( DT, &D0,
/* ** */ /* ** */
&D1,
DB, &D2, 1, FLA_BOTTOM );
FLA_Repart_2x1_to_3x1( pT, &p0,
/* ** */ /* ** */
&p1,
pB, &p2, 1, FLA_BOTTOM );
FLA_Repart_2x1_to_3x1( ET, &E0,
/* ** */ /* ** */
&E1,
EB, &E2, 1, FLA_BOTTOM );
/*------------------------------------------------------------*/
FLA_SA_FS_blk( *FLASH_OBJ_PTR_AT( L1 ),
*FLASH_OBJ_PTR_AT( D1 ),
*FLASH_OBJ_PTR_AT( p1 ),
*FLASH_OBJ_PTR_AT( C ),
*FLASH_OBJ_PTR_AT( E1 ),
nb_alg );
/*------------------------------------------------------------*/
FLA_Cont_with_3x1_to_2x1( <, L0,
L1,
/* ** */ /* ** */
&LB, L2, FLA_TOP );
FLA_Cont_with_3x1_to_2x1( &DT, D0,
D1,
/* ** */ /* ** */
&DB, D2, FLA_TOP );
FLA_Cont_with_3x1_to_2x1( &pT, p0,
p1,
/* ** */ /* ** */
&pB, p2, FLA_TOP );
FLA_Cont_with_3x1_to_2x1( &ET, E0,
E1,
/* ** */ /* ** */
&EB, E2, FLA_TOP );
}
return FLA_SUCCESS;
}
References FLA_Cont_with_3x3_to_2x2(), FLA_LU_piv_task(), FLA_Obj_length(), FLA_Obj_width(), FLA_Part_2x2(), FLA_Repart_2x2_to_3x3(), FLASH_Queue_get_enabled(), FLASH_SA_LU(), and FLASH_Trsm_piv().
Referenced by FLASH_LU_incpiv_noopt().
{
FLA_Obj ATL, ATR, A00, A01, A02,
ABL, ABR, A10, A11, A12,
A20, A21, A22;
FLA_Obj pTL, pTR, p00, p01, p02,
pBL, pBR, p10, p11, p12,
p20, p21, p22;
FLA_Obj LTL, LTR, L00, L01, L02,
LBL, LBR, L10, L11, L12,
L20, L21, L22;
FLA_Part_2x2( A, &ATL, &ATR,
&ABL, &ABR, 0, 0, FLA_TL );
FLA_Part_2x2( p, &pTL, &pTR,
&pBL, &pBR, 0, 0, FLA_TL );
FLA_Part_2x2( L, <L, <R,
&LBL, &LBR, 0, 0, FLA_TL );
while ( FLA_Obj_length( ATL ) < FLA_Obj_length( A ) &&
FLA_Obj_width ( ATL ) < FLA_Obj_width ( A ) )
{
FLA_Repart_2x2_to_3x3( ATL, /**/ ATR, &A00, /**/ &A01, &A02,
/* ************* */ /* ******************** */
&A10, /**/ &A11, &A12,
ABL, /**/ ABR, &A20, /**/ &A21, &A22,
1, 1, FLA_BR );
FLA_Repart_2x2_to_3x3( pTL, /**/ pTR, &p00, /**/ &p01, &p02,
/* ************* */ /* ******************** */
&p10, /**/ &p11, &p12,
pBL, /**/ pBR, &p20, /**/ &p21, &p22,
1, 1, FLA_BR );
FLA_Repart_2x2_to_3x3( LTL, /**/ LTR, &L00, /**/ &L01, &L02,
/* ************* */ /* ******************** */
&L10, /**/ &L11, &L12,
LBL, /**/ LBR, &L20, /**/ &L21, &L22,
1, 1, FLA_BR );
/*------------------------------------------------------------*/
if ( FLASH_Queue_get_enabled( ) )
{
// Enqueue
ENQUEUE_FLASH_LU_piv( *FLASH_OBJ_PTR_AT( A11 ),
*FLASH_OBJ_PTR_AT( p11 ),
FLA_Cntl_sub_lu( cntl ) );
}
else
{
// Execute leaf
FLA_LU_piv_task( *FLASH_OBJ_PTR_AT( A11 ),
*FLASH_OBJ_PTR_AT( p11 ),
FLA_Cntl_sub_lu( cntl ) );
}
FLASH_Trsm_piv( A11, A12, p11,
FLA_Cntl_sub_trsm1( cntl ) );
FLASH_SA_LU( A11, A12,
A21, A22, p21, L21, nb_alg, cntl );
/*------------------------------------------------------------*/
FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR, A00, A01, /**/ A02,
A10, A11, /**/ A12,
/* ************** */ /* ****************** */
&ABL, /**/ &ABR, A20, A21, /**/ A22,
FLA_TL );
FLA_Cont_with_3x3_to_2x2( &pTL, /**/ &pTR, p00, p01, /**/ p02,
p10, p11, /**/ p12,
/* ************** */ /* ****************** */
&pBL, /**/ &pBR, p20, p21, /**/ p22,
FLA_TL );
FLA_Cont_with_3x3_to_2x2( <L, /**/ <R, L00, L01, /**/ L02,
L10, L11, /**/ L12,
/* ************** */ /* ****************** */
&LBL, /**/ &LBR, L20, L21, /**/ L22,
FLA_TL );
}
return FLA_SUCCESS;
}
| FLA_Error FLASH_LU_incpiv_var2 | ( | FLA_Obj | A, |
| FLA_Obj | p, | ||
| FLA_Obj | L, | ||
| FLA_Obj | U, | ||
| dim_t | nb_alg, | ||
| fla_lu_t * | cntl | ||
| ) |
References FLA_Cont_with_1x3_to_1x2(), FLA_Cont_with_3x3_to_2x2(), FLA_LU_piv_copy_task(), FLA_Obj_length(), FLA_Obj_width(), FLA_Part_1x2(), FLA_Part_2x2(), FLA_Repart_1x2_to_1x3(), FLA_Repart_2x2_to_3x3(), FLASH_Queue_get_enabled(), FLASH_SA_LU(), and FLASH_Trsm_piv().
Referenced by FLASH_LU_incpiv_opt1().
{
FLA_Obj ATL, ATR, A00, A01, A02,
ABL, ABR, A10, A11, A12,
A20, A21, A22;
FLA_Obj pTL, pTR, p00, p01, p02,
pBL, pBR, p10, p11, p12,
p20, p21, p22;
FLA_Obj LTL, LTR, L00, L01, L02,
LBL, LBR, L10, L11, L12,
L20, L21, L22;
FLA_Obj UL, UR, U0, U1, U2;
FLA_Part_2x2( A, &ATL, &ATR,
&ABL, &ABR, 0, 0, FLA_TL );
FLA_Part_2x2( p, &pTL, &pTR,
&pBL, &pBR, 0, 0, FLA_TL );
FLA_Part_2x2( L, <L, <R,
&LBL, &LBR, 0, 0, FLA_TL );
FLA_Part_1x2( U, &UL, &UR, 0, FLA_LEFT );
while ( FLA_Obj_length( ATL ) < FLA_Obj_length( A ) &&
FLA_Obj_width ( ATL ) < FLA_Obj_width ( A ) )
{
FLA_Repart_2x2_to_3x3( ATL, /**/ ATR, &A00, /**/ &A01, &A02,
/* ************* */ /* ******************** */
&A10, /**/ &A11, &A12,
ABL, /**/ ABR, &A20, /**/ &A21, &A22,
1, 1, FLA_BR );
FLA_Repart_2x2_to_3x3( pTL, /**/ pTR, &p00, /**/ &p01, &p02,
/* ************* */ /* ******************** */
&p10, /**/ &p11, &p12,
pBL, /**/ pBR, &p20, /**/ &p21, &p22,
1, 1, FLA_BR );
FLA_Repart_2x2_to_3x3( LTL, /**/ LTR, &L00, /**/ &L01, &L02,
/* ************* */ /* ******************** */
&L10, /**/ &L11, &L12,
LBL, /**/ LBR, &L20, /**/ &L21, &L22,
1, 1, FLA_BR );
FLA_Repart_1x2_to_1x3( UL, /**/ UR, &U0, /**/ &U1, &U2,
1, FLA_RIGHT );
/*------------------------------------------------------------*/
if ( FLASH_Queue_get_enabled( ) )
{
// Enqueue
ENQUEUE_FLASH_LU_piv_copy( *FLASH_OBJ_PTR_AT( A11 ),
*FLASH_OBJ_PTR_AT( p11 ),
*FLASH_OBJ_PTR_AT( U1 ),
FLA_Cntl_sub_lu( cntl ) );
}
else
{
// Execute leaf
FLA_LU_piv_copy_task( *FLASH_OBJ_PTR_AT( A11 ),
*FLASH_OBJ_PTR_AT( p11 ),
*FLASH_OBJ_PTR_AT( U1 ),
FLA_Cntl_sub_lu( cntl ) );
}
FLASH_Trsm_piv( U1, A12, p11,
FLA_Cntl_sub_trsm1( cntl ) );
FLASH_SA_LU( A11, A12,
A21, A22, p21, L21, nb_alg, cntl );
/*------------------------------------------------------------*/
FLA_Cont_with_3x3_to_2x2( &ATL, /**/ &ATR, A00, A01, /**/ A02,
A10, A11, /**/ A12,
/* ************** */ /* ****************** */
&ABL, /**/ &ABR, A20, A21, /**/ A22,
FLA_TL );
FLA_Cont_with_3x3_to_2x2( &pTL, /**/ &pTR, p00, p01, /**/ p02,
p10, p11, /**/ p12,
/* ************** */ /* ****************** */
&pBL, /**/ &pBR, p20, p21, /**/ p22,
FLA_TL );
FLA_Cont_with_3x3_to_2x2( <L, /**/ <R, L00, L01, /**/ L02,
L10, L11, /**/ L12,
/* ************** */ /* ****************** */
&LBL, /**/ &LBR, L20, L21, /**/ L22,
FLA_TL );
FLA_Cont_with_1x3_to_1x2( &UL, /**/ &UR, U0, U1, /**/ U2,
FLA_LEFT );
}
return FLA_SUCCESS;
}
| FLA_Error FLASH_SA_FS | ( | FLA_Obj | L, |
| FLA_Obj | D, | ||
| FLA_Obj | p, | ||
| FLA_Obj | C, | ||
| FLA_Obj | E, | ||
| dim_t | nb_alg, | ||
| fla_gemm_t * | cntl | ||
| ) |
References FLA_Cont_with_1x3_to_1x2(), FLA_Obj_width(), FLA_Part_1x2(), FLA_Repart_1x2_to_1x3(), FLA_SA_FS_task(), and FLASH_Queue_get_enabled().
Referenced by FLASH_SA_LU().
{
FLA_Obj CL, CR, C0, C1, C2;
FLA_Obj EL, ER, E0, E1, E2;
FLA_Part_1x2( C, &CL, &CR, 0, FLA_LEFT );
FLA_Part_1x2( E, &EL, &ER, 0, FLA_LEFT );
while ( FLA_Obj_width( CL ) < FLA_Obj_width( C ) )
{
FLA_Repart_1x2_to_1x3( CL, /**/ CR, &C0, /**/ &C1, &C2,
1, FLA_RIGHT );
FLA_Repart_1x2_to_1x3( EL, /**/ ER, &E0, /**/ &E1, &E2,
1, FLA_RIGHT );
/*------------------------------------------------------------*/
if ( FLASH_Queue_get_enabled( ) )
{
// Enqueue
ENQUEUE_FLASH_SA_FS( *FLASH_OBJ_PTR_AT( L ),
*FLASH_OBJ_PTR_AT( D ),
*FLASH_OBJ_PTR_AT( p ),
*FLASH_OBJ_PTR_AT( C1 ),
*FLASH_OBJ_PTR_AT( E1 ),
nb_alg,
FLA_Cntl_sub_gemm( cntl ) );
}
else
{
// Execute leaf
FLA_SA_FS_task( *FLASH_OBJ_PTR_AT( L ),
*FLASH_OBJ_PTR_AT( D ),
*FLASH_OBJ_PTR_AT( p ),
*FLASH_OBJ_PTR_AT( C1 ),
*FLASH_OBJ_PTR_AT( E1 ),
nb_alg,
FLA_Cntl_sub_gemm( cntl ) );
}
/*------------------------------------------------------------*/
FLA_Cont_with_1x3_to_1x2( &CL, /**/ &CR, C0, C1, /**/ C2,
FLA_LEFT );
FLA_Cont_with_1x3_to_1x2( &EL, /**/ &ER, E0, E1, /**/ E2,
FLA_LEFT );
}
return FLA_SUCCESS;
}
| FLA_Error FLASH_SA_LU | ( | FLA_Obj | B, |
| FLA_Obj | C, | ||
| FLA_Obj | D, | ||
| FLA_Obj | E, | ||
| FLA_Obj | p, | ||
| FLA_Obj | L, | ||
| dim_t | nb_alg, | ||
| fla_lu_t * | cntl | ||
| ) |
References FLA_Cont_with_3x1_to_2x1(), FLA_Obj_length(), FLA_Part_2x1(), FLA_Repart_2x1_to_3x1(), FLA_SA_LU_task(), FLASH_Queue_get_enabled(), and FLASH_SA_FS().
Referenced by FLASH_LU_incpiv_var1(), and FLASH_LU_incpiv_var2().
{
FLA_Obj DT, D0,
DB, D1,
D2;
FLA_Obj ET, E0,
EB, E1,
E2;
FLA_Obj pT, p0,
pB, p1,
p2;
FLA_Obj LT, L0,
LB, L1,
L2;
FLA_Part_2x1( D, &DT,
&DB, 0, FLA_TOP );
FLA_Part_2x1( E, &ET,
&EB, 0, FLA_TOP );
FLA_Part_2x1( p, &pT,
&pB, 0, FLA_TOP );
FLA_Part_2x1( L, <,
&LB, 0, FLA_TOP );
while ( FLA_Obj_length( DT ) < FLA_Obj_length( D ) )
{
FLA_Repart_2x1_to_3x1( DT, &D0,
/* ** */ /* ** */
&D1,
DB, &D2, 1, FLA_BOTTOM );
FLA_Repart_2x1_to_3x1( ET, &E0,
/* ** */ /* ** */
&E1,
EB, &E2, 1, FLA_BOTTOM );
FLA_Repart_2x1_to_3x1( pT, &p0,
/* ** */ /* ** */
&p1,
pB, &p2, 1, FLA_BOTTOM );
FLA_Repart_2x1_to_3x1( LT, &L0,
/* ** */ /* ** */
&L1,
LB, &L2, 1, FLA_BOTTOM );
/*------------------------------------------------------------*/
if ( FLASH_Queue_get_enabled( ) )
{
// Enqueue
ENQUEUE_FLASH_SA_LU( *FLASH_OBJ_PTR_AT( B ),
*FLASH_OBJ_PTR_AT( D1 ),
*FLASH_OBJ_PTR_AT( p1 ),
*FLASH_OBJ_PTR_AT( L1 ),
nb_alg,
FLA_Cntl_sub_lu( cntl ) );
}
else
{
// Execute leaf
FLA_SA_LU_task( *FLASH_OBJ_PTR_AT( B ),
*FLASH_OBJ_PTR_AT( D1 ),
*FLASH_OBJ_PTR_AT( p1 ),
*FLASH_OBJ_PTR_AT( L1 ),
nb_alg,
FLA_Cntl_sub_lu( cntl ) );
}
FLASH_SA_FS( L1,
D1, p1, C,
E1, nb_alg, FLA_Cntl_sub_gemm1( cntl ) );
/*------------------------------------------------------------*/
FLA_Cont_with_3x1_to_2x1( &DT, D0,
D1,
/* ** */ /* ** */
&DB, D2, FLA_TOP );
FLA_Cont_with_3x1_to_2x1( &ET, E0,
E1,
/* ** */ /* ** */
&EB, E2, FLA_TOP );
FLA_Cont_with_3x1_to_2x1( &pT, p0,
p1,
/* ** */ /* ** */
&pB, p2, FLA_TOP );
FLA_Cont_with_3x1_to_2x1( <, L0,
L1,
/* ** */ /* ** */
&LB, L2, FLA_TOP );
}
return FLA_SUCCESS;
}
| FLA_Error FLASH_Trsm_piv | ( | FLA_Obj | A, |
| FLA_Obj | B, | ||
| FLA_Obj | p, | ||
| fla_trsm_t * | cntl | ||
| ) |
References FLA_Cont_with_1x3_to_1x2(), FLA_Obj_width(), FLA_Part_1x2(), FLA_Repart_1x2_to_1x3(), FLA_Trsm_piv_task(), and FLASH_Queue_get_enabled().
Referenced by FLASH_LU_incpiv_var1(), and FLASH_LU_incpiv_var2().
{
FLA_Obj BL, BR, B0, B1, B2;
FLA_Part_1x2( B, &BL, &BR, 0, FLA_LEFT );
while ( FLA_Obj_width( BL ) < FLA_Obj_width( B ) )
{
FLA_Repart_1x2_to_1x3( BL, /**/ BR, &B0, /**/ &B1, &B2,
1, FLA_RIGHT );
/*------------------------------------------------------------*/
if ( FLASH_Queue_get_enabled( ) )
{
// Enqueue
ENQUEUE_FLASH_Trsm_piv( *FLASH_OBJ_PTR_AT( A ),
*FLASH_OBJ_PTR_AT( B1 ),
*FLASH_OBJ_PTR_AT( p ),
FLA_Cntl_sub_trsm( cntl ) );
}
else
{
// Execute leaf
FLA_Trsm_piv_task( *FLASH_OBJ_PTR_AT( A ),
*FLASH_OBJ_PTR_AT( B1 ),
*FLASH_OBJ_PTR_AT( p ),
FLA_Cntl_sub_trsm( cntl ) );
}
/*------------------------------------------------------------*/
FLA_Cont_with_1x3_to_1x2( &BL, /**/ &BR, B0, B1, /**/ B2,
FLA_LEFT );
}
return FLA_SUCCESS;
}
1.7.6.1