|
libflame
revision_anchor
|
Go to the source code of this file.
| void FLASH_Queue_begin | ( | void | ) |
References FLA_Clock().
Referenced by FLASH_Apply_CAQ_UT_inc(), FLASH_Apply_Q2_UT(), FLASH_Apply_Q_UT(), FLASH_Apply_Q_UT_inc(), FLASH_Apply_QUD_UT_inc(), FLASH_CAQR_UT_inc_noopt(), FLASH_Chol(), FLASH_Copy(), FLASH_Copyr(), FLASH_Eig_gest(), FLASH_Gemm(), FLASH_Hemm(), FLASH_Her2k(), FLASH_Herk(), FLASH_LQ_UT(), FLASH_LU_incpiv_noopt(), FLASH_LU_incpiv_opt1(), FLASH_LU_nopiv(), FLASH_LU_piv(), FLASH_Lyap(), FLASH_QR2_UT(), FLASH_QR_UT(), FLASH_QR_UT_inc_noopt(), FLASH_QR_UT_inc_opt1(), FLASH_SPDinv(), FLASH_Sylv(), FLASH_Symm(), FLASH_Syr2k(), FLASH_Syrk(), FLASH_Trinv(), FLASH_Trmm(), FLASH_Trsm(), FLASH_Ttmm(), and FLASH_UDdate_UT_inc().
{
#ifdef FLA_ENABLE_SUPERMATRIX
if ( flash_queue_stack == 0 )
{
// Save the starting time for the total execution time.
flash_queue_total_time = FLA_Clock();
}
#endif
// Push onto the stack.
flash_queue_stack++;
return;
}
| FLA_Bool FLASH_Queue_check_block_gpu | ( | FLA_Obj | obj, |
| int | thread, | ||
| void * | arg | ||
| ) |
References FLA_Obj_view::base, FLA_Obj_gpu_struct::clean, FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_get_gpu_num_blocks(), FLASH_Queue_variables::gpu, FLASH_Queue_variables::gpu_lock, FLA_Obj_gpu_struct::obj, FLA_Obj_gpu_struct::request, and FLASH_Queue_variables::victim.
Referenced by FLASH_Queue_check_gpu().
{
FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
int k;
dim_t gpu_n_blocks = FLASH_Queue_get_gpu_num_blocks();
FLA_Bool r_val = TRUE;
#ifdef FLA_ENABLE_MULTITHREADING
FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G ***
#endif
// Locate the position of the block on the GPU.
for ( k = 0; k < gpu_n_blocks; k++ )
if ( obj.base == args->gpu[thread * gpu_n_blocks + k].obj.base )
break;
if ( k < gpu_n_blocks )
{
// Request this block if it is dirty.
if ( !args->gpu[thread * gpu_n_blocks + k].clean )
{
args->gpu[thread * gpu_n_blocks + k].request = TRUE;
r_val = FALSE;
}
}
// Check the victim block.
if ( obj.base == args->victim[thread].obj.base )
r_val = FALSE;
#ifdef FLA_ENABLE_MULTITHREADING
FLA_Lock_release( &(args->gpu_lock[thread]) ); // G ***
#endif
return r_val;
}
| FLA_Bool FLASH_Queue_check_gpu | ( | FLASH_Task * | t, |
| void * | arg | ||
| ) |
References FLA_Obj_view::base, FLA_Obj_col_stride(), FLA_Obj_elemtype(), FLA_Obj_length(), FLA_Obj_width(), FLASH_Queue_check_block_gpu(), FLASH_Queue_get_num_threads(), FLASH_Task_s::input_arg, FLASH_Task_s::n_input_args, FLASH_Task_s::n_output_args, FLASH_Task_s::output_arg, and FLASH_Task_s::thread.
Referenced by FLASH_Queue_exec_gpu().
{
int i, j, k;
int thread = t->thread;
int n_input_args = t->n_input_args;
int n_output_args = t->n_output_args;
int n_threads = FLASH_Queue_get_num_threads();
FLA_Bool r_val = TRUE;
FLA_Bool t_val;
FLA_Bool duplicate;
FLA_Obj obj;
// Check the input and output arguments on the GPUs.
for ( i = 0; i < n_input_args + n_output_args; i++ )
{
// Check for duplicate blocks.
duplicate = FALSE;
// Find the correct input or output argument.
if ( i < n_input_args )
{
obj = t->input_arg[i];
for ( j = 0; j < n_output_args && !duplicate; j++ )
{
if ( obj.base == t->output_arg[j].base )
duplicate = TRUE;
}
for ( j = 0; j < i && !duplicate; j++ )
{
if ( obj.base == t->input_arg[j].base )
duplicate = TRUE;
}
}
else
{
obj = t->output_arg[i - n_input_args];
for ( j = 0; j < i - n_input_args && !duplicate; j++ )
{
if ( obj.base == t->output_arg[j].base )
duplicate = TRUE;
}
}
// If the block has not been processed before.
if ( !duplicate )
{
// Macroblock is used.
if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
{
dim_t jj, kk;
dim_t m = FLA_Obj_length( obj );
dim_t n = FLA_Obj_width( obj );
dim_t cs = FLA_Obj_col_stride( obj );
FLA_Obj* buf = FLASH_OBJ_PTR_AT( obj );
// Clear each block in macroblock.
for ( jj = 0; jj < n; jj++ )
{
for ( kk = 0; kk < m; kk++ )
{
obj = *( buf + jj * cs + kk );
t_val = TRUE;
// Check to see if the block is dirty on another GPU.
for ( k = 0; k < n_threads && t_val; k++ )
if ( k != thread )
t_val = t_val && FLASH_Queue_check_block_gpu( obj, k, arg );
r_val = r_val && t_val;
}
}
}
else
{
t_val = TRUE;
// Check to see if the block is dirty on another GPU.
for ( k = 0; k < n_threads && t_val; k++ )
if ( k != thread )
t_val = t_val && FLASH_Queue_check_block_gpu( obj, k, arg );
r_val = r_val && t_val;
}
}
}
return r_val;
}
| void FLASH_Queue_create_gpu | ( | int | thread, |
| void * | arg | ||
| ) |
References FLASH_Queue_variables::block_size, FLA_Obj_gpu_struct::buffer_gpu, FLASH_Queue_variables::datatype, FLASH_Queue_alloc_gpu(), FLASH_Queue_bind_gpu(), FLASH_Queue_get_enabled_gpu(), FLASH_Queue_get_gpu_num_blocks(), and FLASH_Queue_variables::gpu.
Referenced by FLASH_Queue_exec_parallel_function().
{
FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
int i;
dim_t gpu_n_blocks = FLASH_Queue_get_gpu_num_blocks();
dim_t block_size = args->block_size;
FLA_Datatype datatype = args->datatype;
// Exit if not using GPU.
if ( !FLASH_Queue_get_enabled_gpu() )
return;
// Bind thread to GPU.
FLASH_Queue_bind_gpu( thread );
// Allocate the memory on the GPU for all the blocks a priori.
for ( i = 0; i < gpu_n_blocks; i++ )
FLASH_Queue_alloc_gpu( block_size, datatype, &(args->gpu[thread * gpu_n_blocks + i].buffer_gpu) );
return;
}
| void FLASH_Queue_destroy_gpu | ( | int | thread, |
| void * | arg | ||
| ) |
References FLA_Obj_view::base, FLA_Obj_gpu_struct::buffer_gpu, FLA_Obj_gpu_struct::clean, FLASH_Queue_free_gpu(), FLASH_Queue_get_enabled_gpu(), FLASH_Queue_get_gpu_num_blocks(), FLASH_Queue_read_gpu(), FLASH_Queue_variables::gpu, and FLA_Obj_gpu_struct::obj.
Referenced by FLASH_Queue_exec_parallel_function().
{
FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
int i;
dim_t gpu_n_blocks = FLASH_Queue_get_gpu_num_blocks();
FLA_Obj_gpu gpu_obj;
// Exit if not using GPU.
if ( !FLASH_Queue_get_enabled_gpu() )
return;
// Examine every block left on the GPU.
for ( i = 0; i < gpu_n_blocks; i++ )
{
gpu_obj = args->gpu[thread * gpu_n_blocks + i];
// Flush the blocks that are dirty.
if ( gpu_obj.obj.base != NULL && !gpu_obj.clean )
FLASH_Queue_read_gpu( gpu_obj.obj, gpu_obj.buffer_gpu );
// Free the memory on the GPU for all the blocks.
FLASH_Queue_free_gpu( gpu_obj.buffer_gpu );
}
return;
}
| FLA_Error FLASH_Queue_disable | ( | void | ) |
Referenced by FLASH_Apply_pivots(), FLASH_Axpy(), FLASH_Axpyt(), FLASH_Copyt(), FLASH_FS_incpiv(), FLASH_Gemv(), FLASH_Scal(), FLASH_Scalr(), and FLASH_Trsv().
{
#ifdef FLA_ENABLE_SUPERMATRIX
if ( flash_queue_stack == 0 )
{
// Disable if not begin parallel region yet.
flash_queue_enabled = FALSE;
return FLA_SUCCESS;
}
else
{
// Cannot change status during parallel region.
return FLA_FAILURE;
}
#else
// Allow disabling enqueuing even when SuperMatrix is not configured.
flash_queue_enabled = FALSE;
return FLA_SUCCESS;
#endif
}
| FLA_Error FLASH_Queue_enable | ( | void | ) |
Referenced by FLASH_Apply_pivots(), FLASH_Axpy(), FLASH_Axpyt(), FLASH_Copyt(), FLASH_FS_incpiv(), FLASH_Gemv(), FLASH_Scal(), FLASH_Scalr(), and FLASH_Trsv().
{
#ifdef FLA_ENABLE_SUPERMATRIX
if ( flash_queue_stack == 0 )
{
// Enable if not begin parallel region yet.
flash_queue_enabled = TRUE;
return FLA_SUCCESS;
}
else
{
// Cannot change status during parallel region.
return FLA_FAILURE;
}
#else
// Raise an exception when SuperMatrix is not configured.
FLA_Check_error_code( FLA_SUPERMATRIX_NOT_ENABLED );
return FLA_FAILURE;
#endif
}
| void FLASH_Queue_end | ( | void | ) |
References FLA_Clock(), and FLASH_Queue_exec().
Referenced by FLASH_Apply_CAQ_UT_inc(), FLASH_Apply_Q2_UT(), FLASH_Apply_Q_UT(), FLASH_Apply_Q_UT_inc(), FLASH_Apply_QUD_UT_inc(), FLASH_CAQR_UT_inc_noopt(), FLASH_Chol(), FLASH_Copy(), FLASH_Copyr(), FLASH_Eig_gest(), FLASH_Gemm(), FLASH_Hemm(), FLASH_Her2k(), FLASH_Herk(), FLASH_LQ_UT(), FLASH_LU_incpiv_noopt(), FLASH_LU_incpiv_opt1(), FLASH_LU_nopiv(), FLASH_LU_piv(), FLASH_Lyap(), FLASH_QR2_UT(), FLASH_QR_UT(), FLASH_QR_UT_inc_noopt(), FLASH_QR_UT_inc_opt1(), FLASH_SPDinv(), FLASH_Sylv(), FLASH_Symm(), FLASH_Syr2k(), FLASH_Syrk(), FLASH_Trinv(), FLASH_Trmm(), FLASH_Trsm(), FLASH_Ttmm(), and FLASH_UDdate_UT_inc().
{
// Pop off the stack.
flash_queue_stack--;
#ifdef FLA_ENABLE_SUPERMATRIX
if ( flash_queue_stack == 0 )
{
// Execute tasks if encounter the outermost parallel region.
FLASH_Queue_exec();
// Find the total execution time.
flash_queue_total_time = FLA_Clock() - flash_queue_total_time;
}
#endif
return;
}
| void FLASH_Queue_exec | ( | void | ) |
References FLASH_Queue_variables::all_lock, FLA_Obj_view::base, FLA_Obj_gpu_struct::buffer_gpu, FLASH_Queue_variables::cac_lock, FLASH_Queue_variables::cache, FLA_Obj_gpu_struct::clean, FLASH_Queue_variables::dep_lock, FLA_Clock(), FLA_free(), FLA_is_owner(), FLA_Lock_destroy(), FLA_Lock_init(), FLA_malloc(), FLA_shfree(), FLA_shmalloc(), FLASH_Queue_exec_parallel(), FLASH_Queue_exec_parallel_function(), FLASH_Queue_exec_simulation(), FLASH_Queue_get_block_size(), FLASH_Queue_get_cache_size(), FLASH_Queue_get_caching(), FLASH_Queue_get_cores_per_cache(), FLASH_Queue_get_cores_per_queue(), FLASH_Queue_get_data_affinity(), FLASH_Queue_get_gpu_num_blocks(), FLASH_Queue_get_num_tasks(), FLASH_Queue_get_num_threads(), FLASH_Queue_get_verbose_output(), FLASH_Queue_get_work_stealing(), FLASH_Queue_init_tasks(), FLASH_Queue_reset(), FLASH_Queue_set_caching(), FLASH_Queue_set_data_affinity(), FLASH_Queue_set_parallel_time(), FLASH_Queue_set_work_stealing(), FLASH_Queue_verbose_output(), FLASH_Task_free(), FLASH_Queue_variables::gpu, FLASH_Queue_variables::gpu_lock, FLASH_Queue_variables::gpu_log, FLASH_Queue_s::head, FLASH_Queue_variables::n_caches, FLASH_Queue_variables::n_queues, FLASH_Queue_variables::n_ready, FLASH_Queue_s::n_tasks, FLASH_Queue_variables::n_wait, FLA_Obj_gpu_struct::obj, FLASH_Queue_variables::pc, FLASH_Queue_variables::prefetch, RCCE_wtime(), FLA_Obj_gpu_struct::request, FLASH_Queue_variables::run_lock, FLASH_Queue_variables::size, Synch_all(), FLASH_Queue_s::tail, FLASH_Queue_variables::task_queue, FLASH_Queue_variables::victim, FLASH_Queue_variables::wait_queue, and FLASH_Queue_variables::war_lock.
Referenced by FLASH_Queue_end().
{
int n_tasks = FLASH_Queue_get_num_tasks();
int i;
double dtime;
// All the necessary variables for the SuperMatrix mechanism.
FLASH_Queue_vars args;
// If the queue is empty, return early.
if ( n_tasks == 0 )
return;
// Turn off all multiple queue implementations.
FLASH_Queue_set_data_affinity( FLASH_QUEUE_AFFINITY_NONE );
FLASH_Queue_set_work_stealing( FALSE );
// Do not use cache affinity yet.
FLASH_Queue_set_caching( FALSE );
// Allocate memory for task queues.
args.task_queue = ( FLASH_Task** ) FLA_malloc( n_tasks * sizeof( FLASH_Task* ) );
args.n_ready = ( int* ) FLA_shmalloc( n_tasks * sizeof( int ) );
args.wait_queue = ( int* ) FLA_shmalloc( n_tasks * sizeof( int ) );
args.n_wait = ( int* ) FLA_shmalloc( sizeof( int ) );
args.pc = ( int* ) FLA_shmalloc( sizeof( int ) );
// Initialize data.
if ( FLA_is_owner() )
{
args.n_wait[0] = 0;
args.pc[0] = 0;
}
Synch_all();
// Initialize tasks with critical information.
FLASH_Queue_init_tasks( ( void* ) &args );
// Display verbose output before free all tasks.
if ( FLASH_Queue_get_verbose_output() )
FLASH_Queue_verbose_output();
// Start timing the parallel execution.
dtime = RCCE_wtime();
FLASH_Queue_exec_parallel_function( ( void* ) &args );
// End timing the parallel execution.
dtime = RCCE_wtime() - dtime;
FLASH_Queue_set_parallel_time( dtime );
// Free all tasks sequentially.
for ( i = 0; i < n_tasks; i++ )
FLASH_Task_free( args.task_queue[i] );
// Free data.
FLA_free( args.task_queue );
FLA_shfree( args.n_ready );
FLA_shfree( args.wait_queue );
FLA_shfree( args.n_wait );
FLA_shfree( args.pc );
// Reset values for next call to FLASH_Queue_exec().
FLASH_Queue_reset();
return;
}
| FLA_Bool FLASH_Queue_exec_gpu | ( | FLASH_Task * | t, |
| void * | arg | ||
| ) |
References FLA_Obj_view::base, FLASH_Task_s::enabled_gpu, FLA_free(), FLA_Lock_acquire(), FLA_Lock_release(), FLA_malloc(), FLA_Obj_col_stride(), FLA_Obj_elemtype(), FLA_Obj_length(), FLA_Obj_width(), FLASH_Queue_check_gpu(), FLASH_Queue_exec_task(), FLASH_Queue_exec_task_gpu(), FLASH_Queue_flush_block_gpu(), FLASH_Queue_get_enabled_gpu(), FLASH_Queue_get_num_threads(), FLASH_Queue_invalidate_block_gpu(), FLASH_Queue_mark_gpu(), FLASH_Queue_update_gpu(), FLASH_Queue_wait_enqueue(), FLASH_Task_s::hit, FLASH_Task_s::input_arg, FLASH_Task_s::n_input_args, FLASH_Task_s::n_output_args, FLASH_Task_s::output_arg, FLASH_Task_s::queue, FLASH_Queue_variables::run_lock, and FLASH_Task_s::thread.
Referenced by FLASH_Queue_exec_parallel_function().
{
void** input_arg;
void** output_arg;
if ( t == NULL )
return TRUE;
// If not using the GPU, then execute on CPU.
if ( !FLASH_Queue_get_enabled_gpu() )
{
FLASH_Queue_exec_task( t );
return TRUE;
}
// Check if all the operands are ready and up to date.
if ( !FLASH_Queue_check_gpu( t, arg ) )
{
FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
int queue = t->queue;
t->hit = FALSE;
#ifdef FLA_ENABLE_MULTITHREADING
FLA_Lock_acquire( &(args->run_lock[queue]) ); // R ***
#endif
// Reenqueue the task if the blocks are not all flushed.
FLASH_Queue_wait_enqueue( t, arg );
#ifdef FLA_ENABLE_MULTITHREADING
FLA_Lock_release( &(args->run_lock[queue]) ); // R ***
#endif
return FALSE;
}
// If GPU is enabled, but the task is not supported for GPU execution.
if ( !t->enabled_gpu )
{
int i, j, k;
int thread = t->thread;
int n_input_args = t->n_input_args;
int n_output_args = t->n_output_args;
int n_threads = FLASH_Queue_get_num_threads();
FLA_Bool duplicate;
FLA_Obj obj;
// Check the blocks on each GPU.
for ( k = 0; k < n_threads; k++ )
{
// Check the input and output arguments on the GPUs.
for ( i = 0; i < n_input_args + n_output_args; i++ )
{
// Check for duplicate blocks.
duplicate = FALSE;
// Find the correct input or output argument.
if ( i < n_input_args )
{
obj = t->input_arg[i];
for ( j = 0; j < n_output_args && !duplicate; j++ )
{
if ( obj.base == t->output_arg[j].base )
duplicate = TRUE;
}
for ( j = 0; j < i && !duplicate; j++ )
{
if ( obj.base == t->input_arg[j].base )
duplicate = TRUE;
}
}
else
{
obj = t->output_arg[i - n_input_args];
for ( j = 0; j < i - n_input_args && !duplicate; j++ )
{
if ( obj.base == t->output_arg[j].base )
duplicate = TRUE;
}
}
// If the block has not been processed before.
if ( !duplicate )
{
// Macroblock is used.
if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
{
dim_t jj, kk;
dim_t m = FLA_Obj_length( obj );
dim_t n = FLA_Obj_width( obj );
dim_t cs = FLA_Obj_col_stride( obj );
FLA_Obj* buf = FLASH_OBJ_PTR_AT( obj );
// Clear each block in macroblock.
for ( jj = 0; jj < n; jj++ )
{
for ( kk = 0; kk < m; kk++ )
{
obj = *( buf + jj * cs + kk );
// Flush the block to main memory if it is on the GPU.
if ( k == thread )
FLASH_Queue_flush_block_gpu( obj, k, arg );
// Invalidate output block on all GPUs.
if ( i >= n_input_args )
FLASH_Queue_invalidate_block_gpu( obj, k, arg );
}
}
}
else
{
// Flush the block to main memory if it is on the GPU.
if ( k == thread )
FLASH_Queue_flush_block_gpu( obj, k, arg );
// Invalidate output block on all GPUs.
if ( i >= n_input_args )
FLASH_Queue_invalidate_block_gpu( obj, k, arg );
}
}
}
}
// Execute the task on CPU instead of GPU.
FLASH_Queue_exec_task( t );
return TRUE;
}
// Gather the pointers for the data on the GPU.
input_arg = ( void** ) FLA_malloc( t->n_input_args * sizeof( void* ) );
output_arg = ( void** ) FLA_malloc( t->n_output_args * sizeof( void* ) );
// Bring all the blocks to GPU.
FLASH_Queue_update_gpu( t, input_arg, output_arg, arg );
// Execute the task on GPU.
FLASH_Queue_exec_task_gpu( t, input_arg, output_arg );
// Mark all the output blocks as dirty.
FLASH_Queue_mark_gpu( t, arg );
// Free memory.
FLA_free( input_arg );
FLA_free( output_arg );
return TRUE;
}
| void FLASH_Queue_exec_parallel | ( | void * | arg | ) |
References FLA_Check_pthread_create_result(), FLA_Check_pthread_join_result(), FLA_free(), FLA_malloc(), FLASH_Queue_exec_parallel_function(), and FLASH_Queue_get_num_threads().
Referenced by FLASH_Queue_exec().
{
int i;
int n_threads = FLASH_Queue_get_num_threads();
void* (*thread_entry_point)( void* );
// Allocate the thread structures array. Here, an array of FLASH_Thread
// structures of length n_threads is allocated and the fields of each
// structure set to appropriate values.
FLASH_Thread* thread = ( FLASH_Thread* ) FLA_malloc( n_threads * sizeof( FLASH_Thread ) );
// Initialize the thread structures array.
for ( i = 0; i < n_threads; i++ )
{
// Save the thread's identifier.
thread[i].id = i;
// Save the pointer to the necessary variables with the thread.
thread[i].args = arg;
// The pthread object, if it was even compiled into the FLASH_Thread
// structure, will be initialized by the pthread implementation when we
// call pthread_create() and does not need to be touched at this time.
}
// Determine which function to send threads to.
thread_entry_point = FLASH_Queue_exec_parallel_function;
#if FLA_MULTITHREADING_MODEL == FLA_OPENMP
// An OpenMP parallel for region spawns n_threads threads. Each thread
// executes the work function with a different FLASH_Thread argument.
// An implicit synchronization point exists at the end of the curly
// brace scope.
#pragma omp parallel for \
private( i ) \
shared( thread, n_threads, thread_entry_point ) \
schedule( static, 1 ) \
num_threads( n_threads )
for ( i = 0; i < n_threads; ++i )
{
thread_entry_point( ( void* ) &thread[i] );
}
#elif FLA_MULTITHREADING_MODEL == FLA_PTHREADS
// Create each POSIX thread needed in addition to the main thread.
for ( i = 1; i < n_threads; i++ )
{
int pthread_e_val;
// Create thread i with default attributes.
pthread_e_val = pthread_create( &(thread[i].pthread_obj),
NULL,
thread_entry_point,
( void* ) &thread[i] );
#ifdef FLA_ENABLE_INTERNAL_ERROR_CHECKING
FLA_Error e_val = FLA_Check_pthread_create_result( pthread_e_val );
FLA_Check_error_code( e_val );
#endif
}
// The main thread is assigned the role of thread 0. Here we manually
// execute it as a worker thread.
thread_entry_point( ( void* ) &thread[0] );
// Wait for non-main threads to finish.
for ( i = 1; i < n_threads; i++ )
{
// These two variables are declared local to this for loop since this
// is the only place they are needed, and since they would show up as
// unused variables if FLA_MULTITHREADING_MODEL == FLA_PTHREADS.
// Strangely, the Intel compiler produces code that results in an
// "unaligned access" runtime message if thread_status is declared as
// an int. Declaring it as a long or void* appears to force the
// compiler (not surprisingly) into aligning it to an 8-byte boundary.
int pthread_e_val;
void* thread_status;
// Wait for thread i to invoke its respective pthread_exit().
// The return value passed to pthread_exit() is provided to us
// via status, if one was given.
pthread_e_val = pthread_join( thread[i].pthread_obj,
( void** ) &thread_status );
#ifdef FLA_ENABLE_INTERNAL_ERROR_CHECKING
FLA_Error e_val = FLA_Check_pthread_join_result( pthread_e_val );
FLA_Check_error_code( e_val );
#endif
}
#endif
FLA_free( thread );
return;
}
| void * FLASH_Queue_exec_parallel_function | ( | void * | arg | ) |
References FLASH_Thread_s::args, FLASH_Task_s::cache, FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_create_gpu(), FLASH_Queue_destroy_gpu(), FLASH_Queue_exec_gpu(), FLASH_Queue_exec_task(), FLASH_Queue_flush_gpu(), FLASH_Queue_get_caching(), FLASH_Queue_get_cores_per_cache(), FLASH_Queue_get_enabled_gpu(), FLASH_Queue_get_num_tasks(), FLASH_Queue_get_num_threads(), FLASH_Queue_get_work_stealing(), FLASH_Queue_prefetch(), FLASH_Queue_update_cache(), FLASH_Queue_wait_dequeue(), FLASH_Queue_work_stealing(), FLASH_Task_free_parallel(), FLASH_Task_update_dependencies(), FLASH_Thread_s::id, FLASH_Queue_variables::n_queues, RCCE_acquire_lock(), RCCE_release_lock(), RCCE_ue(), and FLASH_Task_s::thread.
Referenced by FLASH_Queue_exec(), and FLASH_Queue_exec_parallel().
{
FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
int i = RCCE_ue();
int queue = 0;
int cache = 0;
int n_tasks = FLASH_Queue_get_num_tasks();
int n_threads = FLASH_Queue_get_num_threads();
FLA_Bool condition;
FLA_Bool available;
FLASH_Task* t = NULL;
// Do not let extraneous cores execute.
if ( i < n_threads )
condition = TRUE;
else
condition = FALSE;
// Loop until all the tasks have committed.
while ( condition )
{
RCCE_acquire_lock( 0 );
// Obtain task to execute.
t = FLASH_Queue_wait_dequeue( queue, cache, ( void* ) args );
RCCE_release_lock( 0 );
// Dequeued a task from the waiting queue.
available = ( t != NULL );
if ( available )
{
// Save the thread and cache that executes the task.
t->thread = i;
t->cache = cache;
// Execute the task.
FLASH_Queue_exec_task( t );
// Update task dependencies.
FLASH_Task_update_dependencies( t, ( void* ) args );
}
RCCE_acquire_lock( 0 );
// Terminate loop.
if ( args->pc[0] >= n_tasks )
condition = FALSE;
RCCE_release_lock( 0 );
}
return ( void* ) NULL;
}
| void FLASH_Queue_exec_simulation | ( | void * | arg | ) |
References FLASH_Task_s::cache, FLASH_Task_s::dep_arg_head, FLA_free(), FLA_malloc(), FLASH_Queue_exec_task(), FLASH_Queue_get_cores_per_cache(), FLASH_Queue_get_num_tasks(), FLASH_Queue_get_num_threads(), FLASH_Queue_get_verbose_output(), FLASH_Queue_prefetch(), FLASH_Queue_update_cache(), FLASH_Queue_wait_dequeue(), FLASH_Queue_wait_enqueue(), FLASH_Task_free(), FLASH_Task_s::n_dep_args, FLASH_Queue_variables::n_queues, FLASH_Task_s::n_ready, FLASH_Task_s::name, FLASH_Dep_s::next_dep, FLASH_Queue_variables::pc, FLASH_Dep_s::task, and FLASH_Task_s::thread.
Referenced by FLASH_Queue_exec().
{
FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
int i, j;
int queue;
int cache;
int n_stages = 0;
int n_queues = args->n_queues;
int n_tasks = FLASH_Queue_get_num_tasks();
int n_threads = FLASH_Queue_get_num_threads();
int n_cores = FLASH_Queue_get_cores_per_cache();
FLASH_Verbose verbose = FLASH_Queue_get_verbose_output();
FLASH_Task* task;
FLASH_Task* t;
FLASH_Dep* d;
// An array to hold tasks to be executed during of simulation.
#ifdef FLA_ENABLE_WINDOWS_BUILD
FLASH_Task** exec_array = ( FLASH_Task** ) FLA_malloc( n_threads * sizeof( FLASH_Task* ) );
#else
FLASH_Task* exec_array[n_threads];
#endif
for ( i = 0; i < n_threads; i++ )
{
// Initialize all exec_array to NULL.
exec_array[i] = NULL;
// Prefetch blocks into the cache before execution.
if ( i % n_cores == 0 )
FLASH_Queue_prefetch( i, arg );
}
// Loop until all the tasks have committed.
while ( args->pc < n_tasks )
{
for ( i = 0; i < n_threads; i++ )
{
// Update waiting queue with ready tasks.
t = exec_array[i];
if ( t != NULL )
{
// Check each dependent task.
d = t->dep_arg_head;
for ( j = 0; j < t->n_dep_args; j++ )
{
task = d->task;
task->n_ready--;
// Place newly ready tasks on waiting queue.
if ( task->n_ready == 0 )
{
FLASH_Queue_wait_enqueue( task, arg );
}
// Go to the next dep.
d = d->next_dep;
}
// Free the task.
FLASH_Task_free( t );
}
}
n_stages++;
if ( !verbose )
printf( "%7d", n_stages );
// Move ready tasks from the waiting queue to execution queue.
for ( i = 0; i < n_threads; i++ )
{
// Determine to which queue this thread belongs.
queue = i / ( n_threads / n_queues );
// Determine to which cache this thread belongs.
cache = i / n_cores;
// Dequeue a task.
t = FLASH_Queue_wait_dequeue( queue, cache, arg );
// Save the task for execution.
exec_array[i] = t;
if ( t != NULL )
{
// Save the thread and cache that executes the task.
t->thread = i;
t->cache = cache;
// Increment program counter.
args->pc++;
}
}
// Execute independent tasks.
for ( i = 0; i < n_threads; i++ )
{
t = exec_array[i];
FLASH_Queue_update_cache( t, arg );
FLASH_Queue_exec_task( t );
if ( !verbose )
printf( "%7s", ( t == NULL ? " " : t->name ) );
// Free the task if this is the last stage.
if ( args->pc == n_tasks && t != NULL )
FLASH_Task_free( t );
}
if ( !verbose )
printf( "\n" );
}
if ( !verbose )
printf( "\n" );
#ifdef FLA_ENABLE_WINDOWS_BUILD
FLA_free( exec_array );
#endif
return;
}
| void FLASH_Queue_exec_task | ( | FLASH_Task * | t | ) |
References FLASH_Task_s::cntl, FLA_Apply_CAQ2_UT_task(), FLA_Apply_pivots_macro_task(), FLA_Apply_Q2_UT_task(), FLA_Apply_Q_UT_task(), FLA_Apply_QUD_UT_task(), FLASH_Task_s::fla_arg, FLA_Axpy_task(), FLA_Axpyt_task(), FLA_CAQR2_UT_task(), FLA_Chol_task(), FLA_Copy_task(), FLA_Copyr_task(), FLA_Copyt_task(), FLA_Eig_gest_task(), FLA_Gemm_task(), FLA_Gemv_task(), FLA_Hemm_task(), FLA_Her2k_task(), FLA_Herk_task(), FLA_LQ_UT_macro_task(), FLA_LU_nopiv_task(), FLA_LU_piv_copy_task(), FLA_LU_piv_macro_task(), FLA_LU_piv_task(), FLA_Lyap_task(), FLA_Obj_create_buffer_task(), FLA_Obj_free_buffer_task(), FLA_QR2_UT_task(), FLA_QR_UT_copy_task(), FLA_QR_UT_macro_task(), FLA_QR_UT_task(), FLA_SA_FS_task(), FLA_SA_LU_task(), FLA_Scal_task(), FLA_Scalr_task(), FLA_Sylv_task(), FLA_Symm_task(), FLA_Syr2k_task(), FLA_Syrk_task(), FLA_Trinv_task(), FLA_Trmm_task(), FLA_Trsm_piv_task(), FLA_Trsm_task(), FLA_Trsv_task(), FLA_Ttmm_task(), FLA_UDdate_UT_task(), FLASH_Task_s::func, FLASH_Task_s::input_arg, FLASH_Task_s::int_arg, and FLASH_Task_s::output_arg.
Referenced by FLASH_Queue_exec_gpu(), FLASH_Queue_exec_parallel_function(), and FLASH_Queue_exec_simulation().
{
// Define local function pointer types.
// LAPACK-level
typedef FLA_Error(*flash_lu_piv_macro_p)(FLA_Obj A, FLA_Obj p, fla_lu_t* cntl );
typedef FLA_Error(*flash_apply_pivots_macro_p)(FLA_Side side, FLA_Trans trans, FLA_Obj p, FLA_Obj A, fla_appiv_t* cntl);
typedef FLA_Error(*flash_lu_piv_p)(FLA_Obj A, FLA_Obj p, fla_lu_t* cntl);
typedef FLA_Error(*flash_lu_piv_copy_p)(FLA_Obj A, FLA_Obj p, FLA_Obj U, fla_lu_t* cntl);
typedef FLA_Error(*flash_trsm_piv_p)(FLA_Obj A, FLA_Obj C, FLA_Obj p, fla_trsm_t* cntl);
typedef FLA_Error(*flash_sa_lu_p)(FLA_Obj U, FLA_Obj D, FLA_Obj p, FLA_Obj L, int nb_alg, fla_lu_t* cntl);
typedef FLA_Error(*flash_sa_fs_p)(FLA_Obj L, FLA_Obj D, FLA_Obj p, FLA_Obj C, FLA_Obj E, int nb_alg, fla_gemm_t* cntl);
typedef FLA_Error(*flash_lu_nopiv_p)(FLA_Obj A, fla_lu_t* cntl);
typedef FLA_Error(*flash_trinv_p)(FLA_Uplo uplo, FLA_Diag diag, FLA_Obj A, fla_trinv_t* cntl);
typedef FLA_Error(*flash_ttmm_p)(FLA_Uplo uplo, FLA_Obj A, fla_ttmm_t* cntl);
typedef FLA_Error(*flash_chol_p)(FLA_Uplo uplo, FLA_Obj A, fla_chol_t* cntl);
typedef FLA_Error(*flash_sylv_p)(FLA_Trans transa, FLA_Trans transb, FLA_Obj isgn, FLA_Obj A, FLA_Obj B, FLA_Obj C, FLA_Obj scale, fla_sylv_t* cntl);
typedef FLA_Error(*flash_lyap_p)(FLA_Trans trans, FLA_Obj isgn, FLA_Obj A, FLA_Obj C, FLA_Obj scale, fla_lyap_t* cntl);
typedef FLA_Error(*flash_qrut_macro_p)(FLA_Obj A, FLA_Obj T, fla_qrut_t* cntl);
typedef FLA_Error(*flash_qrut_p)(FLA_Obj A, FLA_Obj T, fla_qrut_t* cntl);
typedef FLA_Error(*flash_qrutc_p)(FLA_Obj A, FLA_Obj T, FLA_Obj U, fla_qrut_t* cntl);
typedef FLA_Error(*flash_qr2ut_p)(FLA_Obj B, FLA_Obj D, FLA_Obj T, fla_qr2ut_t* cntl);
typedef FLA_Error(*flash_lqut_macro_p)(FLA_Obj A, FLA_Obj T, fla_lqut_t* cntl);
typedef FLA_Error(*flash_caqr2ut_p)(FLA_Obj B, FLA_Obj D, FLA_Obj T, fla_caqr2ut_t* cntl);
typedef FLA_Error(*flash_uddateut_p)(FLA_Obj R, FLA_Obj C, FLA_Obj D, FLA_Obj T, fla_uddateut_t* cntl);
typedef FLA_Error(*flash_apqut_p)(FLA_Side side, FLA_Trans trans, FLA_Direct direct, FLA_Store storev, FLA_Obj A, FLA_Obj T, FLA_Obj W, FLA_Obj B, fla_apqut_t* cntl);
typedef FLA_Error(*flash_apq2ut_p)(FLA_Side side, FLA_Trans trans, FLA_Direct direct, FLA_Store storev, FLA_Obj D, FLA_Obj T, FLA_Obj W, FLA_Obj C, FLA_Obj E, fla_apq2ut_t* cntl);
typedef FLA_Error(*flash_apcaq2ut_p)(FLA_Side side, FLA_Trans trans, FLA_Direct direct, FLA_Store storev, FLA_Obj D, FLA_Obj T, FLA_Obj W, FLA_Obj C, FLA_Obj E, fla_apcaq2ut_t* cntl);
typedef FLA_Error(*flash_apqudut_p)(FLA_Side side, FLA_Trans trans, FLA_Direct direct, FLA_Store storev, FLA_Obj T, FLA_Obj W, FLA_Obj R, FLA_Obj U, FLA_Obj C, FLA_Obj V, FLA_Obj D, fla_apqudut_t* cntl);
typedef FLA_Error(*flash_eig_gest_p)(FLA_Inv inv, FLA_Uplo uplo, FLA_Obj A, FLA_Obj Y, FLA_Obj B, fla_eig_gest_t* cntl);
// Level-3 BLAS
typedef FLA_Error(*flash_gemm_p)(FLA_Trans transa, FLA_Trans transb, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_gemm_t* cntl);
typedef FLA_Error(*flash_hemm_p)(FLA_Side side, FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_hemm_t* cntl);
typedef FLA_Error(*flash_herk_p)(FLA_Uplo uplo, FLA_Trans transa, FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_herk_t* cntl);
typedef FLA_Error(*flash_her2k_p)(FLA_Uplo uplo, FLA_Trans transa, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_her2k_t* cntl);
typedef FLA_Error(*flash_symm_p)(FLA_Side side, FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_symm_t* cntl);
typedef FLA_Error(*flash_syrk_p)(FLA_Uplo uplo, FLA_Trans transa, FLA_Obj alpha, FLA_Obj A, FLA_Obj beta, FLA_Obj C, fla_syrk_t* cntl);
typedef FLA_Error(*flash_syr2k_p)(FLA_Uplo uplo, FLA_Trans transa, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, FLA_Obj beta, FLA_Obj C, fla_syr2k_t* cntl);
typedef FLA_Error(*flash_trmm_p)(FLA_Side side, FLA_Uplo uplo, FLA_Trans trans, FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj C, fla_trmm_t* cntl);
typedef FLA_Error(*flash_trsm_p)(FLA_Side side, FLA_Uplo uplo, FLA_Trans trans, FLA_Diag diag, FLA_Obj alpha, FLA_Obj A, FLA_Obj C, fla_trsm_t* cntl);
// Level-2 BLAS
typedef FLA_Error(*flash_gemv_p)(FLA_Trans transa, FLA_Obj alpha, FLA_Obj A, FLA_Obj x, FLA_Obj beta, FLA_Obj y, fla_gemv_t* cntl);
typedef FLA_Error(*flash_trsv_p)(FLA_Uplo uplo, FLA_Trans trans, FLA_Diag diag, FLA_Obj A, FLA_Obj x, fla_trsv_t* cntl);
// Level-1 BLAS
typedef FLA_Error(*flash_axpy_p)(FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_axpy_t* cntl);
typedef FLA_Error(*flash_axpyt_p)(FLA_Trans trans, FLA_Obj alpha, FLA_Obj A, FLA_Obj B, fla_axpyt_t* cntl);
typedef FLA_Error(*flash_copy_p)(FLA_Obj A, FLA_Obj B, fla_copy_t* cntl);
typedef FLA_Error(*flash_copyt_p)(FLA_Trans trans, FLA_Obj A, FLA_Obj B, fla_copyt_t* cntl);
typedef FLA_Error(*flash_copyr_p)(FLA_Uplo uplo, FLA_Obj A, FLA_Obj B, fla_copyr_t* cntl);
typedef FLA_Error(*flash_scal_p)(FLA_Obj alpha, FLA_Obj A, fla_scal_t* cntl);
typedef FLA_Error(*flash_scalr_p)(FLA_Uplo uplo, FLA_Obj alpha, FLA_Obj A, fla_scalr_t* cntl);
// Base
typedef FLA_Error(*flash_obj_create_buffer_p)(dim_t rs, dim_t cs, FLA_Obj A, void* cntl);
typedef FLA_Error(*flash_obj_free_buffer_p)(FLA_Obj A, void* cntl);
// Only execute task if it is not NULL.
if ( t == NULL )
return;
// Now "switch" between the various possible task functions.
// FLA_LU_piv_macro
if ( t->func == (void *) FLA_LU_piv_macro_task )
{
flash_lu_piv_macro_p func;
func = (flash_lu_piv_macro_p) t->func;
func( t->output_arg[0],
t->output_arg[1],
( fla_lu_t* ) t->cntl );
}
// FLA_Apply_pivots_macro
else if ( t->func == (void *) FLA_Apply_pivots_macro_task )
{
flash_apply_pivots_macro_p func;
func = (flash_apply_pivots_macro_p) t->func;
func( ( FLA_Side ) t->int_arg[0],
( FLA_Trans ) t->int_arg[1],
t->input_arg[0],
t->output_arg[0],
( fla_appiv_t* ) t->cntl );
}
// FLA_LU_piv
else if ( t->func == (void *) FLA_LU_piv_task )
{
flash_lu_piv_p func;
func = (flash_lu_piv_p) t->func;
func( t->output_arg[0],
t->fla_arg[0],
( fla_lu_t* ) t->cntl );
}
// FLA_LU_piv_copy
else if ( t->func == (void *) FLA_LU_piv_copy_task )
{
flash_lu_piv_copy_p func;
func = (flash_lu_piv_copy_p) t->func;
func( t->output_arg[0],
t->fla_arg[0],
t->output_arg[1],
( fla_lu_t* ) t->cntl );
}
// FLA_Trsm_piv
else if ( t->func == (void *) FLA_Trsm_piv_task )
{
flash_trsm_piv_p func;
func = (flash_trsm_piv_p) t->func;
func( t->input_arg[0],
t->output_arg[0],
t->fla_arg[0],
( fla_trsm_t* ) t->cntl );
}
// FLA_SA_LU
else if ( t->func == (void *) FLA_SA_LU_task )
{
flash_sa_lu_p func;
func = (flash_sa_lu_p) t->func;
func( t->output_arg[1],
t->output_arg[0],
t->fla_arg[0],
t->fla_arg[1],
t->int_arg[0],
( fla_lu_t* ) t->cntl );
}
// FLA_SA_FS
else if ( t->func == (void *) FLA_SA_FS_task )
{
flash_sa_fs_p func;
func = (flash_sa_fs_p) t->func;
func( t->fla_arg[0],
t->input_arg[0],
t->fla_arg[1],
t->output_arg[1],
t->output_arg[0],
t->int_arg[0],
( fla_gemm_t* ) t->cntl );
}
// FLA_LU_nopiv
else if ( t->func == (void *) FLA_LU_nopiv_task )
{
flash_lu_nopiv_p func;
func = (flash_lu_nopiv_p) t->func;
func( t->output_arg[0],
( fla_lu_t* ) t->cntl );
}
// FLA_Trinv
else if ( t->func == (void *) FLA_Trinv_task )
{
flash_trinv_p func;
func = (flash_trinv_p) t->func;
func( ( FLA_Uplo ) t->int_arg[0],
( FLA_Diag ) t->int_arg[1],
t->output_arg[0],
( fla_trinv_t* ) t->cntl );
}
// FLA_Ttmm
else if ( t->func == (void *) FLA_Ttmm_task )
{
flash_ttmm_p func;
func = (flash_ttmm_p) t->func;
func( ( FLA_Uplo ) t->int_arg[0],
t->output_arg[0],
( fla_ttmm_t* ) t->cntl );
}
// FLA_Chol
else if ( t->func == (void *) FLA_Chol_task )
{
flash_chol_p func;
func = (flash_chol_p) t->func;
func( ( FLA_Uplo ) t->int_arg[0],
t->output_arg[0],
( fla_chol_t* ) t->cntl );
}
// FLA_Sylv
else if ( t->func == (void *) FLA_Sylv_task )
{
flash_sylv_p func;
func = (flash_sylv_p) t->func;
func( ( FLA_Trans ) t->int_arg[0],
( FLA_Trans ) t->int_arg[1],
t->fla_arg[0],
t->input_arg[0],
t->input_arg[1],
t->output_arg[0],
t->fla_arg[1],
( fla_sylv_t* ) t->cntl );
}
// FLA_Lyap
else if ( t->func == (void *) FLA_Lyap_task )
{
flash_lyap_p func;
func = (flash_lyap_p) t->func;
func( ( FLA_Trans ) t->int_arg[0],
t->fla_arg[0],
t->input_arg[0],
t->output_arg[0],
t->fla_arg[1],
( fla_lyap_t* ) t->cntl );
}
// FLA_QR_UT_macro
else if ( t->func == (void *) FLA_QR_UT_macro_task )
{
flash_qrut_macro_p func;
func = (flash_qrut_macro_p) t->func;
func( t->output_arg[0],
t->output_arg[1],
( fla_qrut_t* ) t->cntl );
}
// FLA_QR_UT
else if ( t->func == (void *) FLA_QR_UT_task )
{
flash_qrut_p func;
func = (flash_qrut_p) t->func;
func( t->output_arg[0],
t->fla_arg[0],
( fla_qrut_t* ) t->cntl );
}
// FLA_QR_UT_copy
else if ( t->func == (void *) FLA_QR_UT_copy_task )
{
flash_qrutc_p func;
func = (flash_qrutc_p) t->func;
func( t->output_arg[0],
t->fla_arg[0],
t->output_arg[1],
( fla_qrut_t* ) t->cntl );
}
// FLA_QR2_UT
else if ( t->func == (void *) FLA_QR2_UT_task )
{
flash_qr2ut_p func;
func = (flash_qr2ut_p) t->func;
func( t->output_arg[1],
t->output_arg[0],
t->fla_arg[0],
( fla_qr2ut_t* ) t->cntl );
}
// FLA_LQ_UT_macro
else if ( t->func == (void *) FLA_LQ_UT_macro_task )
{
flash_lqut_macro_p func;
func = (flash_lqut_macro_p) t->func;
func( t->output_arg[0],
t->output_arg[1],
( fla_lqut_t* ) t->cntl );
}
// FLA_CAQR2_UT
else if ( t->func == (void *) FLA_CAQR2_UT_task )
{
flash_caqr2ut_p func;
func = (flash_caqr2ut_p) t->func;
func( t->output_arg[1],
t->output_arg[0],
t->fla_arg[0],
( fla_caqr2ut_t* ) t->cntl );
}
// FLA_UDdate_UT
else if ( t->func == (void *) FLA_UDdate_UT_task )
{
flash_uddateut_p func;
func = (flash_uddateut_p) t->func;
func( t->output_arg[0],
t->output_arg[1],
t->output_arg[2],
t->output_arg[3],
( fla_uddateut_t* ) t->cntl );
}
// FLA_Apply_Q_UT
else if ( t->func == (void *) FLA_Apply_Q_UT_task )
{
flash_apqut_p func;
func = (flash_apqut_p) t->func;
func( ( FLA_Side ) t->int_arg[0],
( FLA_Trans ) t->int_arg[1],
( FLA_Direct ) t->int_arg[2],
( FLA_Store ) t->int_arg[3],
t->input_arg[0],
t->fla_arg[0],
t->output_arg[1],
t->output_arg[0],
( fla_apqut_t* ) t->cntl );
}
// FLA_Apply_Q2_UT
else if ( t->func == (void *) FLA_Apply_Q2_UT_task )
{
flash_apq2ut_p func;
func = (flash_apq2ut_p) t->func;
func( ( FLA_Side ) t->int_arg[0],
( FLA_Trans ) t->int_arg[1],
( FLA_Direct ) t->int_arg[2],
( FLA_Store ) t->int_arg[3],
t->input_arg[0],
t->fla_arg[0],
t->output_arg[2],
t->output_arg[1],
t->output_arg[0],
( fla_apq2ut_t* ) t->cntl );
}
// FLA_Apply_CAQ2_UT
else if ( t->func == (void *) FLA_Apply_CAQ2_UT_task )
{
flash_apcaq2ut_p func;
func = (flash_apcaq2ut_p) t->func;
func( ( FLA_Side ) t->int_arg[0],
( FLA_Trans ) t->int_arg[1],
( FLA_Direct ) t->int_arg[2],
( FLA_Store ) t->int_arg[3],
t->input_arg[0],
t->fla_arg[0],
t->output_arg[2],
t->output_arg[1],
t->output_arg[0],
( fla_apcaq2ut_t* ) t->cntl );
}
// FLA_Apply_QUD_UT
else if ( t->func == (void *) FLA_Apply_QUD_UT_task )
{
flash_apqudut_p func;
func = (flash_apqudut_p) t->func;
func( ( FLA_Side ) t->int_arg[0],
( FLA_Trans ) t->int_arg[1],
( FLA_Direct ) t->int_arg[2],
( FLA_Store ) t->int_arg[3],
t->input_arg[0],
t->output_arg[0],
t->output_arg[1],
t->input_arg[1],
t->output_arg[2],
t->input_arg[2],
t->output_arg[3],
( fla_apqudut_t* ) t->cntl );
}
// FLA_Eig_gest
else if ( t->func == (void *) FLA_Eig_gest_task )
{
flash_eig_gest_p func;
func = (flash_eig_gest_p) t->func;
func( ( FLA_Inv ) t->int_arg[0],
( FLA_Uplo ) t->int_arg[1],
t->output_arg[1],
t->output_arg[0],
t->input_arg[0],
( fla_eig_gest_t* ) t->cntl );
}
// FLA_Gemm
else if ( t->func == (void *) FLA_Gemm_task )
{
flash_gemm_p func;
func = (flash_gemm_p) t->func;
func( ( FLA_Trans ) t->int_arg[0],
( FLA_Trans ) t->int_arg[1],
t->fla_arg[0],
t->input_arg[0],
t->input_arg[1],
t->fla_arg[1],
t->output_arg[0],
( fla_gemm_t* ) t->cntl );
}
// FLA_Hemm
else if ( t->func == (void *) FLA_Hemm_task )
{
flash_hemm_p func;
func = (flash_hemm_p) t->func;
func( ( FLA_Side ) t->int_arg[0],
( FLA_Uplo ) t->int_arg[1],
t->fla_arg[0],
t->input_arg[0],
t->input_arg[1],
t->fla_arg[1],
t->output_arg[0],
( fla_hemm_t* ) t->cntl );
}
// FLA_Herk
else if ( t->func == (void *) FLA_Herk_task )
{
flash_herk_p func;
func = (flash_herk_p) t->func;
func( ( FLA_Uplo ) t->int_arg[0],
( FLA_Trans ) t->int_arg[1],
t->fla_arg[0],
t->input_arg[0],
t->fla_arg[1],
t->output_arg[0],
( fla_herk_t* ) t->cntl );
}
// FLA_Her2k
else if ( t->func == (void *) FLA_Her2k_task )
{
flash_her2k_p func;
func = (flash_her2k_p) t->func;
func( ( FLA_Uplo ) t->int_arg[0],
( FLA_Trans ) t->int_arg[1],
t->fla_arg[0],
t->input_arg[0],
t->input_arg[1],
t->fla_arg[1],
t->output_arg[0],
( fla_her2k_t* ) t->cntl );
}
// FLA_Symm
else if ( t->func == (void *) FLA_Symm_task )
{
flash_symm_p func;
func = (flash_symm_p) t->func;
func( ( FLA_Side ) t->int_arg[0],
( FLA_Uplo ) t->int_arg[1],
t->fla_arg[0],
t->input_arg[0],
t->input_arg[1],
t->fla_arg[1],
t->output_arg[0],
( fla_symm_t* ) t->cntl );
}
// FLA_Syrk
else if ( t->func == (void *) FLA_Syrk_task )
{
flash_syrk_p func;
func = (flash_syrk_p) t->func;
func( ( FLA_Uplo ) t->int_arg[0],
( FLA_Trans ) t->int_arg[1],
t->fla_arg[0],
t->input_arg[0],
t->fla_arg[1],
t->output_arg[0],
( fla_syrk_t* ) t->cntl );
}
// FLA_Syr2k
else if ( t->func == (void *) FLA_Syr2k_task )
{
flash_syr2k_p func;
func = (flash_syr2k_p) t->func;
func( ( FLA_Uplo ) t->int_arg[0],
( FLA_Trans ) t->int_arg[1],
t->fla_arg[0],
t->input_arg[0],
t->input_arg[1],
t->fla_arg[1],
t->output_arg[0],
( fla_syr2k_t* ) t->cntl );
}
// FLA_Trmm
else if ( t->func == (void *) FLA_Trmm_task )
{
flash_trmm_p func;
func = (flash_trmm_p) t->func;
func( ( FLA_Side ) t->int_arg[0],
( FLA_Uplo ) t->int_arg[1],
( FLA_Trans ) t->int_arg[2],
( FLA_Diag ) t->int_arg[3],
t->fla_arg[0],
t->input_arg[0],
t->output_arg[0],
( fla_trmm_t* ) t->cntl );
}
// FLA_Trsm
else if ( t->func == (void *) FLA_Trsm_task )
{
flash_trsm_p func;
func = (flash_trsm_p) t->func;
func( ( FLA_Side ) t->int_arg[0],
( FLA_Uplo ) t->int_arg[1],
( FLA_Trans ) t->int_arg[2],
( FLA_Diag ) t->int_arg[3],
t->fla_arg[0],
t->input_arg[0],
t->output_arg[0],
( fla_trsm_t* ) t->cntl );
}
// FLA_Gemv
else if ( t->func == (void *) FLA_Gemv_task )
{
flash_gemv_p func;
func = (flash_gemv_p) t->func;
func( ( FLA_Trans ) t->int_arg[0],
t->fla_arg[0],
t->input_arg[0],
t->input_arg[1],
t->fla_arg[1],
t->output_arg[0],
( fla_gemv_t* ) t->cntl );
}
// FLA_Trsv
else if ( t->func == (void *) FLA_Trsv_task )
{
flash_trsv_p func;
func = (flash_trsv_p) t->func;
func( ( FLA_Uplo ) t->int_arg[0],
( FLA_Trans ) t->int_arg[1],
( FLA_Diag ) t->int_arg[2],
t->input_arg[0],
t->output_arg[0],
( fla_trsv_t* ) t->cntl );
}
// FLA_Axpy
else if ( t->func == (void *) FLA_Axpy_task )
{
flash_axpy_p func;
func = (flash_axpy_p) t->func;
func( t->fla_arg[0],
t->input_arg[0],
t->output_arg[0],
( fla_axpy_t* ) t->cntl );
}
// FLA_Axpyt
else if ( t->func == (void *) FLA_Axpyt_task )
{
flash_axpyt_p func;
func = (flash_axpyt_p) t->func;
func( ( FLA_Trans ) t->int_arg[0],
t->fla_arg[0],
t->input_arg[0],
t->output_arg[0],
( fla_axpyt_t* ) t->cntl );
}
// FLA_Copy
else if ( t->func == (void *) FLA_Copy_task )
{
flash_copy_p func;
func = (flash_copy_p) t->func;
func( t->input_arg[0],
t->output_arg[0],
( fla_copy_t* ) t->cntl );
}
// FLA_Copyt
else if ( t->func == (void *) FLA_Copyt_task )
{
flash_copyt_p func;
func = (flash_copyt_p) t->func;
func( ( FLA_Trans ) t->int_arg[0],
t->input_arg[0],
t->output_arg[0],
( fla_copyt_t* ) t->cntl );
}
// FLA_Copyr
else if ( t->func == (void *) FLA_Copyr_task )
{
flash_copyr_p func;
func = (flash_copyr_p) t->func;
func( ( FLA_Uplo ) t->int_arg[0],
t->input_arg[0],
t->output_arg[0],
( fla_copyr_t* ) t->cntl );
}
// FLA_Scal
else if ( t->func == (void *) FLA_Scal_task )
{
flash_scal_p func;
func = (flash_scal_p) t->func;
func( t->fla_arg[0],
t->output_arg[0],
( fla_scal_t* ) t->cntl );
}
// FLA_Scalr
else if ( t->func == (void *) FLA_Scalr_task )
{
flash_scalr_p func;
func = (flash_scalr_p) t->func;
func( ( FLA_Uplo ) t->int_arg[0],
t->fla_arg[0],
t->output_arg[0],
( fla_scalr_t* ) t->cntl );
}
// FLA_Obj_create_buffer
else if ( t->func == (void *) FLA_Obj_create_buffer_task )
{
flash_obj_create_buffer_p func;
func = (flash_obj_create_buffer_p) t->func;
func( ( dim_t ) t->int_arg[0],
( dim_t ) t->int_arg[1],
t->output_arg[0],
t->cntl );
}
// FLA_Obj_free_buffer
else if ( t->func == (void *) FLA_Obj_free_buffer_task )
{
flash_obj_free_buffer_p func;
func = (flash_obj_free_buffer_p) t->func;
func( t->output_arg[0],
t->cntl );
}
else
{
FLA_Check_error_code( FLA_NOT_YET_IMPLEMENTED );
}
return;
}
| void FLASH_Queue_finalize | ( | void | ) |
References FLASH_Queue_finalize_gpu().
Referenced by FLA_Finalize().
{
// Exit early if we're not already initialized.
if ( flash_queue_initialized == FALSE )
return;
// Clear the initialized flag.
flash_queue_initialized = FALSE;
#ifdef FLA_ENABLE_GPU
FLASH_Queue_finalize_gpu();
#endif
return;
}
| void FLASH_Queue_flush_block_gpu | ( | FLA_Obj | obj, |
| int | thread, | ||
| void * | arg | ||
| ) |
References FLA_Obj_view::base, FLA_Obj_gpu_struct::buffer_gpu, FLA_Obj_gpu_struct::clean, FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_get_gpu_num_blocks(), FLASH_Queue_read_gpu(), FLASH_Queue_variables::gpu, FLASH_Queue_variables::gpu_lock, FLA_Obj_gpu_struct::obj, and FLA_Obj_gpu_struct::request.
Referenced by FLASH_Queue_exec_gpu().
{
FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
int k;
dim_t gpu_n_blocks = FLASH_Queue_get_gpu_num_blocks();
FLA_Bool transfer = FALSE;
FLA_Obj_gpu gpu_obj;
#ifdef FLA_ENABLE_MULTITHREADING
FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G ***
#endif
// Locate the position of the block on the GPU.
for ( k = 0; k < gpu_n_blocks; k++ )
if ( obj.base == args->gpu[thread * gpu_n_blocks + k].obj.base )
break;
// The block is owned by the GPU.
if ( k < gpu_n_blocks )
{
// Save the block that will be flushed.
gpu_obj = args->gpu[thread * gpu_n_blocks + k];
// If the block is dirty, then flush it.
if ( gpu_obj.obj.base != NULL && !gpu_obj.clean )
transfer = TRUE;
}
#ifdef FLA_ENABLE_MULTITHREADING
FLA_Lock_release( &(args->gpu_lock[thread]) ); // G ***
#endif
// Exit early if a flush is not required.
if ( !transfer )
return;
// Flush the block outside the critical section.
FLASH_Queue_read_gpu( gpu_obj.obj, gpu_obj.buffer_gpu );
#ifdef FLA_ENABLE_MULTITHREADING
FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G ***
#endif
// Locate the position of the block on the GPU.
for ( k = 0; k < gpu_n_blocks; k++ )
if ( obj.base == args->gpu[thread * gpu_n_blocks + k].obj.base )
break;
if ( k < gpu_n_blocks )
{
// Update the bits for the flushed block.
args->gpu[thread * gpu_n_blocks + k].clean = TRUE;
args->gpu[thread * gpu_n_blocks + k].request = FALSE;
}
#ifdef FLA_ENABLE_MULTITHREADING
FLA_Lock_release( &(args->gpu_lock[thread]) ); // G ***
#endif
return;
}
| void FLASH_Queue_flush_gpu | ( | int | thread, |
| void * | arg | ||
| ) |
References FLA_Obj_view::base, FLA_Obj_gpu_struct::buffer_gpu, FLA_Obj_gpu_struct::clean, FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_get_enabled_gpu(), FLASH_Queue_get_gpu_num_blocks(), FLASH_Queue_read_gpu(), FLASH_Queue_variables::gpu, FLASH_Queue_variables::gpu_lock, FLASH_Queue_variables::gpu_log, FLA_Obj_gpu_struct::obj, and FLA_Obj_gpu_struct::request.
Referenced by FLASH_Queue_exec_parallel_function().
{
FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
int i, k;
dim_t gpu_n_blocks = FLASH_Queue_get_gpu_num_blocks();
int n_transfer = 0;
FLA_Obj_gpu gpu_obj;
// Exit if not using GPU.
if ( !FLASH_Queue_get_enabled_gpu() )
return;
#ifdef FLA_ENABLE_MULTITHREADING
FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G ***
#endif
for ( k = 0; k < gpu_n_blocks; k++ )
{
// Save the block that might be flushed.
gpu_obj = args->gpu[thread * gpu_n_blocks + k];
// Flush the block if it is dirty and requested.
if ( gpu_obj.obj.base != NULL && !gpu_obj.clean && gpu_obj.request )
{
// Save the block for data transfer outside the critical section.
args->gpu_log[thread * gpu_n_blocks + n_transfer] = gpu_obj;
n_transfer++;
}
}
#ifdef FLA_ENABLE_MULTITHREADING
FLA_Lock_release( &(args->gpu_lock[thread]) ); // G ***
#endif
// Exit early if a flush is not required.
if ( n_transfer == 0 )
return;
// Flush the block outside the critical section.
for ( i = 0; i < n_transfer; i++ )
{
gpu_obj = args->gpu_log[thread * gpu_n_blocks + i];
FLASH_Queue_read_gpu( gpu_obj.obj, gpu_obj.buffer_gpu );
}
#ifdef FLA_ENABLE_MULTITHREADING
FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G ***
#endif
// Update the bits for each block that is flushed.
for ( i = 0; i < n_transfer; i++ )
{
// Locate the position of the block on the GPU.
for ( k = 0; k < gpu_n_blocks; k++ )
if ( args->gpu_log[thread * gpu_n_blocks + i].obj.base ==
args->gpu[thread * gpu_n_blocks + k].obj.base )
break;
if ( k < gpu_n_blocks )
{
// The block is now clean.
args->gpu[thread * gpu_n_blocks + k].clean = TRUE;
args->gpu[thread * gpu_n_blocks + k].request = FALSE;
}
}
#ifdef FLA_ENABLE_MULTITHREADING
FLA_Lock_release( &(args->gpu_lock[thread]) ); // G ***
#endif
return;
}
| dim_t FLASH_Queue_get_block_size | ( | void | ) |
Referenced by FLASH_Queue_exec().
{
return flash_queue_block_size;
}
| dim_t FLASH_Queue_get_cache_line_size | ( | void | ) |
Referenced by FLASH_Queue_prefetch_block().
{
return flash_queue_cache_line_size;
}
| dim_t FLASH_Queue_get_cache_size | ( | void | ) |
Referenced by FLASH_Queue_exec().
{
return flash_queue_cache_size;
}
| FLA_Bool FLASH_Queue_get_caching | ( | void | ) |
Referenced by FLASH_Queue_exec(), FLASH_Queue_exec_parallel_function(), FLASH_Queue_wait_dequeue(), and FLASH_Task_update_dependencies().
{
return flash_queue_caching;
}
| int FLASH_Queue_get_cores_per_cache | ( | void | ) |
Referenced by FLASH_Queue_exec(), FLASH_Queue_exec_parallel_function(), and FLASH_Queue_exec_simulation().
{
return flash_queue_cores_per_cache;
}
| int FLASH_Queue_get_cores_per_queue | ( | void | ) |
Referenced by FLASH_Queue_exec().
{
return flash_queue_cores_per_queue;
}
Referenced by FLASH_Queue_exec(), FLASH_Queue_init_tasks(), and FLASH_Queue_verbose_output().
{
return flash_queue_data_affinity;
}
| FLA_Bool FLASH_Queue_get_enabled | ( | void | ) |
Referenced by FLA_Apply_CAQ2_UT_internal(), FLA_Apply_pivots_internal(), FLA_Apply_Q2_UT_internal(), FLA_Apply_Q_UT_internal(), FLA_Apply_QUD_UT_internal(), FLA_Axpy_internal(), FLA_Axpyt_internal(), FLA_CAQR2_UT_internal(), FLA_Chol_internal(), FLA_Copy_internal(), FLA_Copyr_internal(), FLA_Copyt_internal(), FLA_Eig_gest_internal(), FLA_Gemm_internal(), FLA_Gemv_internal(), FLA_Hemm_internal(), FLA_Her2k_internal(), FLA_Herk_internal(), FLA_LQ_UT_internal(), FLA_LU_nopiv_internal(), FLA_LU_piv_internal(), FLA_Lyap_internal(), FLA_QR2_UT_internal(), FLA_QR_UT_copy_internal(), FLA_QR_UT_internal(), FLA_Scal_internal(), FLA_Scalr_internal(), FLA_Sylv_internal(), FLA_Symm_internal(), FLA_Syr2k_internal(), FLA_Syrk_internal(), FLA_Trinv_internal(), FLA_Trmm_internal(), FLA_Trsm_internal(), FLA_Trsv_internal(), FLA_Ttmm_internal(), FLA_UDdate_UT_internal(), FLASH_Apply_pivots(), FLASH_Axpy(), FLASH_Axpyt(), FLASH_Copyt(), FLASH_FS_incpiv(), FLASH_Gemv(), FLASH_LU_incpiv_var1(), FLASH_LU_incpiv_var2(), FLASH_Queue_enable_gpu(), FLASH_Queue_get_enabled_gpu(), FLASH_SA_FS(), FLASH_SA_LU(), FLASH_Scal(), FLASH_Scalr(), FLASH_Trsm_piv(), and FLASH_Trsv().
{
// Return if enabled, but always false if SuperMatrix is not configured.
#ifdef FLA_ENABLE_SUPERMATRIX
return flash_queue_enabled;
#else
return FALSE;
#endif
}
| FLASH_Task* FLASH_Queue_get_head_task | ( | void | ) |
References FLASH_Queue_s::head.
Referenced by FLASH_Queue_init_tasks(), and FLASH_Queue_verbose_output().
| unsigned int FLASH_Queue_get_num_tasks | ( | void | ) |
References FLASH_Queue_s::n_tasks.
Referenced by FLASH_Queue_exec(), FLASH_Queue_exec_parallel_function(), FLASH_Queue_exec_simulation(), FLASH_Queue_init_tasks(), and FLASH_Queue_verbose_output().
| unsigned int FLASH_Queue_get_num_threads | ( | void | ) |
Referenced by FLASH_Queue_check_gpu(), FLASH_Queue_exec(), FLASH_Queue_exec_gpu(), FLASH_Queue_exec_parallel(), FLASH_Queue_exec_parallel_function(), FLASH_Queue_exec_simulation(), FLASH_Queue_update_gpu(), FLASH_Queue_verbose_output(), FLASH_Task_free_parallel(), and FLASH_Task_update_dependencies().
{
return flash_queue_n_threads;
}
| double FLASH_Queue_get_parallel_time | ( | void | ) |
{
// Only return time if out of parallel region.
if ( flash_queue_stack == 0 )
return flash_queue_parallel_time;
return 0.0;
}
| FLA_Bool FLASH_Queue_get_sorting | ( | void | ) |
Referenced by FLASH_Queue_wait_enqueue(), and FLASH_Task_update_binding().
{
return flash_queue_sorting;
}
| FLASH_Task* FLASH_Queue_get_tail_task | ( | void | ) |
| double FLASH_Queue_get_total_time | ( | void | ) |
{
// Only return time if out of parallel region.
if ( flash_queue_stack == 0 )
return flash_queue_total_time;
return 0.0;
}
Referenced by FLASH_Queue_exec(), FLASH_Queue_exec_simulation(), and FLASH_Queue_verbose_output().
{
return flash_queue_verbose;
}
| FLA_Bool FLASH_Queue_get_work_stealing | ( | void | ) |
Referenced by FLASH_Queue_exec(), FLASH_Queue_exec_parallel_function(), and FLASH_Task_update_dependencies().
{
return flash_queue_work_stealing;
}
| void FLASH_Queue_init | ( | void | ) |
References FLASH_Queue_init_gpu(), and FLASH_Queue_reset().
Referenced by FLA_Init().
{
// Exit early if we're already initialized.
if ( flash_queue_initialized == TRUE )
return;
// Reset all the initial values.
FLASH_Queue_reset();
// Set the initialized flag.
flash_queue_initialized = TRUE;
#ifdef FLA_ENABLE_GPU
FLASH_Queue_init_gpu();
#endif
return;
}
| void FLASH_Queue_init_tasks | ( | void * | arg | ) |
References FLA_Obj_view::base, FLASH_Queue_variables::block_size, FLASH_Queue_variables::datatype, FLASH_Task_s::dep_arg_head, FLA_is_owner(), FLA_Obj_col_stride(), FLA_Obj_datatype(), FLA_Obj_datatype_size(), FLA_Obj_elemtype(), FLA_Obj_free_buffer_task(), FLA_Obj_length(), FLA_Obj_width(), FLASH_Queue_get_data_affinity(), FLASH_Queue_get_head_task(), FLASH_Queue_get_num_tasks(), FLASH_Queue_get_tail_task(), FLASH_Queue_wait_enqueue(), FLASH_Task_s::func, FLASH_Task_s::height, FLASH_Task_s::input_arg, FLA_Obj_struct::m_index, FLASH_Task_s::n_dep_args, FLA_Obj_struct::n_index, FLASH_Task_s::n_input_args, FLASH_Task_s::n_macro_args, FLASH_Task_s::n_output_args, FLASH_Queue_variables::n_queues, FLASH_Task_s::n_ready, FLASH_Queue_variables::n_ready, FLASH_Task_s::n_war_args, FLA_Obj_struct::n_write_blocks, FLASH_Dep_s::next_dep, FLASH_Task_s::next_task, FLASH_Task_s::output_arg, FLASH_Queue_variables::prefetch, FLASH_Task_s::prev_task, FLASH_Task_s::queue, RCCE_acquire_lock(), RCCE_release_lock(), FLASH_Queue_variables::size, FLASH_Dep_s::task, and FLASH_Queue_variables::task_queue.
Referenced by FLASH_Queue_exec().
{
FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
int i, j;
int n_tasks = FLASH_Queue_get_num_tasks();
int n_ready = 0;
int height;
FLASH_Task* t;
FLASH_Dep* d;
// Grab the tail of the task queue.
t = FLASH_Queue_get_tail_task();
for ( i = n_tasks - 1; i >= 0; i-- )
{
// Save all the task pointers.
args->task_queue[i] = t;
// Only use a single queue implementation.
t->queue = 0;
// Determine the height of each task in the DAG.
height = 0;
d = t->dep_arg_head;
// Take the maximum height of dependent tasks.
for ( j = 0; j < t->n_dep_args; j++ )
{
height = max( height, d->task->height );
d = d->next_dep;
}
t->height = height + 1;
// Since freeing a task is always a leaf, we want to force it to execute
// earlier by giving it a greater height in order to reclaim memory.
if ( t->func == (void *) FLA_Obj_free_buffer_task )
t->height += n_tasks;
// Find all ready tasks.
t->n_ready += t->n_input_args + t->n_output_args +
t->n_macro_args + t->n_war_args;
if ( t->n_ready == 0 )
{
// Save the number of ready and available tasks.
n_ready++;
}
if ( FLA_is_owner() )
{
// Record all the ready values.
args->n_ready[i] = t->n_ready;
}
// Go to the previous task.
t = t->prev_task;
}
// Only allow the first core to enqueue the initial ready tasks.
if ( !FLA_is_owner() )
return;
// Grab the head of the task queue.
t = FLASH_Queue_get_head_task();
for ( i = 0; i < n_tasks && n_ready > 0; i++ )
{
if ( t->n_ready == 0 )
{
RCCE_acquire_lock( 0 );
// Enqueue all the ready and available tasks.
FLASH_Queue_wait_enqueue( t, arg );
RCCE_release_lock( 0 );
// Decrement the number of ready tasks left to be enqueued.
n_ready--;
}
// Go to the next task.
t = t->next_task;
}
return;
}
| void FLASH_Queue_invalidate_block_gpu | ( | FLA_Obj | obj, |
| int | thread, | ||
| void * | arg | ||
| ) |
References FLA_Obj_view::base, FLA_Obj_gpu_struct::clean, FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_get_gpu_num_blocks(), FLASH_Queue_variables::gpu, FLASH_Queue_variables::gpu_lock, FLA_Obj_gpu_struct::obj, and FLA_Obj_gpu_struct::request.
Referenced by FLASH_Queue_exec_gpu(), and FLASH_Queue_update_gpu().
{
FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
int j, k;
dim_t gpu_n_blocks = FLASH_Queue_get_gpu_num_blocks();
FLA_Obj_gpu gpu_obj;
#ifdef FLA_ENABLE_MULTITHREADING
FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G ***
#endif
// Locate the position of the block on the GPU.
for ( k = 0; k < gpu_n_blocks; k++ )
if ( obj.base == args->gpu[thread * gpu_n_blocks + k].obj.base )
break;
// The block is owned by other GPU.
if ( k < gpu_n_blocks )
{
// Invalidate the block.
args->gpu[thread * gpu_n_blocks + k].obj.base = NULL;
args->gpu[thread * gpu_n_blocks + k].clean = TRUE;
args->gpu[thread * gpu_n_blocks + k].request = FALSE;
// Save the block that will be invalidated.
gpu_obj = args->gpu[thread * gpu_n_blocks + k];
// Shift all the blocks for the invalidated block.
for ( j = k; j < gpu_n_blocks - 1; j++ )
args->gpu[thread * gpu_n_blocks + j] = args->gpu[thread * gpu_n_blocks + j + 1];
// Move to the LRU block.
args->gpu[thread * gpu_n_blocks + gpu_n_blocks - 1] = gpu_obj;
}
#ifdef FLA_ENABLE_MULTITHREADING
FLA_Lock_release( &(args->gpu_lock[thread]) ); // G ***
#endif
return;
}
| void FLASH_Queue_mark_gpu | ( | FLASH_Task * | t, |
| void * | arg | ||
| ) |
References FLA_Obj_view::base, FLA_Obj_gpu_struct::clean, FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_get_gpu_num_blocks(), FLASH_Queue_variables::gpu, FLASH_Queue_variables::gpu_lock, FLASH_Task_s::n_output_args, FLA_Obj_gpu_struct::obj, FLASH_Task_s::output_arg, FLA_Obj_gpu_struct::request, and FLASH_Task_s::thread.
Referenced by FLASH_Queue_exec_gpu().
{
FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
int i, j, k;
int thread = t->thread;
dim_t gpu_n_blocks = FLASH_Queue_get_gpu_num_blocks();
FLA_Bool duplicate;
FLA_Obj obj;
// Mark all the output blocks on the GPU as dirty.
for ( i = t->n_output_args - 1; i >= 0; i-- )
{
obj = t->output_arg[i];
// Check for duplicate blocks.
duplicate = FALSE;
for ( j = 0; j < i && !duplicate; j++ )
{
if ( obj.base == t->output_arg[j].base )
duplicate = TRUE;
}
// If the output block has not been processed before.
if ( !duplicate )
{
#ifdef FLA_ENABLE_MULTITHREADING
FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G ***
#endif
// Locate the position of the block on the GPU.
for ( k = 0; k < gpu_n_blocks; k++ )
if ( obj.base == args->gpu[thread * gpu_n_blocks + k].obj.base )
break;
if ( k < gpu_n_blocks )
{
// Change the bits for the new dirty block.
args->gpu[thread * gpu_n_blocks + k].clean = FALSE;
args->gpu[thread * gpu_n_blocks + k].request = FALSE;
}
#ifdef FLA_ENABLE_MULTITHREADING
FLA_Lock_release( &(args->gpu_lock[thread]) ); // G ***
#endif
}
}
return;
}
| void FLASH_Queue_prefetch | ( | int | cache, |
| void * | arg | ||
| ) |
References FLA_Obj_view::base, FLASH_Queue_variables::cache, FLASH_Queue_prefetch_block(), FLASH_Queue_variables::prefetch, and FLASH_Queue_variables::size.
Referenced by FLASH_Queue_exec_parallel_function(), and FLASH_Queue_exec_simulation().
{
FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
int i;
int size = args->size;
FLA_Obj obj;
// Prefetch blocks in opposite order to maintain LRU.
for ( i = size - 1; i >= 0; i-- )
{
obj = args->prefetch[i];
// Only prefetch if it is a valid block.
if ( obj.base != NULL )
{
// Prefetch the block.
FLASH_Queue_prefetch_block( obj );
// Record the prefetched block in the cache.
args->cache[cache * size + i] = obj;
}
}
return;
}
| void FLASH_Queue_prefetch_block | ( | FLA_Obj | obj | ) |
References FLA_Obj_datatype(), FLA_Obj_elem_size(), FLA_Obj_length(), FLA_Obj_width(), FLASH_Queue_get_cache_line_size(), scomplex::real, and dcomplex::real.
Referenced by FLASH_Queue_prefetch().
{
int i, inc;
int line_size = FLASH_Queue_get_cache_line_size();
int elem_size = FLA_Obj_elem_size( obj );
int length = FLA_Obj_length( obj );
int width = FLA_Obj_width( obj );
FLA_Datatype datatype = FLA_Obj_datatype( obj );
// Determine stride to prefetch block into cache.
inc = line_size / elem_size;
// Switch between the four different datatypes.
switch ( datatype )
{
case FLA_FLOAT:
{
float *buffer = ( float * ) FLA_FLOAT_PTR( obj );
float access;
// Access each cache line of the block.
for ( i = 0; i < length * width; i += inc )
access = buffer[i];
// Prevent dead code elimination.
access += 1.0;
break;
}
case FLA_DOUBLE:
{
double *buffer = ( double * ) FLA_DOUBLE_PTR( obj );
double access;
// Access each cache line of the block.
for ( i = 0; i < length * width; i += inc )
access = buffer[i];
// Prevent dead code elimination.
access += 1.0;
break;
}
case FLA_COMPLEX:
{
scomplex *buffer = ( scomplex * ) FLA_COMPLEX_PTR( obj );
scomplex access;
// Access each cache line of the block.
for ( i = 0; i < length * width; i += inc )
access = buffer[i];
// Prevent dead code elimination.
access.real += 1.0;
break;
}
case FLA_DOUBLE_COMPLEX:
{
dcomplex *buffer = ( dcomplex * ) FLA_DOUBLE_COMPLEX_PTR( obj );
dcomplex access;
// Access each cache line of the block.
for ( i = 0; i < length * width; i += inc )
access = buffer[i];
// Prevent dead code elimination.
access.real += 1.0;
break;
}
case FLA_INT:
{
int *buffer = ( int * ) FLA_INT_PTR( obj );
int access;
// Access each cache line of the block.
for ( i = 0; i < length * width; i += inc )
access = buffer[i];
// Prevent dead code elimination.
access += 1.0;
break;
}
default:
// This default case should never execute.
FLA_Check_error_code( FLA_INVALID_DATATYPE );
}
return;
}
| void FLASH_Queue_push | ( | void * | func, |
| void * | cntl, | ||
| char * | name, | ||
| FLA_Bool | enabled_gpu, | ||
| int | n_int_args, | ||
| int | n_fla_args, | ||
| int | n_input_args, | ||
| int | n_output_args, | ||
| ... | |||
| ) |
References FLA_Obj_view::base, FLASH_Task_s::fla_arg, FLA_Obj_col_stride(), FLA_Obj_elemtype(), FLA_Obj_length(), FLA_Obj_width(), FLASH_Queue_push_input(), FLASH_Queue_push_output(), FLASH_Task_alloc(), FLASH_Queue_s::head, FLASH_Task_s::input_arg, FLASH_Task_s::int_arg, FLASH_Task_s::n_macro_args, FLASH_Queue_s::n_tasks, FLASH_Task_s::next_task, FLASH_Task_s::order, FLASH_Task_s::output_arg, FLASH_Task_s::prev_task, FLASH_Task_s::queue, FLASH_Queue_s::tail, and FLA_Obj_struct::write_task.
{
int i;
va_list var_arg_list;
FLASH_Task* t;
FLA_Obj obj;
// Allocate a new FLA_Task and populate its fields with appropriate values.
t = FLASH_Task_alloc( func, cntl, name, enabled_gpu,
n_int_args, n_fla_args,
n_input_args, n_output_args );
// Initialize variable argument environment. In case you're wondering, the
// second argument in this macro invocation of va_start() is supposed to be
// the parameter that immediately preceeds the variable argument list
// (ie: the ... above ).
va_start( var_arg_list, n_output_args );
// Extract the integer arguments.
for ( i = 0; i < n_int_args; i++ )
t->int_arg[i] = va_arg( var_arg_list, int );
// Extract the FLA_Obj arguments.
for ( i = 0; i < n_fla_args; i++ )
t->fla_arg[i] = va_arg( var_arg_list, FLA_Obj );
// Extract the input FLA_Obj arguments.
for ( i = 0; i < n_input_args; i++ )
{
obj = va_arg( var_arg_list, FLA_Obj );
t->input_arg[i] = obj;
// Macroblock is used.
if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
{
dim_t jj, kk;
dim_t m = FLA_Obj_length( obj );
dim_t n = FLA_Obj_width( obj );
dim_t cs = FLA_Obj_col_stride( obj );
FLA_Obj* buf = FLASH_OBJ_PTR_AT( obj );
// Dependence analysis for each input block in macroblock.
for ( jj = 0; jj < n; jj++ )
for ( kk = 0; kk < m; kk++ )
FLASH_Queue_push_input( *( buf + jj * cs + kk ), t );
// Set the number of blocks in the macroblock subtracted by one
// since we do not want to recount an operand for each n_input_arg.
t->n_macro_args += m * n - 1;
}
else // Regular block.
{
// Dependence analysis for input operand.
FLASH_Queue_push_input( obj, t );
}
}
// Extract the output FLA_Obj arguments.
for ( i = 0; i < n_output_args; i++ )
{
obj = va_arg( var_arg_list, FLA_Obj );
t->output_arg[i] = obj;
// Only assign data affinity to the first output block.
if ( i == 0 )
{
FLA_Obj buf = obj;
// Use the top left block of the macroblock.
if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
buf = *FLASH_OBJ_PTR_AT( obj );
if ( buf.base->write_task == NULL )
t->queue = flash_queue_n_write_blocks;
else
t->queue = buf.base->write_task->queue;
}
// Macroblock is used.
if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
{
dim_t jj, kk;
dim_t m = FLA_Obj_length( obj );
dim_t n = FLA_Obj_width( obj );
dim_t cs = FLA_Obj_col_stride( obj );
FLA_Obj* buf = FLASH_OBJ_PTR_AT( obj );
// Dependence analysis for each output block in macroblock.
for ( jj = 0; jj < n; jj++ )
for ( kk = 0; kk < m; kk++ )
FLASH_Queue_push_output( *( buf + jj * cs + kk ), t );
// Set the number of blocks in the macroblock subtracted by one
// since we do not want to recount an operand for each n_output_arg.
t->n_macro_args += m * n - 1;
}
else // Regular block.
{
// Dependence analysis for output operand.
FLASH_Queue_push_output( obj, t );
}
}
// Finalize the variable argument environment.
va_end( var_arg_list );
// Add the task to the tail of the queue (and the head if queue is empty).
if ( _tq.n_tasks == 0 )
{
_tq.head = t;
_tq.tail = t;
}
else
{
t->prev_task = _tq.tail;
_tq.tail->next_task = t;
_tq.tail = t;
// Determine the index of the task in the task queue.
t->order = t->prev_task->order + 1;
}
// Increment the number of tasks.
_tq.n_tasks++;
return;
}
| void FLASH_Queue_push_input | ( | FLA_Obj | obj, |
| FLASH_Task * | t | ||
| ) |
References FLA_Obj_view::base, FLASH_Task_s::dep_arg_head, FLASH_Task_s::dep_arg_tail, FLA_malloc(), FLASH_Task_s::n_dep_args, FLA_Obj_struct::n_read_blocks, FLA_Obj_struct::n_read_tasks, FLASH_Task_s::n_ready, FLASH_Dep_s::next_dep, FLA_Obj_struct::read_task_head, FLA_Obj_struct::read_task_tail, FLASH_Dep_s::task, and FLA_Obj_struct::write_task.
Referenced by FLASH_Queue_push().
{
FLASH_Task* task;
FLASH_Dep* d;
// Find dependence information.
if ( obj.base->write_task == NULL )
{
t->n_ready--;
// Add to number of blocks read if not written and not read before.
if ( obj.base->n_read_tasks == 0 )
{
// Identify each read block with an id for freeing.
obj.base->n_read_blocks = flash_queue_n_read_blocks;
flash_queue_n_read_blocks++;
}
}
else
{ // Flow dependence.
task = obj.base->write_task;
d = (FLASH_Dep *) FLA_malloc( sizeof(FLASH_Dep) );
d->task = t;
d->next_dep = NULL;
if ( task->n_dep_args == 0 )
{
task->dep_arg_head = d;
task->dep_arg_tail = d;
}
else
{
task->dep_arg_tail->next_dep = d;
task->dep_arg_tail = d;
}
task->n_dep_args++;
}
// Add task to the read task in the object if not already there.
if ( obj.base->n_read_tasks == 0 ||
obj.base->read_task_tail->task != t )
{ // Anti-dependence potentially.
d = (FLASH_Dep *) FLA_malloc( sizeof(FLASH_Dep) );
d->task = t;
d->next_dep = NULL;
if ( obj.base->n_read_tasks == 0 )
{
obj.base->read_task_head = d;
obj.base->read_task_tail = d;
}
else
{
obj.base->read_task_tail->next_dep = d;
obj.base->read_task_tail = d;
}
obj.base->n_read_tasks++;
}
return;
}
| void FLASH_Queue_push_output | ( | FLA_Obj | obj, |
| FLASH_Task * | t | ||
| ) |
References FLA_Obj_view::base, FLASH_Task_s::dep_arg_head, FLASH_Task_s::dep_arg_tail, FLA_free(), FLA_malloc(), FLASH_Task_s::n_dep_args, FLA_Obj_struct::n_read_blocks, FLA_Obj_struct::n_read_tasks, FLASH_Task_s::n_ready, FLASH_Task_s::n_war_args, FLA_Obj_struct::n_write_blocks, FLASH_Dep_s::next_dep, FLA_Obj_struct::read_task_head, FLA_Obj_struct::read_task_tail, FLASH_Dep_s::task, and FLA_Obj_struct::write_task.
Referenced by FLASH_Queue_push().
{
int i;
FLASH_Task* task;
FLASH_Dep* d;
FLASH_Dep* next_dep;
// Assign tasks to threads with data affinity.
if ( obj.base->write_task == NULL )
{
t->n_ready--;
// Save index in which this output block is first encountered.
obj.base->n_write_blocks = flash_queue_n_write_blocks;
// Number of blocks written if not written before.
flash_queue_n_write_blocks++;
// Add to number of blocks read if not written or read before.
if ( obj.base->n_read_tasks == 0 )
{
// Identify each read block with an id for freeing.
obj.base->n_read_blocks = flash_queue_n_read_blocks;
flash_queue_n_read_blocks++;
}
}
else
{ // Flow dependence potentially.
// The last task to overwrite this block is not itself.
if ( obj.base->write_task != t )
{
// Create dependency from task that last wrote the block.
task = obj.base->write_task;
d = (FLASH_Dep *) FLA_malloc( sizeof(FLASH_Dep) );
d->task = t;
d->next_dep = NULL;
if ( task->n_dep_args == 0 )
{
task->dep_arg_head = d;
task->dep_arg_tail = d;
}
else
{
task->dep_arg_tail->next_dep = d;
task->dep_arg_tail = d;
}
task->n_dep_args++;
}
else
{
// No need to notify task twice for output block already seen.
t->n_ready--;
}
}
// Clear read task for next set of reads and record the anti-dependence.
d = obj.base->read_task_head;
for ( i = 0; i < obj.base->n_read_tasks; i++ )
{
task = d->task;
next_dep = d->next_dep;
// If the last task to read is not the current task, add dependence.
if ( task != t )
{
d->task = t;
d->next_dep = NULL;
if ( task->n_dep_args == 0 )
{
task->dep_arg_head = d;
task->dep_arg_tail = d;
}
else
{
task->dep_arg_tail->next_dep = d;
task->dep_arg_tail = d;
}
task->n_dep_args++;
t->n_war_args++;
}
else
{
FLA_free( d );
}
d = next_dep;
}
obj.base->n_read_tasks = 0;
obj.base->read_task_head = NULL;
obj.base->read_task_tail = NULL;
// Record this task as the last to write to this block.
obj.base->write_task = t;
return;
}
| void FLASH_Queue_reset | ( | void | ) |
References FLASH_Queue_s::head, FLASH_Queue_s::n_tasks, and FLASH_Queue_s::tail.
Referenced by FLASH_Queue_exec(), and FLASH_Queue_init().
| void FLASH_Queue_set_block_size | ( | dim_t | size | ) |
Referenced by FLASH_Obj_create_hierarchy().
{
// Only adjust the block size if the new block is larger.
if ( flash_queue_block_size < size )
flash_queue_block_size = size;
return;
}
| void FLASH_Queue_set_cache_line_size | ( | dim_t | size | ) |
{
flash_queue_cache_line_size = size;
return;
}
| void FLASH_Queue_set_cache_size | ( | dim_t | size | ) |
{
flash_queue_cache_size = size;
return;
}
| void FLASH_Queue_set_caching | ( | FLA_Bool | caching | ) |
Referenced by FLASH_Queue_exec().
{
flash_queue_caching = caching;
return;
}
| void FLASH_Queue_set_cores_per_cache | ( | int | cores | ) |
{
flash_queue_cores_per_cache = cores;
return;
}
| void FLASH_Queue_set_cores_per_queue | ( | int | cores | ) |
{
flash_queue_cores_per_queue = cores;
return;
}
| void FLASH_Queue_set_data_affinity | ( | FLASH_Data_aff | data_affinity | ) |
Referenced by FLASH_Queue_exec().
{
flash_queue_data_affinity = data_affinity;
return;
}
| void FLASH_Queue_set_num_threads | ( | unsigned int | n_threads | ) |
References FLA_Check_num_threads().
{
FLA_Error e_val;
// Verify that the number of threads is positive.
e_val = FLA_Check_num_threads( n_threads );
FLA_Check_error_code( e_val );
// Keep track of the number of threads internally.
flash_queue_n_threads = n_threads;
#if FLA_MULTITHREADING_MODEL == FLA_OPENMP
// No additional action is necessary to set the number of OpenMP threads
// since setting the number of threads is handled at the parallel for loop
// with a num_threads() clause. This gives the user more flexibility since
// he can use the OMP_NUM_THREADS environment variable or the
// omp_set_num_threads() function to set the global number of OpenMP threads
// independently of the number of SuperMatrix threads.
#elif FLA_MULTITHREADING_MODEL == FLA_PTHREADS
// No additional action is necessary to set the number of pthreads
// since setting the number of threads is handled entirely on our end.
#endif
return;
}
| void FLASH_Queue_set_parallel_time | ( | double | dtime | ) |
Referenced by FLASH_Queue_exec().
{
flash_queue_parallel_time = dtime;
return;
}
| void FLASH_Queue_set_sorting | ( | FLA_Bool | sorting | ) |
{
flash_queue_sorting = sorting;
return;
}
| void FLASH_Queue_set_verbose_output | ( | FLASH_Verbose | verbose | ) |
{
flash_queue_verbose = verbose;
return;
}
| void FLASH_Queue_set_work_stealing | ( | FLA_Bool | work_stealing | ) |
Referenced by FLASH_Queue_exec().
{
flash_queue_work_stealing = work_stealing;
return;
}
| unsigned int FLASH_Queue_stack_depth | ( | void | ) |
Referenced by FLASH_Eig_gest(), FLASH_LU_incpiv(), FLASH_QR_UT_inc(), FLASH_Queue_disable_gpu(), and FLASH_Queue_enable_gpu().
{
return flash_queue_stack;
}
| void FLASH_Queue_update_block_gpu | ( | FLA_Obj | obj, |
| void ** | buffer_gpu, | ||
| int | thread, | ||
| void * | arg | ||
| ) |
References FLA_Obj_view::base, FLA_Obj_gpu_struct::buffer_gpu, FLA_Obj_gpu_struct::clean, FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_get_gpu_num_blocks(), FLASH_Queue_read_gpu(), FLASH_Queue_write_gpu(), FLASH_Queue_variables::gpu, FLASH_Queue_variables::gpu_lock, FLA_Obj_gpu_struct::obj, FLA_Obj_gpu_struct::request, and FLASH_Queue_variables::victim.
Referenced by FLASH_Queue_update_gpu().
{
FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
int j, k;
dim_t gpu_n_blocks = FLASH_Queue_get_gpu_num_blocks();
FLA_Bool transfer = FALSE;
FLA_Bool evict = FALSE;
FLA_Obj_gpu evict_obj;
FLA_Obj_gpu gpu_obj;
#ifdef FLA_ENABLE_MULTITHREADING
FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G ***
#endif
// Locate the position of the block on GPU.
for ( k = 0; k < gpu_n_blocks - 1; k++ )
if ( obj.base == args->gpu[thread * gpu_n_blocks + k].obj.base )
break;
// Save the pointer to the data on the GPU.
buffer_gpu[0] = args->gpu[thread * gpu_n_blocks + k].buffer_gpu;
// Save the victim block.
evict_obj = args->gpu[thread * gpu_n_blocks + k];
// The block is not already in the GPU.
if ( obj.base != args->gpu[thread * gpu_n_blocks + k].obj.base )
{
// Save for data transfer outside of critical section.
transfer = TRUE;
// Save for eviction outside of critical section.
if ( evict_obj.obj.base != NULL && !evict_obj.clean )
{
evict = TRUE;
args->victim[thread] = evict_obj;
}
// Save the block in the data structure.
args->gpu[thread * gpu_n_blocks + k].obj = obj;
// Make sure the new block is clean.
args->gpu[thread * gpu_n_blocks + k].clean = TRUE;
args->gpu[thread * gpu_n_blocks + k].request = FALSE;
}
// Use the block on the GPU that is a hit or LRU.
gpu_obj = args->gpu[thread * gpu_n_blocks + k];
// Shift all the previous tasks for LRU replacement.
for ( j = k; j > 0; j-- )
args->gpu[thread * gpu_n_blocks + j] = args->gpu[thread * gpu_n_blocks + j - 1];
// Place the block on the cache as the most recently used.
args->gpu[thread * gpu_n_blocks] = gpu_obj;
#ifdef FLA_ENABLE_MULTITHREADING
FLA_Lock_release( &(args->gpu_lock[thread]) ); // G ***
#endif
// Evict and flush the LRU dirty block.
if ( evict )
{
FLASH_Queue_read_gpu( evict_obj.obj, evict_obj.buffer_gpu );
#ifdef FLA_ENABLE_MULTITHREADING
FLA_Lock_acquire( &(args->gpu_lock[thread]) ); // G ***
#endif
args->victim[thread].obj.base = NULL;
#ifdef FLA_ENABLE_MULTITHREADING
FLA_Lock_release( &(args->gpu_lock[thread]) ); // G ***
#endif
}
// Move the block to the GPU.
if ( transfer )
FLASH_Queue_write_gpu( gpu_obj.obj, gpu_obj.buffer_gpu );
return;
}
| void FLASH_Queue_update_cache | ( | FLASH_Task * | t, |
| void * | arg | ||
| ) |
References FLA_Obj_view::base, FLASH_Task_s::cache, FLA_Obj_col_stride(), FLA_Obj_elemtype(), FLA_Obj_length(), FLA_Obj_width(), FLASH_Queue_update_cache_block(), FLASH_Task_s::input_arg, FLASH_Task_s::n_input_args, FLASH_Task_s::n_output_args, and FLASH_Task_s::output_arg.
Referenced by FLASH_Queue_exec_parallel_function(), and FLASH_Queue_exec_simulation().
{
int i, j;
FLA_Bool duplicate;
FLA_Obj obj;
if ( t == NULL )
return;
// Updating the input blocks.
for ( i = t->n_input_args - 1; i >= 0; i-- )
{
// Check for duplicate blocks.
duplicate = FALSE;
for ( j = 0; j < t->n_output_args && !duplicate; j++ )
{
if ( t->input_arg[i].base == t->output_arg[j].base )
duplicate = TRUE;
}
for ( j = 0; j < i && !duplicate; j++ )
{
if ( t->input_arg[i].base == t->input_arg[j].base )
duplicate = TRUE;
}
// If the input block has not been processed before.
if ( !duplicate )
{
obj = t->input_arg[i];
// Macroblock is used.
if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
{
dim_t jj, kk;
dim_t m = FLA_Obj_length( obj );
dim_t n = FLA_Obj_width( obj );
dim_t cs = FLA_Obj_col_stride( obj );
FLA_Obj* buf = FLASH_OBJ_PTR_AT( obj );
// Dependence analysis for each input block in macroblock.
for ( jj = 0; jj < n; jj++ )
for ( kk = 0; kk < m; kk++ )
FLASH_Queue_update_cache_block( *( buf + jj * cs + kk ),
t->cache, FALSE, arg );
}
else // Regular block.
{
FLASH_Queue_update_cache_block( obj, t->cache, FALSE, arg );
}
}
}
// Updating the output blocks.
for ( i = t->n_output_args - 1; i >= 0; i-- )
{
// Check for duplicate blocks.
duplicate = FALSE;
for ( j = 0; j < i && !duplicate; j++ )
{
if ( t->output_arg[i].base == t->output_arg[j].base )
duplicate = TRUE;
}
// If the output block has not been processed before.
if ( !duplicate )
{
obj = t->output_arg[i];
// Macroblock is used.
if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
{
dim_t jj, kk;
dim_t m = FLA_Obj_length( obj );
dim_t n = FLA_Obj_width( obj );
dim_t cs = FLA_Obj_col_stride( obj );
FLA_Obj* buf = FLASH_OBJ_PTR_AT( obj );
// Dependence analysis for each input block in macroblock.
for ( jj = 0; jj < n; jj++ )
for ( kk = 0; kk < m; kk++ )
FLASH_Queue_update_cache_block( *( buf + jj * cs + kk ),
t->cache, TRUE, arg );
}
else // Regular block.
{
FLASH_Queue_update_cache_block( obj, t->cache, TRUE, arg );
}
}
}
return;
}
| void FLASH_Queue_update_cache_block | ( | FLA_Obj | obj, |
| int | cache, | ||
| FLA_Bool | output, | ||
| void * | arg | ||
| ) |
References FLA_Obj_view::base, FLASH_Queue_variables::cac_lock, FLASH_Queue_variables::cache, FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_variables::n_caches, and FLASH_Queue_variables::size.
Referenced by FLASH_Queue_update_cache().
{
FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
int i, j, k;
int n_caches = args->n_caches;
int size = args->size;
#ifdef FLA_ENABLE_MULTITHREADING
FLA_Lock_acquire( &(args->cac_lock[cache]) ); // C ***
#endif
// Locate the position of the block in the cache.
for ( k = 0; k < size - 1; k++ )
{
if ( obj.base == args->cache[cache * size + k].base )
break;
}
// Shift all the previous tasks for LRU replacement.
for ( j = k; j > 0; j-- )
args->cache[cache * size + j] = args->cache[cache * size + j - 1];
// Place the block on the cache as the most recently used.
args->cache[cache * size] = obj;
#ifdef FLA_ENABLE_MULTITHREADING
FLA_Lock_release( &(args->cac_lock[cache]) ); // C ***
#endif
// Write invalidate if updating with output block.
if ( output )
{
for ( i = 0; i < n_caches; i++ )
{
if ( i != cache )
{
#ifdef FLA_ENABLE_MULTITHREADING
FLA_Lock_acquire( &(args->cac_lock[i]) ); // C ***
#endif
// Locate the position of the block in the cache.
for ( k = 0; k < size; k++ )
{
if ( obj.base == args->cache[i * size + k].base )
break;
}
// The block is owned by other thread.
if ( k < size )
{
// Shift all the blocks for the invalidated block.
for ( j = k; j < size - 1; j++ )
args->cache[i * size + j] = args->cache[i * size + j + 1];
// Invalidate the block.
args->cache[i * size + size - 1].base = NULL;
}
#ifdef FLA_ENABLE_MULTITHREADING
FLA_Lock_release( &(args->cac_lock[i]) ); // C ***
#endif
}
}
}
return;
}
| void FLASH_Queue_update_gpu | ( | FLASH_Task * | t, |
| void ** | input_arg, | ||
| void ** | output_arg, | ||
| void * | arg | ||
| ) |
References FLA_Obj_view::base, FLASH_Queue_get_num_threads(), FLASH_Queue_invalidate_block_gpu(), FLASH_Queue_update_block_gpu(), FLASH_Task_s::input_arg, FLASH_Task_s::n_input_args, FLASH_Task_s::n_output_args, FLASH_Task_s::output_arg, and FLASH_Task_s::thread.
Referenced by FLASH_Queue_exec_gpu().
{
int i, j, k;
int thread = t->thread;
int n_threads = FLASH_Queue_get_num_threads();
FLA_Bool duplicate;
// None of the arguments can be macroblocks yet.
// Complicating factor is copying macroblock to contiguous memory on GPU.
// Bring the input arguments to the GPU.
for ( i = t->n_input_args - 1; i >= 0; i-- )
{
// Check for duplicate blocks.
duplicate = FALSE;
for ( j = 0; j < t->n_output_args && !duplicate; j++ )
{
if ( t->input_arg[i].base == t->output_arg[j].base )
duplicate = TRUE;
}
for ( j = 0; j < i && !duplicate; j++ )
{
if ( t->input_arg[i].base == t->input_arg[j].base )
duplicate = TRUE;
}
// If the input block has not been processed before.
if ( !duplicate )
{
FLASH_Queue_update_block_gpu( t->input_arg[i], input_arg + i, thread, arg );
}
else
{
input_arg[i] = NULL;
}
}
// Bring the output arguments to the GPU.
for ( i = t->n_output_args - 1; i >= 0; i-- )
{
// Check for duplicate blocks.
duplicate = FALSE;
for ( j = 0; j < i && !duplicate; j++ )
{
if ( t->output_arg[i].base == t->output_arg[j].base )
duplicate = TRUE;
}
// If the output block has not been processed before.
if ( !duplicate )
{
FLASH_Queue_update_block_gpu( t->output_arg[i], output_arg + i, thread, arg );
// Invalidate output blocks on all other GPUs.
for ( k = 0; k < n_threads; k++ )
if ( k != thread )
FLASH_Queue_invalidate_block_gpu( t->output_arg[i], k, arg );
}
else
{
output_arg[i] = NULL;
}
}
// Check to see if there are any duplicates.
for ( i = t->n_input_args - 1; i >= 0; i-- )
{
for ( j = 0; j < t->n_output_args && input_arg[i] == NULL; j++ )
{
if ( t->input_arg[i].base == t->output_arg[j].base )
input_arg[i] = output_arg[j];
}
for ( j = 0; j < i && input_arg[i] == NULL; j++ )
{
if ( t->input_arg[i].base == t->input_arg[j].base )
input_arg[i] = input_arg[j];
}
}
// Check to see if there are any duplicates.
for ( i = t->n_output_args - 1; i >= 0; i-- )
{
for ( j = 0; j < i && output_arg[i] == NULL; j++ )
{
if ( t->output_arg[i].base == t->output_arg[j].base )
output_arg[i] = output_arg[j];
}
}
return;
}
| void FLASH_Queue_verbose_output | ( | void | ) |
References FLA_Obj_view::base, FLASH_Task_s::dep_arg_head, FLASH_Queue_get_data_affinity(), FLASH_Queue_get_head_task(), FLASH_Queue_get_num_tasks(), FLASH_Queue_get_num_threads(), FLASH_Queue_get_verbose_output(), FLA_Obj_struct::id, FLASH_Task_s::input_arg, FLA_Obj_struct::m_index, FLASH_Task_s::n_dep_args, FLA_Obj_struct::n_index, FLASH_Task_s::n_input_args, FLASH_Task_s::n_output_args, FLASH_Task_s::name, FLASH_Dep_s::next_dep, FLASH_Task_s::next_task, FLASH_Task_s::order, FLASH_Task_s::output_arg, FLASH_Task_s::queue, and FLASH_Dep_s::task.
Referenced by FLASH_Queue_exec().
{
int i, j, k;
int n_threads = FLASH_Queue_get_num_threads();
int n_tasks = FLASH_Queue_get_num_tasks();
FLASH_Verbose verbose = FLASH_Queue_get_verbose_output();
FLASH_Task* t;
FLASH_Dep* d;
// Grab the head of the task queue.
t = FLASH_Queue_get_head_task();
if ( verbose == FLASH_QUEUE_VERBOSE_READABLE )
{
// Iterate over linked list of tasks.
for ( i = 0; i < n_tasks; i++ )
{
printf( "%d\t%s\t", t->order, t->name );
for ( j = 0; j < t->n_output_args; j++ )
printf( "%lu[%u,%u] ", t->output_arg[j].base->id,
t->output_arg[j].base->m_index,
t->output_arg[j].base->n_index );
printf( ":= " );
for ( j = 0; j < t->n_output_args; j++ )
printf( "%lu[%u,%u] ", t->output_arg[j].base->id,
t->output_arg[j].base->m_index,
t->output_arg[j].base->n_index );
for ( j = 0; j < t->n_input_args; j++ )
printf( "%lu[%u,%u] ", t->input_arg[j].base->id,
t->input_arg[j].base->m_index,
t->input_arg[j].base->n_index );
printf( "\n" );
// Go to the next task.
t = t->next_task;
}
printf( "\n" );
}
else
{
printf( "digraph SuperMatrix {\n" );
if ( FLASH_Queue_get_data_affinity() == FLASH_QUEUE_AFFINITY_NONE )
{
// Iterate over linked list of tasks.
for ( i = 0; i < n_tasks; i++ )
{
printf( "%d [label=\"%s\"]; %d -> {", t->order, t->name, t->order);
d = t->dep_arg_head;
for ( j = 0; j < t->n_dep_args; j++ )
{
printf( "%d;", d->task->order );
d = d->next_dep;
}
printf( "};\n" );
// Go to the next task.
t = t->next_task;
}
}
else
{
// Iterate over all the threads.
for ( k = 0; k < n_threads; k++ )
{
printf( "subgraph cluster%d {\nlabel=\"%d\"\n", k, k );
// Iterate over linked list of tasks.
for ( i = 0; i < n_tasks; i++ )
{
if ( t->queue == k )
printf( "%d [label=\"%s\"];\n", t->order, t->name );
// Go to the next task.
t = t->next_task;
}
printf( "}\n" );
// Grab the head of the task queue.
t = FLASH_Queue_get_head_task();
}
// Iterate over linked list of tasks.
for ( i = 0; i < n_tasks; i++ )
{
printf( "%d -> {", t->order );
d = t->dep_arg_head;
for ( j = 0; j < t->n_dep_args; j++ )
{
printf( "%d;", d->task->order );
d = d->next_dep;
}
printf( "};\n" );
// Go to the next task.
t = t->next_task;
}
}
printf( "}\n\n" );
}
return;
}
| FLASH_Task * FLASH_Queue_wait_dequeue | ( | int | queue, |
| int | cache, | ||
| void * | arg | ||
| ) |
References FLASH_Queue_variables::cac_lock, FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_get_caching(), FLASH_Queue_get_enabled_gpu(), FLASH_Queue_wait_dequeue_block(), FLASH_Queue_variables::gpu_lock, FLASH_Queue_s::head, FLASH_Queue_s::n_tasks, FLASH_Queue_variables::n_wait, FLASH_Task_s::next_wait, FLASH_Queue_variables::pc, FLASH_Task_s::prev_wait, FLASH_Queue_s::tail, FLASH_Queue_variables::task_queue, and FLASH_Queue_variables::wait_queue.
Referenced by FLASH_Queue_exec_parallel_function(), FLASH_Queue_exec_simulation(), and FLASH_Task_update_dependencies().
{
FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
FLASH_Task* t = NULL;
if ( args->n_wait[0] > 0 )
{
// Grab the head of the queue.
t = args->task_queue[args->wait_queue[args->pc[0]]];
// Decrement number of tasks on waiting queue.
args->n_wait[0]--;
// Increment the program counter.
args->pc[0]++;
}
return t;
}
| FLASH_Task* FLASH_Queue_wait_dequeue_block | ( | int | queue, |
| int | cache, | ||
| void * | arg | ||
| ) |
References FLA_Obj_view::base, FLASH_Queue_variables::cache, FLA_Obj_elemtype(), FLASH_Queue_get_enabled_gpu(), FLASH_Queue_get_gpu_num_blocks(), FLASH_Queue_variables::gpu, FLASH_Queue_s::head, FLASH_Task_s::hit, FLASH_Task_s::n_output_args, FLASH_Queue_s::n_tasks, FLASH_Task_s::next_wait, FLA_Obj_gpu_struct::obj, FLASH_Task_s::output_arg, FLASH_Queue_variables::size, and FLASH_Queue_variables::wait_queue.
Referenced by FLASH_Queue_wait_dequeue().
{
FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
int i, j, k;
int size = args->size;
int n_tasks = args->wait_queue[queue].n_tasks;
FLA_Bool enabled = FALSE;
FLASH_Task* t;
FLA_Obj obj;
FLA_Obj mem;
#ifdef FLA_ENABLE_GPU
enabled = FLASH_Queue_get_enabled_gpu();
// If using GPUs, then only check GPU and not the cache.
if ( enabled )
size = FLASH_Queue_get_gpu_num_blocks();
#endif
t = args->wait_queue[queue].head;
// Check if any of the output blocks are in the cache.
for ( i = 0; i < n_tasks; i++ )
{
for ( j = 0; j < size; j++ )
{
// Initialize the memory just in case.
mem.base = NULL;
// Determine if using GPU or not.
if ( enabled )
{
#ifdef FLA_ENABLE_GPU
mem = args->gpu[cache * size + j].obj;
#endif
}
else
{
mem = args->cache[cache * size + j];
}
for ( k = 0; k < t->n_output_args; k++ )
{
obj = t->output_arg[k];
if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
obj = *FLASH_OBJ_PTR_AT( obj );
// Return the task if its output block is in cache.
if ( mem.base == obj.base )
{
t->hit = TRUE;
return t;
}
}
}
t = t->next_wait;
}
return args->wait_queue[queue].head;
}
| void FLASH_Queue_wait_enqueue | ( | FLASH_Task * | t, |
| void * | arg | ||
| ) |
References FLASH_Queue_get_sorting(), FLASH_Queue_s::head, FLASH_Task_s::height, FLASH_Queue_s::n_tasks, FLASH_Queue_variables::n_wait, FLASH_Task_s::next_wait, FLASH_Task_s::order, FLASH_Queue_variables::pc, FLASH_Task_s::prev_wait, FLASH_Task_s::queue, FLASH_Queue_s::tail, FLASH_Queue_variables::task_queue, and FLASH_Queue_variables::wait_queue.
Referenced by FLASH_Queue_exec_gpu(), FLASH_Queue_exec_simulation(), FLASH_Queue_init_tasks(), FLASH_Task_update_binding(), and FLASH_Task_update_dependencies().
{
FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
int i = args->n_wait[0] + args->pc[0];
// Insertion sort of tasks in waiting queue.
if ( FLASH_Queue_get_sorting() )
{
for ( ; i > args->pc[0]; i-- )
{
if ( args->task_queue[args->wait_queue[i-1]]->height >
args->task_queue[t->order]->height )
break;
args->wait_queue[i] = args->wait_queue[i-1];
}
}
args->wait_queue[i] = t->order;
// Increment number of tasks on waiting queue.
args->n_wait[0]++;
return;
}
| FLASH_Task* FLASH_Queue_work_stealing | ( | int | queue, |
| void * | arg | ||
| ) |
References FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_s::head, FLASH_Queue_variables::n_queues, FLASH_Queue_s::n_tasks, FLASH_Task_s::next_wait, FLASH_Task_s::prev_wait, FLASH_Task_s::queue, FLASH_Queue_variables::run_lock, FLASH_Queue_s::tail, and FLASH_Queue_variables::wait_queue.
Referenced by FLASH_Queue_exec_parallel_function().
{
FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
int q;
int n_queues = args->n_queues;
FLASH_Task* t = NULL;
// Do not perform work stealing if there is only one queue.
if ( n_queues == 1 )
return t;
// Find a random queue not equal to the current queue.
do
{
#ifdef FLA_ENABLE_WINDOWS_BUILD
rand_s( &q );
q = q % n_queues;
#else
#ifdef FLA_ENABLE_TIDSP
q = rand() % n_queues;
#else
q = lrand48() % n_queues;
#endif
#endif
}
while ( q == queue );
#ifdef FLA_ENABLE_MULTITHREADING
FLA_Lock_acquire( &(args->run_lock[q]) ); // R ***
#endif
// If there are tasks that this thread can steal.
if ( args->wait_queue[q].n_tasks > 0 )
{
// Dequeue the last task.
t = args->wait_queue[q].tail;
if ( args->wait_queue[q].n_tasks == 1 )
{
// Clear the queue of its only task.
args->wait_queue[q].head = NULL;
args->wait_queue[q].tail = NULL;
}
else
{
// Adjust pointers in waiting queue.
args->wait_queue[q].tail = t->prev_wait;
args->wait_queue[q].tail->next_wait = NULL;
}
// Reset waiting queue data about the stolen task.
t->queue = queue;
t->prev_wait = NULL;
t->next_wait = NULL;
args->wait_queue[q].n_tasks--;
}
#ifdef FLA_ENABLE_MULTITHREADING
FLA_Lock_release( &(args->run_lock[q]) ); // R ***
#endif
return t;
}
| FLASH_Task* FLASH_Task_alloc | ( | void * | func, |
| void * | cntl, | ||
| char * | name, | ||
| FLA_Bool | enabled_gpu, | ||
| int | n_int_args, | ||
| int | n_fla_args, | ||
| int | n_input_args, | ||
| int | n_output_args | ||
| ) |
References FLASH_Task_s::cache, FLASH_Task_s::cntl, FLASH_Task_s::dep_arg_head, FLASH_Task_s::dep_arg_tail, FLASH_Task_s::enabled_gpu, FLASH_Task_s::fla_arg, FLA_malloc(), FLASH_Task_s::func, FLASH_Task_s::height, FLASH_Task_s::hit, FLASH_Task_s::input_arg, FLASH_Task_s::int_arg, FLASH_Task_s::n_dep_args, FLASH_Task_s::n_fla_args, FLASH_Task_s::n_input_args, FLASH_Task_s::n_int_args, FLASH_Task_s::n_macro_args, FLASH_Task_s::n_output_args, FLASH_Task_s::n_ready, FLASH_Task_s::n_war_args, FLASH_Task_s::name, FLASH_Task_s::next_task, FLASH_Task_s::next_wait, FLASH_Task_s::order, FLASH_Task_s::output_arg, FLASH_Task_s::prev_task, FLASH_Task_s::prev_wait, FLASH_Task_s::queue, and FLASH_Task_s::thread.
Referenced by FLASH_Queue_push().
{
FLASH_Task* t;
// Allocate space for the task structure t.
t = (FLASH_Task *) FLA_malloc( sizeof(FLASH_Task) );
// Allocate space for the task's integer arguments.
t->int_arg = (int *) FLA_malloc( n_int_args * sizeof(int) );
// Allocate space for the task's FLA_Obj arguments.
t->fla_arg = (FLA_Obj *) FLA_malloc( n_fla_args * sizeof(FLA_Obj) );
// Allocate space for the task's input FLA_Obj arguments.
t->input_arg = (FLA_Obj *) FLA_malloc( n_input_args * sizeof(FLA_Obj) );
// Allocate space for the task's output FLA_Obj arguments.
t->output_arg = (FLA_Obj *) FLA_malloc( n_output_args * sizeof(FLA_Obj) );
// Initialize other fields of the structure.
t->n_ready = 0;
t->order = 0;
t->queue = 0;
t->height = 0;
t->thread = 0;
t->cache = 0;
t->hit = FALSE;
t->func = func;
t->cntl = cntl;
t->name = name;
t->enabled_gpu = enabled_gpu;
t->n_int_args = n_int_args;
t->n_fla_args = n_fla_args;
t->n_input_args = n_input_args;
t->n_output_args = n_output_args;
t->n_macro_args = 0;
t->n_war_args = 0;
t->n_dep_args = 0;
t->dep_arg_head = NULL;
t->dep_arg_tail = NULL;
t->prev_task = NULL;
t->next_task = NULL;
t->prev_wait = NULL;
t->next_wait = NULL;
// Return a pointer to the initialized structure.
return t;
}
| void FLASH_Task_free | ( | FLASH_Task * | t | ) |
References FLA_Obj_view::base, FLASH_Task_s::dep_arg_head, FLASH_Task_s::fla_arg, FLA_free(), FLA_Obj_col_stride(), FLA_Obj_elemtype(), FLA_Obj_length(), FLA_Obj_width(), FLASH_Task_s::input_arg, FLASH_Task_s::int_arg, FLASH_Task_s::n_dep_args, FLASH_Task_s::n_input_args, FLASH_Task_s::n_output_args, FLA_Obj_struct::n_read_tasks, FLASH_Dep_s::next_dep, FLASH_Task_s::output_arg, FLA_Obj_struct::read_task_head, FLA_Obj_struct::read_task_tail, and FLA_Obj_struct::write_task.
Referenced by FLASH_Queue_exec(), and FLASH_Queue_exec_simulation().
{
int i, j, k;
FLA_Obj obj;
FLASH_Dep* d;
FLASH_Dep* next_dep;
// Clearing the last write task in each output block.
for ( i = 0; i < t->n_output_args; i++ )
{
obj = t->output_arg[i];
// Macroblock is used.
if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
{
dim_t jj, kk;
dim_t m = FLA_Obj_length( obj );
dim_t n = FLA_Obj_width( obj );
dim_t cs = FLA_Obj_col_stride( obj );
FLA_Obj* buf = FLASH_OBJ_PTR_AT( obj );
// Clear each block in macroblock.
for ( jj = 0; jj < n; jj++ )
for ( kk = 0; kk < m; kk++ )
( buf + jj * cs + kk )->base->write_task = NULL;
}
else // Clear regular block.
{
obj.base->write_task = NULL;
}
}
// Cleaning the last read tasks in each input block.
for ( i = 0; i < t->n_input_args; i++ )
{
obj = t->input_arg[i];
// Macroblock is used.
if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
{
dim_t jj, kk;
dim_t m = FLA_Obj_length( obj );
dim_t n = FLA_Obj_width( obj );
dim_t cs = FLA_Obj_col_stride( obj );
FLA_Obj* buf = FLASH_OBJ_PTR_AT( obj );
// Clear each block in macroblock.
for ( jj = 0; jj < n; jj++ )
{
for ( kk = 0; kk < m; kk++ )
{
obj = *( buf + jj * cs + kk );
k = obj.base->n_read_tasks;
d = obj.base->read_task_head;
obj.base->n_read_tasks = 0;
obj.base->read_task_head = NULL;
obj.base->read_task_tail = NULL;
for ( j = 0; j < k; j++ )
{
next_dep = d->next_dep;
FLA_free( d );
d = next_dep;
}
}
}
}
else // Regular block.
{
k = obj.base->n_read_tasks;
d = obj.base->read_task_head;
obj.base->n_read_tasks = 0;
obj.base->read_task_head = NULL;
obj.base->read_task_tail = NULL;
for ( j = 0; j < k; j++ )
{
next_dep = d->next_dep;
FLA_free( d );
d = next_dep;
}
}
}
// Free the dep_arg field of t.
d = t->dep_arg_head;
for ( i = 0; i < t->n_dep_args; i++ )
{
next_dep = d->next_dep;
FLA_free( d );
d = next_dep;
}
// Free the int_arg field of t.
FLA_free( t->int_arg );
// Free the fla_arg field of t.
FLA_free( t->fla_arg );
// Free the input_arg field of t.
FLA_free( t->input_arg );
// Free the output_arg field of t.
FLA_free( t->output_arg );
// Finally, free the struct itself.
FLA_free( t );
return;
}
| void FLASH_Task_free_parallel | ( | FLASH_Task * | t, |
| void * | arg | ||
| ) |
References FLA_Obj_view::base, FLASH_Task_s::dep_arg_head, FLASH_Task_s::fla_arg, FLA_free(), FLA_Lock_acquire(), FLA_Lock_release(), FLA_Obj_col_stride(), FLA_Obj_elemtype(), FLA_Obj_length(), FLA_Obj_width(), FLASH_Queue_get_num_threads(), FLASH_Task_s::input_arg, FLASH_Task_s::int_arg, FLASH_Task_s::n_dep_args, FLASH_Task_s::n_input_args, FLASH_Task_s::n_output_args, FLA_Obj_struct::n_read_blocks, FLA_Obj_struct::n_read_tasks, FLASH_Dep_s::next_dep, FLASH_Task_s::output_arg, FLA_Obj_struct::read_task_head, FLA_Obj_struct::read_task_tail, FLASH_Queue_variables::war_lock, and FLA_Obj_struct::write_task.
Referenced by FLASH_Queue_exec_parallel_function().
{
FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
int i, j, k;
int thread;
int n_threads = FLASH_Queue_get_num_threads();
FLASH_Dep* d;
FLASH_Dep* next_dep;
FLA_Obj obj;
// Clearing the last write task in each output block.
for ( i = 0; i < t->n_output_args; i++ )
{
obj = t->output_arg[i];
// Macroblock is used.
if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
{
dim_t jj, kk;
dim_t m = FLA_Obj_length( obj );
dim_t n = FLA_Obj_width( obj );
dim_t cs = FLA_Obj_col_stride( obj );
FLA_Obj* buf = FLASH_OBJ_PTR_AT( obj );
// Clear each block in macroblock.
for ( jj = 0; jj < n; jj++ )
for ( kk = 0; kk < m; kk++ )
( buf + jj * cs + kk )->base->write_task = NULL;
}
else // Clear regular block.
{
obj.base->write_task = NULL;
}
}
// Cleaning the last read tasks in each input block.
for ( i = 0; i < t->n_input_args; i++ )
{
obj = t->input_arg[i];
// Macroblock is used.
if ( FLA_Obj_elemtype( obj ) == FLA_MATRIX )
{
dim_t jj, kk;
dim_t m = FLA_Obj_length( obj );
dim_t n = FLA_Obj_width( obj );
dim_t cs = FLA_Obj_col_stride( obj );
FLA_Obj* buf = FLASH_OBJ_PTR_AT( obj );
// Clear each block in macroblock.
for ( jj = 0; jj < n; jj++ )
{
for ( kk = 0; kk < m; kk++ )
{
obj = *( buf + jj * cs + kk );
thread = obj.base->n_read_blocks % n_threads;
FLA_Lock_acquire( &(args->war_lock[thread]) ); // W ***
k = obj.base->n_read_tasks;
d = obj.base->read_task_head;
obj.base->n_read_tasks = 0;
obj.base->read_task_head = NULL;
obj.base->read_task_tail = NULL;
FLA_Lock_release( &(args->war_lock[thread]) ); // W ***
for ( j = 0; j < k; j++ )
{
next_dep = d->next_dep;
FLA_free( d );
d = next_dep;
}
}
}
}
else // Regular block.
{
thread = obj.base->n_read_blocks % n_threads;
FLA_Lock_acquire( &(args->war_lock[thread]) ); // W ***
k = obj.base->n_read_tasks;
d = obj.base->read_task_head;
obj.base->n_read_tasks = 0;
obj.base->read_task_head = NULL;
obj.base->read_task_tail = NULL;
FLA_Lock_release( &(args->war_lock[thread]) ); // W ***
for ( j = 0; j < k; j++ )
{
next_dep = d->next_dep;
FLA_free( d );
d = next_dep;
}
}
}
// Free the dep_arg field of t.
d = t->dep_arg_head;
for ( i = 0; i < t->n_dep_args; i++ )
{
next_dep = d->next_dep;
FLA_free( d );
d = next_dep;
}
// Free the int_arg field of t.
FLA_free( t->int_arg );
// Free the fla_arg field of t.
FLA_free( t->fla_arg );
// Free the input_arg field of t.
FLA_free( t->input_arg );
// Free the output_arg field of t.
FLA_free( t->output_arg );
// Finally, free the struct itself.
FLA_free( t );
return;
}
| FLASH_Task* FLASH_Task_update_binding | ( | FLASH_Task * | t, |
| FLASH_Task * | r, | ||
| void * | arg | ||
| ) |
References FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_get_sorting(), FLASH_Queue_wait_enqueue(), FLASH_Task_s::height, FLASH_Task_s::hit, FLASH_Task_s::queue, and FLASH_Queue_variables::run_lock.
Referenced by FLASH_Task_update_dependencies().
{
FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
int queue;
if ( r == NULL )
{
// There are no tasks on waiting queue, so bind the first task.
r = t;
r->hit = TRUE;
}
else
{
// Swap the binded task for the new ready task.
if ( !r->hit || ( FLASH_Queue_get_sorting() && r->height < t->height ) )
{
queue = r->queue;
r->hit = FALSE;
FLA_Lock_acquire( &(args->run_lock[queue]) ); // R ***
// Place swapped task back onto waiting queue.
FLASH_Queue_wait_enqueue( r, arg );
FLA_Lock_release( &(args->run_lock[queue]) ); // R ***
// Bind the new ready task.
r = t;
r->hit = TRUE;
}
else // Keep the binded task and enqueue new ready task.
{
queue = t->queue;
FLA_Lock_acquire( &(args->run_lock[queue]) ); // R ***
FLASH_Queue_wait_enqueue( t, arg );
FLA_Lock_release( &(args->run_lock[queue]) ); // R ***
}
}
return r;
}
| FLASH_Task * FLASH_Task_update_dependencies | ( | FLASH_Task * | t, |
| void * | arg | ||
| ) |
References FLASH_Task_s::cache, FLASH_Task_s::dep_arg_head, FLASH_Queue_variables::dep_lock, FLA_Lock_acquire(), FLA_Lock_release(), FLASH_Queue_get_caching(), FLASH_Queue_get_num_threads(), FLASH_Queue_get_work_stealing(), FLASH_Queue_wait_dequeue(), FLASH_Queue_wait_enqueue(), FLASH_Task_update_binding(), FLASH_Task_s::n_dep_args, FLASH_Task_s::n_ready, FLASH_Queue_variables::n_ready, FLASH_Dep_s::next_dep, FLASH_Task_s::order, FLASH_Task_s::queue, RCCE_acquire_lock(), RCCE_release_lock(), FLASH_Queue_variables::run_lock, and FLASH_Dep_s::task.
Referenced by FLASH_Queue_exec_parallel_function().
{
FLASH_Queue_vars* args = ( FLASH_Queue_vars* ) arg;
int i;
int n_threads = FLASH_Queue_get_num_threads();
int thread;
FLA_Bool available;
FLASH_Task* task;
FLASH_Task* r = NULL;
FLASH_Dep* d = t->dep_arg_head;
// Check each dependent task.
for ( i = 0; i < t->n_dep_args; i++ )
{
task = d->task;
// Use the remaining locks except for the first one.
thread = ( n_threads > 1 ? task->order % ( n_threads - 1 ) + 1 : 0 );
RCCE_acquire_lock( thread );
args->n_ready[task->order]--;
available = ( args->n_ready[task->order] == 0 );
RCCE_release_lock( thread );
// Place newly ready tasks on waiting queue.
if ( available )
{
RCCE_acquire_lock( 0 );
FLASH_Queue_wait_enqueue( task, arg );
RCCE_release_lock( 0 );
}
// Go to the next dep.
d = d->next_dep;
}
return r;
}
1.7.6.1