PAPI  5.7.0.0
simpleMultiGPU.h
Go to the documentation of this file.
1 /*
2  * PAPI Multiple GPU example. This example is taken from the NVIDIA
3  * documentation (Copyright 1993-2013 NVIDIA Corporation) and has been
4  * adapted to show the use of CUPTI and PAPI in collecting event
5  * counters for multiple GPU contexts. PAPI Team (2015)
6  */
7 
8 /*
9  * This software contains source code provided by NVIDIA Corporation
10  *
11  * According to the Nvidia EULA (compute 5.5 version)
12  * http://developer.download.nvidia.com/compute/cuda/5_5/rel/docs/EULA.pdf
13  *
14  * Chapter 2. NVIDIA CORPORATION CUDA SAMPLES END USER LICENSE AGREEMENT
15  * 2.1.1. Source Code
16  * Developer shall have the right to modify and create derivative works with the Source
17  * Code. Developer shall own any derivative works ("Derivatives") it creates to the Source
18  * Code, provided that Developer uses the Materials in accordance with the terms and
19  * conditions of this Agreement. Developer may distribute the Derivatives, provided that
20  * all NVIDIA copyright notices and trademarks are propagated and used properly and
21  * the Derivatives include the following statement: “This software contains source code
22  * provided by NVIDIA Corporation.”
23  */
24 
25 /*
26  * This application demonstrates how to use the CUDA API to use multiple GPUs.
27  *
28  * Note that in order to detect multiple GPUs in your system you have to disable
29  * SLI in the nvidia control panel. Otherwise only one GPU is visible to the
30  * application. On the other side, you can still extend your desktop to screens
31  * attached to both GPUs.
32  */
33 
34 #ifndef SIMPLEMULTIGPU_H
35 #define SIMPLEMULTIGPU_H
36 
37 typedef struct
38 {
39  //Host-side input data
40  int dataN;
41  float *h_Data;
42 
43  //Partial sum for this GPU
44  float *h_Sum;
45 
46  //Device buffers
47  float *d_Data,*d_Sum;
48 
49  //Reduction copied back from GPU
51 
52  //Stream for asynchronous command execution
53  cudaStream_t stream;
54 
55 } TGPUplan;
56 
57 extern "C"
58 void launch_reduceKernel(float *d_Result, float *d_Input, int N, int BLOCK_N, int THREAD_N, cudaStream_t &s);
59 
60 #endif
float * h_Data
float * d_Sum
cudaStream_t stream
float * h_Sum
double s
Definition: byte_profile.c:36
void launch_reduceKernel(float *d_Result, float *d_Input, int N, int BLOCK_N, int THREAD_N, cudaStream_t &s)
float * h_Sum_from_device
#define N
Definition: byte_profile.c:32