Blender  V2.93
kernel_config.h
Go to the documentation of this file.
1 /*
2  * Copyright 2011-2013 Blender Foundation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 /* device data taken from CUDA occupancy calculator */
18 
19 /* 3.0 and 3.5 */
20 #if __CUDA_ARCH__ == 300 || __CUDA_ARCH__ == 350
21 # define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
22 # define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
23 # define CUDA_BLOCK_MAX_THREADS 1024
24 # define CUDA_THREAD_MAX_REGISTERS 63
25 
26 /* tunable parameters */
27 # define CUDA_THREADS_BLOCK_WIDTH 16
28 # define CUDA_KERNEL_MAX_REGISTERS 63
29 # define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
30 
31 /* 3.2 */
32 #elif __CUDA_ARCH__ == 320
33 # define CUDA_MULTIPRESSOR_MAX_REGISTERS 32768
34 # define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
35 # define CUDA_BLOCK_MAX_THREADS 1024
36 # define CUDA_THREAD_MAX_REGISTERS 63
37 
38 /* tunable parameters */
39 # define CUDA_THREADS_BLOCK_WIDTH 16
40 # define CUDA_KERNEL_MAX_REGISTERS 63
41 # define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
42 
43 /* 3.7 */
44 #elif __CUDA_ARCH__ == 370
45 # define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
46 # define CUDA_MULTIPROCESSOR_MAX_BLOCKS 16
47 # define CUDA_BLOCK_MAX_THREADS 1024
48 # define CUDA_THREAD_MAX_REGISTERS 255
49 
50 /* tunable parameters */
51 # define CUDA_THREADS_BLOCK_WIDTH 16
52 # define CUDA_KERNEL_MAX_REGISTERS 63
53 # define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
54 
55 /* 5.x, 6.x */
56 #elif __CUDA_ARCH__ <= 699
57 # define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
58 # define CUDA_MULTIPROCESSOR_MAX_BLOCKS 32
59 # define CUDA_BLOCK_MAX_THREADS 1024
60 # define CUDA_THREAD_MAX_REGISTERS 255
61 
62 /* tunable parameters */
63 # define CUDA_THREADS_BLOCK_WIDTH 16
64 /* CUDA 9.0 seems to cause slowdowns on high-end Pascal cards unless we increase the number of
65  * registers */
66 # if __CUDACC_VER_MAJOR__ >= 9 && __CUDA_ARCH__ >= 600
67 # define CUDA_KERNEL_MAX_REGISTERS 64
68 # else
69 # define CUDA_KERNEL_MAX_REGISTERS 48
70 # endif
71 # define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 63
72 
73 /* 7.x, 8.x */
74 #elif __CUDA_ARCH__ <= 899
75 # define CUDA_MULTIPRESSOR_MAX_REGISTERS 65536
76 # define CUDA_MULTIPROCESSOR_MAX_BLOCKS 32
77 # define CUDA_BLOCK_MAX_THREADS 1024
78 # define CUDA_THREAD_MAX_REGISTERS 255
79 
80 /* tunable parameters */
81 # define CUDA_THREADS_BLOCK_WIDTH 16
82 # define CUDA_KERNEL_MAX_REGISTERS 64
83 # define CUDA_KERNEL_BRANCHED_MAX_REGISTERS 72
84 
85 /* unknown architecture */
86 #else
87 # error "Unknown or unsupported CUDA architecture, can't determine launch bounds"
88 #endif
89 
90 /* For split kernel using all registers seems fastest for now, but this
91  * is unlikely to be optimal once we resolve other bottlenecks. */
92 
93 #define CUDA_KERNEL_SPLIT_MAX_REGISTERS CUDA_THREAD_MAX_REGISTERS
94 
95 /* Compute number of threads per block and minimum blocks per multiprocessor
96  * given the maximum number of registers per thread. */
97 
98 #define CUDA_LAUNCH_BOUNDS(threads_block_width, thread_num_registers) \
99  __launch_bounds__(threads_block_width *threads_block_width, \
100  CUDA_MULTIPRESSOR_MAX_REGISTERS / \
101  (threads_block_width * threads_block_width * thread_num_registers))
102 
103 /* sanity checks */
104 
105 #if CUDA_THREADS_BLOCK_WIDTH * CUDA_THREADS_BLOCK_WIDTH > CUDA_BLOCK_MAX_THREADS
106 # error "Maximum number of threads per block exceeded"
107 #endif
108 
109 #if CUDA_MULTIPRESSOR_MAX_REGISTERS / \
110  (CUDA_THREADS_BLOCK_WIDTH * CUDA_THREADS_BLOCK_WIDTH * CUDA_KERNEL_MAX_REGISTERS) > \
111  CUDA_MULTIPROCESSOR_MAX_BLOCKS
112 # error "Maximum number of blocks per multiprocessor exceeded"
113 #endif
114 
115 #if CUDA_KERNEL_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS
116 # error "Maximum number of registers per thread exceeded"
117 #endif
118 
119 #if CUDA_KERNEL_BRANCHED_MAX_REGISTERS > CUDA_THREAD_MAX_REGISTERS
120 # error "Maximum number of registers per thread exceeded"
121 #endif