35 #ifndef WIN32_LEAN_AND_MEAN
36 # define WIN32_LEAN_AND_MEAN
47 # include <VersionHelpers.h>
54 static HMODULE kernel_lib;
59 typedef BOOL t_GetNumaHighestNodeNumber(PULONG highest_node_number);
60 typedef BOOL t_GetNumaNodeProcessorMask(
UCHAR node, ULONGLONG* processor_mask);
61 typedef BOOL t_GetNumaNodeProcessorMaskEx(
USHORT node,
62 GROUP_AFFINITY* processor_mask);
63 typedef BOOL t_GetNumaProcessorNode(
UCHAR processor,
UCHAR* node_number);
64 typedef void* t_VirtualAllocExNuma(HANDLE process_handle,
67 DWORD allocation_type,
70 typedef BOOL t_VirtualFree(
void* address, SIZE_T
size, DWORD free_type);
72 typedef BOOL t_SetProcessAffinityMask(HANDLE process_handle,
73 DWORD_PTR process_affinity_mask);
74 typedef BOOL t_SetThreadGroupAffinity(HANDLE thread_handle,
75 const GROUP_AFFINITY* group_affinity,
76 GROUP_AFFINITY* PreviousGroupAffinity);
77 typedef BOOL t_GetThreadGroupAffinity(HANDLE thread_handle,
78 GROUP_AFFINITY* group_affinity);
79 typedef DWORD t_GetCurrentProcessorNumber(
void);
80 typedef void t_GetCurrentProcessorNumberEx(PROCESSOR_NUMBER* proc_number);
81 typedef DWORD t_GetActiveProcessorCount(WORD group_number);
85 static t_GetNumaHighestNodeNumber* _GetNumaHighestNodeNumber;
86 static t_GetNumaNodeProcessorMask* _GetNumaNodeProcessorMask;
87 static t_GetNumaNodeProcessorMaskEx* _GetNumaNodeProcessorMaskEx;
88 static t_GetNumaProcessorNode* _GetNumaProcessorNode;
89 static t_VirtualAllocExNuma* _VirtualAllocExNuma;
90 static t_VirtualFree* _VirtualFree;
92 static t_SetProcessAffinityMask* _SetProcessAffinityMask;
93 static t_SetThreadGroupAffinity* _SetThreadGroupAffinity;
94 static t_GetThreadGroupAffinity* _GetThreadGroupAffinity;
95 static t_GetCurrentProcessorNumber* _GetCurrentProcessorNumber;
96 static t_GetCurrentProcessorNumberEx* _GetCurrentProcessorNumberEx;
97 static t_GetActiveProcessorCount* _GetActiveProcessorCount;
99 static void numaExit(
void) {
112 const int error = atexit(numaExit);
118 kernel_lib = LoadLibraryA(
"Kernel32.dll");
121 #define _LIBRARY_FIND(lib, name) \
123 _##name = (t_##name *)GetProcAddress(lib, #name); \
125 #define KERNEL_LIBRARY_FIND(name) _LIBRARY_FIND(kernel_lib, name)
128 KERNEL_LIBRARY_FIND(GetNumaHighestNodeNumber);
129 KERNEL_LIBRARY_FIND(GetNumaNodeProcessorMask);
130 KERNEL_LIBRARY_FIND(GetNumaNodeProcessorMaskEx);
131 KERNEL_LIBRARY_FIND(GetNumaProcessorNode);
132 KERNEL_LIBRARY_FIND(VirtualAllocExNuma);
133 KERNEL_LIBRARY_FIND(VirtualFree);
135 KERNEL_LIBRARY_FIND(SetProcessAffinityMask);
136 KERNEL_LIBRARY_FIND(SetThreadGroupAffinity);
137 KERNEL_LIBRARY_FIND(GetThreadGroupAffinity);
138 KERNEL_LIBRARY_FIND(GetCurrentProcessorNumber);
139 KERNEL_LIBRARY_FIND(GetCurrentProcessorNumberEx);
140 KERNEL_LIBRARY_FIND(GetActiveProcessorCount);
142 #undef KERNEL_LIBRARY_FIND
150 #if !ARCH_CPU_64_BITS
154 if (!IsWindows7OrGreater()) {
166 static int countNumSetBits(ULONGLONG
mask) {
171 num_bits += (
mask & 1);
181 ULONG highest_node_number;
182 if (!_GetNumaHighestNodeNumber(&highest_node_number)) {
187 return (
int)highest_node_number + 1;
196 GROUP_AFFINITY processor_mask = { 0 };
197 if (!_GetNumaNodeProcessorMaskEx(
node, &processor_mask)) {
200 if (processor_mask.Mask == 0) {
207 GROUP_AFFINITY processor_mask = { 0 };
208 if (!_GetNumaNodeProcessorMaskEx(
node, &processor_mask)) {
211 return countNumSetBits(processor_mask.Mask);
218 HANDLE thread_handle = GetCurrentThread();
219 GROUP_AFFINITY group_affinity;
221 if (!_GetThreadGroupAffinity(thread_handle, &group_affinity)) {
225 const int num_processors = countNumSetBits(group_affinity.Mask);
227 const int num_group_processors =
228 _GetActiveProcessorCount(group_affinity.Group);
229 if (num_group_processors < num_processors) {
230 return num_group_processors;
232 return num_processors;
241 HANDLE process_handle = GetCurrentProcess();
242 GROUP_AFFINITY processor_mask = { 0 };
243 if (_GetNumaNodeProcessorMaskEx(
node, &processor_mask) == 0) {
247 if (_SetProcessAffinityMask(process_handle, processor_mask.Mask) == 0) {
254 HANDLE thread_handle = GetCurrentThread();
255 GROUP_AFFINITY group_affinity = { 0 };
256 if (_GetNumaNodeProcessorMaskEx(
node, &group_affinity) == 0) {
259 if (_SetThreadGroupAffinity(thread_handle, &group_affinity,
NULL) == 0) {
269 return _VirtualAllocExNuma(GetCurrentProcess(),
272 MEM_RESERVE | MEM_COMMIT,
278 UCHAR current_processor = (
UCHAR)_GetCurrentProcessorNumber();
280 if (!_GetNumaProcessorNode(current_processor, &
node)) {
287 if (!_VirtualFree(start,
size, MEM_RELEASE)) {
static DBVT_INLINE btScalar size(const btDbvtVolume &a)
static void error(const char *str)
bool numaAPI_IsNodeAvailable(int node)
bool numaAPI_RunProcessOnNode(int node)
void * numaAPI_AllocateOnNode(size_t size, int node)
NUMAAPI_Result numaAPI_Initialize(void)
void * numaAPI_AllocateLocal(size_t size)
void numaAPI_Free(void *start, size_t size)
int numaAPI_GetNumCurrentNodesProcessors(void)
int numaAPI_GetNumNodeProcessors(int node)
int numaAPI_GetNumNodes(void)
bool numaAPI_RunThreadOnNode(int node)
ccl_device_inline float4 mask(const int4 &mask, const float4 &a)