PAPI  5.7.0.0
linux-nvml.c File Reference

This is an NVML component, it demos the component interface and implements a number of counters from the Nvidia Management Library. Please refer to NVML documentation for details about nvmlDeviceGetPowerUsage, nvmlDeviceGetTemperature. Power is reported in mW and temperature in Celcius. The counter descriptions should contain the units that the measurement returns. More...

Include dependency graph for linux-nvml.c:

Go to the source code of this file.

Data Structures

struct  nvml_context_t
 

Macros

#define CUDAAPI   __attribute__((weak))
 
#define CUDARTAPI   __attribute__((weak))
 
#define DECLDIR   __attribute__((weak))
 
#define NVML_MAX_COUNTERS   100
 

Functions

unsigned long long getClockSpeed (nvmlDevice_t dev, nvmlClockType_t which_one)
 
unsigned long long getEccLocalErrors (nvmlDevice_t dev, nvmlEccBitType_t bits, int which_one)
 
unsigned long long getFanSpeed (nvmlDevice_t dev)
 
unsigned long long getMaxClockSpeed (nvmlDevice_t dev, nvmlClockType_t which_one)
 
unsigned long long getMemoryInfo (nvmlDevice_t dev, int which_one)
 
unsigned long long getPState (nvmlDevice_t dev)
 
unsigned long long getPowerUsage (nvmlDevice_t dev)
 
unsigned long long getTemperature (nvmlDevice_t dev)
 
unsigned long long getTotalEccErrors (nvmlDevice_t dev, nvmlEccBitType_t bits)
 
unsigned long long getUtilization (nvmlDevice_t dev, int which_one)
 
unsigned long long getPowerManagementLimit (nvmlDevice_t dev)
 
static void nvml_hardware_reset ()
 
static int nvml_hardware_read (long long *value, int which_one)
 
static int nvml_hardware_write (long long *value, int which_one)
 
int _papi_nvml_init_thread (hwd_context_t *ctx)
 
static int detectDevices ()
 
static void createNativeEvents ()
 
int _papi_nvml_shutdown_component ()
 
int _papi_nvml_init_component (int cidx)
 
static int linkCudaLibraries ()
 
int _papi_nvml_init_control_state (hwd_control_state_t *ctl)
 
int _papi_nvml_update_control_state (hwd_control_state_t *ctl, NativeInfo_t *native, int count, hwd_context_t *ctx)
 
int _papi_nvml_start (hwd_context_t *ctx, hwd_control_state_t *ctl)
 
int _papi_nvml_stop (hwd_context_t *ctx, hwd_control_state_t *ctl)
 
int _papi_nvml_read (hwd_context_t *ctx, hwd_control_state_t *ctl, long long **events, int flags)
 
int _papi_nvml_write (hwd_context_t *ctx, hwd_control_state_t *ctl, long long *events)
 
int _papi_nvml_reset (hwd_context_t *ctx, hwd_control_state_t *ctl)
 
int _papi_nvml_shutdown_thread (hwd_context_t *ctx)
 
int _papi_nvml_ctl (hwd_context_t *ctx, int code, _papi_int_option_t *option)
 
int _papi_nvml_set_domain (hwd_control_state_t *cntrl, int domain)
 
int _papi_nvml_ntv_enum_events (unsigned int *EventCode, int modifier)
 
int _papi_nvml_ntv_code_to_name (unsigned int EventCode, char *name, int len)
 
int _papi_nvml_ntv_code_to_descr (unsigned int EventCode, char *descr, int len)
 
int _papi_nvml_ntv_code_to_info (unsigned int EventCode, PAPI_event_info_t *info)
 

Variables

void(* _dl_non_dynamic_init )(void)
 
 nvml_control_state_t
 
static nvml_native_event_entry_tnvml_native_table = NULL
 
static int device_count = 0
 
static int num_events = 0
 
static nvmlDevice_t * devices = NULL
 
static int * features = NULL
 
static unsigned int * power_management_initial_limit = NULL
 
static unsigned int * power_management_limit_constraint_min = NULL
 
static unsigned int * power_management_limit_constraint_max = NULL
 
papi_vector_t _nvml_vector
 

Detailed Description

Author
Kiran Kumar Kasichayanula kkasi.nosp@m.cha@.nosp@m.utk.e.nosp@m.du
James Ralph ralph.nosp@m.@eec.nosp@m.s.utk.nosp@m..edu

Definition in file linux-nvml.c.

Macro Definition Documentation

◆ CUDAAPI

#define CUDAAPI   __attribute__((weak))

◆ CUDARTAPI

#define CUDARTAPI   __attribute__((weak))

◆ DECLDIR

#define DECLDIR   __attribute__((weak))

◆ NVML_MAX_COUNTERS

#define NVML_MAX_COUNTERS   100

Function Documentation

◆ _papi_nvml_ctl()

int _papi_nvml_ctl ( hwd_context_t ctx,
int  code,
_papi_int_option_t option 
)

This function sets various options in the component

Parameters
codevalid are PAPI_SET_DEFDOM, PAPI_SET_DOMAIN, PAPI_SETDEFGRN, PAPI_SET_GRANUL and PAPI_SET_INHERIT

Definition at line 1471 of file linux-nvml.c.

1472 {
1473  SUBDBG("Enter: ctx: %p, code: %d\n", ctx, code);
1474 
1475  (void) ctx;
1476  (void) code;
1477  (void) option;
1478 
1479  /* FIXME. This should maybe set up more state, such as which counters are active and */
1480  /* counter mappings. */
1481 
1482  return PAPI_OK;
1483 }
#define PAPI_OK
Definition: fpapi.h:105
#define SUBDBG(format, args...)
Definition: papi_debug.h:63

◆ _papi_nvml_init_component()

int _papi_nvml_init_component ( int  cidx)

Initialize hardware counters, setup the function vector table and get hardware information, this routine is called when the PAPI process is initialized (IE PAPI_library_init)

Definition at line 1050 of file linux-nvml.c.

1051 {
1052  SUBDBG("Entry: cidx: %d\n", cidx);
1053  nvmlReturn_t ret;
1054  cudaError_t cuerr;
1055  int papi_errorcode;
1056 
1057  int cuda_count = 0;
1058  unsigned int nvml_count = 0;
1059 
1060  /* link in the cuda and nvml libraries and resolve the symbols we need to use */
1061  if (linkCudaLibraries() != PAPI_OK) {
1062  SUBDBG("Dynamic link of CUDA libraries failed, component will be disabled.\n");
1063  SUBDBG("See disable reason in papi_component_avail output for more details.\n");
1064  _papi_nvml_shutdown_component(); // clean up any open dynLibs, mallocs, etc.
1065  return (PAPI_ENOSUPP);
1066  }
1067 
1068  ret = (*nvmlInitPtr)();
1069  if (NVML_SUCCESS != ret) {
1070  strcpy(_nvml_vector.cmp_info.disabled_reason, "The NVIDIA managament library failed to initialize.");
1071  _papi_nvml_shutdown_component(); // clean up any open dynLibs, mallocs, etc.
1072  return PAPI_ENOSUPP;
1073  }
1074 
1075  cuerr = (*cuInitPtr)(0);
1076  if (cudaSuccess != cuerr) {
1077  strcpy(_nvml_vector.cmp_info.disabled_reason, "The CUDA library failed to initialize.");
1078  _papi_nvml_shutdown_component(); // clean up any open dynLibs, mallocs, etc.
1079  return PAPI_ENOSUPP;
1080  }
1081 
1082  /* Figure out the number of CUDA devices in the system */
1083  ret = (*nvmlDeviceGetCountPtr)(&nvml_count);
1084  if (NVML_SUCCESS != ret) {
1085  strcpy(_nvml_vector.cmp_info.disabled_reason, "Unable to get a count of devices from the NVIDIA managament library.");
1086  _papi_nvml_shutdown_component(); // clean up any open dynLibs, mallocs, etc.
1087  return PAPI_ENOSUPP;
1088  }
1089 
1090  cuerr = (*cudaGetDeviceCountPtr)(&cuda_count);
1091  if (cudaSuccess != cuerr) {
1092  strcpy(_nvml_vector.cmp_info.disabled_reason, "Unable to get a device count from CUDA.");
1093  _papi_nvml_shutdown_component(); // clean up any open dynLibs, mallocs, etc.
1094  return PAPI_ENOSUPP;
1095  }
1096 
1097  /* We can probably recover from this, when we're clever */
1098  if ((cuda_count > 0) && (nvml_count != (unsigned int)cuda_count)) {
1099  strcpy(_nvml_vector.cmp_info.disabled_reason, "CUDA and the NVIDIA managament library have different device counts.");
1100  _papi_nvml_shutdown_component(); // clean up any open dynLibs, mallocs, etc.
1101  return PAPI_ENOSUPP;
1102  }
1103 
1104  device_count = cuda_count;
1105  SUBDBG("Need to setup NVML with %d devices\n", device_count);
1106 
1107  /* A per device representation of what events are present */
1108  features = (int*)papi_malloc(sizeof(int) * device_count);
1109 
1110  /* Handles to each device */
1111  devices = (nvmlDevice_t*)papi_malloc(sizeof(nvmlDevice_t) * device_count);
1112 
1113  /* For each device, store the intial power value to enable reset if power is altered */
1114  power_management_initial_limit = (unsigned int*)papi_malloc(sizeof(unsigned int) * device_count);
1115  power_management_limit_constraint_min = (unsigned int*)papi_malloc(sizeof(unsigned int) * device_count);
1116  power_management_limit_constraint_max = (unsigned int*)papi_malloc(sizeof(unsigned int) * device_count);
1117 
1118  /* Figure out what events are supported on each card. */
1119  if ((papi_errorcode = detectDevices()) != PAPI_OK) {
1121  papi_free(devices);
1122  sprintf(_nvml_vector.cmp_info.disabled_reason, "An error occured in device feature detection, please check your NVIDIA Management Library and CUDA install.");
1123  _papi_nvml_shutdown_component(); // clean up any open dynLibs, mallocs, etc.
1124  return PAPI_ENOSUPP;
1125  }
1126 
1127  /* The assumption is that if everything went swimmingly in detectDevices,
1128  all nvml calls here should be fine. */
1130 
1131  /* Export the total number of events available */
1133 
1134  /* Export the component id */
1136 
1137  /* Export the number of 'counters' */
1140 
1141  return PAPI_OK;
1142 }
#define PAPI_OK
Definition: fpapi.h:105
char disabled_reason[PAPI_MAX_STR_LEN]
Definition: papi.h:637
static unsigned int * power_management_limit_constraint_min
Definition: linux-nvml.c:166
static unsigned int * power_management_initial_limit
Definition: linux-nvml.c:165
static int linkCudaLibraries()
Definition: linux-nvml.c:1151
#define papi_free(a)
Definition: papi_memory.h:35
#define PAPI_ENOSUPP
Definition: fpapi.h:123
#define papi_malloc(a)
Definition: papi_memory.h:34
static int num_events
Definition: linux-nvml.c:161
PAPI_component_info_t cmp_info
Definition: papi_vector.h:20
int _papi_nvml_shutdown_component()
Definition: linux-nvml.c:1020
papi_vector_t _nvml_vector
Definition: linux-nvml.c:1637
static int cidx
static nvmlDevice_t * devices
Definition: linux-nvml.c:163
static int device_count
Definition: linux-nvml.c:158
static unsigned int * power_management_limit_constraint_max
Definition: linux-nvml.c:167
#define SUBDBG(format, args...)
Definition: papi_debug.h:63
static int * features
Definition: linux-nvml.c:164
long long ret
Definition: iozone.c:1346
static int detectDevices()
Definition: linux-nvml.c:592
static void createNativeEvents()
Definition: linux-nvml.c:751
Here is the call graph for this function:

◆ _papi_nvml_init_control_state()

int _papi_nvml_init_control_state ( hwd_control_state_t ctl)

Setup a counter control state. In general a control state holds the hardware info for an EventSet.

Definition at line 1311 of file linux-nvml.c.

1312 {
1313  SUBDBG("nvml_init_control_state... %p\n", ctl);
1314  nvml_control_state_t *nvml_ctl = (nvml_control_state_t *) ctl;
1315  memset(nvml_ctl, 0, sizeof(nvml_control_state_t));
1316 
1317  return PAPI_OK;
1318 }
#define PAPI_OK
Definition: fpapi.h:105
nvml_control_state_t
Definition: linux-nvml.c:147
#define SUBDBG(format, args...)
Definition: papi_debug.h:63

◆ _papi_nvml_init_thread()

int _papi_nvml_init_thread ( hwd_context_t ctx)

This is called whenever a thread is initialized

Definition at line 582 of file linux-nvml.c.

583 {
584  (void) ctx;
585 
586  SUBDBG("Enter: ctx: %p\n", ctx);
587 
588  return PAPI_OK;
589 }
#define PAPI_OK
Definition: fpapi.h:105
#define SUBDBG(format, args...)
Definition: papi_debug.h:63

◆ _papi_nvml_ntv_code_to_descr()

int _papi_nvml_ntv_code_to_descr ( unsigned int  EventCode,
char *  descr,
int  len 
)

Takes a native event code and passes back the event description

Parameters
EventCodeis the native event code
descris a pointer for the description to be copied to
lenis the size of the descr string

Definition at line 1598 of file linux-nvml.c.

1599 {
1600  int index;
1601  index = EventCode;
1602 
1603  if (index >= num_events) return PAPI_ENOEVNT;
1604 
1605  strncpy(descr, nvml_native_table[index].description, len);
1606 
1607  return PAPI_OK;
1608 }
#define PAPI_OK
Definition: fpapi.h:105
static int num_events
Definition: linux-nvml.c:161
static nvml_native_event_entry_t * nvml_native_table
Definition: linux-nvml.c:155
char description[PAPI_MAX_STR_LEN]
#define PAPI_ENOEVNT
Definition: fpapi.h:112

◆ _papi_nvml_ntv_code_to_info()

int _papi_nvml_ntv_code_to_info ( unsigned int  EventCode,
PAPI_event_info_t info 
)

Takes a native event code and passes back the event info

Parameters
EventCodeis the native event code
infois a pointer for the info to be copied to

Definition at line 1615 of file linux-nvml.c.

1616 {
1617 
1618  int index = EventCode;
1619 
1620  if ((index < 0) || (index >= num_events)) return PAPI_ENOEVNT;
1621 
1622  strncpy(info->symbol, nvml_native_table[index].name, sizeof(info->symbol) - 1);
1623  info->symbol[sizeof(info->symbol) - 1] = '\0';
1624 
1625  strncpy(info->units, nvml_native_table[index].units, sizeof(info->units) - 1);
1626  info->units[sizeof(info->units) - 1] = '\0';
1627 
1628  strncpy(info->long_descr, nvml_native_table[index].description, sizeof(info->long_descr) - 1);
1629  info->long_descr[sizeof(info->long_descr) - 1] = '\0';
1630 
1631 // info->data_type = nvml_native_table[index].return_type;
1632 
1633  return PAPI_OK;
1634 }
#define PAPI_OK
Definition: fpapi.h:105
char units[PAPI_MIN_STR_LEN]
Definition: papi.h:976
static int num_events
Definition: linux-nvml.c:161
char long_descr[PAPI_HUGE_STR_LEN]
Definition: papi.h:970
char name[PAPI_MAX_STR_LEN]
Definition: linux-nvml.h:50
static nvml_native_event_entry_t * nvml_native_table
Definition: linux-nvml.c:155
char symbol[PAPI_HUGE_STR_LEN]
Definition: papi.h:967
char units[PAPI_MIN_STR_LEN]
Definition: linux-nvml.h:51
char description[PAPI_MAX_STR_LEN]
Definition: linux-nvml.h:52
#define PAPI_ENOEVNT
Definition: fpapi.h:112

◆ _papi_nvml_ntv_code_to_name()

int _papi_nvml_ntv_code_to_name ( unsigned int  EventCode,
char *  name,
int  len 
)

Takes a native event code and passes back the name

Parameters
EventCodeis the native event code
nameis a pointer for the name to be copied to
lenis the size of the name string

Definition at line 1577 of file linux-nvml.c.

1578 {
1579  SUBDBG("Entry: EventCode: %#x, name: %s, len: %d\n", EventCode, name, len);
1580  int index;
1581 
1582  index = EventCode;
1583 
1584  /* Make sure we are in range */
1585  if (index >= num_events) return PAPI_ENOEVNT;
1586 
1587  strncpy(name, nvml_native_table[index].name, len);
1588 
1589  return PAPI_OK;
1590 }
#define PAPI_OK
Definition: fpapi.h:105
static const char * name
Definition: fork_overflow.c:31
static int num_events
Definition: linux-nvml.c:161
static nvml_native_event_entry_t * nvml_native_table
Definition: linux-nvml.c:155
#define SUBDBG(format, args...)
Definition: papi_debug.h:63
#define PAPI_ENOEVNT
Definition: fpapi.h:112

◆ _papi_nvml_ntv_enum_events()

int _papi_nvml_ntv_enum_events ( unsigned int *  EventCode,
int  modifier 
)

Enumerate Native Events

Parameters
EventCodeis the event of interest
modifieris one of PAPI_ENUM_FIRST, PAPI_ENUM_EVENTS If your component has attribute masks then these need to be handled here as well.

Definition at line 1536 of file linux-nvml.c.

1537 {
1538  int index;
1539 
1540  switch (modifier) {
1541 
1542  /* return EventCode of first event */
1543  case PAPI_ENUM_FIRST:
1544  /* return the first event that we support */
1545 
1546  *EventCode = 0;
1547  return PAPI_OK;
1548 
1549  /* return EventCode of next available event */
1550  case PAPI_ENUM_EVENTS:
1551  index = *EventCode;
1552 
1553  /* Make sure we are in range */
1554  if (index < num_events - 1) {
1555 
1556  /* This assumes a non-sparse mapping of the events */
1557  *EventCode = *EventCode + 1;
1558  return PAPI_OK;
1559  } else {
1560  return PAPI_ENOEVNT;
1561  }
1562  break;
1563 
1564  default:
1565  return PAPI_EINVAL;
1566  }
1567 
1568  return PAPI_EINVAL;
1569 }
#define PAPI_OK
Definition: fpapi.h:105
#define PAPI_EINVAL
Definition: fpapi.h:106
static int num_events
Definition: linux-nvml.c:161
#define PAPI_ENOEVNT
Definition: fpapi.h:112

◆ _papi_nvml_read()

int _papi_nvml_read ( hwd_context_t ctx,
hwd_control_state_t ctl,
long long **  events,
int  flags 
)

Triggered by PAPI_read()

Definition at line 1389 of file linux-nvml.c.

1391 {
1392  SUBDBG("Enter: ctx: %p, flags: %d\n", ctx, flags);
1393 
1394  (void) ctx;
1395  (void) flags;
1396  int i;
1397  int ret;
1398  nvml_control_state_t* nvml_ctl = (nvml_control_state_t*) ctl;
1399 
1400  for (i = 0; i < nvml_ctl->num_events; i++) {
1401  if (PAPI_OK !=
1402  (ret = nvml_hardware_read(&nvml_ctl->counter[i],
1403  nvml_ctl->which_counter[i])))
1404  return ret;
1405 
1406  }
1407  /* return pointer to the values we read */
1408  *events = nvml_ctl->counter;
1409  return PAPI_OK;
1410 }
#define PAPI_OK
Definition: fpapi.h:105
char events[MAX_EVENTS][BUFSIZ]
nvml_control_state_t
Definition: linux-nvml.c:147
#define SUBDBG(format, args...)
Definition: papi_debug.h:63
long long ret
Definition: iozone.c:1346
int i
Definition: fileop.c:140
static int nvml_hardware_read(long long *value, int which_one)
Definition: linux-nvml.c:449
Here is the call graph for this function:

◆ _papi_nvml_reset()

int _papi_nvml_reset ( hwd_context_t ctx,
hwd_control_state_t ctl 
)

Triggered by PAPI_reset() but only if the EventSet is currently running

Definition at line 1441 of file linux-nvml.c.

1442 {
1443  SUBDBG("Enter: ctx: %p, ctl: %p\n", ctx, ctl);
1444 
1445  (void) ctx;
1446  (void) ctl;
1447 
1448  /* Reset the hardware */
1450 
1451  return PAPI_OK;
1452 }
#define PAPI_OK
Definition: fpapi.h:105
#define SUBDBG(format, args...)
Definition: papi_debug.h:63
static void nvml_hardware_reset()
Definition: linux-nvml.c:416
Here is the call graph for this function:

◆ _papi_nvml_set_domain()

int _papi_nvml_set_domain ( hwd_control_state_t cntrl,
int  domain 
)

This function has to set the bits needed to count different domains In particular: PAPI_DOM_USER, PAPI_DOM_KERNEL PAPI_DOM_OTHER By default return PAPI_EINVAL if none of those are specified and PAPI_OK with success PAPI_DOM_USER is only user context is counted PAPI_DOM_KERNEL is only the Kernel/OS context is counted PAPI_DOM_OTHER is Exception/transient mode (like user TLB misses) PAPI_DOM_ALL is all of the domains

Definition at line 1495 of file linux-nvml.c.

1496 {
1497  SUBDBG("Enter: cntrl: %p, domain: %d\n", cntrl, domain);
1498 
1499  (void) cntrl;
1500 
1501  int found = 0;
1502 
1503  if (PAPI_DOM_USER & domain) {
1504  SUBDBG(" PAPI_DOM_USER \n");
1505  found = 1;
1506  }
1507  if (PAPI_DOM_KERNEL & domain) {
1508  SUBDBG(" PAPI_DOM_KERNEL \n");
1509  found = 1;
1510  }
1511  if (PAPI_DOM_OTHER & domain) {
1512  SUBDBG(" PAPI_DOM_OTHER \n");
1513  found = 1;
1514  }
1515  if (PAPI_DOM_ALL & domain) {
1516  SUBDBG(" PAPI_DOM_ALL \n");
1517  found = 1;
1518  }
1519  if (!found)
1520  return (PAPI_EINVAL);
1521 
1522  return PAPI_OK;
1523 }
#define PAPI_OK
Definition: fpapi.h:105
#define PAPI_DOM_KERNEL
Definition: fpapi.h:22
#define PAPI_EINVAL
Definition: fpapi.h:106
#define PAPI_DOM_OTHER
Definition: fpapi.h:23
#define PAPI_DOM_USER
Definition: fpapi.h:21
#define SUBDBG(format, args...)
Definition: papi_debug.h:63
#define PAPI_DOM_ALL
Definition: fpapi.h:25

◆ _papi_nvml_shutdown_component()

int _papi_nvml_shutdown_component ( )

Definition at line 1020 of file linux-nvml.c.

1021 {
1022  SUBDBG("Enter:\n");
1025  if (devices != NULL) papi_free(devices);
1026  if (features != NULL) papi_free(features);
1030  if (nvmlShutdownPtr) (*nvmlShutdownPtr)(); // Call nvml shutdown if we got that far.
1031 
1032  device_count = 0;
1033  num_events = 0;
1034 
1035  // close the dynamic libraries needed by this component (opened in the init component call)
1036  if (dl3) {dlclose(dl3); dl3=NULL;}
1037  if (dl2) {dlclose(dl2); dl2=NULL;}
1038  if (dl1) {dlclose(dl1); dl1=NULL;}
1039 
1040  return PAPI_OK;
1041 }
#define PAPI_OK
Definition: fpapi.h:105
static unsigned int * power_management_limit_constraint_min
Definition: linux-nvml.c:166
static unsigned int * power_management_initial_limit
Definition: linux-nvml.c:165
#define papi_free(a)
Definition: papi_memory.h:35
static int num_events
Definition: linux-nvml.c:161
static nvml_native_event_entry_t * nvml_native_table
Definition: linux-nvml.c:155
static void * dl1
Definition: linux-cuda.c:110
static void * dl2
Definition: linux-cuda.c:111
static nvmlDevice_t * devices
Definition: linux-nvml.c:163
static int device_count
Definition: linux-nvml.c:158
static unsigned int * power_management_limit_constraint_max
Definition: linux-nvml.c:167
#define SUBDBG(format, args...)
Definition: papi_debug.h:63
static int * features
Definition: linux-nvml.c:164
static void * dl3
Definition: linux-cuda.c:112
nvmlReturn_t(* nvmlShutdownPtr)(void)
Definition: benchSANVML.c:84
static void nvml_hardware_reset()
Definition: linux-nvml.c:416
Here is the call graph for this function:
Here is the caller graph for this function:

◆ _papi_nvml_shutdown_thread()

int _papi_nvml_shutdown_thread ( hwd_context_t ctx)

Called at thread shutdown

Definition at line 1456 of file linux-nvml.c.

1457 {
1458  SUBDBG("Enter: ctx: %p\n", ctx);
1459 
1460  (void) ctx;
1461 
1462  /* Last chance to clean up thread */
1463 
1464  return PAPI_OK;
1465 }
#define PAPI_OK
Definition: fpapi.h:105
#define SUBDBG(format, args...)
Definition: papi_debug.h:63

◆ _papi_nvml_start()

int _papi_nvml_start ( hwd_context_t ctx,
hwd_control_state_t ctl 
)

Triggered by PAPI_start()

Definition at line 1348 of file linux-nvml.c.

1349 {
1350  SUBDBG("Enter: ctx: %p, ctl: %p\n", ctx, ctl);
1351 
1352  (void) ctx;
1353  (void) ctl;
1354 
1355  /* anything that would need to be set at counter start time */
1356 
1357  /* reset */
1358  /* start the counting */
1359 
1360  return PAPI_OK;
1361 }
#define PAPI_OK
Definition: fpapi.h:105
#define SUBDBG(format, args...)
Definition: papi_debug.h:63

◆ _papi_nvml_stop()

int _papi_nvml_stop ( hwd_context_t ctx,
hwd_control_state_t ctl 
)

Triggered by PAPI_stop()

Definition at line 1365 of file linux-nvml.c.

1366 {
1367  SUBDBG("Enter: ctx: %p, ctl: %p\n", ctx, ctl);
1368 
1369  int i;
1370  (void) ctx;
1371  (void) ctl;
1372  int ret;
1373 
1374  nvml_control_state_t* nvml_ctl = (nvml_control_state_t*) ctl;
1375 
1376  for (i = 0; i < nvml_ctl->num_events; i++) {
1377  if (PAPI_OK !=
1378  (ret = nvml_hardware_read(&nvml_ctl->counter[i],
1379  nvml_ctl->which_counter[i])))
1380  return ret;
1381 
1382  }
1383 
1384  return PAPI_OK;
1385 }
#define PAPI_OK
Definition: fpapi.h:105
nvml_control_state_t
Definition: linux-nvml.c:147
#define SUBDBG(format, args...)
Definition: papi_debug.h:63
long long ret
Definition: iozone.c:1346
int i
Definition: fileop.c:140
static int nvml_hardware_read(long long *value, int which_one)
Definition: linux-nvml.c:449
Here is the call graph for this function:

◆ _papi_nvml_update_control_state()

int _papi_nvml_update_control_state ( hwd_control_state_t ctl,
NativeInfo_t native,
int  count,
hwd_context_t ctx 
)

Triggered by eventset operations like add or remove

Definition at line 1322 of file linux-nvml.c.

1326 {
1327  SUBDBG("Enter: ctl: %p, ctx: %p\n", ctl, ctx);
1328  int i, index;
1329 
1330  nvml_control_state_t *nvml_ctl = (nvml_control_state_t *) ctl;
1331  (void) ctx;
1332 
1333  /* if no events, return */
1334  if (count == 0) return PAPI_OK;
1335 
1336  for (i = 0; i < count; i++) {
1337  index = native[i].ni_event;
1338  nvml_ctl->which_counter[i] = index;
1339  /* We have no constraints on event position, so any event */
1340  /* can be in any slot. */
1341  native[i].ni_position = i;
1342  }
1343  nvml_ctl->num_events = count;
1344  return PAPI_OK;
1345 }
#define PAPI_OK
Definition: fpapi.h:105
nvml_control_state_t
Definition: linux-nvml.c:147
static int native
#define SUBDBG(format, args...)
Definition: papi_debug.h:63
static long count
int i
Definition: fileop.c:140

◆ _papi_nvml_write()

int _papi_nvml_write ( hwd_context_t ctx,
hwd_control_state_t ctl,
long long *  events 
)

Triggered by PAPI_write(), but only if the counters are running

Definition at line 1415 of file linux-nvml.c.

1416 {
1417  SUBDBG("Enter: ctx: %p, ctl: %p\n", ctx, ctl);
1418  (void) ctx;
1419  nvml_control_state_t* nvml_ctl = (nvml_control_state_t*) ctl;
1420  int i;
1421  int ret;
1422 
1423  /* You can change ECC mode and compute exclusivity modes on the cards */
1424  /* But I don't see this as a function of a PAPI component at this time */
1425  /* All implementation issues aside. */
1426 
1427  // Currently POWER_MANAGEMENT can be written
1428  for (i = 0; i < nvml_ctl->num_events; i++) {
1429  if (PAPI_OK != (ret = nvml_hardware_write(&events[i], nvml_ctl->which_counter[i])))
1430  return ret;
1431  }
1432 
1433  /* return pointer to the values we read */
1434  return PAPI_OK;
1435 }
#define PAPI_OK
Definition: fpapi.h:105
static int nvml_hardware_write(long long *value, int which_one)
Definition: linux-nvml.c:531
char events[MAX_EVENTS][BUFSIZ]
nvml_control_state_t
Definition: linux-nvml.c:147
#define SUBDBG(format, args...)
Definition: papi_debug.h:63
long long ret
Definition: iozone.c:1346
int i
Definition: fileop.c:140
Here is the call graph for this function:

◆ createNativeEvents()

static void createNativeEvents ( void  )
static

Definition at line 751 of file linux-nvml.c.

752 {
753  char name[64];
754  char sanitized_name[PAPI_MAX_STR_LEN];
755  char names[device_count][64];
756 
757  int i, nameLen = 0, j;
758 
760  nvmlReturn_t ret;
761 
765  entry = &nvml_native_table[0];
766 
767  for (i = 0; i < device_count; i++) {
768  memset(names[i], 0x0, 64);
769  ret = (*nvmlDeviceGetNamePtr)(devices[i], name, sizeof(name) - 1);
770  if (NVML_SUCCESS != ret) {
771  SUBDBG("nvmlDeviceGetName failed \n");
772  strncpy(name, "deviceNameUnknown", 17);
773  }
774  name[sizeof(name) - 1] = '\0'; // to safely use strlen operation below, the variable 'name' must be null terminated
775 
776  nameLen = strlen(name);
777  strncpy(sanitized_name, name, PAPI_MAX_STR_LEN);
778 
779  int retval = snprintf(sanitized_name, sizeof(name), "%s:device_%d", name, i);
780  if (retval > (int)sizeof(name)) {
781  SUBDBG("Device name is too long %s:device%d", name, i);
782  return;
783  }
784  sanitized_name[sizeof(name) - 1] = '\0';
785 
786  for (j = 0; j < nameLen; j++)
787  if (' ' == sanitized_name[j])
788  sanitized_name[j] = '_';
789 
791  sprintf(entry->name, "%s:graphics_clock", sanitized_name);
792  strncpy(entry->description, "Graphics clock domain (MHz).", PAPI_MAX_STR_LEN);
793  entry->options.clock = NVML_CLOCK_GRAPHICS;
794  entry->type = FEATURE_CLOCK_INFO;
795  entry++;
796 
797  sprintf(entry->name, "%s:sm_clock", sanitized_name);
798  strncpy(entry->description, "SM clock domain (MHz).", PAPI_MAX_STR_LEN);
799  entry->options.clock = NVML_CLOCK_SM;
800  entry->type = FEATURE_CLOCK_INFO;
801  entry++;
802 
803  sprintf(entry->name, "%s:memory_clock", sanitized_name);
804  strncpy(entry->description, "Memory clock domain (MHz).", PAPI_MAX_STR_LEN);
805  entry->options.clock = NVML_CLOCK_MEM;
806  entry->type = FEATURE_CLOCK_INFO;
807  entry++;
808  }
809 
811  sprintf(entry->name, "%s:l1_single_ecc_errors", sanitized_name);
812  strncpy(entry->description, "L1 cache single bit ECC", PAPI_MAX_STR_LEN);
813  entry->options.ecc_opts = (struct local_ecc) {
814  .bits = NVML_SINGLE_BIT_ECC,
815  .which_one = LOCAL_ECC_L1,
816  };
818  entry++;
819 
820  sprintf(entry->name, "%s:l2_single_ecc_errors", sanitized_name);
821  strncpy(entry->description, "L2 cache single bit ECC", PAPI_MAX_STR_LEN);
822  entry->options.ecc_opts = (struct local_ecc) {
823  .bits = NVML_SINGLE_BIT_ECC,
824  .which_one = LOCAL_ECC_L2,
825  };
827  entry++;
828 
829  sprintf(entry->name, "%s:memory_single_ecc_errors", sanitized_name);
830  strncpy(entry->description, "Device memory single bit ECC", PAPI_MAX_STR_LEN);
831  entry->options.ecc_opts = (struct local_ecc) {
832  .bits = NVML_SINGLE_BIT_ECC,
833  .which_one = LOCAL_ECC_MEM,
834  };
836  entry++;
837 
838  sprintf(entry->name, "%s:regfile_single_ecc_errors", sanitized_name);
839  strncpy(entry->description, "Register file single bit ECC", PAPI_MAX_STR_LEN);
840  entry->options.ecc_opts = (struct local_ecc) {
841  .bits = NVML_SINGLE_BIT_ECC,
842  .which_one = LOCAL_ECC_REGFILE,
843  };
845  entry++;
846 
847  sprintf(entry->name, "%s:1l_double_ecc_errors", sanitized_name);
848  strncpy(entry->description, "L1 cache double bit ECC", PAPI_MAX_STR_LEN);
849  entry->options.ecc_opts = (struct local_ecc) {
850  .bits = NVML_DOUBLE_BIT_ECC,
851  .which_one = LOCAL_ECC_L1,
852  };
854  entry++;
855 
856  sprintf(entry->name, "%s:l2_double_ecc_errors", sanitized_name);
857  strncpy(entry->description, "L2 cache double bit ECC", PAPI_MAX_STR_LEN);
858  entry->options.ecc_opts = (struct local_ecc) {
859  .bits = NVML_DOUBLE_BIT_ECC,
860  .which_one = LOCAL_ECC_L2,
861  };
863  entry++;
864 
865  sprintf(entry->name, "%s:memory_double_ecc_errors", sanitized_name);
866  strncpy(entry->description, "Device memory double bit ECC", PAPI_MAX_STR_LEN);
867  entry->options.ecc_opts = (struct local_ecc) {
868  .bits = NVML_DOUBLE_BIT_ECC,
869  .which_one = LOCAL_ECC_MEM,
870  };
872  entry++;
873 
874  sprintf(entry->name, "%s:regfile_double_ecc_errors", sanitized_name);
875  strncpy(entry->description, "Register file double bit ECC", PAPI_MAX_STR_LEN);
876  entry->options.ecc_opts = (struct local_ecc) {
877  .bits = NVML_DOUBLE_BIT_ECC,
878  .which_one = LOCAL_ECC_REGFILE,
879  };
881  entry++;
882  }
883 
885  sprintf(entry->name, "%s:fan_speed", sanitized_name);
886  strncpy(entry->description, "The fan speed expressed as a percent of the maximum, i.e. full speed is 100%", PAPI_MAX_STR_LEN);
887  entry->type = FEATURE_FAN_SPEED;
888  entry++;
889  }
890 
892  sprintf(entry->name, "%s:graphics_max_clock", sanitized_name);
893  strncpy(entry->description, "Maximal Graphics clock domain (MHz).", PAPI_MAX_STR_LEN);
894  entry->options.clock = NVML_CLOCK_GRAPHICS;
895  entry->type = FEATURE_MAX_CLOCK;
896  entry++;
897 
898  sprintf(entry->name, "%s:sm_max_clock", sanitized_name);
899  strncpy(entry->description, "Maximal SM clock domain (MHz).", PAPI_MAX_STR_LEN);
900  entry->options.clock = NVML_CLOCK_SM;
901  entry->type = FEATURE_MAX_CLOCK;
902  entry++;
903 
904  sprintf(entry->name, "%s:memory_max_clock", sanitized_name);
905  strncpy(entry->description, "Maximal Memory clock domain (MHz).", PAPI_MAX_STR_LEN);
906  entry->options.clock = NVML_CLOCK_MEM;
907  entry->type = FEATURE_MAX_CLOCK;
908  entry++;
909  }
910 
912  sprintf(entry->name, "%s:total_memory", sanitized_name);
913  strncpy(entry->description, "Total installed FB memory (in bytes).", PAPI_MAX_STR_LEN);
915  entry->type = FEATURE_MEMORY_INFO;
916  entry++;
917 
918  sprintf(entry->name, "%s:unallocated_memory", sanitized_name);
919  strncpy(entry->description, "Uncallocated FB memory (in bytes).", PAPI_MAX_STR_LEN);
921  entry->type = FEATURE_MEMORY_INFO;
922  entry++;
923 
924  sprintf(entry->name, "%s:allocated_memory", sanitized_name);
925  strncpy(entry->description, "Allocated FB memory (in bytes). Note that the driver/GPU always sets aside a small amount of memory for bookkeeping.", PAPI_MAX_STR_LEN);
927  entry->type = FEATURE_MEMORY_INFO;
928  entry++;
929  }
930 
932  sprintf(entry->name, "%s:pstate", sanitized_name);
933  strncpy(entry->description, "The performance state of the device.", PAPI_MAX_STR_LEN);
934  entry->type = FEATURE_PERF_STATES;
935  entry++;
936  }
937 
939  sprintf(entry->name, "%s:power", sanitized_name);
940  // set the power event units value to "mW" for miliwatts
941  strncpy(entry->units, "mW", PAPI_MIN_STR_LEN);
942  strncpy(entry->description, "Power usage reading for the device, in miliwatts. This is the power draw (+/-5 watts) for the entire board: GPU, memory, etc.", PAPI_MAX_STR_LEN);
943  entry->type = FEATURE_POWER;
944  entry++;
945  }
946 
948  sprintf(entry->name, "%s:temperature", sanitized_name);
949  strncpy(entry->description, "Current temperature readings for the device, in degrees C.", PAPI_MAX_STR_LEN);
950  entry->type = FEATURE_TEMP;
951  entry++;
952  }
953 
955  sprintf(entry->name, "%s:total_ecc_errors", sanitized_name);
956  strncpy(entry->description, "Total single bit errors.", PAPI_MAX_STR_LEN);
957  entry->options.ecc_opts = (struct local_ecc) {
958  .bits = NVML_SINGLE_BIT_ECC,
959  };
961  entry++;
962 
963  sprintf(entry->name, "%s:total_ecc_errors", sanitized_name);
964  strncpy(entry->description, "Total double bit errors.", PAPI_MAX_STR_LEN);
965  entry->options.ecc_opts = (struct local_ecc) {
966  .bits = NVML_DOUBLE_BIT_ECC,
967  };
969  entry++;
970  }
971 
973  sprintf(entry->name, "%s:gpu_utilization", sanitized_name);
974  strncpy(entry->description, "Percent of time over the past second during which one or more kernels was executing on the GPU.", PAPI_MAX_STR_LEN);
976  entry->type = FEATURE_UTILIZATION;
977  entry++;
978 
979  sprintf(entry->name, "%s:memory_utilization", sanitized_name);
980  strncpy(entry->description, "Percent of time over the past second during which global (device) memory was being read or written.", PAPI_MAX_STR_LEN);
982  entry->type = FEATURE_UTILIZATION;
983  entry++;
984  }
985 
987  sprintf(entry->name, "%s:power_management_limit", sanitized_name);
988  // set the power event units value to "mW" for milliwatts
989  strncpy(entry->units, "mW", PAPI_MIN_STR_LEN);
990  strncpy(entry->description, "Power management limit in milliwatts associated with the device. The power limit defines the upper boundary for the cards power draw. If the cards total power draw reaches this limit the power management algorithm kicks in. This should be writable (with appropriate privileges) on supported Kepler or later (unit milliWatts). ", PAPI_MAX_STR_LEN);
992  entry++;
993  }
995  sprintf(entry->name, "%s:power_management_limit_constraint_min", sanitized_name);
996  strncpy(entry->units, "mW", PAPI_MIN_STR_LEN);
997  strncpy(entry->description, "The minimum power management limit in milliwatts.", PAPI_MAX_STR_LEN);
999  entry++;
1000  }
1001 
1003  sprintf(entry->name, "%s:power_management_limit_constraint_max", sanitized_name);
1004  strncpy(entry->units, "mW", PAPI_MIN_STR_LEN);
1005  strncpy(entry->description, "The maximum power management limit in milliwatts.", PAPI_MAX_STR_LEN);
1007  entry++;
1008  }
1009 
1010  strncpy(names[i], name, sizeof(names[0]) - 1);
1011  names[i][sizeof(names[0]) - 1] = '\0';
1012  }
1013 } // create native events.
static const char * name
Definition: fork_overflow.c:31
#define FEATURE_NVML_POWER_MANAGEMENT_LIMIT_CONSTRAINT_MIN
Definition: linux-nvml.h:17
#define FEATURE_ECC_LOCAL_ERRORS
Definition: linux-nvml.h:7
int type
Definition: linux-nvml.h:53
#define papi_malloc(a)
Definition: papi_memory.h:34
#define MEMINFO_TOTAL_MEMORY
Definition: linux-nvml.h:22
#define FEATURE_ECC_TOTAL_ERRORS
Definition: linux-nvml.h:14
static int num_events
Definition: linux-nvml.c:161
struct local_ecc ecc_opts
Definition: linux-nvml.h:44
#define FEATURE_FAN_SPEED
Definition: linux-nvml.h:8
nvmlEccBitType_t bits
Definition: linux-nvml.h:38
#define LOCAL_ECC_MEM
Definition: linux-nvml.h:29
int retval
Definition: zero_fork.c:53
char name[PAPI_MAX_STR_LEN]
Definition: linux-nvml.h:50
static nvml_native_event_entry_t * nvml_native_table
Definition: linux-nvml.c:155
#define MEMINFO_ALLOCED
Definition: linux-nvml.h:24
#define FEATURE_UTILIZATION
Definition: linux-nvml.h:15
#define FEATURE_CLOCK_INFO
Definition: linux-nvml.h:6
nvml_resource_options_t options
Definition: linux-nvml.h:49
static nvmlDevice_t * devices
Definition: linux-nvml.c:163
#define FEATURE_MAX_CLOCK
Definition: linux-nvml.h:9
Definition: linux-nvml.h:48
static int device_count
Definition: linux-nvml.c:158
#define MEMORY_UTILIZATION
Definition: linux-nvml.h:32
#define FEATURE_PERF_STATES
Definition: linux-nvml.h:11
#define PAPI_MIN_STR_LEN
Definition: fpapi.h:41
nvmlClockType_t clock
Definition: linux-nvml.h:43
#define FEATURE_TEMP
Definition: linux-nvml.h:13
#define FEATURE_NVML_POWER_MANAGEMENT_LIMIT_CONSTRAINT_MAX
Definition: linux-nvml.h:18
#define SUBDBG(format, args...)
Definition: papi_debug.h:63
static int * features
Definition: linux-nvml.c:164
long long ret
Definition: iozone.c:1346
#define FEATURE_POWER
Definition: linux-nvml.h:12
char units[PAPI_MIN_STR_LEN]
Definition: linux-nvml.h:51
char description[PAPI_MAX_STR_LEN]
Definition: linux-nvml.h:52
#define LOCAL_ECC_L1
Definition: linux-nvml.h:27
#define HAS_FEATURE(features, query)
Definition: linux-nvml.h:20
#define GPU_UTILIZATION
Definition: linux-nvml.h:31
#define FEATURE_POWER_MANAGEMENT
Definition: linux-nvml.h:16
#define LOCAL_ECC_REGFILE
Definition: linux-nvml.h:26
#define FEATURE_MEMORY_INFO
Definition: linux-nvml.h:10
#define LOCAL_ECC_L2
Definition: linux-nvml.h:28
const char * names[NUM_EVENTS]
#define MEMINFO_UNALLOCED
Definition: linux-nvml.h:23
int i
Definition: fileop.c:140
#define PAPI_MAX_STR_LEN
Definition: fpapi.h:43
Here is the caller graph for this function:

◆ detectDevices()

static int detectDevices ( )
static

Definition at line 592 of file linux-nvml.c.

593 {
594  nvmlReturn_t ret;
595  nvmlEnableState_t mode = NVML_FEATURE_DISABLED;
596  nvmlEnableState_t pendingmode = NVML_FEATURE_DISABLED;
597 
598  char name[64];
599  char inforomECC[16];
600  char names[device_count][64];
601 
602  float ecc_version = 0.0;
603 
604  int i = 0;
605 
606  unsigned int temp = 0;
607 
608  memset(names, 0x0, device_count * 64);
609 
610  /* So for each card, check whats querable */
611  for (i = 0; i < device_count; i++) {
612  features[i] = 0;
613 
614  ret = (*nvmlDeviceGetHandleByIndexPtr)(i, &devices[i]);
615  if (NVML_SUCCESS != ret) {
616  SUBDBG("nvmlDeviceGetHandleByIndex(%d, &devices[%d]) failed.\n", i, i);
617  return PAPI_ESYS;
618  }
619 
620  ret = (*nvmlDeviceGetNamePtr)(devices[i], name, sizeof(name) - 1);
621  if (NVML_SUCCESS != ret) {
622  SUBDBG("nvmlDeviceGetName failed \n");
623  strncpy(name, "deviceNameUnknown", 17);
624  }
625 
626  name[sizeof(name) - 1] = '\0'; // to safely use strstr operation below, the variable 'name' must be null terminated
627 
628  ret = (*nvmlDeviceGetInforomVersionPtr)(devices[i], NVML_INFOROM_ECC, inforomECC, 16);
629  if (NVML_SUCCESS != ret) {
630  SUBDBG("nvmlGetInforomVersion fails %s\n", (*nvmlErrorStringPtr)(ret));
631  } else {
632  ecc_version = strtof(inforomECC, NULL);
633  }
634 
635  if (getClockSpeed(devices[i], NVML_CLOCK_GRAPHICS) != (unsigned long long) - 1) {
637  num_events += 3;
638  }
639 
640  /* For Tesla and Quadro products from Fermi and Kepler families.
641  requires NVML_INFOROM_ECC 2.0 or higher for location-based counts
642  requires NVML_INFOROM_ECC 1.0 or higher for all other ECC counts
643  requires ECC mode to be enabled. */
644  ret = (*nvmlDeviceGetEccModePtr)(devices[i], &mode, &pendingmode);
645  if (NVML_SUCCESS == ret) {
646  if (NVML_FEATURE_ENABLED == mode) {
647  if (ecc_version >= 2.0) {
649  num_events += 8; /* {single bit, two bit errors} x { reg, l1, l2, memory } */
650  }
651  if (ecc_version >= 1.0) {
653  num_events += 2; /* single bit errors, double bit errors */
654  }
655  }
656  } else {
657  SUBDBG("nvmlDeviceGetEccMode does not appear to be supported. (nvml return code %d)\n", ret);
658  }
659 
660  /* Check if fan speed is available */
661  if (getFanSpeed(devices[i]) != (unsigned long long) - 1) {
663  num_events++;
664  }
665 
666  /* Check if clock data are available */
667  if (getMaxClockSpeed(devices[i], NVML_CLOCK_GRAPHICS) != (unsigned long long) - 1) {
669  num_events += 3;
670  }
671 
672  /* For all products */
674  num_events += 3; /* total, free, used */
675 
676  /* Check if performance state is available */
677  if (getPState(devices[i]) != (unsigned long long) - 1) {
679  num_events++;
680  }
681 
682  /* For "GF11x" Tesla and Quadro products from the Fermi family
683  requires NVML_INFOROM_POWER 3.0 or higher
684  For Tesla and Quadro products from the Kepler family
685  does not require NVML_INFOROM_POWER */
686  /* Just try reading power, if it works, enable it*/
687  ret = (*nvmlDeviceGetPowerUsagePtr)(devices[i], &temp);
688  if (NVML_SUCCESS == ret) {
690  num_events++;
691  } else {
692  SUBDBG("nvmlDeviceGetPowerUsage does not appear to be supported on this card. (nvml return code %d)\n", ret);
693  }
694 
695  /* Check if temperature data are available */
696  if (getTemperature(devices[i]) != (unsigned long long) - 1) {
697  features[i] |= FEATURE_TEMP;
698  num_events++;
699  }
700 
701  // For power_management_limit
702  {
703  // Just try the call to see if it works
704  unsigned int templimit = 0;
705  ret = (*nvmlDeviceGetPowerManagementLimitPtr)(devices[i], &templimit);
706  if (ret == NVML_SUCCESS && templimit > 0) {
707  power_management_initial_limit[i] = templimit;
709  num_events += 1;
710  } else {
712  SUBDBG("nvmlDeviceGetPowerManagementLimit not appear to be supported on this card. (NVML code %d)\n", ret);
713  }
714  }
715 
716  // For power_management_limit_constraints, minimum and maximum
717  {
718  unsigned int minLimit = 0, maxLimit = 0;
719  ret = (*nvmlDeviceGetPowerManagementLimitConstraintsPtr)(devices[i], &minLimit, &maxLimit);
720  if (ret == NVML_SUCCESS) {
723  num_events += 1;
726  num_events += 1;
727  } else {
730  }
731  SUBDBG("Done nvmlDeviceGetPowerManagementLimitConstraintsPtr\n");
732  }
733 
734  /* Check if temperature data are available */
735  if (getUtilization(devices[i], GPU_UTILIZATION) != (unsigned long long) - 1) {
737  num_events += 2;
738  }
739 
740  int retval = snprintf(names[i], sizeof(name), "%s:device:%d", name, i);
741  if (retval > (int)sizeof(name)) {
742  SUBDBG("Device name is too long %s:device%d", name, i);
743  return (PAPI_EINVAL);
744  }
745  names[i][sizeof(name) - 1] = '\0';
746  }
747  return PAPI_OK;
748 }
#define PAPI_OK
Definition: fpapi.h:105
unsigned long long getPState(nvmlDevice_t dev)
Definition: linux-nvml.c:265
static const char * name
Definition: fork_overflow.c:31
#define FEATURE_NVML_POWER_MANAGEMENT_LIMIT_CONSTRAINT_MIN
Definition: linux-nvml.h:17
#define PAPI_EINVAL
Definition: fpapi.h:106
static unsigned int * power_management_limit_constraint_min
Definition: linux-nvml.c:166
static unsigned int * power_management_initial_limit
Definition: linux-nvml.c:165
#define FEATURE_ECC_LOCAL_ERRORS
Definition: linux-nvml.h:7
#define FEATURE_ECC_TOTAL_ERRORS
Definition: linux-nvml.h:14
static int num_events
Definition: linux-nvml.c:161
#define FEATURE_FAN_SPEED
Definition: linux-nvml.h:8
int retval
Definition: zero_fork.c:53
#define PAPI_ESYS
Definition: fpapi.h:108
#define FEATURE_UTILIZATION
Definition: linux-nvml.h:15
#define FEATURE_CLOCK_INFO
Definition: linux-nvml.h:6
static nvmlDevice_t * devices
Definition: linux-nvml.c:163
#define FEATURE_MAX_CLOCK
Definition: linux-nvml.h:9
static int device_count
Definition: linux-nvml.c:158
#define FEATURE_PERF_STATES
Definition: linux-nvml.h:11
#define FEATURE_TEMP
Definition: linux-nvml.h:13
#define FEATURE_NVML_POWER_MANAGEMENT_LIMIT_CONSTRAINT_MAX
Definition: linux-nvml.h:18
static unsigned int * power_management_limit_constraint_max
Definition: linux-nvml.c:167
#define SUBDBG(format, args...)
Definition: papi_debug.h:63
static int * features
Definition: linux-nvml.c:164
long long ret
Definition: iozone.c:1346
#define FEATURE_POWER
Definition: linux-nvml.h:12
unsigned long long getClockSpeed(nvmlDevice_t dev, nvmlClockType_t which_one)
Definition: linux-nvml.c:170
unsigned long long getTemperature(nvmlDevice_t dev)
Definition: linux-nvml.c:349
unsigned long long getFanSpeed(nvmlDevice_t dev)
Definition: linux-nvml.c:212
char *(* nvmlErrorStringPtr)(nvmlReturn_t)
Definition: benchSANVML.c:64
#define GPU_UTILIZATION
Definition: linux-nvml.h:31
#define FEATURE_POWER_MANAGEMENT
Definition: linux-nvml.h:16
unsigned long long getUtilization(nvmlDevice_t dev, int which_one)
Definition: linux-nvml.c:380
#define FEATURE_MEMORY_INFO
Definition: linux-nvml.h:10
const char * names[NUM_EVENTS]
int i
Definition: fileop.c:140
unsigned long long getMaxClockSpeed(nvmlDevice_t dev, nvmlClockType_t which_one)
Definition: linux-nvml.c:226
Here is the call graph for this function:
Here is the caller graph for this function:

◆ getClockSpeed()

unsigned long long getClockSpeed ( nvmlDevice_t  dev,
nvmlClockType_t  which_one 
)

Definition at line 170 of file linux-nvml.c.

171 {
172  unsigned int ret = 0;
173  nvmlReturn_t bad;
174  bad = (*nvmlDeviceGetClockInfoPtr)(dev, which_one, &ret);
175 
176  if (NVML_SUCCESS != bad) {
177  SUBDBG("something went wrong %s\n", (*nvmlErrorStringPtr)(bad));
178  return (unsigned long long) - 1;
179  }
180 
181  return (unsigned long long)ret;
182 }
#define SUBDBG(format, args...)
Definition: papi_debug.h:63
long long ret
Definition: iozone.c:1346
char *(* nvmlErrorStringPtr)(nvmlReturn_t)
Definition: benchSANVML.c:64
Here is the caller graph for this function:

◆ getEccLocalErrors()

unsigned long long getEccLocalErrors ( nvmlDevice_t  dev,
nvmlEccBitType_t  bits,
int  which_one 
)

Definition at line 185 of file linux-nvml.c.

186 {
187  nvmlEccErrorCounts_t counts;
188 
189  nvmlReturn_t bad;
190  bad = (*nvmlDeviceGetDetailedEccErrorsPtr)(dev, bits, NVML_VOLATILE_ECC , &counts);
191 
192  if (NVML_SUCCESS != bad) {
193  SUBDBG("something went wrong %s\n", (*nvmlErrorStringPtr)(bad));
194  return (unsigned long long) - 1;
195  }
196  switch (which_one) {
197  case LOCAL_ECC_REGFILE:
198  return counts.registerFile;
199  case LOCAL_ECC_L1:
200  return counts.l1Cache;
201  case LOCAL_ECC_L2:
202  return counts.l2Cache;
203  case LOCAL_ECC_MEM:
204  return counts.deviceMemory;
205  default:
206  ;
207  }
208  return (unsigned long long) - 1;
209 }
#define LOCAL_ECC_MEM
Definition: linux-nvml.h:29
#define SUBDBG(format, args...)
Definition: papi_debug.h:63
#define LOCAL_ECC_L1
Definition: linux-nvml.h:27
char *(* nvmlErrorStringPtr)(nvmlReturn_t)
Definition: benchSANVML.c:64
#define LOCAL_ECC_REGFILE
Definition: linux-nvml.h:26
#define LOCAL_ECC_L2
Definition: linux-nvml.h:28
Here is the caller graph for this function:

◆ getFanSpeed()

unsigned long long getFanSpeed ( nvmlDevice_t  dev)

Definition at line 212 of file linux-nvml.c.

213 {
214  unsigned int ret = 0;
215  nvmlReturn_t bad;
216  bad = (*nvmlDeviceGetFanSpeedPtr)(dev, &ret);
217 
218  if (NVML_SUCCESS != bad) {
219  SUBDBG("something went wrong %s\n", (*nvmlErrorStringPtr)(bad));
220  return (unsigned long long) - 1;
221  }
222  return (unsigned long long)ret;
223 }
#define SUBDBG(format, args...)
Definition: papi_debug.h:63
long long ret
Definition: iozone.c:1346
char *(* nvmlErrorStringPtr)(nvmlReturn_t)
Definition: benchSANVML.c:64
Here is the caller graph for this function:

◆ getMaxClockSpeed()

unsigned long long getMaxClockSpeed ( nvmlDevice_t  dev,
nvmlClockType_t  which_one 
)

Definition at line 226 of file linux-nvml.c.

227 {
228  unsigned int ret = 0;
229  nvmlReturn_t bad;
230  bad = (*nvmlDeviceGetClockInfoPtr)(dev, which_one, &ret);
231 
232  if (NVML_SUCCESS != bad) {
233  SUBDBG("something went wrong %s\n", (*nvmlErrorStringPtr)(bad));
234  return (unsigned long long) - 1;
235  }
236  return (unsigned long long) ret;
237 }
#define SUBDBG(format, args...)
Definition: papi_debug.h:63
long long ret
Definition: iozone.c:1346
char *(* nvmlErrorStringPtr)(nvmlReturn_t)
Definition: benchSANVML.c:64
Here is the caller graph for this function:

◆ getMemoryInfo()

unsigned long long getMemoryInfo ( nvmlDevice_t  dev,
int  which_one 
)

Definition at line 240 of file linux-nvml.c.

241 {
242  nvmlMemory_t meminfo;
243  nvmlReturn_t bad;
244  bad = (*nvmlDeviceGetMemoryInfoPtr)(dev, &meminfo);
245 
246  if (NVML_SUCCESS != bad) {
247  SUBDBG("something went wrong %s\n", (*nvmlErrorStringPtr)(bad));
248  return (unsigned long long) - 1;
249  }
250 
251  switch (which_one) {
253  return meminfo.total;
254  case MEMINFO_UNALLOCED:
255  return meminfo.free;
256  case MEMINFO_ALLOCED:
257  return meminfo.used;
258  default:
259  ;
260  }
261  return (unsigned long long) - 1;
262 }
#define MEMINFO_TOTAL_MEMORY
Definition: linux-nvml.h:22
#define MEMINFO_ALLOCED
Definition: linux-nvml.h:24
#define SUBDBG(format, args...)
Definition: papi_debug.h:63
char *(* nvmlErrorStringPtr)(nvmlReturn_t)
Definition: benchSANVML.c:64
#define MEMINFO_UNALLOCED
Definition: linux-nvml.h:23
Here is the caller graph for this function:

◆ getPowerManagementLimit()

unsigned long long getPowerManagementLimit ( nvmlDevice_t  dev)

Definition at line 403 of file linux-nvml.c.

404 {
405  unsigned int limit;
406  nvmlReturn_t rv;
407  rv = (*nvmlDeviceGetPowerManagementLimitPtr)(dev, &limit);
408  if (NVML_SUCCESS != rv) {
409  SUBDBG("something went wrong %s\n", (*nvmlErrorStringPtr)(rv));
410  return (unsigned long long) 0;
411  }
412  return (unsigned long long) limit;
413 }
#define SUBDBG(format, args...)
Definition: papi_debug.h:63
char *(* nvmlErrorStringPtr)(nvmlReturn_t)
Definition: benchSANVML.c:64
Here is the caller graph for this function:

◆ getPowerUsage()

unsigned long long getPowerUsage ( nvmlDevice_t  dev)

Definition at line 335 of file linux-nvml.c.

336 {
337  unsigned int power;
338  nvmlReturn_t bad;
339  bad = (*nvmlDeviceGetPowerUsagePtr)(dev, &power);
340 
341  if (NVML_SUCCESS != bad) {
342  SUBDBG("something went wrong %s\n", (*nvmlErrorStringPtr)(bad));
343  return (unsigned long long) - 1;
344  }
345  return (unsigned long long) power;
346 }
#define SUBDBG(format, args...)
Definition: papi_debug.h:63
char *(* nvmlErrorStringPtr)(nvmlReturn_t)
Definition: benchSANVML.c:64
Here is the caller graph for this function:

◆ getPState()

unsigned long long getPState ( nvmlDevice_t  dev)

Definition at line 265 of file linux-nvml.c.

266 {
267  unsigned int ret = 0;
268  nvmlPstates_t state = NVML_PSTATE_15;
269  nvmlReturn_t bad;
270  bad = (*nvmlDeviceGetPerformanceStatePtr)(dev, &state);
271 
272  if (NVML_SUCCESS != bad) {
273  SUBDBG("something went wrong %s\n", (*nvmlErrorStringPtr)(bad));
274  return (unsigned long long) - 1;
275  }
276  switch (state) {
277  case NVML_PSTATE_15:
278  ret++;
279  // fall through
280  case NVML_PSTATE_14:
281  ret++;
282  // fall through
283  case NVML_PSTATE_13:
284  ret++;
285  // fall through
286  case NVML_PSTATE_12:
287  ret++;
288  // fall through
289  case NVML_PSTATE_11:
290  ret++;
291  // fall through
292  case NVML_PSTATE_10:
293  ret++;
294  // fall through
295  case NVML_PSTATE_9:
296  ret++;
297  // fall through
298  case NVML_PSTATE_8:
299  ret++;
300  // fall through
301  case NVML_PSTATE_7:
302  ret++;
303  // fall through
304  case NVML_PSTATE_6:
305  ret++;
306  // fall through
307  case NVML_PSTATE_5:
308  ret++;
309  // fall through
310  case NVML_PSTATE_4:
311  ret++;
312  // fall through
313  case NVML_PSTATE_3:
314  ret++;
315  // fall through
316  case NVML_PSTATE_2:
317  ret++;
318  // fall through
319  case NVML_PSTATE_1:
320  ret++;
321  // fall through
322  case NVML_PSTATE_0:
323  break;
324  // fall through
325  case NVML_PSTATE_UNKNOWN:
326  default:
327  /* This should never happen?
328  * The API docs just state Unknown performance state... */
329  return (unsigned long long) - 1;
330  }
331  return (unsigned long long)ret;
332 }
#define SUBDBG(format, args...)
Definition: papi_debug.h:63
long long ret
Definition: iozone.c:1346
char *(* nvmlErrorStringPtr)(nvmlReturn_t)
Definition: benchSANVML.c:64
Here is the caller graph for this function:

◆ getTemperature()

unsigned long long getTemperature ( nvmlDevice_t  dev)

Definition at line 349 of file linux-nvml.c.

350 {
351  unsigned int ret = 0;
352  nvmlReturn_t bad;
353  bad = (*nvmlDeviceGetTemperaturePtr)(dev, NVML_TEMPERATURE_GPU, &ret);
354 
355  if (NVML_SUCCESS != bad) {
356  SUBDBG("something went wrong %s\n", (*nvmlErrorStringPtr)(bad));
357  return (unsigned long long) - 1;
358  }
359  return (unsigned long long)ret;
360 }
#define SUBDBG(format, args...)
Definition: papi_debug.h:63
long long ret
Definition: iozone.c:1346
char *(* nvmlErrorStringPtr)(nvmlReturn_t)
Definition: benchSANVML.c:64
Here is the caller graph for this function:

◆ getTotalEccErrors()

unsigned long long getTotalEccErrors ( nvmlDevice_t  dev,
nvmlEccBitType_t  bits 
)

Definition at line 363 of file linux-nvml.c.

364 {
365  unsigned long long counts = 0;
366  nvmlReturn_t bad;
367  bad = (*nvmlDeviceGetTotalEccErrorsPtr)(dev, bits, NVML_VOLATILE_ECC , &counts);
368 
369  if (NVML_SUCCESS != bad) {
370  SUBDBG("something went wrong %s\n", (*nvmlErrorStringPtr)(bad));
371  return (unsigned long long) - 1;
372  }
373  return counts;
374 }
#define SUBDBG(format, args...)
Definition: papi_debug.h:63
char *(* nvmlErrorStringPtr)(nvmlReturn_t)
Definition: benchSANVML.c:64
Here is the caller graph for this function:

◆ getUtilization()

unsigned long long getUtilization ( nvmlDevice_t  dev,
int  which_one 
)

Definition at line 380 of file linux-nvml.c.

381 {
382  nvmlUtilization_t util;
383  nvmlReturn_t bad;
384  bad = (*nvmlDeviceGetUtilizationRatesPtr)(dev, &util);
385 
386  if (NVML_SUCCESS != bad) {
387  SUBDBG("something went wrong %s\n", (*nvmlErrorStringPtr)(bad));
388  return (unsigned long long) - 1;
389  }
390 
391  switch (which_one) {
392  case GPU_UTILIZATION:
393  return (unsigned long long) util.gpu;
394  case MEMORY_UTILIZATION:
395  return (unsigned long long) util.memory;
396  default:
397  ;
398  }
399 
400  return (unsigned long long) - 1;
401 }
#define MEMORY_UTILIZATION
Definition: linux-nvml.h:32
#define SUBDBG(format, args...)
Definition: papi_debug.h:63
char *(* nvmlErrorStringPtr)(nvmlReturn_t)
Definition: benchSANVML.c:64
#define GPU_UTILIZATION
Definition: linux-nvml.h:31
Here is the caller graph for this function:

◆ linkCudaLibraries()

static int linkCudaLibraries ( )
static

Definition at line 1151 of file linux-nvml.c.

1152 {
1153  /* Attempt to guess if we were statically linked to libc, if so bail */
1154  if (_dl_non_dynamic_init != NULL) {
1155  strncpy(_nvml_vector.cmp_info.disabled_reason, "NVML component does not support statically linking of libc.", PAPI_MAX_STR_LEN);
1156  return PAPI_ENOSUPP;
1157  }
1158 
1159  /* Need to link in the cuda libraries, if not found disable the component */
1160  dl1 = dlopen("libcuda.so", RTLD_NOW | RTLD_GLOBAL);
1161  if (!dl1) {
1162  strncpy(_nvml_vector.cmp_info.disabled_reason, "CUDA library libcuda.so not found.", PAPI_MAX_STR_LEN);
1163  return (PAPI_ENOSUPP);
1164  }
1165  cuInitPtr = dlsym(dl1, "cuInit");
1166  if (dlerror() != NULL) {
1167  strncpy(_nvml_vector.cmp_info.disabled_reason, "CUDA function cuInit not found.", PAPI_MAX_STR_LEN);
1168  return (PAPI_ENOSUPP);
1169  }
1170 
1171  dl2 = dlopen("libcudart.so", RTLD_NOW | RTLD_GLOBAL | RTLD_NODELETE);
1172  if (!dl2) {
1173  strncpy(_nvml_vector.cmp_info.disabled_reason, "CUDA runtime library libcudart.so not found.", PAPI_MAX_STR_LEN);
1174  return (PAPI_ENOSUPP);
1175  }
1176  cudaGetDevicePtr = dlsym(dl2, "cudaGetDevice");
1177  if (dlerror() != NULL) {
1178  strncpy(_nvml_vector.cmp_info.disabled_reason, "CUDART function cudaGetDevice not found.", PAPI_MAX_STR_LEN);
1179  return (PAPI_ENOSUPP);
1180  }
1181  cudaGetDeviceCountPtr = dlsym(dl2, "cudaGetDeviceCount");
1182  if (dlerror() != NULL) {
1183  strncpy(_nvml_vector.cmp_info.disabled_reason, "CUDART function cudaGetDeviceCount not found.", PAPI_MAX_STR_LEN);
1184  return (PAPI_ENOSUPP);
1185  }
1186  cudaDeviceGetPCIBusIdPtr = dlsym(dl2, "cudaDeviceGetPCIBusId");
1187  if (dlerror() != NULL) {
1188  strncpy(_nvml_vector.cmp_info.disabled_reason, "CUDART function cudaDeviceGetPCIBusId not found.", PAPI_MAX_STR_LEN);
1189  return (PAPI_ENOSUPP);
1190  }
1191 
1192  dl3 = dlopen("libnvidia-ml.so", RTLD_NOW | RTLD_GLOBAL);
1193  if (!dl3) {
1194  strncpy(_nvml_vector.cmp_info.disabled_reason, "NVML runtime library libnvidia-ml.so not found.", PAPI_MAX_STR_LEN);
1195  return (PAPI_ENOSUPP);
1196  }
1197  nvmlDeviceGetClockInfoPtr = dlsym(dl3, "nvmlDeviceGetClockInfo");
1198  if (dlerror() != NULL) {
1199  strncpy(_nvml_vector.cmp_info.disabled_reason, "NVML function nvmlDeviceGetClockInfo not found.", PAPI_MAX_STR_LEN);
1200  return (PAPI_ENOSUPP);
1201  }
1202  nvmlErrorStringPtr = dlsym(dl3, "nvmlErrorString");
1203  if (dlerror() != NULL) {
1204  strncpy(_nvml_vector.cmp_info.disabled_reason, "NVML function nvmlErrorString not found.", PAPI_MAX_STR_LEN);
1205  return (PAPI_ENOSUPP);
1206  }
1207  nvmlDeviceGetDetailedEccErrorsPtr = dlsym(dl3, "nvmlDeviceGetDetailedEccErrors");
1208  if (dlerror() != NULL) {
1209  strncpy(_nvml_vector.cmp_info.disabled_reason, "NVML function nvmlDeviceGetDetailedEccErrors not found.", PAPI_MAX_STR_LEN);
1210  return (PAPI_ENOSUPP);
1211  }
1212  nvmlDeviceGetFanSpeedPtr = dlsym(dl3, "nvmlDeviceGetFanSpeed");
1213  if (dlerror() != NULL) {
1214  strncpy(_nvml_vector.cmp_info.disabled_reason, "NVML function nvmlDeviceGetFanSpeed not found.", PAPI_MAX_STR_LEN);
1215  return (PAPI_ENOSUPP);
1216  }
1217  nvmlDeviceGetMemoryInfoPtr = dlsym(dl3, "nvmlDeviceGetMemoryInfo");
1218  if (dlerror() != NULL) {
1219  strncpy(_nvml_vector.cmp_info.disabled_reason, "NVML function nvmlDeviceGetMemoryInfo not found.", PAPI_MAX_STR_LEN);
1220  return (PAPI_ENOSUPP);
1221  }
1222  nvmlDeviceGetPerformanceStatePtr = dlsym(dl3, "nvmlDeviceGetPerformanceState");
1223  if (dlerror() != NULL) {
1224  strncpy(_nvml_vector.cmp_info.disabled_reason, "NVML function nvmlDeviceGetPerformanceState not found.", PAPI_MAX_STR_LEN);
1225  return (PAPI_ENOSUPP);
1226  }
1227  nvmlDeviceGetPowerUsagePtr = dlsym(dl3, "nvmlDeviceGetPowerUsage");
1228  if (dlerror() != NULL) {
1229  strncpy(_nvml_vector.cmp_info.disabled_reason, "NVML function nvmlDeviceGetPowerUsage not found.", PAPI_MAX_STR_LEN);
1230  return (PAPI_ENOSUPP);
1231  }
1232  nvmlDeviceGetTemperaturePtr = dlsym(dl3, "nvmlDeviceGetTemperature");
1233  if (dlerror() != NULL) {
1234  strncpy(_nvml_vector.cmp_info.disabled_reason, "NVML function nvmlDeviceGetTemperature not found.", PAPI_MAX_STR_LEN);
1235  return (PAPI_ENOSUPP);
1236  }
1237  nvmlDeviceGetTotalEccErrorsPtr = dlsym(dl3, "nvmlDeviceGetTotalEccErrors");
1238  if (dlerror() != NULL) {
1239  strncpy(_nvml_vector.cmp_info.disabled_reason, "NVML function nvmlDeviceGetTotalEccErrors not found.", PAPI_MAX_STR_LEN);
1240  return (PAPI_ENOSUPP);
1241  }
1242  nvmlDeviceGetUtilizationRatesPtr = dlsym(dl3, "nvmlDeviceGetUtilizationRates");
1243  if (dlerror() != NULL) {
1244  strncpy(_nvml_vector.cmp_info.disabled_reason, "NVML function nvmlDeviceGetUtilizationRates not found.", PAPI_MAX_STR_LEN);
1245  return (PAPI_ENOSUPP);
1246  }
1247  nvmlDeviceGetHandleByIndexPtr = dlsym(dl3, "nvmlDeviceGetHandleByIndex");
1248  if (dlerror() != NULL) {
1249  strncpy(_nvml_vector.cmp_info.disabled_reason, "NVML function nvmlDeviceGetHandleByIndex not found.", PAPI_MAX_STR_LEN);
1250  return (PAPI_ENOSUPP);
1251  }
1252  nvmlDeviceGetPciInfoPtr = dlsym(dl3, "nvmlDeviceGetPciInfo");
1253  if (dlerror() != NULL) {
1254  strncpy(_nvml_vector.cmp_info.disabled_reason, "NVML function nvmlDeviceGetPciInfo not found.", PAPI_MAX_STR_LEN);
1255  return (PAPI_ENOSUPP);
1256  }
1257  nvmlDeviceGetNamePtr = dlsym(dl3, "nvmlDeviceGetName");
1258  if (dlerror() != NULL) {
1259  strncpy(_nvml_vector.cmp_info.disabled_reason, "NVML function nvmlDeviceGetName not found.", PAPI_MAX_STR_LEN);
1260  return (PAPI_ENOSUPP);
1261  }
1262  nvmlDeviceGetInforomVersionPtr = dlsym(dl3, "nvmlDeviceGetInforomVersion");
1263  if (dlerror() != NULL) {
1264  strncpy(_nvml_vector.cmp_info.disabled_reason, "NVML function nvmlDeviceGetInforomVersion not found.", PAPI_MAX_STR_LEN);
1265  return (PAPI_ENOSUPP);
1266  }
1267  nvmlDeviceGetEccModePtr = dlsym(dl3, "nvmlDeviceGetEccMode");
1268  if (dlerror() != NULL) {
1269  strncpy(_nvml_vector.cmp_info.disabled_reason, "NVML function nvmlDeviceGetEccMode not found.", PAPI_MAX_STR_LEN);
1270  return (PAPI_ENOSUPP);
1271  }
1272  nvmlInitPtr = dlsym(dl3, "nvmlInit");
1273  if (dlerror() != NULL) {
1274  strncpy(_nvml_vector.cmp_info.disabled_reason, "NVML function nvmlInit not found.", PAPI_MAX_STR_LEN);
1275  return (PAPI_ENOSUPP);
1276  }
1277  nvmlDeviceGetCountPtr = dlsym(dl3, "nvmlDeviceGetCount");
1278  if (dlerror() != NULL) {
1279  strncpy(_nvml_vector.cmp_info.disabled_reason, "NVML function nvmlDeviceGetCount not found.", PAPI_MAX_STR_LEN);
1280  return (PAPI_ENOSUPP);
1281  }
1282  nvmlShutdownPtr = dlsym(dl3, "nvmlShutdown");
1283  if (dlerror() != NULL) {
1284  strncpy(_nvml_vector.cmp_info.disabled_reason, "NVML function nvmlShutdown not found.", PAPI_MAX_STR_LEN);
1285  return (PAPI_ENOSUPP);
1286  }
1287  nvmlDeviceGetPowerManagementLimitPtr = dlsym(dl3, "nvmlDeviceGetPowerManagementLimit");
1288  if (dlerror() != NULL) {
1289  strncpy(_nvml_vector.cmp_info.disabled_reason, "NVML function nvmlDeviceGetPowerManagementLimit not found.", PAPI_MAX_STR_LEN);
1290  return (PAPI_ENOSUPP);
1291  }
1292  nvmlDeviceSetPowerManagementLimitPtr = dlsym(dl3, "nvmlDeviceSetPowerManagementLimit");
1293  if (dlerror() != NULL) {
1294  strncpy(_nvml_vector.cmp_info.disabled_reason, "NVML function nvmlDeviceSetPowerManagementLimit not found.", PAPI_MAX_STR_LEN);
1295  return (PAPI_ENOSUPP);
1296  }
1297  nvmlDeviceGetPowerManagementLimitConstraintsPtr = dlsym(dl3, "nvmlDeviceGetPowerManagementLimitConstraints");
1298  if (dlerror() != NULL) {
1299  strncpy(_nvml_vector.cmp_info.disabled_reason, "NVML function nvmlDeviceGetPowerManagementLimitConstraints not found.", PAPI_MAX_STR_LEN);
1300  return (PAPI_ENOSUPP);
1301  }
1302  return (PAPI_OK);
1303 }
#define PAPI_OK
Definition: fpapi.h:105
char disabled_reason[PAPI_MAX_STR_LEN]
Definition: papi.h:637
void(* _dl_non_dynamic_init)(void)
Definition: linux-nvml.c:48
nvmlReturn_t(* nvmlDeviceGetCountPtr)(unsigned int *dest)
Definition: benchSANVML.c:66
nvmlReturn_t(* nvmlDeviceGetNamePtr)(nvmlDevice_t, char *, unsigned int)
Definition: benchSANVML.c:73
#define PAPI_ENOSUPP
Definition: fpapi.h:123
nvmlReturn_t(* nvmlDeviceGetHandleByIndexPtr)(unsigned int, nvmlDevice_t *)
Definition: benchSANVML.c:70
nvmlReturn_t(* nvmlDeviceGetEccModePtr)(nvmlDevice_t, nvmlEnableState_t *, nvmlEnableState_t *)
Definition: benchSANVML.c:68
nvmlReturn_t(* nvmlDeviceGetInforomVersionPtr)(nvmlDevice_t, nvmlInforomObject_t, char *, unsigned int)
Definition: benchSANVML.c:71
nvmlReturn_t(* nvmlDeviceGetPciInfoPtr)(nvmlDevice_t, nvmlPciInfo_t *)
Definition: benchSANVML.c:74
nvmlReturn_t(* nvmlDeviceGetPowerManagementLimitConstraintsPtr)(nvmlDevice_t device, unsigned int *minLimit, unsigned int *maxLimit)
Definition: benchSANVML.c:76
PAPI_component_info_t cmp_info
Definition: papi_vector.h:20
cudaError_t(* cudaDeviceGetPCIBusIdPtr)(char *, int, int)
Definition: benchSANVML.c:55
static void * dl1
Definition: linux-cuda.c:110
papi_vector_t _nvml_vector
Definition: linux-nvml.c:1637
nvmlReturn_t(* nvmlDeviceGetClockInfoPtr)(nvmlDevice_t, nvmlClockType_t, unsigned int *)
Definition: benchSANVML.c:65
nvmlReturn_t(* nvmlDeviceGetDetailedEccErrorsPtr)(nvmlDevice_t, nvmlEccBitType_t, nvmlEccCounterType_t, nvmlEccErrorCounts_t *)
Definition: benchSANVML.c:67
nvmlReturn_t(* nvmlDeviceGetPowerManagementLimitPtr)(nvmlDevice_t device, unsigned int *limit)
Definition: benchSANVML.c:77
static void * dl2
Definition: linux-cuda.c:111
nvmlReturn_t(* nvmlDeviceGetTotalEccErrorsPtr)(nvmlDevice_t, nvmlEccBitType_t, nvmlEccCounterType_t, unsigned long long *)
Definition: benchSANVML.c:80
cudaError_t(* cudaGetDeviceCountPtr)(int *)
Definition: benchSANVML.c:54
nvmlReturn_t(* nvmlDeviceGetUtilizationRatesPtr)(nvmlDevice_t, nvmlUtilization_t *)
Definition: benchSANVML.c:81
static void * dl3
Definition: linux-cuda.c:112
nvmlReturn_t(* nvmlDeviceGetPowerUsagePtr)(nvmlDevice_t, unsigned int *)
Definition: benchSANVML.c:78
nvmlReturn_t(* nvmlDeviceGetMemoryInfoPtr)(nvmlDevice_t, nvmlMemory_t *)
Definition: benchSANVML.c:72
nvmlReturn_t(* nvmlDeviceGetTemperaturePtr)(nvmlDevice_t, nvmlTemperatureSensors_t, unsigned int *)
Definition: benchSANVML.c:79
nvmlReturn_t(* nvmlShutdownPtr)(void)
Definition: benchSANVML.c:84
CUresult CUDAAPI(* cuInitPtr)(unsigned int)
Definition: benchSANVML.c:47
nvmlReturn_t(* nvmlDeviceSetPowerManagementLimitPtr)(nvmlDevice_t device, unsigned int limit)
Definition: benchSANVML.c:82
nvmlReturn_t(* nvmlDeviceGetFanSpeedPtr)(nvmlDevice_t, unsigned int *)
Definition: benchSANVML.c:69
char *(* nvmlErrorStringPtr)(nvmlReturn_t)
Definition: benchSANVML.c:64
nvmlReturn_t(* nvmlInitPtr)(void)
Definition: benchSANVML.c:83
nvmlReturn_t(* nvmlDeviceGetPerformanceStatePtr)(nvmlDevice_t, nvmlPstates_t *)
Definition: benchSANVML.c:75
cudaError_t(* cudaGetDevicePtr)(int *)
Definition: benchSANVML.c:53
#define PAPI_MAX_STR_LEN
Definition: fpapi.h:43
Here is the caller graph for this function:

◆ nvml_hardware_read()

static int nvml_hardware_read ( long long *  value,
int  which_one 
)
static

Code that reads event values.

Definition at line 449 of file linux-nvml.c.

451 {
453  nvmlDevice_t handle;
454  int cudaIdx = -1;
455 
456  entry = &nvml_native_table[which_one];
457  *value = (long long) - 1;
458  /* replace entry->resources with the current cuda_device->nvml device */
459  (*cudaGetDevicePtr)(&cudaIdx);
460 
461  if (cudaIdx < 0 || cudaIdx > device_count)
462  return PAPI_EINVAL;
463 
464  /* Make sure the device we are running on has the requested event */
465  if (!HAS_FEATURE(features[cudaIdx] , entry->type))
466  return PAPI_EINVAL;
467 
468  handle = devices[cudaIdx];
469 
470  switch (entry->type) {
471  case FEATURE_CLOCK_INFO:
472  *value = getClockSpeed(handle, (nvmlClockType_t)entry->options.clock);
473  break;
475  *value = getEccLocalErrors(handle,
476  (nvmlEccBitType_t)entry->options.ecc_opts.bits,
477  (int)entry->options.ecc_opts.which_one);
478  break;
479  case FEATURE_FAN_SPEED:
480  *value = getFanSpeed(handle);
481  break;
482  case FEATURE_MAX_CLOCK:
483  *value = getMaxClockSpeed(handle,
484  (nvmlClockType_t)entry->options.clock);
485  break;
486  case FEATURE_MEMORY_INFO:
487  *value = getMemoryInfo(handle,
488  (int)entry->options.which_one);
489  break;
490  case FEATURE_PERF_STATES:
491  *value = getPState(handle);
492  break;
493  case FEATURE_POWER:
494  *value = getPowerUsage(handle);
495  break;
496  case FEATURE_TEMP:
497  *value = getTemperature(handle);
498  break;
500  *value = getTotalEccErrors(handle,
501  (nvmlEccBitType_t)entry->options.ecc_opts.bits);
502  break;
503  case FEATURE_UTILIZATION:
504  *value = getUtilization(handle,
505  (int)entry->options.which_one);
506  break;
508  *value = getPowerManagementLimit(handle);
509  break;
510 
512  *value = power_management_limit_constraint_min[cudaIdx];
513  break;
514 
516  *value = power_management_limit_constraint_max[cudaIdx];
517  break;
518 
519  default:
520  return PAPI_EINVAL;
521  }
522  if (*value == (long long)(unsigned long long) - 1)
523  return PAPI_EINVAL;
524 
525  return PAPI_OK;
526 }
#define PAPI_OK
Definition: fpapi.h:105
unsigned long long getPState(nvmlDevice_t dev)
Definition: linux-nvml.c:265
#define FEATURE_NVML_POWER_MANAGEMENT_LIMIT_CONSTRAINT_MIN
Definition: linux-nvml.h:17
#define PAPI_EINVAL
Definition: fpapi.h:106
static unsigned int * power_management_limit_constraint_min
Definition: linux-nvml.c:166
unsigned long long getTotalEccErrors(nvmlDevice_t dev, nvmlEccBitType_t bits)
Definition: linux-nvml.c:363
#define FEATURE_ECC_LOCAL_ERRORS
Definition: linux-nvml.h:7
int type
Definition: linux-nvml.h:53
unsigned long long getPowerUsage(nvmlDevice_t dev)
Definition: linux-nvml.c:335
#define FEATURE_ECC_TOTAL_ERRORS
Definition: linux-nvml.h:14
struct local_ecc ecc_opts
Definition: linux-nvml.h:44
#define FEATURE_FAN_SPEED
Definition: linux-nvml.h:8
unsigned long long getMemoryInfo(nvmlDevice_t dev, int which_one)
Definition: linux-nvml.c:240
nvmlEccBitType_t bits
Definition: linux-nvml.h:38
static nvml_native_event_entry_t * nvml_native_table
Definition: linux-nvml.c:155
unsigned long long getPowerManagementLimit(nvmlDevice_t dev)
Definition: linux-nvml.c:403
#define FEATURE_UTILIZATION
Definition: linux-nvml.h:15
#define FEATURE_CLOCK_INFO
Definition: linux-nvml.h:6
nvml_resource_options_t options
Definition: linux-nvml.h:49
static nvmlDevice_t * devices
Definition: linux-nvml.c:163
#define FEATURE_MAX_CLOCK
Definition: linux-nvml.h:9
Definition: linux-nvml.h:48
static int device_count
Definition: linux-nvml.c:158
unsigned long long getEccLocalErrors(nvmlDevice_t dev, nvmlEccBitType_t bits, int which_one)
Definition: linux-nvml.c:185
#define FEATURE_PERF_STATES
Definition: linux-nvml.h:11
nvmlClockType_t clock
Definition: linux-nvml.h:43
#define FEATURE_TEMP
Definition: linux-nvml.h:13
#define FEATURE_NVML_POWER_MANAGEMENT_LIMIT_CONSTRAINT_MAX
Definition: linux-nvml.h:18
static unsigned int * power_management_limit_constraint_max
Definition: linux-nvml.c:167
static int * features
Definition: linux-nvml.c:164
#define FEATURE_POWER
Definition: linux-nvml.h:12
unsigned long long getClockSpeed(nvmlDevice_t dev, nvmlClockType_t which_one)
Definition: linux-nvml.c:170
unsigned long long getTemperature(nvmlDevice_t dev)
Definition: linux-nvml.c:349
unsigned long long getFanSpeed(nvmlDevice_t dev)
Definition: linux-nvml.c:212
#define HAS_FEATURE(features, query)
Definition: linux-nvml.h:20
int which_one
Definition: linux-nvml.h:39
#define FEATURE_POWER_MANAGEMENT
Definition: linux-nvml.h:16
unsigned long long getUtilization(nvmlDevice_t dev, int which_one)
Definition: linux-nvml.c:380
#define FEATURE_MEMORY_INFO
Definition: linux-nvml.h:10
unsigned long long getMaxClockSpeed(nvmlDevice_t dev, nvmlClockType_t which_one)
Definition: linux-nvml.c:226
Here is the call graph for this function:
Here is the caller graph for this function:

◆ nvml_hardware_reset()

static void nvml_hardware_reset ( )
static

Definition at line 416 of file linux-nvml.c.

417 {
418  /* nvmlDeviceSet* and nvmlDeviceClear* calls require root/admin access, so while
419  * possible to implement a reset on the ECC counters, we pass */
420  /*
421  for ( i=0; i < device_count; i++ )
422  nvmlDeviceClearEccErrorCounts( device[i], NVML_VOLATILE_ECC );
423  */
424  int i;
425  nvmlReturn_t ret;
426  unsigned int templimit = 0;
427  for (i = 0; i < device_count; i++) {
429  // if power management is available
430  if (power_management_initial_limit[i] != 0) {
431  ret = (*nvmlDeviceGetPowerManagementLimitPtr)(devices[i], &templimit);
432  if ((ret == NVML_SUCCESS) && (templimit != power_management_initial_limit[i])) {
433  SUBDBG("Reset power_management_limit on device %d to initial value of %d \n", i, power_management_initial_limit[i]);
434  // if power is not at its initial value
435  // reset to initial value
436  ret = (*nvmlDeviceSetPowerManagementLimitPtr)(devices[i], power_management_initial_limit[i]);
437  if (ret != NVML_SUCCESS)
438  SUBDBG("Unable to reset the NVML power management limit on device %i to %ull (return code %d) \n", i, power_management_initial_limit[i] , ret);
439  }
440  }
441  }
442  }
443 }
static unsigned int * power_management_initial_limit
Definition: linux-nvml.c:165
static nvmlDevice_t * devices
Definition: linux-nvml.c:163
static int device_count
Definition: linux-nvml.c:158
#define SUBDBG(format, args...)
Definition: papi_debug.h:63
static int * features
Definition: linux-nvml.c:164
long long ret
Definition: iozone.c:1346
#define HAS_FEATURE(features, query)
Definition: linux-nvml.h:20
#define FEATURE_POWER_MANAGEMENT
Definition: linux-nvml.h:16
int i
Definition: fileop.c:140
Here is the caller graph for this function:

◆ nvml_hardware_write()

static int nvml_hardware_write ( long long *  value,
int  which_one 
)
static

Code that reads event values.

Definition at line 531 of file linux-nvml.c.

532 {
534  nvmlDevice_t handle;
535  int cudaIdx = -1;
536  nvmlReturn_t nvret;
537 
538  entry = &nvml_native_table[which_one];
539  /* replace entry->resources with the current cuda_device->nvml device */
540  (*cudaGetDevicePtr)(&cudaIdx);
541 
542  if (cudaIdx < 0 || cudaIdx > device_count)
543  return PAPI_EINVAL;
544 
545  /* Make sure the device we are running on has the requested event */
546  if (!HAS_FEATURE(features[cudaIdx] , entry->type))
547  return PAPI_EINVAL;
548 
549  handle = devices[cudaIdx];
550 
551  switch (entry->type) {
553  unsigned int setToPower = (unsigned int) * value;
554  if (setToPower < power_management_limit_constraint_min[cudaIdx]) {
555  SUBDBG("Error: Desired power %u mW < minimum %u mW on device %d\n", setToPower, power_management_limit_constraint_min[cudaIdx], cudaIdx);
556  return PAPI_EINVAL;
557  }
558  if (setToPower > power_management_limit_constraint_max[cudaIdx]) {
559  SUBDBG("Error: Desired power %u mW > maximum %u mW on device %d\n", setToPower, power_management_limit_constraint_max[cudaIdx], cudaIdx);
560  return PAPI_EINVAL;
561  }
562  if ((nvret = (*nvmlDeviceSetPowerManagementLimitPtr)(handle, setToPower)) != NVML_SUCCESS) {
563  SUBDBG("Error: %s\n", (*nvmlErrorStringPtr)(nvret));
564  return PAPI_EINVAL;
565  }
566  }
567  break;
568 
569  default:
570  return PAPI_EINVAL;
571  }
572 
573  return PAPI_OK;
574 }
#define PAPI_OK
Definition: fpapi.h:105
#define PAPI_EINVAL
Definition: fpapi.h:106
static unsigned int * power_management_limit_constraint_min
Definition: linux-nvml.c:166
int type
Definition: linux-nvml.h:53
static nvml_native_event_entry_t * nvml_native_table
Definition: linux-nvml.c:155
static nvmlDevice_t * devices
Definition: linux-nvml.c:163
Definition: linux-nvml.h:48
static int device_count
Definition: linux-nvml.c:158
static unsigned int * power_management_limit_constraint_max
Definition: linux-nvml.c:167
#define SUBDBG(format, args...)
Definition: papi_debug.h:63
static int * features
Definition: linux-nvml.c:164
#define HAS_FEATURE(features, query)
Definition: linux-nvml.h:20
nvmlReturn_t(* nvmlDeviceSetPowerManagementLimitPtr)(nvmlDevice_t device, unsigned int limit)
Definition: benchSANVML.c:82
char *(* nvmlErrorStringPtr)(nvmlReturn_t)
Definition: benchSANVML.c:64
#define FEATURE_POWER_MANAGEMENT
Definition: linux-nvml.h:16
Here is the caller graph for this function:

Variable Documentation

◆ _dl_non_dynamic_init

void(* _dl_non_dynamic_init) (void)

Holds control flags. Usually there's one of these per event-set. Usually this is out-of band configuration of the hardware

< Copy of counts, holds results when stopped

Definition at line 48 of file linux-nvml.c.

143  {
144  int num_events;
145  int which_counter[NVML_MAX_COUNTERS];
146  long long counter[NVML_MAX_COUNTERS];
static int num_events
Definition: linux-nvml.c:161
#define NVML_MAX_COUNTERS
nvml_control_state_t
Definition: linux-nvml.c:147

◆ _nvml_vector

papi_vector_t _nvml_vector

Vector that points to entry points for our component

Definition at line 1637 of file linux-nvml.c.

◆ device_count

int device_count = 0
static

Number of devices detected at component_init time

Definition at line 158 of file linux-nvml.c.

◆ devices

nvmlDevice_t* devices = NULL
static

Definition at line 163 of file linux-nvml.c.

◆ features

int* features = NULL
static

Definition at line 164 of file linux-nvml.c.

◆ num_events

int num_events = 0
static

number of events in the table

Definition at line 161 of file linux-nvml.c.

◆ nvml_control_state_t

nvml_control_state_t

Definition at line 147 of file linux-nvml.c.

◆ nvml_native_table

nvml_native_event_entry_t* nvml_native_table = NULL
static

This table contains the native events

Definition at line 155 of file linux-nvml.c.

◆ power_management_initial_limit

unsigned int* power_management_initial_limit = NULL
static

Definition at line 165 of file linux-nvml.c.

◆ power_management_limit_constraint_max

unsigned int* power_management_limit_constraint_max = NULL
static

Definition at line 167 of file linux-nvml.c.

◆ power_management_limit_constraint_min

unsigned int* power_management_limit_constraint_min = NULL
static

Definition at line 166 of file linux-nvml.c.