Skip to content

Commit

Permalink
cleanup dcgm_sampler dev cruft
Browse files Browse the repository at this point in the history
This brings dcgm_sampler in top of tree in line with ovis-4.4.2 version
changes:
eliminate code generator
eliminate file-scope variables use_base, termed, and pthread usage.
  • Loading branch information
baallan authored and tom95858 committed Feb 13, 2024
1 parent d570f51 commit 08bec37
Show file tree
Hide file tree
Showing 4 changed files with 101 additions and 160 deletions.
25 changes: 5 additions & 20 deletions ldms/src/sampler/dcgm_sampler/Makefile.am
Original file line number Diff line number Diff line change
@@ -1,34 +1,19 @@
bin_PROGRAMS = ldms-dcgm-list-fields

libdcgm_sampler_la_SOURCES = \
dcgm_sampler.c

dcgm_sampler.c
libdcgm_sampler_la_LIBADD = \
$(top_builddir)/ldms/src/sampler/libsampler_base.la \
$(top_builddir)/ldms/src/core/libldms.la \
$(top_builddir)/lib/src/coll/libcoll.la \
$(top_builddir)/ldms/src/sampler/libjobid_helper.la \
$(top_builddir)/lib/src/ovis_util/libovis_util.la \
$(top_builddir)/ldms/src/sampler/libjobid_helper.la \
-ldcgm

libdcgm_sampler_la_LDFLAGS = \
-no-undefined \
-export-symbols-regex 'get_plugin' \
-version-info 1:0:0
-export-symbols-regex 'get_plugin' \
-version-info 1:0:0
libdcgm_sampler_la_CPPFLAGS = \
@OVIS_INCLUDE_ABS@

pkglib_LTLIBRARIES = libdcgm_sampler.la

dist_man7_MANS = Plugin_dcgm_sampler.man

dist_noinst_SCRIPTS = gen-ldms-dcgm-list-fields

ldms-dcgm-list-fields.c: $(srcdir)/gen-ldms-dcgm-list-fields
$(srcdir)/gen-ldms-dcgm-list-fields > ldms-dcgm-list-fields.c

ldms_dcgm_list_fields_SOURCES = ldms-dcgm-list-fields.c
ldms_dcgm_list_fields_CPPFLAGS = @OVIS_INCLUDE_ABS@
ldms_dcgm_list_fields_LDADD = -ldcgm

clean-local::
$(RM) $(builddir)/ldms_dcgm_list_fields.c
12 changes: 6 additions & 6 deletions ldms/src/sampler/dcgm_sampler/Plugin_dcgm_sampler.man
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ Plugin_dcgm_sampler - man page for the LDMS dcgm_sampler plugin
.SH SYNOPSIS
Within ldmsd_controller or a configuration file:
.br
config name=dcgm_sampler [ <attr>=<value> ] [use_base=1]
config name=dcgm_sampler [ <attr>=<value> ] [use_base=<*>]

.SH DESCRIPTION
With LDMS (Lightweight Distributed Metric Service), plugins for the ldmsd (ldms daemon) are configured via ldmsd_controller
Expand All @@ -17,7 +17,7 @@ The schema is named "dcgm" by default.

.TP
.BR config
name=<plugin_name> interval=<interval(us)> [fields=<fields>] [schema=<schema_name>] [job_set=<metric set name>] [use_base=1 [uid=<int>] [gid=<int>] [perm=<octal>] [instance=<name>] [producer=<name>] [job_id=<metric name in job_set set>]]
name=<plugin_name> interval=<interval(us)> [fields=<fields>] [schema=<schema_name>] [job_set=<metric set name>] [use_base=<*> [uid=<int>] [gid=<int>] [perm=<octal>] [instance=<name>] [producer=<name>] [job_id=<metric name in job_set set>]]
.br
configuration line
.RS
Expand All @@ -26,9 +26,9 @@ name=<plugin_name>
.br
This MUST be dcgm_sampler.
.TP
use_base=1
use_base=<*>
.br
This enables the sampler_base configuration option processing (see ldms_sampler_base(7)). If not given, the options not
Any value given enables the sampler_base configuration option processing (see ldms_sampler_base(7)). If not given, the options not
listed below are ignored.
.TP
interval=<interval(us)>
Expand All @@ -42,8 +42,8 @@ fields=<fields>
identifiers that the plugin should watch. By default the plugin will
watch fields 150,155. The field identifier meanings are defined in dcgm_fields.h
and the DCGM Library API Reference Manual and may vary with DCGM release version.
The ldms-dcgm-list-fields command provides a table of fields, subject to hardware
support.
The plugin usage message provides a table of fields, subject to hardware
support; see the output of 'ldms-plugins.sh dcgm_sampler'.
.TP
schema=<schema_name>
.br
Expand Down
134 changes: 90 additions & 44 deletions ldms/src/sampler/dcgm_sampler/dcgm_sampler.c
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,10 @@
#include "config.h"
#include "jobid_helper.h"
#include "sampler_base.h"
#include <pthread.h>
#include "dstring.h"

#define _GNU_SOURCE


static pthread_mutex_t cfg_lock = PTHREAD_MUTEX_INITIALIZER;

#define SAMP "dcgm_sampler"

static unsigned short default_fields[] = {
Expand Down Expand Up @@ -72,9 +69,8 @@ static ldms_schema_t gpu_schema;
/* NOTE: we are assuming here that GPU ids will start at zero and
not exceed the DCGM_MAX_NUM_DEVICES count in value */
static ldms_set_t gpu_sets[DCGM_MAX_NUM_DEVICES];
static int use_base;
static base_data_t base;
static int termed;
static char *field_help;

/* We won't use many of the entries in this array, but DCGM_FI_MAX_FIELDS is
is only around 1000. We trade off memory usage to allow quick translation of
Expand Down Expand Up @@ -243,7 +239,7 @@ static ldms_set_t gpu_metric_set_create(int gpu_id)
char instance_name[256];

ovis_log(mylog, OVIS_LDEBUG, "gpu_metric_set_create() (gpu %d)\n", gpu_id);
if (use_base) {
if (base) {
char *tmp = base->instance_name;
size_t len = strlen(tmp);
base->instance_name = malloc( len + 20);
Expand Down Expand Up @@ -294,7 +290,7 @@ static int gpu_schema_create()
int i;

ovis_log(mylog, OVIS_LDEBUG, "gpu_schema_create()\n");
if (!use_base) {
if (!base) {
sch = ldms_schema_new(conf.schema_name);
if (sch == NULL)
goto err1;
Expand Down Expand Up @@ -343,7 +339,7 @@ static int gpu_schema_create()

return 0;
err2:
if (use_base)
if (base)
base_schema_delete(base);
else
ldms_schema_delete(sch);
Expand All @@ -354,7 +350,7 @@ static int gpu_schema_create()

static void gpu_schema_destroy()
{
if (use_base)
if (base)
base_schema_delete(base);
else
ldms_schema_delete(gpu_schema);
Expand Down Expand Up @@ -435,6 +431,62 @@ static int parse_fields_value(const char *fields_str, unsigned short **fields_ou
return -1;
}

const char *typeString(int ft)
{
switch (ft) {
case DCGM_FT_DOUBLE:
return "double";
case DCGM_FT_INT64:
return "int64_t";
case DCGM_FT_STRING:
return "string";
case DCGM_FT_TIMESTAMP:
return "timestamp";
default:
return "unsupported_data_type";
}
}

#define NUSAGE 20480
static void init_field_help(char *preamble)
{
if (!dcgm_initialized) {
dcgmReturn_t rc = dcgmInit();
if (rc != DCGM_ST_OK) {
return;
}
}

dstring_t ds;
dstr_init2(&ds, NUSAGE);

int i;
dstrcat(&ds, preamble, DSTRING_ALL);
dstrcat(&ds, "field_id\ttag/metric\t\ttype\t(units)\n", DSTRING_ALL);
for (i = 0; i < DCGM_FI_MAX_FIELDS; i++) {
dcgm_field_meta_p field_meta;
field_meta = DcgmFieldGetById(i);
if (field_meta) {
dstrcat_int(&ds, (int64_t)field_meta->fieldId);
dstrcat(&ds, "\t", 1);
dstrcat(&ds, field_meta->tag, DSTRING_ALL);
dstrcat(&ds, "\t", 1);
dstrcat(&ds, typeString(field_meta->fieldType), DSTRING_ALL);
dstrcat(&ds, "\t(", 2);
dstrcat(&ds, (field_meta->valueFormat ?
field_meta->valueFormat->unit :
"no_format"), DSTRING_ALL);
dstrcat(&ds, ")\n", 2);
}
}
field_help = dstr_extract(&ds);
dstr_free(&ds);

if (!dcgm_initialized) {
dcgmShutdown();
}
}

/**************************************************************************
* Externally accessed functions
**************************************************************************/
Expand All @@ -446,15 +498,12 @@ static int config(struct ldmsd_plugin *self,
int rc = -1;
int i;

pthread_mutex_lock(&cfg_lock);
if (termed)
termed = 0;
ovis_log(mylog, OVIS_LDEBUG, "config() called\n");
if (dcgm_initialized) {
ovis_log(mylog, OVIS_LERROR, "config() called twice. Stop it first.\n");
pthread_mutex_unlock(&cfg_lock);
return EINVAL;
}
int use_base = 0;
value = av_value(avl, "use_base");
if (value != NULL) {
use_base = 1;
Expand All @@ -463,17 +512,17 @@ static int config(struct ldmsd_plugin *self,
ovis_log(mylog, OVIS_LDEBUG, "Ignoring sampler_base\n");
}

value = av_value(avl, "interval");
if (value == NULL) {
ovis_log(mylog, OVIS_LERROR, "config() \"interval\" option missing\n");
goto err0;
}
errno = 0;
conf.interval = strtol(value, NULL, 10);
if (errno != 0) {
ovis_log(mylog, OVIS_LERROR, "config() \"interval\" value conversion error: %d\n", errno);
goto err0;
}
value = av_value(avl, "interval");
if (value == NULL) {
ovis_log(mylog, OVIS_LERROR, "config() \"interval\" option missing\n");
goto err0;
}
errno = 0;
conf.interval = strtol(value, NULL, 10);
if (errno != 0) {
ovis_log(mylog, OVIS_LERROR, "config() \"interval\" value conversion error: %d\n", errno);
goto err0;
}

if (! use_base) {
int jc = jobid_helper_config(avl);
Expand All @@ -494,7 +543,7 @@ static int config(struct ldmsd_plugin *self,
goto err0;
}
} else {
base_config(avl, SAMP, "dcgm", mylog);
base = base_config(avl, SAMP, "dcgm", mylog);
conf.schema_name = strdup(base->schema_name);
}

Expand Down Expand Up @@ -531,15 +580,14 @@ static int config(struct ldmsd_plugin *self,
gpu_sets[gpu_ids[i]] = gpu_metric_set_create(gpu_ids[i]);
}

pthread_mutex_unlock(&cfg_lock);
return 0;

err4:
for (i = i-1; i >= 0; i--) {
gpu_metric_set_destroy(gpu_sets[gpu_ids[i]]);
}
gpu_schema_destroy();
if (use_base) {
if (base) {
free(base->instance_name);
base->instance_name = NULL;
base_del(base);
Expand All @@ -556,17 +604,13 @@ static int config(struct ldmsd_plugin *self,
free(conf.schema_name);
conf.schema_name = NULL;
err0:
pthread_mutex_unlock(&cfg_lock);
return rc;
}

static int sample(struct ldmsd_sampler *self)
{
ovis_log(mylog, OVIS_LDEBUG, SAMP" sample() called\n");
pthread_mutex_lock(&cfg_lock);
if (!termed)
gpu_sample();
pthread_mutex_unlock(&cfg_lock);
gpu_sample();
return 0;
}

Expand All @@ -575,9 +619,8 @@ static void term(struct ldmsd_plugin *self)
int i;

ovis_log(mylog, OVIS_LDEBUG, "term() called\n");
pthread_mutex_lock(&cfg_lock);
gpu_schema_destroy();
if (use_base) {
if (base) {
free(base->instance_name);
base->instance_name = NULL;
base_del(base);
Expand All @@ -593,13 +636,13 @@ static void term(struct ldmsd_plugin *self)
gpu_metric_set_destroy(gpu_sets[gpu_ids[i]]);
}
dcgm_fini();
termed = 1;
use_base = 0;
free(field_help);
field_help = NULL;
if (mylog) {
ovis_log_destroy(mylog);
mylog = NULL;
}
pthread_mutex_unlock(&cfg_lock);

}

static ldms_set_t get_set(struct ldmsd_sampler *self)
Expand All @@ -610,20 +653,20 @@ static ldms_set_t get_set(struct ldmsd_sampler *self)
static const char *usage(struct ldmsd_plugin *self)
{
ovis_log(mylog, OVIS_LDEBUG, "usage() called\n");
return "config name=" SAMP
char *preamble = "config name=" SAMP
" interval=<interval(us)> [fields=<fields>]\n"
" [schema=<schema_name>] [job_set=<metric set name>]\n"
" [use_base=1\n"
" [use_base=<*>\n"
" [uid=<int>] [gid=<int>] [perm=<octal>] [instance=<name>]\n"
" [producer=<name>] [job_id=<metric name in job_set set>]\n"
" [producer=<name>] [job_id=<metric name in job_set set>]\n"
" ]\n"
" name=<plugin_name>\n"
" interval=<interval(us)> DCGM query interval (microsecond)\n"
" must match dcgm_sampler interval for plugin start\n"
" fields=<fields> list of DCGM field_ids\n"
" schema=<schema_name> default " SAMP "\n"
" job_set=<job metric set name>\n"
" If use_base=1 is given, the additional parameters are applied\n"
" If use_base=<*> is given, the additional parameters are applied\n"
" (see ldms_sampler_base).\n"
" producer A unique name for the host providing the timing data\n"
" (default $HOSTNAME)\n"
Expand All @@ -636,8 +679,11 @@ static const char *usage(struct ldmsd_plugin *self)
" uid The user-id of the set's owner\n"
" gid The group id of the set's owner\n"
" perm The set's access permissions\n"
" See ldms-dcgm-list-fields for input values to fields\n"
;
" The field numbers are tabulated:\n"
" (Not all can be ldms metrics, as indicated by 'unsupported_data_type')\n";
if (!field_help)
init_field_help(preamble);
return field_help ? field_help : preamble;
}

static struct ldmsd_sampler nvidia_dcgm_plugin = {
Expand Down
Loading

0 comments on commit 08bec37

Please sign in to comment.