Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

cleanup dcgm_sampler dev cruft #1359

Merged
merged 1 commit into from
Feb 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 5 additions & 20 deletions ldms/src/sampler/dcgm_sampler/Makefile.am
Original file line number Diff line number Diff line change
@@ -1,34 +1,19 @@
bin_PROGRAMS = ldms-dcgm-list-fields

libdcgm_sampler_la_SOURCES = \
dcgm_sampler.c

dcgm_sampler.c
libdcgm_sampler_la_LIBADD = \
$(top_builddir)/ldms/src/sampler/libsampler_base.la \
$(top_builddir)/ldms/src/core/libldms.la \
$(top_builddir)/lib/src/coll/libcoll.la \
$(top_builddir)/ldms/src/sampler/libjobid_helper.la \
$(top_builddir)/lib/src/ovis_util/libovis_util.la \
$(top_builddir)/ldms/src/sampler/libjobid_helper.la \
-ldcgm

libdcgm_sampler_la_LDFLAGS = \
-no-undefined \
-export-symbols-regex 'get_plugin' \
-version-info 1:0:0
-export-symbols-regex 'get_plugin' \
-version-info 1:0:0
libdcgm_sampler_la_CPPFLAGS = \
@OVIS_INCLUDE_ABS@

pkglib_LTLIBRARIES = libdcgm_sampler.la

dist_man7_MANS = Plugin_dcgm_sampler.man

dist_noinst_SCRIPTS = gen-ldms-dcgm-list-fields

ldms-dcgm-list-fields.c: $(srcdir)/gen-ldms-dcgm-list-fields
$(srcdir)/gen-ldms-dcgm-list-fields > ldms-dcgm-list-fields.c

ldms_dcgm_list_fields_SOURCES = ldms-dcgm-list-fields.c
ldms_dcgm_list_fields_CPPFLAGS = @OVIS_INCLUDE_ABS@
ldms_dcgm_list_fields_LDADD = -ldcgm

clean-local::
$(RM) $(builddir)/ldms_dcgm_list_fields.c
12 changes: 6 additions & 6 deletions ldms/src/sampler/dcgm_sampler/Plugin_dcgm_sampler.man
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ Plugin_dcgm_sampler - man page for the LDMS dcgm_sampler plugin
.SH SYNOPSIS
Within ldmsd_controller or a configuration file:
.br
config name=dcgm_sampler [ <attr>=<value> ] [use_base=1]
config name=dcgm_sampler [ <attr>=<value> ] [use_base=<*>]

.SH DESCRIPTION
With LDMS (Lightweight Distributed Metric Service), plugins for the ldmsd (ldms daemon) are configured via ldmsd_controller
Expand All @@ -17,7 +17,7 @@ The schema is named "dcgm" by default.

.TP
.BR config
name=<plugin_name> interval=<interval(us)> [fields=<fields>] [schema=<schema_name>] [job_set=<metric set name>] [use_base=1 [uid=<int>] [gid=<int>] [perm=<octal>] [instance=<name>] [producer=<name>] [job_id=<metric name in job_set set>]]
name=<plugin_name> interval=<interval(us)> [fields=<fields>] [schema=<schema_name>] [job_set=<metric set name>] [use_base=<*> [uid=<int>] [gid=<int>] [perm=<octal>] [instance=<name>] [producer=<name>] [job_id=<metric name in job_set set>]]
.br
configuration line
.RS
Expand All @@ -26,9 +26,9 @@ name=<plugin_name>
.br
This MUST be dcgm_sampler.
.TP
use_base=1
use_base=<*>
.br
This enables the sampler_base configuration option processing (see ldms_sampler_base(7)). If not given, the options not
Any value given enables the sampler_base configuration option processing (see ldms_sampler_base(7)). If not given, the options not
listed below are ignored.
.TP
interval=<interval(us)>
Expand All @@ -42,8 +42,8 @@ fields=<fields>
identifiers that the plugin should watch. By default the plugin will
watch fields 150,155. The field identifier meanings are defined in dcgm_fields.h
and the DCGM Library API Reference Manual and may vary with DCGM release version.
The ldms-dcgm-list-fields command provides a table of fields, subject to hardware
support.
The plugin usage message provides a table of fields, subject to hardware
support; see the output of 'ldms-plugins.sh dcgm_sampler'.
.TP
schema=<schema_name>
.br
Expand Down
134 changes: 90 additions & 44 deletions ldms/src/sampler/dcgm_sampler/dcgm_sampler.c
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,10 @@
#include "config.h"
#include "jobid_helper.h"
#include "sampler_base.h"
#include <pthread.h>
#include "dstring.h"

#define _GNU_SOURCE


static pthread_mutex_t cfg_lock = PTHREAD_MUTEX_INITIALIZER;

#define SAMP "dcgm_sampler"

static unsigned short default_fields[] = {
Expand Down Expand Up @@ -72,9 +69,8 @@ static ldms_schema_t gpu_schema;
/* NOTE: we are assuming here that GPU ids will start at zero and
not exceed the DCGM_MAX_NUM_DEVICES count in value */
static ldms_set_t gpu_sets[DCGM_MAX_NUM_DEVICES];
static int use_base;
static base_data_t base;
static int termed;
static char *field_help;

/* We won't use many of the entries in this array, but DCGM_FI_MAX_FIELDS is
is only around 1000. We trade off memory usage to allow quick translation of
Expand Down Expand Up @@ -243,7 +239,7 @@ static ldms_set_t gpu_metric_set_create(int gpu_id)
char instance_name[256];

ovis_log(mylog, OVIS_LDEBUG, "gpu_metric_set_create() (gpu %d)\n", gpu_id);
if (use_base) {
if (base) {
char *tmp = base->instance_name;
size_t len = strlen(tmp);
base->instance_name = malloc( len + 20);
Expand Down Expand Up @@ -294,7 +290,7 @@ static int gpu_schema_create()
int i;

ovis_log(mylog, OVIS_LDEBUG, "gpu_schema_create()\n");
if (!use_base) {
if (!base) {
sch = ldms_schema_new(conf.schema_name);
if (sch == NULL)
goto err1;
Expand Down Expand Up @@ -343,7 +339,7 @@ static int gpu_schema_create()

return 0;
err2:
if (use_base)
if (base)
base_schema_delete(base);
else
ldms_schema_delete(sch);
Expand All @@ -354,7 +350,7 @@ static int gpu_schema_create()

static void gpu_schema_destroy()
{
if (use_base)
if (base)
base_schema_delete(base);
else
ldms_schema_delete(gpu_schema);
Expand Down Expand Up @@ -435,6 +431,62 @@ static int parse_fields_value(const char *fields_str, unsigned short **fields_ou
return -1;
}

const char *typeString(int ft)
{
switch (ft) {
case DCGM_FT_DOUBLE:
return "double";
case DCGM_FT_INT64:
return "int64_t";
case DCGM_FT_STRING:
return "string";
case DCGM_FT_TIMESTAMP:
return "timestamp";
default:
return "unsupported_data_type";
}
}

#define NUSAGE 20480
static void init_field_help(char *preamble)
{
if (!dcgm_initialized) {
dcgmReturn_t rc = dcgmInit();
if (rc != DCGM_ST_OK) {
return;
}
}

dstring_t ds;
dstr_init2(&ds, NUSAGE);

int i;
dstrcat(&ds, preamble, DSTRING_ALL);
dstrcat(&ds, "field_id\ttag/metric\t\ttype\t(units)\n", DSTRING_ALL);
for (i = 0; i < DCGM_FI_MAX_FIELDS; i++) {
dcgm_field_meta_p field_meta;
field_meta = DcgmFieldGetById(i);
if (field_meta) {
dstrcat_int(&ds, (int64_t)field_meta->fieldId);
dstrcat(&ds, "\t", 1);
dstrcat(&ds, field_meta->tag, DSTRING_ALL);
dstrcat(&ds, "\t", 1);
dstrcat(&ds, typeString(field_meta->fieldType), DSTRING_ALL);
dstrcat(&ds, "\t(", 2);
dstrcat(&ds, (field_meta->valueFormat ?
field_meta->valueFormat->unit :
"no_format"), DSTRING_ALL);
dstrcat(&ds, ")\n", 2);
}
}
field_help = dstr_extract(&ds);
dstr_free(&ds);

if (!dcgm_initialized) {
dcgmShutdown();
}
}

/**************************************************************************
* Externally accessed functions
**************************************************************************/
Expand All @@ -446,15 +498,12 @@ static int config(struct ldmsd_plugin *self,
int rc = -1;
int i;

pthread_mutex_lock(&cfg_lock);
if (termed)
termed = 0;
ovis_log(mylog, OVIS_LDEBUG, "config() called\n");
if (dcgm_initialized) {
ovis_log(mylog, OVIS_LERROR, "config() called twice. Stop it first.\n");
pthread_mutex_unlock(&cfg_lock);
return EINVAL;
}
int use_base = 0;
value = av_value(avl, "use_base");
if (value != NULL) {
use_base = 1;
Expand All @@ -463,17 +512,17 @@ static int config(struct ldmsd_plugin *self,
ovis_log(mylog, OVIS_LDEBUG, "Ignoring sampler_base\n");
}

value = av_value(avl, "interval");
if (value == NULL) {
ovis_log(mylog, OVIS_LERROR, "config() \"interval\" option missing\n");
goto err0;
}
errno = 0;
conf.interval = strtol(value, NULL, 10);
if (errno != 0) {
ovis_log(mylog, OVIS_LERROR, "config() \"interval\" value conversion error: %d\n", errno);
goto err0;
}
value = av_value(avl, "interval");
if (value == NULL) {
ovis_log(mylog, OVIS_LERROR, "config() \"interval\" option missing\n");
goto err0;
}
errno = 0;
conf.interval = strtol(value, NULL, 10);
if (errno != 0) {
ovis_log(mylog, OVIS_LERROR, "config() \"interval\" value conversion error: %d\n", errno);
goto err0;
}

if (! use_base) {
int jc = jobid_helper_config(avl);
Expand All @@ -494,7 +543,7 @@ static int config(struct ldmsd_plugin *self,
goto err0;
}
} else {
base_config(avl, SAMP, "dcgm", mylog);
base = base_config(avl, SAMP, "dcgm", mylog);
conf.schema_name = strdup(base->schema_name);
}

Expand Down Expand Up @@ -531,15 +580,14 @@ static int config(struct ldmsd_plugin *self,
gpu_sets[gpu_ids[i]] = gpu_metric_set_create(gpu_ids[i]);
}

pthread_mutex_unlock(&cfg_lock);
return 0;

err4:
for (i = i-1; i >= 0; i--) {
gpu_metric_set_destroy(gpu_sets[gpu_ids[i]]);
}
gpu_schema_destroy();
if (use_base) {
if (base) {
free(base->instance_name);
base->instance_name = NULL;
base_del(base);
Expand All @@ -556,17 +604,13 @@ static int config(struct ldmsd_plugin *self,
free(conf.schema_name);
conf.schema_name = NULL;
err0:
pthread_mutex_unlock(&cfg_lock);
return rc;
}

static int sample(struct ldmsd_sampler *self)
{
ovis_log(mylog, OVIS_LDEBUG, SAMP" sample() called\n");
pthread_mutex_lock(&cfg_lock);
if (!termed)
gpu_sample();
pthread_mutex_unlock(&cfg_lock);
gpu_sample();
return 0;
}

Expand All @@ -575,9 +619,8 @@ static void term(struct ldmsd_plugin *self)
int i;

ovis_log(mylog, OVIS_LDEBUG, "term() called\n");
pthread_mutex_lock(&cfg_lock);
gpu_schema_destroy();
if (use_base) {
if (base) {
free(base->instance_name);
base->instance_name = NULL;
base_del(base);
Expand All @@ -593,13 +636,13 @@ static void term(struct ldmsd_plugin *self)
gpu_metric_set_destroy(gpu_sets[gpu_ids[i]]);
}
dcgm_fini();
termed = 1;
use_base = 0;
free(field_help);
field_help = NULL;
if (mylog) {
ovis_log_destroy(mylog);
mylog = NULL;
}
pthread_mutex_unlock(&cfg_lock);

}

static ldms_set_t get_set(struct ldmsd_sampler *self)
Expand All @@ -610,20 +653,20 @@ static ldms_set_t get_set(struct ldmsd_sampler *self)
static const char *usage(struct ldmsd_plugin *self)
{
ovis_log(mylog, OVIS_LDEBUG, "usage() called\n");
return "config name=" SAMP
char *preamble = "config name=" SAMP
" interval=<interval(us)> [fields=<fields>]\n"
" [schema=<schema_name>] [job_set=<metric set name>]\n"
" [use_base=1\n"
" [use_base=<*>\n"
" [uid=<int>] [gid=<int>] [perm=<octal>] [instance=<name>]\n"
" [producer=<name>] [job_id=<metric name in job_set set>]\n"
" [producer=<name>] [job_id=<metric name in job_set set>]\n"
" ]\n"
" name=<plugin_name>\n"
" interval=<interval(us)> DCGM query interval (microsecond)\n"
" must match dcgm_sampler interval for plugin start\n"
" fields=<fields> list of DCGM field_ids\n"
" schema=<schema_name> default " SAMP "\n"
" job_set=<job metric set name>\n"
" If use_base=1 is given, the additional parameters are applied\n"
" If use_base=<*> is given, the additional parameters are applied\n"
" (see ldms_sampler_base).\n"
" producer A unique name for the host providing the timing data\n"
" (default $HOSTNAME)\n"
Expand All @@ -636,8 +679,11 @@ static const char *usage(struct ldmsd_plugin *self)
" uid The user-id of the set's owner\n"
" gid The group id of the set's owner\n"
" perm The set's access permissions\n"
" See ldms-dcgm-list-fields for input values to fields\n"
;
" The field numbers are tabulated:\n"
" (Not all can be ldms metrics, as indicated by 'unsupported_data_type')\n";
if (!field_help)
init_field_help(preamble);
return field_help ? field_help : preamble;
}

static struct ldmsd_sampler nvidia_dcgm_plugin = {
Expand Down
Loading
Loading