From 08bec370ed0b75e612c6dfe1a3d70fa7a0083b36 Mon Sep 17 00:00:00 2001 From: Ben Allan Date: Mon, 12 Feb 2024 11:14:31 -0700 Subject: [PATCH] cleanup dcgm_sampler dev cruft This brings dcgm_sampler in top of tree in line with ovis-4.4.2 version changes: eliminate code generator eliminate file-scope variables use_base, termed, and pthread usage. --- ldms/src/sampler/dcgm_sampler/Makefile.am | 25 +--- .../dcgm_sampler/Plugin_dcgm_sampler.man | 12 +- ldms/src/sampler/dcgm_sampler/dcgm_sampler.c | 134 ++++++++++++------ .../dcgm_sampler/gen-ldms-dcgm-list-fields | 90 ------------ 4 files changed, 101 insertions(+), 160 deletions(-) delete mode 100755 ldms/src/sampler/dcgm_sampler/gen-ldms-dcgm-list-fields diff --git a/ldms/src/sampler/dcgm_sampler/Makefile.am b/ldms/src/sampler/dcgm_sampler/Makefile.am index af0de9bc9..5c4075a3b 100644 --- a/ldms/src/sampler/dcgm_sampler/Makefile.am +++ b/ldms/src/sampler/dcgm_sampler/Makefile.am @@ -1,34 +1,19 @@ -bin_PROGRAMS = ldms-dcgm-list-fields - libdcgm_sampler_la_SOURCES = \ - dcgm_sampler.c - + dcgm_sampler.c libdcgm_sampler_la_LIBADD = \ $(top_builddir)/ldms/src/sampler/libsampler_base.la \ $(top_builddir)/ldms/src/core/libldms.la \ $(top_builddir)/lib/src/coll/libcoll.la \ - $(top_builddir)/ldms/src/sampler/libjobid_helper.la \ + $(top_builddir)/lib/src/ovis_util/libovis_util.la \ + $(top_builddir)/ldms/src/sampler/libjobid_helper.la \ -ldcgm - libdcgm_sampler_la_LDFLAGS = \ -no-undefined \ - -export-symbols-regex 'get_plugin' \ - -version-info 1:0:0 + -export-symbols-regex 'get_plugin' \ + -version-info 1:0:0 libdcgm_sampler_la_CPPFLAGS = \ @OVIS_INCLUDE_ABS@ pkglib_LTLIBRARIES = libdcgm_sampler.la dist_man7_MANS = Plugin_dcgm_sampler.man - -dist_noinst_SCRIPTS = gen-ldms-dcgm-list-fields - -ldms-dcgm-list-fields.c: $(srcdir)/gen-ldms-dcgm-list-fields - $(srcdir)/gen-ldms-dcgm-list-fields > ldms-dcgm-list-fields.c - -ldms_dcgm_list_fields_SOURCES = ldms-dcgm-list-fields.c -ldms_dcgm_list_fields_CPPFLAGS = @OVIS_INCLUDE_ABS@ -ldms_dcgm_list_fields_LDADD = -ldcgm - -clean-local:: - $(RM) $(builddir)/ldms_dcgm_list_fields.c diff --git a/ldms/src/sampler/dcgm_sampler/Plugin_dcgm_sampler.man b/ldms/src/sampler/dcgm_sampler/Plugin_dcgm_sampler.man index 183e56522..24832ee22 100644 --- a/ldms/src/sampler/dcgm_sampler/Plugin_dcgm_sampler.man +++ b/ldms/src/sampler/dcgm_sampler/Plugin_dcgm_sampler.man @@ -6,7 +6,7 @@ Plugin_dcgm_sampler - man page for the LDMS dcgm_sampler plugin .SH SYNOPSIS Within ldmsd_controller or a configuration file: .br -config name=dcgm_sampler [ = ] [use_base=1] +config name=dcgm_sampler [ = ] [use_base=<*>] .SH DESCRIPTION With LDMS (Lightweight Distributed Metric Service), plugins for the ldmsd (ldms daemon) are configured via ldmsd_controller @@ -17,7 +17,7 @@ The schema is named "dcgm" by default. .TP .BR config -name= interval= [fields=] [schema=] [job_set=] [use_base=1 [uid=] [gid=] [perm=] [instance=] [producer=] [job_id=]] +name= interval= [fields=] [schema=] [job_set=] [use_base=<*> [uid=] [gid=] [perm=] [instance=] [producer=] [job_id=]] .br configuration line .RS @@ -26,9 +26,9 @@ name= .br This MUST be dcgm_sampler. .TP -use_base=1 +use_base=<*> .br -This enables the sampler_base configuration option processing (see ldms_sampler_base(7)). If not given, the options not +Any value given enables the sampler_base configuration option processing (see ldms_sampler_base(7)). If not given, the options not listed below are ignored. .TP interval= @@ -42,8 +42,8 @@ fields= identifiers that the plugin should watch. By default the plugin will watch fields 150,155. The field identifier meanings are defined in dcgm_fields.h and the DCGM Library API Reference Manual and may vary with DCGM release version. -The ldms-dcgm-list-fields command provides a table of fields, subject to hardware -support. +The plugin usage message provides a table of fields, subject to hardware +support; see the output of 'ldms-plugins.sh dcgm_sampler'. .TP schema= .br diff --git a/ldms/src/sampler/dcgm_sampler/dcgm_sampler.c b/ldms/src/sampler/dcgm_sampler/dcgm_sampler.c index f603852b3..061d5e2c7 100644 --- a/ldms/src/sampler/dcgm_sampler/dcgm_sampler.c +++ b/ldms/src/sampler/dcgm_sampler/dcgm_sampler.c @@ -19,13 +19,10 @@ #include "config.h" #include "jobid_helper.h" #include "sampler_base.h" -#include +#include "dstring.h" #define _GNU_SOURCE - -static pthread_mutex_t cfg_lock = PTHREAD_MUTEX_INITIALIZER; - #define SAMP "dcgm_sampler" static unsigned short default_fields[] = { @@ -72,9 +69,8 @@ static ldms_schema_t gpu_schema; /* NOTE: we are assuming here that GPU ids will start at zero and not exceed the DCGM_MAX_NUM_DEVICES count in value */ static ldms_set_t gpu_sets[DCGM_MAX_NUM_DEVICES]; -static int use_base; static base_data_t base; -static int termed; +static char *field_help; /* We won't use many of the entries in this array, but DCGM_FI_MAX_FIELDS is is only around 1000. We trade off memory usage to allow quick translation of @@ -243,7 +239,7 @@ static ldms_set_t gpu_metric_set_create(int gpu_id) char instance_name[256]; ovis_log(mylog, OVIS_LDEBUG, "gpu_metric_set_create() (gpu %d)\n", gpu_id); - if (use_base) { + if (base) { char *tmp = base->instance_name; size_t len = strlen(tmp); base->instance_name = malloc( len + 20); @@ -294,7 +290,7 @@ static int gpu_schema_create() int i; ovis_log(mylog, OVIS_LDEBUG, "gpu_schema_create()\n"); - if (!use_base) { + if (!base) { sch = ldms_schema_new(conf.schema_name); if (sch == NULL) goto err1; @@ -343,7 +339,7 @@ static int gpu_schema_create() return 0; err2: - if (use_base) + if (base) base_schema_delete(base); else ldms_schema_delete(sch); @@ -354,7 +350,7 @@ static int gpu_schema_create() static void gpu_schema_destroy() { - if (use_base) + if (base) base_schema_delete(base); else ldms_schema_delete(gpu_schema); @@ -435,6 +431,62 @@ static int parse_fields_value(const char *fields_str, unsigned short **fields_ou return -1; } +const char *typeString(int ft) +{ + switch (ft) { + case DCGM_FT_DOUBLE: + return "double"; + case DCGM_FT_INT64: + return "int64_t"; + case DCGM_FT_STRING: + return "string"; + case DCGM_FT_TIMESTAMP: + return "timestamp"; + default: + return "unsupported_data_type"; + } +} + +#define NUSAGE 20480 +static void init_field_help(char *preamble) +{ + if (!dcgm_initialized) { + dcgmReturn_t rc = dcgmInit(); + if (rc != DCGM_ST_OK) { + return; + } + } + + dstring_t ds; + dstr_init2(&ds, NUSAGE); + + int i; + dstrcat(&ds, preamble, DSTRING_ALL); + dstrcat(&ds, "field_id\ttag/metric\t\ttype\t(units)\n", DSTRING_ALL); + for (i = 0; i < DCGM_FI_MAX_FIELDS; i++) { + dcgm_field_meta_p field_meta; + field_meta = DcgmFieldGetById(i); + if (field_meta) { + dstrcat_int(&ds, (int64_t)field_meta->fieldId); + dstrcat(&ds, "\t", 1); + dstrcat(&ds, field_meta->tag, DSTRING_ALL); + dstrcat(&ds, "\t", 1); + dstrcat(&ds, typeString(field_meta->fieldType), DSTRING_ALL); + dstrcat(&ds, "\t(", 2); + dstrcat(&ds, (field_meta->valueFormat ? + field_meta->valueFormat->unit : + "no_format"), DSTRING_ALL); + dstrcat(&ds, ")\n", 2); + } + } + field_help = dstr_extract(&ds); + dstr_free(&ds); + + if (!dcgm_initialized) { + dcgmShutdown(); + } +} + /************************************************************************** * Externally accessed functions **************************************************************************/ @@ -446,15 +498,12 @@ static int config(struct ldmsd_plugin *self, int rc = -1; int i; - pthread_mutex_lock(&cfg_lock); - if (termed) - termed = 0; ovis_log(mylog, OVIS_LDEBUG, "config() called\n"); if (dcgm_initialized) { ovis_log(mylog, OVIS_LERROR, "config() called twice. Stop it first.\n"); - pthread_mutex_unlock(&cfg_lock); return EINVAL; } + int use_base = 0; value = av_value(avl, "use_base"); if (value != NULL) { use_base = 1; @@ -463,17 +512,17 @@ static int config(struct ldmsd_plugin *self, ovis_log(mylog, OVIS_LDEBUG, "Ignoring sampler_base\n"); } - value = av_value(avl, "interval"); - if (value == NULL) { - ovis_log(mylog, OVIS_LERROR, "config() \"interval\" option missing\n"); - goto err0; - } - errno = 0; - conf.interval = strtol(value, NULL, 10); - if (errno != 0) { - ovis_log(mylog, OVIS_LERROR, "config() \"interval\" value conversion error: %d\n", errno); - goto err0; - } + value = av_value(avl, "interval"); + if (value == NULL) { + ovis_log(mylog, OVIS_LERROR, "config() \"interval\" option missing\n"); + goto err0; + } + errno = 0; + conf.interval = strtol(value, NULL, 10); + if (errno != 0) { + ovis_log(mylog, OVIS_LERROR, "config() \"interval\" value conversion error: %d\n", errno); + goto err0; + } if (! use_base) { int jc = jobid_helper_config(avl); @@ -494,7 +543,7 @@ static int config(struct ldmsd_plugin *self, goto err0; } } else { - base_config(avl, SAMP, "dcgm", mylog); + base = base_config(avl, SAMP, "dcgm", mylog); conf.schema_name = strdup(base->schema_name); } @@ -531,7 +580,6 @@ static int config(struct ldmsd_plugin *self, gpu_sets[gpu_ids[i]] = gpu_metric_set_create(gpu_ids[i]); } - pthread_mutex_unlock(&cfg_lock); return 0; err4: @@ -539,7 +587,7 @@ static int config(struct ldmsd_plugin *self, gpu_metric_set_destroy(gpu_sets[gpu_ids[i]]); } gpu_schema_destroy(); - if (use_base) { + if (base) { free(base->instance_name); base->instance_name = NULL; base_del(base); @@ -556,17 +604,13 @@ static int config(struct ldmsd_plugin *self, free(conf.schema_name); conf.schema_name = NULL; err0: - pthread_mutex_unlock(&cfg_lock); return rc; } static int sample(struct ldmsd_sampler *self) { ovis_log(mylog, OVIS_LDEBUG, SAMP" sample() called\n"); - pthread_mutex_lock(&cfg_lock); - if (!termed) - gpu_sample(); - pthread_mutex_unlock(&cfg_lock); + gpu_sample(); return 0; } @@ -575,9 +619,8 @@ static void term(struct ldmsd_plugin *self) int i; ovis_log(mylog, OVIS_LDEBUG, "term() called\n"); - pthread_mutex_lock(&cfg_lock); gpu_schema_destroy(); - if (use_base) { + if (base) { free(base->instance_name); base->instance_name = NULL; base_del(base); @@ -593,13 +636,13 @@ static void term(struct ldmsd_plugin *self) gpu_metric_set_destroy(gpu_sets[gpu_ids[i]]); } dcgm_fini(); - termed = 1; - use_base = 0; + free(field_help); + field_help = NULL; if (mylog) { ovis_log_destroy(mylog); mylog = NULL; } - pthread_mutex_unlock(&cfg_lock); + } static ldms_set_t get_set(struct ldmsd_sampler *self) @@ -610,12 +653,12 @@ static ldms_set_t get_set(struct ldmsd_sampler *self) static const char *usage(struct ldmsd_plugin *self) { ovis_log(mylog, OVIS_LDEBUG, "usage() called\n"); - return "config name=" SAMP + char *preamble = "config name=" SAMP " interval= [fields=]\n" " [schema=] [job_set=]\n" - " [use_base=1\n" + " [use_base=<*>\n" " [uid=] [gid=] [perm=] [instance=]\n" - " [producer=] [job_id=]\n" + " [producer=] [job_id=]\n" " ]\n" " name=\n" " interval= DCGM query interval (microsecond)\n" @@ -623,7 +666,7 @@ static const char *usage(struct ldmsd_plugin *self) " fields= list of DCGM field_ids\n" " schema= default " SAMP "\n" " job_set=\n" - " If use_base=1 is given, the additional parameters are applied\n" + " If use_base=<*> is given, the additional parameters are applied\n" " (see ldms_sampler_base).\n" " producer A unique name for the host providing the timing data\n" " (default $HOSTNAME)\n" @@ -636,8 +679,11 @@ static const char *usage(struct ldmsd_plugin *self) " uid The user-id of the set's owner\n" " gid The group id of the set's owner\n" " perm The set's access permissions\n" - " See ldms-dcgm-list-fields for input values to fields\n" - ; + " The field numbers are tabulated:\n" + " (Not all can be ldms metrics, as indicated by 'unsupported_data_type')\n"; + if (!field_help) + init_field_help(preamble); + return field_help ? field_help : preamble; } static struct ldmsd_sampler nvidia_dcgm_plugin = { diff --git a/ldms/src/sampler/dcgm_sampler/gen-ldms-dcgm-list-fields b/ldms/src/sampler/dcgm_sampler/gen-ldms-dcgm-list-fields deleted file mode 100755 index c4473ddd3..000000000 --- a/ldms/src/sampler/dcgm_sampler/gen-ldms-dcgm-list-fields +++ /dev/null @@ -1,90 +0,0 @@ -#! /bin/bash -cat << EOF -/* generated with gen_ldms_dcgm_list_fields */ -#include -#include -#include -#include -struct fielddef { const char *macro; int field_id; char *tag;}; -#define FIELDDEF(m) { #m, m, NULL }, -struct fielddef all_fields[] = { -EOF -grep DCGM_FI_ /usr/include/dcgm_fields.h |grep '^#define' |grep ' [0-9]'|grep -v + | sed -e 's/#define //' -e 's/ .*//g' -e 's/^/FIELDDEF(/' -e 's/$/)/' -cat << EOF - { NULL, 0, NULL } -}; - - -#define ARRAY_SIZE(a) (sizeof(a) / sizeof(*a)) -#define _GNU_SOURCE - - -static int dcgm_init() -{ - dcgmReturn_t rc; - - rc = dcgmInit(); - if (rc != DCGM_ST_OK) { - return -1; - } - - return 0; -} - -static void dcgm_fini() -{ - dcgmShutdown(); -} - -const char *typeString(int ft) -{ - switch (ft) { - case DCGM_FT_DOUBLE: - return "double"; - case DCGM_FT_INT64: - return "int64_t"; - case DCGM_FT_STRING: - return "string"; - case DCGM_FT_TIMESTAMP: - return "timestamp"; - default: - return "unmapped_type"; - } -} - -static void dump_dcgm_tags() -{ - int i; - printf("field_id\ttag\tmacro\ttype\tunits\n"); - for (i = 0; i < ARRAY_SIZE(all_fields) && - all_fields[i].macro != NULL; i++) { - - dcgm_field_meta_p field_meta; - field_meta = DcgmFieldGetById(all_fields[i].field_id); - if (field_meta) { - switch (field_meta->fieldType) { - case DCGM_FT_DOUBLE: - case DCGM_FT_INT64: - case DCGM_FT_STRING: - case DCGM_FT_TIMESTAMP: - all_fields[i].tag = strdup(field_meta->tag); - printf("%d\t%s\t%s\t%s\t\"%s\"\n", - all_fields[i].field_id, field_meta->tag, - all_fields[i].macro, - typeString(field_meta->fieldType), - field_meta->valueFormat->unit); - break; - default: - break; - } - } - } -} - -int main() -{ - dcgm_init(); - dump_dcgm_tags(); - dcgm_fini(); -} -EOF