Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Slingshot Switch Samplers Plus Config fixes #1363

Merged
merged 2 commits into from
Feb 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 12 additions & 3 deletions configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -429,6 +429,7 @@ AS_IF([test "x$enable_papi" != xno],[
])
AM_CONDITIONAL([HAVE_LIBPAPI], [test "x$HAVE_LIBPAPI" = xyes])
AM_CONDITIONAL([HAVE_LIBPFM], [test "x$HAVE_LIBPFM" = xyes])
AM_CONDITIONAL([ENABLE_PAPI], [test "x$enable_papi" != no -a "x$HAVE_LIBPAPI" = xyes -a "x$HAVE_LIBPFM" = xyes])
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The first "no" here should be "xno"

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In line 432 you mean?


AC_LIB_HAVE_LINKFLAGS([ibmad], [], [#include <infiniband/mad.h>])
AM_CONDITIONAL([HAVE_LIBIBMAD], [test "x$HAVE_LIBIBMAD" = xyes])
Expand Down Expand Up @@ -466,8 +467,7 @@ AC_ARG_ENABLE([ibnet],
[],
[enable_ibnet="check"])
AM_CONDITIONAL([ENABLE_IBNET], [test "x$enable_ibnet" != xno])
AS_IF([test "$enable_ibnet" = xyes],[
AC_MSG_NOTICE([Disable ibnet module NOT requested])
AS_IF([test "x$enable_ibnet" = xyes],[
AS_IF([test "x$HAVE_LIBIBMAD" = xno],
[AC_MSG_ERROR([ibnet required libibmad or <infiniband/mad.h> not found])])
AS_IF([test "x$HAVE_LIBIBUMAD" = xno],
Expand Down Expand Up @@ -650,7 +650,7 @@ AC_ARG_WITH([slurm],
[with_slurm=check])
have_slurm=no
AC_SUBST([SLURM_CFLAGS])
AS_IF([test "x$with_slurm" != no],[
AS_IF([test "x$with_slurm" != xno],[
save_CFLAGS=$CFLAGS
CFLAGS="$CFLAGS $SLURM_CFLAGS"
AC_CHECK_HEADER([slurm/spank.h], [have_slurm=yes], [have_slurm=no])
Expand All @@ -663,6 +663,8 @@ AS_IF([test "x$with_slurm" != no],[
])
AM_CONDITIONAL([HAVE_SLURM], [test "x$have_slurm" = xyes])

OPTION_DEFAULT_ENABLE([spank_subscriber], [ENABLE_SPANK_SUBSCRIBER], [Builds sampler that subscribes to spank plugin])

AC_CHECK_HEADER([linux/netlink.h], [have_netlink=yes], [have_netlink=no])
AM_CONDITIONAL([HAVE_NETLINK], [test "x$have_netlink" = xyes])

Expand Down Expand Up @@ -925,6 +927,12 @@ AS_IF([test "x$enable_slingshot" = xyes],[
[AC_MSG_ERROR([libcxi or its headers not found])])
])

AC_ARG_ENABLE([slingshot_switch],
[AS_HELP_STRING([--enable-slingshot_switch], [require the slinghost on-switch plugins @<:@default=check@:>@])],
[],
[enable_slingshot_switch="check"])
AM_CONDITIONAL([ENABLE_SLINGSHOT_SWITCH], [test "x$enable_slingshot_switch" != xno])

# define substitutions for configvars and other sed-generated files.
# note carefully the escapes.
OVIS_DO_SUBST([LDMS_SUBST_RULE], ["sed \
Expand Down Expand Up @@ -1097,6 +1105,7 @@ ldms/src/sampler/syspapi/Makefile
ldms/src/sampler/app_sampler/Makefile
ldms/src/sampler/slingshot_metrics/Makefile
ldms/src/sampler/slingshot_info/Makefile
ldms/src/sampler/slingshot_switch/Makefile
ldms/src/contrib/sampler/Makefile
ldms/src/contrib/sampler/daos/Makefile
ldms/src/contrib/sampler/daos/test/Makefile
Expand Down
3 changes: 1 addition & 2 deletions ldms/src/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,13 @@ SUBDIRS += third-plugins
SUBDIRS += core
SUBDIRS += ldmsd

SUBDIRS += decomp

if ENABLE_OVIS_AUTH
SUBDIRS += auth
endif

if ENABLE_STORE
SUBDIRS += store
SUBDIRS += decomp
endif

if ENABLE_SAMPLER
Expand Down
12 changes: 12 additions & 0 deletions ldms/src/sampler/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,13 @@ ldmssamplerincludedir = $(includedir)/ldms/sampler
ldmssamplerinclude_HEADERS = sampler_base.h

SUBDIRS += netlink

if ENABLE_LUSTRE
SUBDIRS += lustre_client
SUBDIRS += lustre_mdt
SUBDIRS += lustre_ost
SUBDIRS += lustre_mdc
endif

if HAVE_DCGM
SUBDIRS += dcgm_sampler
Expand Down Expand Up @@ -131,8 +134,10 @@ SUBDIRS += job_info_slurm
SUBDIRS += spank
endif

if ENABLE_SPANK_SUBSCRIBER
# This slurm sampler does not have a slurm build dependency
SUBDIRS += slurm
endif

if ENABLE_LUSTRE
SUBDIRS += lustre
Expand Down Expand Up @@ -200,8 +205,10 @@ if ENABLE_MPI_SAMPLER
SUBDIRS += shm
endif

if ENABLE_PAPI
if HAVE_LIBPAPI
SUBDIRS += papi

if HAVE_LIBPFM
SUBDIRS += syspapi

Expand All @@ -218,6 +225,7 @@ librapl_la_LIBADD = $(COMMON_LIBADD) $(LTLIBPAPI) $(LTLIBPFM) -lm
pkglib_LTLIBRARIES += librapl.la
endif
endif
endif

if ENABLE_PROCDISKSTATS
SUBDIRS += procdiskstats
Expand Down Expand Up @@ -308,3 +316,7 @@ SUBDIRS += slingshot_metrics
SUBDIRS += slingshot_info
endif
endif

if ENABLE_SLINGSHOT_SWITCH
SUBDIRS += slingshot_switch
endif
25 changes: 25 additions & 0 deletions ldms/src/sampler/slingshot_switch/Makefile.am
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
libslingshot_switch_la_SOURCES = \
slingshot_switch.c
libslingshot_switch_la_LIBADD = \
$(top_builddir)/ldms/src/core/libldms.la \
$(top_builddir)/lib/src/coll/libcoll.la \
$(top_builddir)/ldms/src/sampler/libsampler_base.la
libslingshot_switch_la_LDFLAGS = \
-no-undefined \
-export-symbols-regex 'get_plugin'
libslingshot_switch_la_CPPFLAGS = @OVIS_INCLUDE_ABS@

libslingshot_switch_1_la_SOURCES = \
slingshot_switch_1.c
libslingshot_switch_1_la_LIBADD = \
$(top_builddir)/ldms/src/core/libldms.la \
$(top_builddir)/lib/src/coll/libcoll.la \
$(top_builddir)/ldms/src/sampler/libsampler_base.la
libslingshot_switch_1_la_LDFLAGS = \
-no-undefined \
-export-symbols-regex 'get_plugin'
libslingshot_switch_1_la_CPPFLAGS = @OVIS_INCLUDE_ABS@

pkglib_LTLIBRARIES = libslingshot_switch.la libslingshot_switch_1.la

dist_man7_MANS = Plugin_slingshot_switch.man
131 changes: 131 additions & 0 deletions ldms/src/sampler/slingshot_switch/Plugin_slingshot_switch.man
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
\" Manpage for Plugin_slingshot_switch
.\" Contact ovis-help@ca.sandia.gov to correct errors or typos.
.TH man 7 "17 Nov 2023" "v4" "LDMS Plugin slingshot_switch man page"

.SH NAME
Plugin_slingshot_switch - man page for the LDMS slingshot_switch plugin

.SH SYNOPSIS
Within ldmsd_controller or a configuration file:
.br
config name=slingshot_switch [ <attr> = <value> ]

.SH DESCRIPTION
With LDMS (Lightweight Distributed Metric Service), plugins for the ldmsd (ldms
daemon) are configured via ldmsd_controller or a configuration file. The
slingshot_switch plugin uses LDMS_V_LIST and LDMS_V_RECORD to provide slingshot
switch info via the dump_counters command run on the switch.

slingshot_switch and slingshot_switch_1 are the same plugins. There are two copies
to enable sampling at two different rates.

.SH CONFIGURATION ATTRIBUTE SYNTAX
The slingshot_switch plugin uses the sampler_base base class. This man page covers
only the configuration attributes, or those with default values, specific to the
this plugin; see ldms_sampler_base.man for the attributes of the base class.

.TP
.BR config
name=<plugin_name>
.br
configuration line
.RS
.TP
name=<plugin_name>
.br
This MUST be slingshot_switch (or slingshot_switch_1).
.TP
conffile=<conffile>
.br
Configuration file. First non-comment line must be "n=XXX" or "p=XXX,YYY,ZZZ".
p does not support ranges. Then variables or groups are listed one
per line. Comments lines can be in the file designated by the first line
being a '#'.

Arguments are those of dump_counters.
.TP
schema=<schema>
.br
Optional schema name. It is intended that the same sampler on different nodes
with different metrics have a different schema. If not specified, will
default to `slingshot_switch` (or `slingshot_switch_1`).
.RE

.SH BUGS (and future enhancements)
\[bu] This is still under development.

\[bu] Does not yet support ranges for the ports.

\[bu] Does not check for duplicate ports.

\[bu] Could have more robust handling of errors in the config file.

\[bu] MAX Ports is 70.

\[bu] Possibly can reduce unnecessary allocations in schema_metric_list.

\[bu] DEBUG messages are excessive, while this is in development.

\[bu] Need to check for extra whitespace in variable names.

\[bu] Only checking for the expected number of data output lines. Note that the output has at least one extra line.


.SH EXAMPLES
.PP
1) Within ldmsd_controller or a configuration file:
.nf
load name=slingshot_switch
config name=slingshot_switch producer=vm1_1 instance=vm1_1/slingshot_switch conffile=/home/confffile.txt
start name=slingshot_switch interval=1000000 offset=0
.fi
or the above with `slingshot_switch_1`.

conffile.txt can look something like:
.nf
#This can be a leading comment(s)
n=65
# This can be an interspersed comment(s)
cfrx
#This is yet another comment(s)
.fi

.PP
2) For confile sampler_ss.conf:
.nf
env SWITCH=$(hostname)
env COMPONENT_ID=1

load name=slingshot_switch
config name=slingshot_switch producer=${SWITCH} component_id=${COMPONENT_ID} instance=${SWITCH}/port_metrics conffile=/rwfs/OVIS_slingshot-4.4.1/etc/ldms/slingshot_ldms_1s.txt
start name=slingshot_switch interval=1000000
.fi

with slingshot_ldms_1s.txt:
.nf
p=0,1,2,3
rfc_3635
.fi

Command line to start ldmsd using the above:
.nf
/rwfs/OVIS_slingshot-4.4.1/etc/ldms# ldmsd -x sock:411 -c /rwfs/OVIS_slingshot-4.4.1/etc/ldms/sampler_ss.conf -m 2M -v QUIET
.fi

Then ldms_ls output:
.nf
x3000c0r42b0/port_metrics1: consistent, last update: Fri Nov 17 17:24:08 2023 +0000 [23292us]
M u64 component_id 1
D u64 job_id 0
D u64 app_id 0
M record_type slingshot_port LDMS_V_RECORD_TYPE
D list<> slingshot_port_list
port (x) IfInDiscards (x) IfInErrors (x) IfInUnknownProtos (x) IfOutDiscards (x) IfOutErrors (x) Dot3HCInPauseFrames (x) Dot3HCOutPauseFrames (x) IfHCInOctets (x) IfHCInUcastPkts (x) IfHCInMulticastPkts (x) IfHCInBroadcastPkts (x) IfHCOutOctets (x) IfHCOutUcastPkts (x) IfHCOutMulticastPkts (x) IfHCOutBroadcastPkts (x)
0 3135102637 0 3135102637 0 0 0 0 147205648261491 2495004354147 2471 0 1536216301 20234005 0 0
1 3135102637 0 3135102637 0 0 0 0 147204949152872 2494992497033 0 0 698442077 10279716 0 0
2 3135102637 0 3135102637 0 0 0 0 147205081815556 2494994737508 0 0 698345785 10272362 0 0
3 3135102637 0 3135102637 0 0 0 0 147205460019681 2495001153446 0 0 698845184 10277326 0 0
.fi

.SH SEE ALSO
ldmsd(8), ldms_quickstart(7), ldmsd_controller(8), ldms_sampler_base(7)
Loading
Loading