Skip to content

Commit

Permalink
add test demonstrating problems with json sampler:
Browse files Browse the repository at this point in the history
- the metric set content gets mixed
- the metric set has only the schema of the first message seen
- messages get lost
  • Loading branch information
baallan authored and tom95858 committed Nov 20, 2024
1 parent e90e806 commit 38f97cf
Show file tree
Hide file tree
Showing 6 changed files with 186 additions and 1 deletion.
96 changes: 96 additions & 0 deletions ldms/scripts/examples/json_stream_sampler
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
export plugname=json_stream_sampler
export dsname=$(ldms_dstat_schema_name mmalloc=1 io=1 fd=1 stat=1 auto-schema=1)
export dstat_schema=$dsname
export LDMSD_LOG_LEVEL=ERROR
export LDMSD_LOG_TIME_SEC=1
export LDMSD_EXTRA="-m 128m"

# Defining and using fakepid to allow testing without root & ldms-notify.
# good enough for sorting storage pipeline issues.
# each gets a new msgno so we can check for missing data
# fake_pid <daemon number to post to> <pid to fake> <schema name> <event name> <stream name>
# uses specific formats for events task_init_priv task_exit
fmsgno=0
function fake_pid {
((fmsgno++))
fdaemon=$1
fpid=$2
fschema=$3
fevent=$4
fstream=$5
case $fevent in
task_init_priv)
fmsg="{\"msgno\":$fmsgno,\"schema\":\"$fschema\",\"event\":\"task_init_priv\",\"timestamp\":1731000790,\"context\":\"*\",\"data\":{\"start\":\"1729013913.013913\",\"job_id\":\"0\",\"serial\":1,\"os_pid\":$fpid,\"uid\":4294967295,\"gid\":4294967295,\"is_thread\":0,\"exe\":\"/usr/lib/systemd/systemd\",\"start_tick\":\"8\",\"task_pid\":1,\"task_global_id\":-1}}"
;;
task_exit)
fmsg="{\"msgno\":$fmsgno,\"schema\":\"$fschema\",\"event\":\"task_exit\",\"timestamp\":1731000792,\"context\":\"*\",\"data\":{\"start\":\"1731000777.000777\",\"job_id\":\"0\",\"serial\":1131,\"os_pid\":$fpid,\"uid\":4294967295,\"gid\":4294967295,\"is_thread\":0,\"exe\":\"/usr/lpp/mmfs/bin/mmksh\",\"duration\":15.161968946456909,\"start_tick\":\"198686390\",\"task_pid\":140962,\"task_global_id\":-1}}"
;;
*)
fmsg="{\"msgno\":$fmsgno,\"schema\":\"$fschema\",\"event\":\"$fevent\",\"timestamp\":1731000792,\"data\":{\"start\":\"1731000777.000777\",\"job_id\":\"0\",\"os_pid\":$fpid,\"uid\":4294967295,\"gid\":4294967295,\"exe\":\"/mypath/myprog\",\"start_tick\":\"198686390\"}}"
;;
esac
if test -n "$PORT"; then
iport=$PORT
else
iport=${ports[$fdaemon]}
fi
echo sending "<$fmsg>" to $iport
echo "$fmsg" | ldmsd_stream_publish -p $iport -a none -s $fstream -t json -x sock -h localhost
}

portbase=61070
rm -f $LOGDIR/json*.log
rm -rf $STOREDIR
mkdir -p $STOREDIR
VGARGS="--tool=drd --suppressions=/scratch1/baallan/ovis/ldms/scripts/examples/linux_proc_sampler.drd.supp"
VGARGS="--leak-check=full --track-origins=yes --trace-children=yes --show-leak-kinds=definite --time-stamp=yes --keep-debuginfo=yes"
DAEMONS $(seq 3)
#vgon
LDMSD 1
vgoff
#vgon
LDMSD 2
vgoff
LDMSD 3
vgoff
SLEEP 1

fake_pid 1 2000 linux_task_data task_init_priv ldms-notify
fake_pid 1 2002 slurm_task_data task_init_priv ldms-notify
fake_pid 1 1999 lsf_task_data task_init_priv ldms-notify
SLEEP 1
fake_pid 1 1999 lsf_task_data task_exit ldms-notify
fake_pid 1 2002 slurm_task_data task_exit ldms-notify
fake_pid 1 2000 linux_task_data task_exit ldms-notify
SLEEP 1
fake_pid 1 2001 myschema myevent ldms-notify
fake_pid 1 2002 myschema myevent ldms-notify

SLEEP 1
for lc in $(seq 3); do
echo "CHECKING STATUS ON $lc:"
ldmsd_controller --auth none --port ${ports[$lc]} --cmd stream_status
done
SLEEP 2
MESSAGE ldms_ls on host 1:
LDMS_LS 1 -v
MESSAGE ldms_ls on host 2:
SLEEP 1
LDMS_LS 2 -v
SLEEP 5
MESSAGE stream_client_dump on sampler daemon 1
for lc in $(seq 2); do
echo "CHECKING CLIENT DUMP ON $lc:"
ldmsd_controller --auth none --port ${ports[$lc]} --cmd stream_client_dump
SLEEP 1
done
SLEEP 5
for lc in $(seq 3); do
LDMS_LS $lc -v
SLEEP 2
done
SLEEP 2
KILL_LDMSD 3 2 1
file_created $STOREDIR/node/$dsname
rollover_created $STOREDIR/blobs/ldms-notify.DAT
# check here for csv and line count of csv for stream messages expected
17 changes: 17 additions & 0 deletions ldms/scripts/examples/json_stream_sampler.1
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
load name=json_stream_sampler
config name=json_stream_sampler producer=localhost${i} instance=localhost${i}/jss component_id=${i} stream=ldms-notify perm=0644 heap_sz=64m
start name=json_stream_sampler interval=1000000 offset=0
#log sampler.json_stream DEBUG

load name=dstat
config name=dstat producer=localhost${i} instance=localhost${i}/${dstat_schema} component_id=${i} mmalloc=1 io=1 fd=1 auto-schema=1 stat=1 perm=777
start name=dstat interval=1000000 offset=0

# define updater to push sets to L1 as they appear
##updtr_add name=push_pidset push=onchange interval=1000000
# catch self in .*
##updtr_prdcr_add name=push_pidset regex=.*
# match sets by schema name
##updtr_match_add name=push_pidset regex=linux_task_data match=schema
# begin pushing
##updtr_start name=push_pidset
44 changes: 44 additions & 0 deletions ldms/scripts/examples/json_stream_sampler.2
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
load name=dstat
config name=dstat producer=localhost${i} instance=localhost${i}/${dstat_schema} component_id=${i} mmalloc=1 io=1 fd=1 auto-schema=1 stat=1 perm=777
start name=dstat interval=1000000 offset=0

# blobs must be allowed by writer plugin and prdcr_subscribe by daemon
load name=blob_stream_writer plugin=blob_stream_writer
config name=blob_stream_writer path=${STOREDIR} container=blobs stream=ldms-notify types=1

prdcr_add name=localhost1 host=${HOST} type=active xprt=${XPRT} port=${port1} reconnect=2000000
prdcr_subscribe regex=.* stream=ldms-notify
prdcr_start name=localhost1

updtr_add name=allhosts interval=1000000 offset=100000
updtr_prdcr_add name=allhosts regex=.*
updtr_start name=allhosts

load name=store_csv
config name=store_csv path=${STOREDIR} altheader=0

strgp_add name=json_stream_store_linux plugin=store_csv schema=linux_task_data container=pnode
strgp_prdcr_add name=json_stream_store_linux regex=.*
strgp_start name=json_stream_store_linux

strgp_add name=json_stream_store_lsf plugin=store_csv schema=lsf_task_data container=pnode
strgp_prdcr_add name=json_stream_store_lsf regex=.*
strgp_start name=json_stream_store_lsf

strgp_add name=json_stream_store_slurm plugin=store_csv schema=slurm_task_data container=pnode
strgp_prdcr_add name=json_stream_store_slurm regex=.*
strgp_start name=json_stream_store_slurm

strgp_add name=json_stream_store_my plugin=store_csv schema=myschema container=pnode
strgp_prdcr_add name=json_stream_store_my regex=.*
strgp_start name=json_stream_store_my


# define updater to push sets to L1 as they appear
#updtr_add name=push_pidset push=onchange interval=1000000
# catch self in .*
#updtr_prdcr_add name=push_pidset regex=.*
# match sets by schema name
#updtr_match_add name=push_pidset regex=linux_task_data match=schema
# begin pushing
#updtr_start name=push_pidset
17 changes: 17 additions & 0 deletions ldms/scripts/examples/json_stream_sampler.3
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
load name=blob_stream_writer plugin=blob_stream_writer
config name=blob_stream_writer path=${STOREDIR} container=blobs_L2 stream=ldms-notify types=1

prdcr_add name=localhost2 host=${HOST} type=active xprt=${XPRT} port=${port2} interval=2000000
prdcr_subscribe regex=.* stream=ldms-notify
prdcr_start name=localhost2

updtr_add name=allhosts interval=1000000 offset=200000
updtr_prdcr_add name=allhosts regex=.*
updtr_start name=allhosts

load name=store_csv
config name=store_csv path=${STOREDIR} altheader=0

strgp_add name=store_dstat plugin=store_csv schema=${dstat_schema} container=node
strgp_prdcr_add name=store_dstat regex=.*
strgp_start name=store_dstat
11 changes: 11 additions & 0 deletions ldms/src/sampler/blob_stream/Plugin_blob_stream_writer.man
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,10 @@ timing=1
.br
Enable writing timestamps to a separate file.
.TP
types=1
.br
Enable writing message types to a separate file.
.TP
spool=1
.br
Move closed files to the directory <path>/<container>/spool/.
Expand Down Expand Up @@ -114,6 +118,13 @@ Each timestamp is written to the .TIMING file as a binary pair (tv_sec, tv_usec)
with each value stored as a little-endian 64 bit value which should be
read and then converted with le64toh.

Optionally (if types=1 given) the additional file
$path/$container/$stream.TYPE.$create_time
is created containing binary type characters corresponding to the messages.
The TYPE file begins with an 8 byte magic number: blobtyp\\0.
Each type is a character (j: json, s:string(fixed length as determined by corresponding offset).
There is no separator between message type indicators.

.SH NOTES
.PP
This writer is in development and may be changed at any time.
Expand Down
2 changes: 1 addition & 1 deletion ldms/src/sampler/json/Plugin_json_stream_sampler.man
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ Within ldmsd_controller or a configuration file:
.OP uid=\fIUID\fR
.OP gid=\fIGID\fR
.OP perm=\fIPERM\fR
.OP heap_szperm=\fIBYTES\fR
.OP heap_sz=\fIBYTES\fR
.YS

.SH DESCRIPTION
Expand Down

0 comments on commit 38f97cf

Please sign in to comment.