From 329efce4f71f799f9fdb4b3864ef53af65e94164 Mon Sep 17 00:00:00 2001 From: Xavier Bouthillier Date: Fri, 13 Oct 2017 22:54:49 -0400 Subject: [PATCH 1/4] Add verbosity to smart-dispatch It is difficult to debug resuming while important process are taking place in the pbs script automatically built by SmartDispatch. We add verbose to smart-dispatch script and add debugging prints in epilog. --- scripts/sd-launch-pbs | 17 ++++++++++++ scripts/smart-dispatch | 60 +++++++++++++++++++++++++++++++++++++----- 2 files changed, 71 insertions(+), 6 deletions(-) diff --git a/scripts/sd-launch-pbs b/scripts/sd-launch-pbs index 0b38733..984025c 100644 --- a/scripts/sd-launch-pbs +++ b/scripts/sd-launch-pbs @@ -7,6 +7,10 @@ import logging from smartdispatch import launch_jobs from smartdispatch import utils + +logger = logging.getLogger() + + LOGS_FOLDERNAME = "SMART_DISPATCH_LOGS" CLUSTER_NAME = utils.detect_cluster() LAUNCHER = utils.get_launcher(CLUSTER_NAME) @@ -27,8 +31,21 @@ def parse_arguments(): parser.add_argument('pbs', type=str, help='PBS filename to launch.') parser.add_argument('path_job', type=str, help='Path to the job folder.') + parser.add_argument( + '-v', '--verbose', action='count', default=0, + help="Print informations about the process.\n" + " -v: INFO\n" + " -vv: DEBUG") + args = parser.parse_args() + if args.verbose == 0: + logging.basicConfig(level=logging.WARNING) + elif args.verbose == 1: + logging.basicConfig(level=logging.INFO) + elif args.verbose >= 2: + logging.basicConfig(level=logging.DEBUG) + return args diff --git a/scripts/smart-dispatch b/scripts/smart-dispatch index 86904fa..b08dc66 100755 --- a/scripts/smart-dispatch +++ b/scripts/smart-dispatch @@ -1,9 +1,10 @@ #!/usr/bin/env python2 # -*- coding: utf-8 -*- +import argparse +import logging import os import sys -import argparse import time as t from os.path import join as pjoin from textwrap import dedent @@ -16,9 +17,12 @@ from smartdispatch import get_available_queues from smartdispatch import launch_jobs from smartdispatch import utils -import logging import smartdispatch + +logger = logging.getLogger() + + LOGS_FOLDERNAME = "SMART_DISPATCH_LOGS" CLUSTER_NAME = utils.detect_cluster() AVAILABLE_QUEUES = get_available_queues(CLUSTER_NAME) @@ -29,19 +33,45 @@ TIMEOUT_EXIT_CODE = 124 AUTORESUME_TRIGGER_AFTER = '$(($PBS_WALLTIME - 60))' # By default, 60s before the maximum walltime. AUTORESUME_WORKER_CALL_PREFIX = 'timeout -s TERM {trigger_after} '.format(trigger_after=AUTORESUME_TRIGGER_AFTER) AUTORESUME_WORKER_CALL_SUFFIX = ' WORKER_PIDS+=" $!"' -AUTORESUME_PROLOG = 'WORKER_PIDS=""' +AUTORESUME_PROLOG = """ +WORKER_PIDS="" +VERBOSE={verbose} +""" AUTORESUME_EPILOG = """\ NEED_TO_RESUME=false +if [ $VERBOSE = true ]; then + echo NEED_TO_RESUME=$NEED_TO_RESUME + echo WORKER_PIDS=$WORKER_PIDS +fi for WORKER_PID in $WORKER_PIDS; do + if [ $VERBOSE = true ]; then + echo WORKER_PID=$WORKER_PID + fi wait "$WORKER_PID" RETURN_CODE=$? + if [ $VERBOSE = true ]; then + echo "RETURN_CODE is $RETURN_CODE while " \ + "timeout_exit_code is {timeout_exit_code}" + fi if [ $RETURN_CODE -eq {timeout_exit_code} ]; then NEED_TO_RESUME=true fi + if [ $VERBOSE = true ]; then + echo NEED_TO_RESUME=$NEED_TO_RESUME + fi done +if [ $VERBOSE = true ]; then + echo NEED_TO_RESUME=$NEED_TO_RESUME +fi if [ "$NEED_TO_RESUME" = true ]; then echo "Autoresuming using: {{launcher}} $PBS_FILENAME" - sd-launch-pbs --launcher {{launcher}} $PBS_FILENAME {{path_job}} + if [ $VERBOSE = true]; then + VERBOSE_OPTION="-vv" + else + VERBOSE_OPTION="" + fi + + sd-launch-pbs $VERBOSE_OPTION --launcher {{launcher}} $PBS_FILENAME {{path_job}} fi """.format(timeout_exit_code=TIMEOUT_EXIT_CODE) @@ -163,8 +193,12 @@ def main(): prolog = [] epilog = ['wait'] if args.autoresume: - prolog = [AUTORESUME_PROLOG] - epilog = [AUTORESUME_EPILOG.format(launcher=LAUNCHER if args.launcher is None else args.launcher, path_job=path_job)] + prolog = [ + AUTORESUME_PROLOG.format(verbose=str(args.verbose >= 2).lower())] + epilog = [ + AUTORESUME_EPILOG.format( + launcher=LAUNCHER if args.launcher is None else args.launcher, + path_job=path_job)] job_generator = job_generator_factory(queue, commands, prolog, epilog, command_params, CLUSTER_NAME, path_job) @@ -187,6 +221,13 @@ def main(): def parse_arguments(): parser = argparse.ArgumentParser() + + parser.add_argument( + '-v', '--verbose', action='count', default=0, + help="Print informations about the process.\n" + " -v: INFO\n" + " -vv: DEBUG") + parser.add_argument('-q', '--queueName', required=True, help='Queue used (ex: qwork@mp2, qfat256@mp2, gpu_1)') parser.add_argument('-n', '--batchName', required=False, help='The name of the batch. Default: The commands launched.') parser.add_argument('-t', '--walltime', required=False, help='Set the estimated running time of your jobs using the DD:HH:MM:SS format. Note that they will be killed when this time limit is reached.') @@ -226,6 +267,13 @@ def parse_arguments(): if args.coresPerCommand < 1: parser.error("coresPerNode must be at least 1") + if args.verbose == 0: + logging.basicConfig(level=logging.WARNING) + elif args.verbose == 1: + logging.basicConfig(level=logging.INFO) + elif args.verbose >= 2: + logging.basicConfig(level=logging.DEBUG) + return args From 502181da6d576160bcae129c04959f9d4860c532 Mon Sep 17 00:00:00 2001 From: Xavier Bouthillier Date: Fri, 20 Oct 2017 15:23:26 -0400 Subject: [PATCH 2/4] Remove forced logging level to INFO The user can now specify which level of logging it wants with -vv option. The default one is WARNING. If is it important for the user to have INFO present in stderr, it can use -v or -vv. --- scripts/sd-launch-pbs | 3 --- scripts/smart-dispatch | 3 --- 2 files changed, 6 deletions(-) diff --git a/scripts/sd-launch-pbs b/scripts/sd-launch-pbs index 984025c..872b202 100644 --- a/scripts/sd-launch-pbs +++ b/scripts/sd-launch-pbs @@ -17,9 +17,6 @@ LAUNCHER = utils.get_launcher(CLUSTER_NAME) def main(): - # Necessary if we want 'logging.info' to appear in stderr. - logging.root.setLevel(logging.INFO) - args = parse_arguments() launch_jobs(LAUNCHER if args.launcher is None else args.launcher, [args.pbs], CLUSTER_NAME, args.path_job) diff --git a/scripts/smart-dispatch b/scripts/smart-dispatch index b08dc66..86f7394 100755 --- a/scripts/smart-dispatch +++ b/scripts/smart-dispatch @@ -77,9 +77,6 @@ fi def main(): - # Necessary if we want 'logging.info' to appear in stderr. - logging.root.setLevel(logging.INFO) - args = parse_arguments() path_smartdispatch_logs = pjoin(os.getcwd(), LOGS_FOLDERNAME) From ac41c216ade01426d8c5186f7d967a2550008148 Mon Sep 17 00:00:00 2001 From: Xavier Bouthillier Date: Thu, 2 Nov 2017 20:20:56 -0400 Subject: [PATCH 3/4] Revert wrapping lines longer than 79 chars. I've been told the admins decided long ago not to follow rule E501. Even though I disagree with this decision, I won't argue. :P --- scripts/smart-dispatch | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/scripts/smart-dispatch b/scripts/smart-dispatch index 86f7394..a9bdfe4 100755 --- a/scripts/smart-dispatch +++ b/scripts/smart-dispatch @@ -33,7 +33,7 @@ TIMEOUT_EXIT_CODE = 124 AUTORESUME_TRIGGER_AFTER = '$(($PBS_WALLTIME - 60))' # By default, 60s before the maximum walltime. AUTORESUME_WORKER_CALL_PREFIX = 'timeout -s TERM {trigger_after} '.format(trigger_after=AUTORESUME_TRIGGER_AFTER) AUTORESUME_WORKER_CALL_SUFFIX = ' WORKER_PIDS+=" $!"' -AUTORESUME_PROLOG = """ +AUTORESUME_PROLOG = """\ WORKER_PIDS="" VERBOSE={verbose} """ @@ -50,8 +50,7 @@ for WORKER_PID in $WORKER_PIDS; do wait "$WORKER_PID" RETURN_CODE=$? if [ $VERBOSE = true ]; then - echo "RETURN_CODE is $RETURN_CODE while " \ - "timeout_exit_code is {timeout_exit_code}" + echo "DEBUG: RETURN_CODE is $RETURN_CODE while timeout_exit_code is {timeout_exit_code}" fi if [ $RETURN_CODE -eq {timeout_exit_code} ]; then NEED_TO_RESUME=true @@ -190,12 +189,8 @@ def main(): prolog = [] epilog = ['wait'] if args.autoresume: - prolog = [ - AUTORESUME_PROLOG.format(verbose=str(args.verbose >= 2).lower())] - epilog = [ - AUTORESUME_EPILOG.format( - launcher=LAUNCHER if args.launcher is None else args.launcher, - path_job=path_job)] + prolog = [AUTORESUME_PROLOG.format(verbose=str(args.verbose >= 2).lower())] + epilog = [AUTORESUME_EPILOG.format(launcher=LAUNCHER if args.launcher is None else args.launcher, path_job=path_job)] job_generator = job_generator_factory(queue, commands, prolog, epilog, command_params, CLUSTER_NAME, path_job) From 65239f5700abcdeece5f76a15ff877ad211a2f72 Mon Sep 17 00:00:00 2001 From: Xavier Bouthillier Date: Thu, 2 Nov 2017 20:31:49 -0400 Subject: [PATCH 4/4] Reformat VERBOSE to allow passing any level to sd-launch-pbs VERBOSE in the PBS file was simply set to "true". Because of this we could only pass args.verbose=0 or args.verbose=2 to sd-launch-pbs. VERBOSE is now set to args.verbose and the command-line option {"", -v, -vv} is rebuilt in the PBS script based on the value of args.parse. --- scripts/smart-dispatch | 47 ++++++++++++++++++++++++++---------------- 1 file changed, 29 insertions(+), 18 deletions(-) diff --git a/scripts/smart-dispatch b/scripts/smart-dispatch index a9bdfe4..4c9e0bd 100755 --- a/scripts/smart-dispatch +++ b/scripts/smart-dispatch @@ -22,6 +22,9 @@ import smartdispatch logger = logging.getLogger() +VERBOSE_WARNING = 0 +VERBOSE_INFO = 1 +VERBOSE_DEBUG = 2 LOGS_FOLDERNAME = "SMART_DISPATCH_LOGS" CLUSTER_NAME = utils.detect_cluster() @@ -39,40 +42,48 @@ VERBOSE={verbose} """ AUTORESUME_EPILOG = """\ NEED_TO_RESUME=false -if [ $VERBOSE = true ]; then - echo NEED_TO_RESUME=$NEED_TO_RESUME - echo WORKER_PIDS=$WORKER_PIDS +if [ $VERBOSE -ge {debug} ]; then + echo "DEBUG: NEED_TO_RESUME=$NEED_TO_RESUME" + echo "DEBUG: WORKER_PIDS=$WORKER_PIDS" fi for WORKER_PID in $WORKER_PIDS; do - if [ $VERBOSE = true ]; then - echo WORKER_PID=$WORKER_PID + if [ $VERBOSE -ge {debug} ]; then + echo "DEBUG: WORKER_PID=$WORKER_PID" fi wait "$WORKER_PID" RETURN_CODE=$? - if [ $VERBOSE = true ]; then + if [ $VERBOSE -ge {debug} ]; then echo "DEBUG: RETURN_CODE is $RETURN_CODE while timeout_exit_code is {timeout_exit_code}" fi if [ $RETURN_CODE -eq {timeout_exit_code} ]; then NEED_TO_RESUME=true fi - if [ $VERBOSE = true ]; then - echo NEED_TO_RESUME=$NEED_TO_RESUME + if [ $VERBOSE -ge {debug} ]; then + echo "DEBUG: NEED_TO_RESUME=$NEED_TO_RESUME" fi done -if [ $VERBOSE = true ]; then - echo NEED_TO_RESUME=$NEED_TO_RESUME +if [ $VERBOSE -ge {debug} ]; then + echo "DEBUG: NEED_TO_RESUME=$NEED_TO_RESUME" fi if [ "$NEED_TO_RESUME" = true ]; then echo "Autoresuming using: {{launcher}} $PBS_FILENAME" - if [ $VERBOSE = true]; then - VERBOSE_OPTION="-vv" + if [ $VERBOSE -ge 0 ] + then + VERBOSE_OPTION="-" + for ((i=0;i= 2).lower())] + prolog = [AUTORESUME_PROLOG.format(verbose=args.verbose)] epilog = [AUTORESUME_EPILOG.format(launcher=LAUNCHER if args.launcher is None else args.launcher, path_job=path_job)] job_generator = job_generator_factory(queue, commands, prolog, epilog, command_params, CLUSTER_NAME, path_job) @@ -259,11 +270,11 @@ def parse_arguments(): if args.coresPerCommand < 1: parser.error("coresPerNode must be at least 1") - if args.verbose == 0: + if args.verbose == VERBOSE_WARNING: logging.basicConfig(level=logging.WARNING) - elif args.verbose == 1: + elif args.verbose == VERBOSE_INFO: logging.basicConfig(level=logging.INFO) - elif args.verbose >= 2: + elif args.verbose >= VERBOSE_DEBUG: logging.basicConfig(level=logging.DEBUG) return args