Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add verbosity to smart-dispatch #175

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 17 additions & 3 deletions scripts/sd-launch-pbs
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,16 @@ import logging
from smartdispatch import launch_jobs
from smartdispatch import utils


logger = logging.getLogger()


Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

remove the logging.root.setLevel(logging.INFO) below and make sure the default throughout the system is warning.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@mgermain Done

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Even in the rest of the smartdispatch to stay consistent?

LOGS_FOLDERNAME = "SMART_DISPATCH_LOGS"
CLUSTER_NAME = utils.detect_cluster()
LAUNCHER = utils.get_launcher(CLUSTER_NAME)


def main():
# Necessary if we want 'logging.info' to appear in stderr.
logging.root.setLevel(logging.INFO)

args = parse_arguments()

launch_jobs(LAUNCHER if args.launcher is None else args.launcher, [args.pbs], CLUSTER_NAME, args.path_job)
Expand All @@ -27,8 +28,21 @@ def parse_arguments():
parser.add_argument('pbs', type=str, help='PBS filename to launch.')
parser.add_argument('path_job', type=str, help='Path to the job folder.')

parser.add_argument(
'-v', '--verbose', action='count', default=0,
help="Print informations about the process.\n"
" -v: INFO\n"
" -vv: DEBUG")

args = parser.parse_args()

if args.verbose == 0:
logging.basicConfig(level=logging.WARNING)
elif args.verbose == 1:
logging.basicConfig(level=logging.INFO)
elif args.verbose >= 2:
logging.basicConfig(level=logging.DEBUG)

return args


Expand Down
69 changes: 60 additions & 9 deletions scripts/smart-dispatch
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
#!/usr/bin/env python2
# -*- coding: utf-8 -*-

import argparse
import logging
import os
import sys
import argparse
import time as t
from os.path import join as pjoin
from textwrap import dedent
Expand All @@ -16,9 +17,15 @@ from smartdispatch import get_available_queues
from smartdispatch import launch_jobs
from smartdispatch import utils

import logging
import smartdispatch


logger = logging.getLogger()

VERBOSE_WARNING = 0
VERBOSE_INFO = 1
VERBOSE_DEBUG = 2

LOGS_FOLDERNAME = "SMART_DISPATCH_LOGS"
CLUSTER_NAME = utils.detect_cluster()
AVAILABLE_QUEUES = get_available_queues(CLUSTER_NAME)
Expand All @@ -29,27 +36,57 @@ TIMEOUT_EXIT_CODE = 124
AUTORESUME_TRIGGER_AFTER = '$(($PBS_WALLTIME - 60))' # By default, 60s before the maximum walltime.
AUTORESUME_WORKER_CALL_PREFIX = 'timeout -s TERM {trigger_after} '.format(trigger_after=AUTORESUME_TRIGGER_AFTER)
AUTORESUME_WORKER_CALL_SUFFIX = ' WORKER_PIDS+=" $!"'
AUTORESUME_PROLOG = 'WORKER_PIDS=""'
AUTORESUME_PROLOG = """\
WORKER_PIDS=""
VERBOSE={verbose}
"""
AUTORESUME_EPILOG = """\
NEED_TO_RESUME=false
if [ $VERBOSE -ge {debug} ]; then
echo "DEBUG: NEED_TO_RESUME=$NEED_TO_RESUME"
echo "DEBUG: WORKER_PIDS=$WORKER_PIDS"
fi
for WORKER_PID in $WORKER_PIDS; do
if [ $VERBOSE -ge {debug} ]; then
echo "DEBUG: WORKER_PID=$WORKER_PID"
fi
wait "$WORKER_PID"
RETURN_CODE=$?
if [ $VERBOSE -ge {debug} ]; then
echo "DEBUG: RETURN_CODE is $RETURN_CODE while timeout_exit_code is {timeout_exit_code}"
fi
if [ $RETURN_CODE -eq {timeout_exit_code} ]; then
NEED_TO_RESUME=true
fi
if [ $VERBOSE -ge {debug} ]; then
echo "DEBUG: NEED_TO_RESUME=$NEED_TO_RESUME"
fi
done
if [ $VERBOSE -ge {debug} ]; then
echo "DEBUG: NEED_TO_RESUME=$NEED_TO_RESUME"
fi
if [ "$NEED_TO_RESUME" = true ]; then
echo "Autoresuming using: {{launcher}} $PBS_FILENAME"
sd-launch-pbs --launcher {{launcher}} $PBS_FILENAME {{path_job}}
if [ $VERBOSE -ge 0 ]
then
VERBOSE_OPTION="-"
for ((i=0;i<VERBOSE;i++))
do
VERBOSE_OPTION="${{{{VERBOSE_OPTION}}}}v"
done
else
VERBOSE_OPTION=""
fi

if [ $VERBOSE -ge {debug} ]; then
echo "DEBUG: executing sd-launch-pbs ${{{{VERBOSE_OPTION}}}} --launcher {{launcher}} $PBS_FILENAME {{path_job}}"
fi
sd-launch-pbs ${{{{VERBOSE_OPTION}}}} --launcher {{launcher}} $PBS_FILENAME {{path_job}}
fi
""".format(timeout_exit_code=TIMEOUT_EXIT_CODE)
""".format(timeout_exit_code=TIMEOUT_EXIT_CODE, debug=VERBOSE_DEBUG)


def main():
# Necessary if we want 'logging.info' to appear in stderr.
logging.root.setLevel(logging.INFO)

args = parse_arguments()
path_smartdispatch_logs = pjoin(os.getcwd(), LOGS_FOLDERNAME)

Expand Down Expand Up @@ -163,7 +200,7 @@ def main():
prolog = []
epilog = ['wait']
if args.autoresume:
prolog = [AUTORESUME_PROLOG]
prolog = [AUTORESUME_PROLOG.format(verbose=args.verbose)]
epilog = [AUTORESUME_EPILOG.format(launcher=LAUNCHER if args.launcher is None else args.launcher, path_job=path_job)]

job_generator = job_generator_factory(queue, commands, prolog, epilog, command_params, CLUSTER_NAME, path_job)
Expand All @@ -187,6 +224,13 @@ def main():

def parse_arguments():
parser = argparse.ArgumentParser()

parser.add_argument(
'-v', '--verbose', action='count', default=0,
help="Print informations about the process.\n"
" -v: INFO\n"
" -vv: DEBUG")

parser.add_argument('-q', '--queueName', required=True, help='Queue used (ex: qwork@mp2, qfat256@mp2, gpu_1)')
parser.add_argument('-n', '--batchName', required=False, help='The name of the batch. Default: The commands launched.')
parser.add_argument('-t', '--walltime', required=False, help='Set the estimated running time of your jobs using the DD:HH:MM:SS format. Note that they will be killed when this time limit is reached.')
Expand Down Expand Up @@ -226,6 +270,13 @@ def parse_arguments():
if args.coresPerCommand < 1:
parser.error("coresPerNode must be at least 1")

if args.verbose == VERBOSE_WARNING:
logging.basicConfig(level=logging.WARNING)
elif args.verbose == VERBOSE_INFO:
logging.basicConfig(level=logging.INFO)
elif args.verbose >= VERBOSE_DEBUG:
logging.basicConfig(level=logging.DEBUG)

return args


Expand Down