Skip to content

Commit

Permalink
Merge pull request #355 from ReactionMechanismGenerator/ssh_improve
Browse files Browse the repository at this point in the history
A large rewriting to the SSHClient
  • Loading branch information
alongd authored May 10, 2020
2 parents 8956356 + 9b09e41 commit 7ba4d74
Show file tree
Hide file tree
Showing 7 changed files with 585 additions and 406 deletions.
230 changes: 110 additions & 120 deletions arc/job/job.py

Large diffs are not rendered by default.

544 changes: 367 additions & 177 deletions arc/job/ssh.py

Large diffs are not rendered by default.

30 changes: 15 additions & 15 deletions arc/job/submit.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,12 +41,12 @@
cd $WorkDir
. $g09root/g09/bsd/g09.profile
cp $SubmitDir/input.gjf .
cp $SubmitDir/check.chk .
cp "$SubmitDir/input.gjf" .
cp "$SubmitDir/check.chk" .
g09 < input.gjf > input.log
formchk check.chk check.fchk
cp * $SubmitDir/
cp * "$SubmitDir/"
rm -rf $GAUSS_SCRDIR
rm -rf $WorkDir
Expand Down Expand Up @@ -86,10 +86,10 @@
mkdir -p $WorkDir
cd $WorkDir
cp $SubmitDir/input.inp .
cp "$SubmitDir/input.inp" .
${ORCA_DIR}/orca input.inp > input.log
cp * $SubmitDir/
cp * "$SubmitDir/"
rm -rf $WorkDir
Expand Down Expand Up @@ -129,12 +129,12 @@
cd $WorkDir
. $g16root/g16/bsd/g16.profile
cp $SubmitDir/input.gjf .
cp $SubmitDir/check.chk .
cp "$SubmitDir/input.gjf" .
cp "$SubmitDir/check.chk" .
g16 < input.gjf > input.log
formchk check.chk check.fchk
cp * $SubmitDir/
cp * "$SubmitDir/"
rm -rf $GAUSS_SCRDIR
rm -rf $WorkDir
Expand Down Expand Up @@ -165,12 +165,12 @@
mkdir -p $sdir
cd $sdir
cp $SubmitDir/input.in .
cp "$SubmitDir/input.in" .
molpro -n {cpus} -d $sdir input.in
cp input.* $SubmitDir/
cp geometry*.* $SubmitDir/
cp input.* "$SubmitDir/"
cp geometry*.* "$SubmitDir/"
rm -rf $sdir
Expand Down Expand Up @@ -225,10 +225,10 @@
mkdir -p $WorkDir
cd $WorkDir
cp $SubmitDir/input.inp .
cp "$SubmitDir/input.inp" .
${ORCA_DIR}/orca input.inp > input.log
cp * $SubmitDir/
cp * "$SubmitDir/"
rm -rf $WorkDir
Expand Down Expand Up @@ -409,10 +409,10 @@
mkdir -p $WorkDir
cd $WorkDir
cp $SubmitDir/input.in .
cp "$SubmitDir/input.in" .
/opt/orca/orca input.in > input.log
cp * $SubmitDir/
cp * "$SubmitDir/"
rm -rf $WorkDir
Expand Down
88 changes: 47 additions & 41 deletions arc/job/trsh.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
from arc.settings import (delete_command,
inconsistency_ab,
inconsistency_az,
list_available_nodes_command,
maximum_barrier,
preserve_param_in_scan_stable,
rotor_scan_resolution,
Expand Down Expand Up @@ -1120,54 +1119,61 @@ def trsh_job_on_server(server: str,
bool: Whether to re-run the job, `True` to rerun.
"""
server_nodes = server_nodes if server_nodes is not None else list()
cluster_soft = servers[server]['cluster_soft']
if job_server_status != 'done':
logger.error(f'Job {job_name} has server status "{job_server_status}" on {server}.')

# delete current server run
command = delete_command[servers[server]['cluster_soft']] + ' ' + str(job_id)
if server == 'local':
execute_command(command)
cmd = delete_command[cluster_soft] + ' ' + str(job_id)
execute_command(cmd)
return None, True
else:
ssh = SSHClient(server)
ssh.send_command_to_server(command)

if servers[server]['cluster_soft'].lower() == 'oge':
logger.error('Troubleshooting by changing node.')
ssh = SSHClient(server)
# find available nodes
stdout = ssh.send_command_to_server(command=list_available_nodes_command[servers[server]['cluster_soft']])[0]
for line in stdout:
node = line.split()[0].split('.')[0].split('node')[1]
if servers[server]['cluster_soft'] == 'OGE' and '0/0/8' in line and node not in server_nodes:
server_nodes.append(node)
break
else:
logger.error(f'Could not find an available node on the server {server}')
# TODO: continue troubleshooting; if all else fails, put the job to sleep,
# and try again searching for a node
return None, False

# modify the submit file
content = ssh.read_remote_file(remote_path=remote_path,
filename=submit_filename[servers[server]['cluster_soft']])
for i, line in enumerate(content):
if '#$ -l h=node' in line:
content[i] = '#$ -l h=node{0}.cluster'.format(node)
break
else:
content.insert(7, '#$ -l h=node{0}.cluster'.format(node))
content = ''.join(content) # convert list into a single string, not to upset paramiko
# resubmit
ssh.upload_file(remote_file_path=os.path.join(remote_path,
submit_filename[servers[server]['cluster_soft']]), file_string=content)
return node, True

elif servers[server]['cluster_soft'].lower() == 'slurm':
# TODO: change node on Slurm
return None, True
with SSHClient(server) as ssh:
ssh.delete_job(job_id)

# find available node
logger.error('Troubleshooting by changing node.')
ssh = SSHClient(server)
nodes = ssh.list_available_nodes()
for node in nodes:
if node not in server_nodes:
server_nodes.append(node)
break
else:
logger.error(f'Could not find an available node on the server {server}')
# TODO: continue troubleshooting; if all else fails, put the job to sleep,
# and try again searching for a node
return None, False

# modify the submit file
remote_submit_file = os.path.join(remote_path, submit_filename[cluster_soft])
with SSHClient(server) as ssh:
content = ssh.read_remote_file(remote_file_path=remote_submit_file)
if cluster_soft.lower() == 'oge':
node_assign = '#$ -l h='
insert_line_num = 7
elif cluster_soft.lower() == 'slurm':
node_assign = '#$BATCH -w, --nodelist='
insert_line_num = 5
else:
# Other software?
logger.denug(f'Unknown cluster software {cluster_soft} is encountered when '
f'troubleshooting by changing node.')
return None, False
for i, line in enumerate(content):
if node_assign in line:
content[i] = node_assign + node
break
else:
content.insert(insert_line_num, node_assign + node)
content = ''.join(content) # convert list into a single string, not to upset paramiko

return None, False
# resubmit
with SSHClient(server) as ssh:
ssh.upload_file(remote_file_path=os.path.join(remote_path,
submit_filename[cluster_soft]), file_string=content)
return node, True


def scan_quality_check(label: str,
Expand Down
93 changes: 43 additions & 50 deletions arc/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -825,56 +825,49 @@ def determine_ess_settings(self, diagnostics=False):
continue
if diagnostics:
logger.info('\nTrying {0}'.format(server))
ssh = SSHClient(server)

cmd = '. ~/.bashrc; which g03'
g03 = ssh.send_command_to_server(cmd)[0]
cmd = '. ~/.bashrc; which g09'
g09 = ssh.send_command_to_server(cmd)[0]
cmd = '. ~/.bashrc; which g16'
g16 = ssh.send_command_to_server(cmd)[0]
if g03 or g09 or g16:
if diagnostics:
logger.info(f' Found Gaussian on {server}: g03={g03}, g09={g09}, g16={g16}')
self.ess_settings['gaussian'].append(server)
elif diagnostics:
logger.info(f' Did NOT find Gaussian on {server}')

cmd = '. ~/.bashrc; which qchem'
qchem = ssh.send_command_to_server(cmd)[0]
if qchem:
if diagnostics:
logger.info(f' Found QChem on {server}')
self.ess_settings['qchem'].append(server)
elif diagnostics:
logger.info(f' Did NOT find QChem on {server}')

cmd = '. ~/.bashrc; which orca'
orca = ssh.send_command_to_server(cmd)[0]
if orca:
if diagnostics:
logger.info(f' Found Orca on {server}')
self.ess_settings['orca'].append(server)
elif diagnostics:
logger.info(f' Did NOT find Orca on {server}')

cmd = '. ~/.bashrc; which terachem'
terachem = ssh.send_command_to_server(cmd)[0]
if terachem:
if diagnostics:
logging.info(f' Found TeraChem on {server}')
self.ess_settings['terachem'].append(server)
elif diagnostics:
logging.info(f' Did NOT find TeraChem on {server}')

cmd = '. .bashrc; which molpro'
molpro = ssh.send_command_to_server(cmd)[0]
if molpro:
if diagnostics:
logger.info(f' Found Molpro on {server}')
self.ess_settings['molpro'].append(server)
elif diagnostics:
logger.info(f' Did NOT find Molpro on {server}')
with SSHClient(server) as ssh:

g03 = ssh.find_package('g03')
g09 = ssh.find_package('g09')
g16 = ssh.find_package('g16')
if g03 or g09 or g16:
if diagnostics:
logger.info(f' Found Gaussian on {server}: g03={g03}, g09={g09}, g16={g16}')
self.ess_settings['gaussian'].append(server)
elif diagnostics:
logger.info(f' Did NOT find Gaussian on {server}')

qchem = ssh.find_package('qchem')
if qchem:
if diagnostics:
logger.info(f' Found QChem on {server}')
self.ess_settings['qchem'].append(server)
elif diagnostics:
logger.info(f' Did NOT find QChem on {server}')

orca = ssh.find_package('orca')
if orca:
if diagnostics:
logger.info(f' Found Orca on {server}')
self.ess_settings['orca'].append(server)
elif diagnostics:
logger.info(f' Did NOT find Orca on {server}')

terachem = ssh.find_package('terachem')
if terachem:
if diagnostics:
logging.info(f' Found TeraChem on {server}')
self.ess_settings['terachem'].append(server)
elif diagnostics:
logging.info(f' Did NOT find TeraChem on {server}')

molpro = ssh.find_package('molpro')
if molpro:
if diagnostics:
logger.info(f' Found Molpro on {server}')
self.ess_settings['molpro'].append(server)
elif diagnostics:
logger.info(f' Did NOT find Molpro on {server}')
if diagnostics:
logger.info('\n\n')
if 'gaussian' in self.ess_settings.keys():
Expand Down
4 changes: 2 additions & 2 deletions arc/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -2511,8 +2511,8 @@ def get_servers_jobs_ids(self):
self.servers_jobs_ids = list()
for server in self.servers:
if server != 'local':
ssh = SSHClient(server)
self.servers_jobs_ids.extend(ssh.check_running_jobs_ids())
with SSHClient(server) as ssh:
self.servers_jobs_ids.extend(ssh.check_running_jobs_ids())
else:
self.servers_jobs_ids.extend(check_running_jobs_ids())

Expand Down
2 changes: 1 addition & 1 deletion arc/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@
'Slurm': '/usr/bin/scancel'}

list_available_nodes_command = {'OGE': 'export SGE_ROOT=/opt/sge; /opt/sge/bin/lx24-amd64/qstat -f | grep "/8 " | grep "long" | grep -v "8/8"| grep -v "aAu"',
'Slurm': 'sinfo'}
'Slurm': 'sinfo -o "%n %t %O %E"'}

submit_filename = {'OGE': 'submit.sh',
'Slurm': 'submit.sl'}
Expand Down

0 comments on commit 7ba4d74

Please sign in to comment.