Replies: 3 comments 8 replies
-
You can check https://github.com/deepmodeling/dpgen/discussions/785, it seems like you got similiar problem. |
Beta Was this translation helpful? Give feedback.
2 replies
-
Could you check the reason why slurm fails to run these tasks and provide more details? |
Beta Was this translation helpful? Give feedback.
5 replies
-
You may use |
Beta Was this translation helpful? Give feedback.
1 reply
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
-
It did not work at all and just stop at iter.000000 task 01
Traceback
Traceback (most recent call last):
File "/share/home/xlzou/.local/lib/python3.9/site-packages/dpdispatcher/submission.py", line 241, in handle_unexpected_submission_state
job.handle_unexpected_job_state()
File "/share/home/xlzou/.local/lib/python3.9/site-packages/dpdispatcher/submission.py", line 605, in handle_unexpected_job_state
raise RuntimeError(f"job:{self.job_hash} {self.job_id} failed {self.fail_count} times.job_detail:{self}")
RuntimeError: job:80b0713a4333c3d802c77c72166a0772e9f9b968 416884 failed 3 times.job_detail:{'80b0713a4333c3d802c77c72166a0772e9f9b968': {'job_task_list': [{'command': "/bin/sh -c '{ if [ ! -f model.ckpt.index ]; then dp train input.json; else dp train input.json --restart model.ckpt; fi }'&&dp freeze", 'task_work_path': '002', 'forward_files': ['input.json'], 'backward_files': ['frozen_model.pb', 'lcurve.out', 'train.log', 'model.ckpt.meta', 'model.ckpt.index', 'model.ckpt.data-00000-of-00001', 'checkpoint'], 'outlog': 'train.log', 'errlog': 'train.log'}], 'resources': {'number_node': 1, 'cpu_per_node': 1, 'gpu_per_node': 1, 'queue_name': 'gpu', 'group_size': 1, 'custom_flags': ['#SBATCH -o output.%j', '#SBATCH -e err -o out'], 'strategy': {'if_cuda_multi_devices': False, 'ratio_unfinished': 0.0}, 'para_deg': 1, 'module_purge': False, 'module_unload_list': [], 'module_list': [], 'source_list': ['~/WORKSPACE/zjl9/DPGEN/deepmd.sh'], 'envs': {}, 'wait_time': 0, 'kwargs': {}}, 'job_state': <JobStatus.terminated: 4>, 'job_id': '416884', 'fail_count': 3}}
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/share/home/xlzou/.local/bin/dpgen", line 8, in
sys.exit(main())
File "/share/home/xlzou/.local/lib/python3.9/site-packages/dpgen/main.py", line 185, in main
args.func(args)
File "/share/home/xlzou/.local/lib/python3.9/site-packages/dpgen/generator/run.py", line 3642, in gen_run
run_iter (args.PARAM, args.MACHINE)
File "/share/home/xlzou/.local/lib/python3.9/site-packages/dpgen/generator/run.py", line 3607, in run_iter
run_train (ii, jdata, mdata)
File "/share/home/xlzou/.local/lib/python3.9/site-packages/dpgen/generator/run.py", line 610, in run_train
submission.run_submission()
File "/share/home/xlzou/.local/lib/python3.9/site-packages/dpdispatcher/submission.py", line 208, in run_submission
self.handle_unexpected_submission_state()
File "/share/home/xlzou/.local/lib/python3.9/site-packages/dpdispatcher/submission.py", line 244, in handle_unexpected_submission_state
raise RuntimeError(
RuntimeError: Meet errors will handle unexpected submission state.
Debug information: remote_root==/share/home/xlzou/WORKSPACE/zjl9/DPGEN/work/04d8a682beb8bfafb345306a91319a282951fae4.
Debug information: submission_hash==04d8a682beb8bfafb345306a91319a282951fae4.
This is my first time submitting a dpgen job in a slurm queue, and here's the machine.json down below:
{
"api_version": "1.0",
"train": [
{
"command": "dp",
"machine": {
"context_type": "local",
"batch_type": "Slurm",
"local_root": "./",
"remote_root": "/share/home/xlzou/WORKSPACE/zjl9/DPGEN/work"
},
"resources": {
"number_node": 1,
"cpu_per_node": 1,
"gpu_per_node": 1,
"queue_name": "gpu",
"custom_flags": ["#SBATCH -o output.%j", "#SBATCH -e err -o out"],
"group_size": 1,
"source_list": ["
/WORKSPACE/zjl9/DPGEN/deepmd.sh"],/WORKSPACE/zjl9/DPGEN/deepmd.sh"],"module_list": [],
"time_limit": "23:0:0"
}
}
],
"model_devi": [
{
"command": "lmp",
"machine": {
"context_type": "local",
"batch_type": "Slurm",
"local_root": "./",
"remote_root": "/share/home/xlzou/WORKSPACE/zjl9/DPGEN/work"
},
"resources": {
"number_node": 1,
"cpu_per_node": 1,
"gpu_per_node": 1,
"queue_name": "gpu",
"exclude_list": [],
"group_size": 10,
"source_list": ["
"module_list": [],
"time_limit": "23:0:0"
}
}
],
"fp": [
{
"command": "mpirun /share/apps/vasp/ips2018/u1/5.4.4/vasp_std",
"machine": {
"context_type": "local",
"batch_type": "Slurm",
"local_root": "./",
"remote_root": "/share/home/xlzou/WORKSPACE/zjl9/DPGEN/work"
},
"resources": {
"number_node": 1,
"cpu_per_node": 1,
"queue_name": "gencpu",
"group_size": 3,
"source_list": [],
"module_list": ["vasp/ips2018/u1/5.4.4"],
"time_limit": "120:0:0"
}
}
]
}
Thanks for any help
nohup.md
Beta Was this translation helpful? Give feedback.
All reactions