From af7e9b5bd9155c51b58fd8ce5f132f94766e99ee Mon Sep 17 00:00:00 2001 From: fbarreir Date: Tue, 7 May 2019 16:14:43 +0200 Subject: [PATCH 001/181] checkpoint on adding jobtype queues --- __init__.py | 0 pandaharvester/harvesterbody/submitter.py | 698 +++++++++--------- .../harvesterbody/worker_adjuster.py | 352 ++++----- pandaharvester/harvestercore/core_utils.py | 2 +- pandaharvester/harvestercore/db_proxy.py | 154 ++-- .../harvestercore/panda_queue_spec.py | 3 +- pandaharvester/harvestercore/work_spec.py | 2 + .../dummy_dynamic_worker_maker.py | 2 +- .../multijob_worker_maker.py | 2 +- .../multinode_worker_maker.py | 2 +- .../simple_bf_es_worker_maker.py | 4 +- .../simple_worker_maker.py | 39 +- 12 files changed, 663 insertions(+), 597 deletions(-) create mode 100644 __init__.py diff --git a/__init__.py b/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/pandaharvester/harvesterbody/submitter.py b/pandaharvester/harvesterbody/submitter.py index 7c2df6b8..a7970cc4 100644 --- a/pandaharvester/harvesterbody/submitter.py +++ b/pandaharvester/harvesterbody/submitter.py @@ -26,386 +26,390 @@ class Submitter(AgentBase): # constructor def __init__(self, queue_config_mapper, single_mode=False): AgentBase.__init__(self, single_mode) - self.queueConfigMapper = queue_config_mapper + self.queue_configMapper = queue_config_mapper self.dbProxy = DBProxy() self.workerMaker = WorkerMaker() self.workerAdjuster = WorkerAdjuster(queue_config_mapper) self.pluginFactory = PluginFactory() self.monitor_fifo = MonitorFIFO() - self.apfmon = Apfmon(self.queueConfigMapper) + self.apfmon = Apfmon(self.queue_configMapper) # main loop def run(self): - lockedBy = 'submitter-{0}'.format(self.get_pid()) + locked_by = 'submitter-{0}'.format(self.get_pid()) monitor_fifo = self.monitor_fifo - queueLockInterval = getattr(harvester_config.submitter, 'queueLockInterval', + queue_lock_interval = getattr(harvester_config.submitter, 'queue_lock_interval', harvester_config.submitter.lockInterval) while True: sw_main = core_utils.get_stopwatch() - mainLog = self.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run') - mainLog.debug('getting queues to submit workers') + main_log = self.make_logger(_logger, 'id={0}'.format(locked_by), method_name='run') + main_log.debug('getting queues to submit workers') # get queues associated to a site to submit workers - curWorkers, siteName, resMap = self.dbProxy.get_queues_to_submit(harvester_config.submitter.nQueues, + current_workers, site_name, res_map = self.dbProxy.get_queues_to_submit(harvester_config.submitter.nQueues, harvester_config.submitter.lookupTime, harvester_config.submitter.lockInterval, - lockedBy, queueLockInterval) + locked_by, queue_lock_interval) submitted = False - if siteName is not None: - mainLog.debug('got {0} queues for site {1}'.format(len(curWorkers), siteName)) + if site_name is not None: + main_log.debug('got {0} queues for site {1}'.format(len(current_workers), site_name)) - # get commands - comStr = '{0}:{1}'.format(CommandSpec.COM_setNWorkers, siteName) - commandSpecs = self.dbProxy.get_commands_for_receiver('submitter', comStr) - mainLog.debug('got {0} {1} commands'.format(commandSpecs, comStr)) - for commandSpec in commandSpecs: - newLimits = self.dbProxy.set_queue_limit(siteName, commandSpec.params) - for tmpResource, tmpNewVal in iteritems(newLimits): - # if available, overwrite new worker value with the command from panda server - if tmpResource in resMap: - tmpQueueName = resMap[tmpResource] - if tmpQueueName in curWorkers: - curWorkers[tmpQueueName][tmpResource]['nNewWorkers'] = tmpNewVal + # get commands from panda server + com_str = '{0}:{1}'.format(CommandSpec.COM_setNWorkers, site_name) + command_specs = self.dbProxy.get_commands_for_receiver('submitter', com_str) + main_log.debug('got {0} {1} commands'.format(command_specs, com_str)) + for command_spec in command_specs: + new_limits = self.dbProxy.set_queue_limit(site_name, command_spec.params) + for tmp_job_type, tmp_jt_vals in iteritems(new_limits): + res_map.setdefault(tmp_job_type, {}) + for tmp_resource_type, tmp_new_val in iteritems(tmp_jt_vals): + # if available, overwrite new worker value with the command from panda server + if tmp_resource_type in res_map[tmp_job_type]: + tmp_queue_name = res_map[tmp_job_type][tmp_resource_type] + if tmp_queue_name in current_workers: + current_workers[tmp_queue_name][tmp_job_type][tmp_resource_type]['nNewWorkers'] = tmp_new_val # define number of new workers - if len(curWorkers) == 0: - n_workers_per_queue_and_rt = dict() + if len(current_workers) == 0: + n_workers_per_queue_jt_rt = dict() else: - n_workers_per_queue_and_rt = self.workerAdjuster.define_num_workers(curWorkers, siteName) + n_workers_per_queue_jt_rt = self.workerAdjuster.define_num_workers(current_workers, site_name) - if n_workers_per_queue_and_rt is None: - mainLog.error('WorkerAdjuster failed to define the number of workers') - elif len(n_workers_per_queue_and_rt) == 0: + if n_workers_per_queue_jt_rt is None: + main_log.error('WorkerAdjuster failed to define the number of workers') + elif len(n_workers_per_queue_jt_rt) == 0: pass else: # loop over all queues and resource types - for queueName in n_workers_per_queue_and_rt: - for resource_type, tmpVal in iteritems(n_workers_per_queue_and_rt[queueName]): + for queue_name in n_workers_per_queue_jt_rt: + for job_type, tmp_job_vals in iteritems(n_workers_per_queue_jt_rt[queue_name]): + for resource_type, tmp_val in iteritems(tmp_job_vals): - tmpLog = self.make_logger(_logger, 'id={0} queue={1} rtype={2}'.format(lockedBy, - queueName, - resource_type), - method_name='run') - try: - tmpLog.debug('start') - tmpLog.debug('workers status: %s' % tmpVal) - nWorkers = tmpVal['nNewWorkers'] + tmpVal['nReady'] - nReady = tmpVal['nReady'] + tmp_log = self.make_logger(_logger, 'id={0} queue={1} jtype={2} rtype={3}'.format( + locked_by, queue_name, job_type, resource_type), method_name='run') + try: + tmp_log.debug('start') + tmp_log.debug('workers status: %s' % tmp_val) + nWorkers = tmp_val['nNewWorkers'] + tmp_val['nReady'] + nReady = tmp_val['nReady'] - # check queue - if not self.queueConfigMapper.has_queue(queueName): - tmpLog.error('config not found') - continue + # check queue + if not self.queue_configMapper.has_queue(queue_name): + tmp_log.error('config not found') + continue - # no new workers - if nWorkers == 0: - tmpLog.debug('skipped since no new worker is needed based on current stats') - continue - # get queue - queueConfig = self.queueConfigMapper.get_queue(queueName) - workerMakerCore = self.workerMaker.get_plugin(queueConfig) - # check if resource is ready - if hasattr(workerMakerCore, 'dynamicSizing') and workerMakerCore.dynamicSizing is True: - numReadyResources = self.workerMaker.num_ready_resources(queueConfig, - resource_type, - workerMakerCore) - tmpLog.debug('numReadyResources: %s' % numReadyResources) - if not numReadyResources: - if hasattr(workerMakerCore, 'staticWorkers'): - nQRWorkers = tmpVal['nQueue'] + tmpVal['nRunning'] - tmpLog.debug('staticWorkers: %s, nQRWorkers(Queue+Running): %s' % - (workerMakerCore.staticWorkers, nQRWorkers)) - if nQRWorkers >= workerMakerCore.staticWorkers: - tmpLog.debug('No left static workers, skip') - continue + # no new workers + if nWorkers == 0: + tmp_log.debug('skipped since no new worker is needed based on current stats') + continue + # get queue + queue_config = self.queue_configMapper.get_queue(queue_name) + workerMakerCore = self.workerMaker.get_plugin(queue_config) + # check if resource is ready + if hasattr(workerMakerCore, 'dynamicSizing') and workerMakerCore.dynamicSizing is True: + numReadyResources = self.workerMaker.num_ready_resources(queue_config, + job_type, + resource_type, + workerMakerCore) + tmp_log.debug('numReadyResources: %s' % numReadyResources) + if not numReadyResources: + if hasattr(workerMakerCore, 'staticWorkers'): + nQRWorkers = tmp_val['nQueue'] + tmp_val['nRunning'] + tmp_log.debug('staticWorkers: %s, nQRWorkers(Queue+Running): %s' % + (workerMakerCore.staticWorkers, nQRWorkers)) + if nQRWorkers >= workerMakerCore.staticWorkers: + tmp_log.debug('No left static workers, skip') + continue + else: + nWorkers = min(workerMakerCore.staticWorkers - nQRWorkers, nWorkers) + tmp_log.debug('staticWorkers: %s, nWorkers: %s' % + (workerMakerCore.staticWorkers, nWorkers)) else: - nWorkers = min(workerMakerCore.staticWorkers - nQRWorkers, nWorkers) - tmpLog.debug('staticWorkers: %s, nWorkers: %s' % - (workerMakerCore.staticWorkers, nWorkers)) + tmp_log.debug('skip since no resources are ready') + continue else: - tmpLog.debug('skip since no resources are ready') - continue + nWorkers = min(nWorkers, numReadyResources) + # post action of worker maker + if hasattr(workerMakerCore, 'skipOnFail') and workerMakerCore.skipOnFail is True: + skipOnFail = True else: - nWorkers = min(nWorkers, numReadyResources) - # post action of worker maker - if hasattr(workerMakerCore, 'skipOnFail') and workerMakerCore.skipOnFail is True: - skipOnFail = True - else: - skipOnFail = False - # actions based on mapping type - if queueConfig.mapType == WorkSpec.MT_NoJob: - # workers without jobs - jobChunks = [] - for i in range(nWorkers): - jobChunks.append([]) - elif queueConfig.mapType == WorkSpec.MT_OneToOne: - # one worker per one job - jobChunks = self.dbProxy.get_job_chunks_for_workers( - queueName, - nWorkers, nReady, 1, None, - queueConfig.useJobLateBinding, - harvester_config.submitter.checkInterval, - harvester_config.submitter.lockInterval, - lockedBy) - elif queueConfig.mapType == WorkSpec.MT_MultiJobs: - # one worker for multiple jobs - nJobsPerWorker = self.workerMaker.get_num_jobs_per_worker(queueConfig, - nWorkers, - resource_type, - maker=workerMakerCore) - tmpLog.debug('nJobsPerWorker={0}'.format(nJobsPerWorker)) - jobChunks = self.dbProxy.get_job_chunks_for_workers( - queueName, - nWorkers, nReady, nJobsPerWorker, None, - queueConfig.useJobLateBinding, - harvester_config.submitter.checkInterval, - harvester_config.submitter.lockInterval, - lockedBy, - queueConfig.allowJobMixture) - elif queueConfig.mapType == WorkSpec.MT_MultiWorkers: - # multiple workers for one job - nWorkersPerJob = self.workerMaker.get_num_workers_per_job(queueConfig, - nWorkers, - resource_type, - maker=workerMakerCore) - maxWorkersPerJob = self.workerMaker.get_max_workers_per_job_in_total( - queueConfig, resource_type, maker=workerMakerCore) - maxWorkersPerJobPerCycle = self.workerMaker.get_max_workers_per_job_per_cycle( - queueConfig, resource_type, maker=workerMakerCore) - tmpLog.debug('nWorkersPerJob={0}'.format(nWorkersPerJob)) - jobChunks = self.dbProxy.get_job_chunks_for_workers( - queueName, - nWorkers, nReady, None, nWorkersPerJob, - queueConfig.useJobLateBinding, - harvester_config.submitter.checkInterval, - harvester_config.submitter.lockInterval, - lockedBy, max_workers_per_job_in_total=maxWorkersPerJob, - max_workers_per_job_per_cycle=maxWorkersPerJobPerCycle) - else: - tmpLog.error('unknown mapType={0}'.format(queueConfig.mapType)) - continue - - tmpLog.debug('got {0} job chunks'.format(len(jobChunks))) - if len(jobChunks) == 0: - continue - # make workers - okChunks, ngChunks = self.workerMaker.make_workers(jobChunks, queueConfig, - nReady, resource_type, - maker=workerMakerCore) - if len(ngChunks) == 0: - tmpLog.debug('successfully made {0} workers'.format(len(okChunks))) - else: - tmpLog.debug('made {0} workers, while {1} workers failed'.format(len(okChunks), - len(ngChunks))) - timeNow = datetime.datetime.utcnow() - timeNow_timestamp = time.time() - pandaIDs = set() - # NG (=not good) - for ngJobs in ngChunks: - for jobSpec in ngJobs: - if skipOnFail: - # release jobs when workers are not made - pandaIDs.add(jobSpec.PandaID) - else: - jobSpec.status = 'failed' - jobSpec.subStatus = 'failed_to_make' - jobSpec.stateChangeTime = timeNow - jobSpec.lockedBy = None - errStr = 'failed to make a worker' - jobSpec.set_pilot_error(PilotErrors.ERR_SETUPFAILURE, errStr) - jobSpec.trigger_propagation() - self.dbProxy.update_job(jobSpec, {'lockedBy': lockedBy, - 'subStatus': 'prepared'}) - # OK - workSpecList = [] - if len(okChunks) > 0: - for workSpec, okJobs in okChunks: - # has job - if (queueConfig.useJobLateBinding and workSpec.workerID is None) \ - or queueConfig.mapType == WorkSpec.MT_NoJob: - workSpec.hasJob = 0 - else: - workSpec.hasJob = 1 - if workSpec.nJobsToReFill in [None, 0]: - workSpec.set_jobspec_list(okJobs) - else: - # refill free slots during the worker is running - workSpec.set_jobspec_list(okJobs[:workSpec.nJobsToReFill]) - workSpec.nJobsToReFill = None - for jobSpec in okJobs[workSpec.nJobsToReFill:]: - pandaIDs.add(jobSpec.PandaID) - workSpec.set_num_jobs_with_list() - # map type - workSpec.mapType = queueConfig.mapType - # queue name - workSpec.computingSite = queueConfig.queueName - # set access point - workSpec.accessPoint = queueConfig.messenger['accessPoint'] - # sync level - workSpec.syncLevel = queueConfig.get_synchronization_level() - # events - if len(okJobs) > 0 and \ - ('eventService' in okJobs[0].jobParams or - 'cloneJob' in okJobs[0].jobParams): - workSpec.eventsRequest = WorkSpec.EV_useEvents - workSpecList.append(workSpec) - if len(workSpecList) > 0: - sw = core_utils.get_stopwatch() - # get plugin for submitter - submitterCore = self.pluginFactory.get_plugin(queueConfig.submitter) - if submitterCore is None: - # not found - tmpLog.error( - 'submitter plugin for {0} not found'.format(jobSpec.computingSite)) + skipOnFail = False + # actions based on mapping type + if queue_config.mapType == WorkSpec.MT_NoJob: + # workers without jobs + jobChunks = [] + for i in range(nWorkers): + jobChunks.append([]) + elif queue_config.mapType == WorkSpec.MT_OneToOne: + # one worker per one job + jobChunks = self.dbProxy.get_job_chunks_for_workers( + queue_name, + nWorkers, nReady, 1, None, + queue_config.useJobLateBinding, + harvester_config.submitter.checkInterval, + harvester_config.submitter.lockInterval, + locked_by) + elif queue_config.mapType == WorkSpec.MT_MultiJobs: + # one worker for multiple jobs + nJobsPerWorker = self.workerMaker.get_num_jobs_per_worker(queue_config, + nWorkers, + job_type, + resource_type, + maker=workerMakerCore) + tmp_log.debug('nJobsPerWorker={0}'.format(nJobsPerWorker)) + jobChunks = self.dbProxy.get_job_chunks_for_workers( + queue_name, + nWorkers, nReady, nJobsPerWorker, None, + queue_config.useJobLateBinding, + harvester_config.submitter.checkInterval, + harvester_config.submitter.lockInterval, + locked_by, + queue_config.allowJobMixture) + elif queue_config.mapType == WorkSpec.MT_MultiWorkers: + # multiple workers for one job + nWorkersPerJob = self.workerMaker.get_num_workers_per_job(queue_config, + nWorkers, + job_type, + resource_type, + maker=workerMakerCore) + maxWorkersPerJob = self.workerMaker.get_max_workers_per_job_in_total( + queue_config, job_type, resource_type, maker=workerMakerCore) + maxWorkersPerJobPerCycle = self.workerMaker.get_max_workers_per_job_per_cycle( + queue_config, job_type, resource_type, maker=workerMakerCore) + tmp_log.debug('nWorkersPerJob={0}'.format(nWorkersPerJob)) + jobChunks = self.dbProxy.get_job_chunks_for_workers( + queue_name, + nWorkers, nReady, None, nWorkersPerJob, + queue_config.useJobLateBinding, + harvester_config.submitter.checkInterval, + harvester_config.submitter.lockInterval, + locked_by, max_workers_per_job_in_total=maxWorkersPerJob, + max_workers_per_job_per_cycle=maxWorkersPerJobPerCycle) + else: + tmp_log.error('unknown mapType={0}'.format(queue_config.mapType)) continue - # get plugin for messenger - messenger = self.pluginFactory.get_plugin(queueConfig.messenger) - if messenger is None: - # not found - tmpLog.error( - 'messenger plugin for {0} not found'.format(jobSpec.computingSite)) + + tmp_log.debug('got {0} job chunks'.format(len(jobChunks))) + if len(jobChunks) == 0: continue - # setup access points - messenger.setup_access_points(workSpecList) - # feed jobs - for workSpec in workSpecList: - if workSpec.hasJob == 1: - tmpStat = messenger.feed_jobs(workSpec, workSpec.get_jobspec_list()) - if tmpStat is False: - tmpLog.error( - 'failed to send jobs to workerID={0}'.format(workSpec.workerID)) + # make workers + okChunks, ngChunks = self.workerMaker.make_workers(jobChunks, queue_config, + nReady, job_type, resource_type, + maker=workerMakerCore) + if len(ngChunks) == 0: + tmp_log.debug('successfully made {0} workers'.format(len(okChunks))) + else: + tmp_log.debug('made {0} workers, while {1} workers failed'.format(len(okChunks), + len(ngChunks))) + timeNow = datetime.datetime.utcnow() + timeNow_timestamp = time.time() + pandaIDs = set() + # NG (=not good) + for ngJobs in ngChunks: + for job_spec in ngJobs: + if skipOnFail: + # release jobs when workers are not made + pandaIDs.add(job_spec.PandaID) else: - tmpLog.debug( - 'sent jobs to workerID={0} with {1}'.format(workSpec.workerID, - tmpStat)) - # insert workers - self.dbProxy.insert_workers(workSpecList, lockedBy) - # submit - sw.reset() - tmpLog.info('submitting {0} workers'.format(len(workSpecList))) - workSpecList, tmpRetList, tmpStrList = self.submit_workers(submitterCore, - workSpecList) - tmpLog.debug('done submitting {0} workers'.format(len(workSpecList)) - + sw.get_elapsed_time()) - # collect successful jobs - okPandaIDs = set() - for iWorker, (tmpRet, tmpStr) in enumerate(zip(tmpRetList, tmpStrList)): - if tmpRet: - workSpec, jobList = okChunks[iWorker] - jobList = workSpec.get_jobspec_list() - if jobList is not None: - for jobSpec in jobList: - okPandaIDs.add(jobSpec.PandaID) - # loop over all workers - for iWorker, (tmpRet, tmpStr) in enumerate(zip(tmpRetList, tmpStrList)): - workSpec, jobList = okChunks[iWorker] - # set harvesterHost - workSpec.harvesterHost = socket.gethostname() - # use associated job list since it can be truncated for re-filling - jobList = workSpec.get_jobspec_list() - # set status - if not tmpRet: - # failed submission - errStr = 'failed to submit a workerID={0} with {1}'.format( - workSpec.workerID, - tmpStr) - tmpLog.error(errStr) - workSpec.set_status(WorkSpec.ST_missed) - workSpec.set_dialog_message(tmpStr) - workSpec.set_pilot_error(PilotErrors.ERR_SETUPFAILURE, errStr) + job_spec.status = 'failed' + job_spec.subStatus = 'failed_to_make' + job_spec.stateChangeTime = timeNow + job_spec.locked_by = None + errStr = 'failed to make a worker' + job_spec.set_pilot_error(PilotErrors.ERR_SETUPFAILURE, errStr) + job_spec.trigger_propagation() + self.dbProxy.update_job(job_spec, {'locked_by': locked_by, + 'subStatus': 'prepared'}) + # OK + work_specList = [] + if len(okChunks) > 0: + for work_spec, okJobs in okChunks: + # has job + if (queue_config.useJobLateBinding and work_spec.workerID is None) \ + or queue_config.mapType == WorkSpec.MT_NoJob: + work_spec.hasJob = 0 + else: + work_spec.hasJob = 1 + if work_spec.nJobsToReFill in [None, 0]: + work_spec.set_jobspec_list(okJobs) + else: + # refill free slots during the worker is running + work_spec.set_jobspec_list(okJobs[:work_spec.nJobsToReFill]) + work_spec.nJobsToReFill = None + for job_spec in okJobs[work_spec.nJobsToReFill:]: + pandaIDs.add(job_spec.PandaID) + work_spec.set_num_jobs_with_list() + # map type + work_spec.mapType = queue_config.mapType + # queue name + work_spec.computingSite = queue_config.queue_name + # set access point + work_spec.accessPoint = queue_config.messenger['accessPoint'] + # sync level + work_spec.syncLevel = queue_config.get_synchronization_level() + # events + if len(okJobs) > 0 and \ + ('eventService' in okJobs[0].jobParams or + 'cloneJob' in okJobs[0].jobParams): + work_spec.eventsRequest = WorkSpec.EV_useEvents + work_specList.append(work_spec) + if len(work_specList) > 0: + sw = core_utils.get_stopwatch() + # get plugin for submitter + submitterCore = self.pluginFactory.get_plugin(queue_config.submitter) + if submitterCore is None: + # not found + tmp_log.error( + 'submitter plugin for {0} not found'.format(job_spec.computingSite)) + continue + # get plugin for messenger + messenger = self.pluginFactory.get_plugin(queue_config.messenger) + if messenger is None: + # not found + tmp_log.error( + 'messenger plugin for {0} not found'.format(job_spec.computingSite)) + continue + # setup access points + messenger.setup_access_points(work_specList) + # feed jobs + for work_spec in work_specList: + if work_spec.hasJob == 1: + tmpStat = messenger.feed_jobs(work_spec, work_spec.get_jobspec_list()) + if tmpStat is False: + tmp_log.error( + 'failed to send jobs to workerID={0}'.format(work_spec.workerID)) + else: + tmp_log.debug( + 'sent jobs to workerID={0} with {1}'.format(work_spec.workerID, + tmpStat)) + # insert workers + self.dbProxy.insert_workers(work_specList, locked_by) + # submit + sw.reset() + tmp_log.info('submitting {0} workers'.format(len(work_specList))) + work_specList, tmpRetList, tmpStrList = self.submit_workers(submitterCore, + work_specList) + tmp_log.debug('done submitting {0} workers'.format(len(work_specList)) + + sw.get_elapsed_time()) + # collect successful jobs + okPandaIDs = set() + for iWorker, (tmpRet, tmpStr) in enumerate(zip(tmpRetList, tmpStrList)): + if tmpRet: + work_spec, jobList = okChunks[iWorker] + jobList = work_spec.get_jobspec_list() + if jobList is not None: + for job_spec in jobList: + okPandaIDs.add(job_spec.PandaID) + # loop over all workers + for iWorker, (tmpRet, tmpStr) in enumerate(zip(tmpRetList, tmpStrList)): + work_spec, jobList = okChunks[iWorker] + # set harvesterHost + work_spec.harvesterHost = socket.gethostname() + # use associated job list since it can be truncated for re-filling + jobList = work_spec.get_jobspec_list() + # set status + if not tmpRet: + # failed submission + errStr = 'failed to submit a workerID={0} with {1}'.format( + work_spec.workerID, + tmpStr) + tmp_log.error(errStr) + work_spec.set_status(WorkSpec.ST_missed) + work_spec.set_dialog_message(tmpStr) + work_spec.set_pilot_error(PilotErrors.ERR_SETUPFAILURE, errStr) + if jobList is not None: + # increment attempt number + newJobList = [] + for job_spec in jobList: + # skip if successful with another worker + if job_spec.PandaID in okPandaIDs: + continue + if job_spec.submissionAttempts is None: + job_spec.submissionAttempts = 0 + job_spec.submissionAttempts += 1 + # max attempt or permanent error + if tmpRet is False or \ + job_spec.submissionAttempts >= \ + queue_config.maxSubmissionAttempts: + newJobList.append(job_spec) + else: + self.dbProxy.increment_submission_attempt( + job_spec.PandaID, + job_spec.submissionAttempts) + jobList = newJobList + elif queue_config.useJobLateBinding and work_spec.hasJob == 1: + # directly go to running after feeding jobs for late biding + work_spec.set_status(WorkSpec.ST_running) + else: + # normal successful submission + work_spec.set_status(WorkSpec.ST_submitted) + work_spec.submitTime = timeNow + work_spec.modificationTime = timeNow + work_spec.checkTime = timeNow + if self.monitor_fifo.enabled: + work_spec.set_work_params({'lastCheckAt': timeNow_timestamp}) + # prefetch events + if tmpRet and work_spec.hasJob == 1 and \ + work_spec.eventsRequest == WorkSpec.EV_useEvents and \ + queue_config.prefetchEvents: + work_spec.eventsRequest = WorkSpec.EV_requestEvents + eventsRequestParams = dict() + for job_spec in jobList: + eventsRequestParams[job_spec.PandaID] = \ + {'pandaID': job_spec.PandaID, + 'taskID': job_spec.taskID, + 'jobsetID': job_spec.jobParams['jobsetID'], + 'nRanges': max(int(math.ceil(work_spec.nCore / len(jobList))), + job_spec.jobParams['coreCount']), + } + work_spec.eventsRequestParams = eventsRequestParams + # register worker + tmpStat = self.dbProxy.register_worker(work_spec, jobList, locked_by) if jobList is not None: - # increment attempt number - newJobList = [] - for jobSpec in jobList: - # skip if successful with another worker - if jobSpec.PandaID in okPandaIDs: - continue - if jobSpec.submissionAttempts is None: - jobSpec.submissionAttempts = 0 - jobSpec.submissionAttempts += 1 - # max attempt or permanent error - if tmpRet is False or \ - jobSpec.submissionAttempts >= \ - queueConfig.maxSubmissionAttempts: - newJobList.append(jobSpec) + for job_spec in jobList: + pandaIDs.add(job_spec.PandaID) + if tmpStat: + if tmpRet: + tmpStr = \ + 'submitted a workerID={0} for PandaID={1} with batchID={2}' + tmp_log.info(tmpStr.format(work_spec.workerID, + job_spec.PandaID, + work_spec.batchID)) + else: + tmpStr = 'failed to submit a workerID={0} for PandaID={1}' + tmp_log.error(tmpStr.format(work_spec.workerID, + job_spec.PandaID)) else: - self.dbProxy.increment_submission_attempt( - jobSpec.PandaID, - jobSpec.submissionAttempts) - jobList = newJobList - elif queueConfig.useJobLateBinding and workSpec.hasJob == 1: - # directly go to running after feeding jobs for late biding - workSpec.set_status(WorkSpec.ST_running) - else: - # normal successful submission - workSpec.set_status(WorkSpec.ST_submitted) - workSpec.submitTime = timeNow - workSpec.modificationTime = timeNow - workSpec.checkTime = timeNow - if self.monitor_fifo.enabled: - workSpec.set_work_params({'lastCheckAt': timeNow_timestamp}) - # prefetch events - if tmpRet and workSpec.hasJob == 1 and \ - workSpec.eventsRequest == WorkSpec.EV_useEvents and \ - queueConfig.prefetchEvents: - workSpec.eventsRequest = WorkSpec.EV_requestEvents - eventsRequestParams = dict() - for jobSpec in jobList: - eventsRequestParams[jobSpec.PandaID] = \ - {'pandaID': jobSpec.PandaID, - 'taskID': jobSpec.taskID, - 'jobsetID': jobSpec.jobParams['jobsetID'], - 'nRanges': max(int(math.ceil(workSpec.nCore / len(jobList))), - jobSpec.jobParams['coreCount']), - } - workSpec.eventsRequestParams = eventsRequestParams - # register worker - tmpStat = self.dbProxy.register_worker(workSpec, jobList, lockedBy) - if jobList is not None: - for jobSpec in jobList: - pandaIDs.add(jobSpec.PandaID) - if tmpStat: - if tmpRet: tmpStr = \ - 'submitted a workerID={0} for PandaID={1} with batchID={2}' - tmpLog.info(tmpStr.format(workSpec.workerID, - jobSpec.PandaID, - workSpec.batchID)) - else: - tmpStr = 'failed to submit a workerID={0} for PandaID={1}' - tmpLog.error(tmpStr.format(workSpec.workerID, - jobSpec.PandaID)) - else: - tmpStr = \ - 'failed to register a worker for PandaID={0} with batchID={1}' - tmpLog.error(tmpStr.format(jobSpec.PandaID, workSpec.batchID)) - # enqueue to monitor fifo - if self.monitor_fifo.enabled \ - and queueConfig.mapType != WorkSpec.MT_MultiWorkers: - workSpecsToEnqueue = \ - [[w] for w in workSpecList if w.status - in (WorkSpec.ST_submitted, WorkSpec.ST_running)] - check_delay = min( - getattr(harvester_config.monitor, 'eventBasedCheckInterval', - harvester_config.monitor.checkInterval), - getattr(harvester_config.monitor, 'fifoCheckInterval', - harvester_config.monitor.checkInterval)) - monitor_fifo.put((queueName, workSpecsToEnqueue), time.time() + check_delay) - mainLog.debug('put workers to monitor FIFO') - submitted = True - # release jobs - self.dbProxy.release_jobs(pandaIDs, lockedBy) - tmpLog.info('done') - except Exception: - core_utils.dump_error_message(tmpLog) + 'failed to register a worker for PandaID={0} with batchID={1}' + tmp_log.error(tmpStr.format(job_spec.PandaID, work_spec.batchID)) + # enqueue to monitor fifo + if self.monitor_fifo.enabled \ + and queue_config.mapType != WorkSpec.MT_MultiWorkers: + work_specsToEnqueue = \ + [[w] for w in work_specList if w.status + in (WorkSpec.ST_submitted, WorkSpec.ST_running)] + check_delay = min( + getattr(harvester_config.monitor, 'eventBasedCheckInterval', + harvester_config.monitor.checkInterval), + getattr(harvester_config.monitor, 'fifoCheckInterval', + harvester_config.monitor.checkInterval)) + monitor_fifo.put((queue_name, work_specsToEnqueue), time.time() + check_delay) + main_log.debug('put workers to monitor FIFO') + submitted = True + # release jobs + self.dbProxy.release_jobs(pandaIDs, locked_by) + tmp_log.info('done') + except Exception: + core_utils.dump_error_message(tmp_log) # release the site - self.dbProxy.release_site(siteName, lockedBy) - if sw_main.get_elapsed_time_in_sec() > queueLockInterval: - mainLog.warning('a submitter cycle was longer than queueLockInterval {0} sec'.format(queueLockInterval) + self.dbProxy.release_site(site_name, locked_by) + if sw_main.get_elapsed_time_in_sec() > queue_lock_interval: + main_log.warning('a submitter cycle was longer than queue_lock_interval {0} sec'.format(queue_lock_interval) + sw_main.get_elapsed_time()) - mainLog.debug('done') + main_log.debug('done') # define sleep interval - if siteName is None: + if site_name is None: sleepTime = harvester_config.submitter.sleepTime else: sleepTime = 0 @@ -413,13 +417,13 @@ def run(self): interval = harvester_config.submitter.minSubmissionInterval if interval > 0: newTime = datetime.datetime.utcnow() + datetime.timedelta(seconds=interval) - self.dbProxy.update_panda_queue_attribute('submitTime', newTime, site_name=siteName) + self.dbProxy.update_panda_queue_attribute('submitTime', newTime, site_name=site_name) # time the cycle - mainLog.debug('done a submitter cycle' + sw_main.get_elapsed_time()) + main_log.debug('done a submitter cycle' + sw_main.get_elapsed_time()) # check if being terminated if self.terminated(sleepTime): - mainLog.debug('terminated') + main_log.debug('terminated') return # wrapper for submitWorkers to skip ready workers @@ -428,13 +432,13 @@ def submit_workers(self, submitter_core, workspec_list): strList = [] newSpecList = [] workersToSubmit = [] - for workSpec in workspec_list: - if workSpec.status in [WorkSpec.ST_ready, WorkSpec.ST_running]: - newSpecList.append(workSpec) + for work_spec in workspec_list: + if work_spec.status in [WorkSpec.ST_ready, WorkSpec.ST_running]: + newSpecList.append(work_spec) retList.append(True) strList.append('') else: - workersToSubmit.append(workSpec) + workersToSubmit.append(work_spec) tmpRetList = submitter_core.submit_workers(workersToSubmit) # submit the workers to the monitoring diff --git a/pandaharvester/harvesterbody/worker_adjuster.py b/pandaharvester/harvesterbody/worker_adjuster.py index 22fd057e..2737a97d 100644 --- a/pandaharvester/harvesterbody/worker_adjuster.py +++ b/pandaharvester/harvesterbody/worker_adjuster.py @@ -15,11 +15,11 @@ class WorkerAdjuster(object): # constructor def __init__(self, queue_config_mapper): - self.queueConfigMapper = queue_config_mapper + self.queue_configMapper = queue_config_mapper self.pluginFactory = PluginFactory() self.dbProxy = DBProxy() self.throttlerMap = dict() - self.apf_mon = Apfmon(self.queueConfigMapper) + self.apf_mon = Apfmon(self.queue_configMapper) try: self.maxNewWorkers = harvester_config.submitter.maxNewWorkers except AttributeError: @@ -27,17 +27,17 @@ def __init__(self, queue_config_mapper): # define number of workers to submit based on various information def define_num_workers(self, static_num_workers, site_name): - tmpLog = core_utils.make_logger(_logger, 'site={0}'.format(site_name), method_name='define_num_workers') - tmpLog.debug('start') - tmpLog.debug('static_num_workers: {0}'.format(static_num_workers)) + tmp_log = core_utils.make_logger(_logger, 'site={0}'.format(site_name), method_name='define_num_workers') + tmp_log.debug('start') + tmp_log.debug('static_num_workers: {0}'.format(static_num_workers)) dyn_num_workers = copy.deepcopy(static_num_workers) try: # get queue status - queueStat = self.dbProxy.get_cache("panda_queues.json", None) - if queueStat is None: - queueStat = dict() + queue_stat = self.dbProxy.get_cache("panda_queues.json", None) + if queue_stat is None: + queue_stat = dict() else: - queueStat = queueStat.data + queue_stat = queue_stat.data # get job statistics job_stats = self.dbProxy.get_cache("job_statistics.json", None) @@ -47,196 +47,206 @@ def define_num_workers(self, static_num_workers, site_name): job_stats = job_stats.data # define num of new workers - for queueName in static_num_workers: + for queue_name in static_num_workers: # get queue - queueConfig = self.queueConfigMapper.get_queue(queueName) - workerLimits_dict = self.dbProxy.get_worker_limits(queueName) - maxWorkers = workerLimits_dict.get('maxWorkers', 0) - nQueueLimit = workerLimits_dict.get('nQueueLimitWorker', 0) - nQueueLimitPerRT = workerLimits_dict['nQueueLimitWorkerPerRT'] - nQueue_total, nReady_total, nRunning_total = 0, 0, 0 + queue_config = self.queue_configMapper.get_queue(queue_name) + worker_limits_dict = self.dbProxy.get_worker_limits(queue_name) + max_workers = worker_limits_dict.get('maxWorkers', 0) + n_queue_limit = worker_limits_dict.get('nQueueLimitWorker', 0) + n_queue_limit_per_rt = worker_limits_dict['nQueueLimitWorkerPerRT'] + n_queue_total, n_ready_total, n_running_total = 0, 0, 0 apf_msg = None apf_data = None - for resource_type, tmpVal in iteritems(static_num_workers[queueName]): - tmpLog.debug('Processing queue {0} resource {1} with static_num_workers {2}'. - format(queueName, resource_type, tmpVal)) - - # set 0 to num of new workers when the queue is disabled - if queueName in queueStat and queueStat[queueName]['status'] in ['offline', 'standby', - 'maintenance']: - dyn_num_workers[queueName][resource_type]['nNewWorkers'] = 0 - retMsg = 'set nNewWorkers=0 since status={0}'.format(queueStat[queueName]['status']) - tmpLog.debug(retMsg) - apf_msg = 'Not submitting workers since queue status = {0}'.format(queueStat[queueName]['status']) - continue - - # protection against not-up-to-date queue config - if queueConfig is None: - dyn_num_workers[queueName][resource_type]['nNewWorkers'] = 0 - retMsg = 'set nNewWorkers=0 due to missing queueConfig' - tmpLog.debug(retMsg) - apf_msg = 'Not submitting workers because of missing queueConfig' - continue - - # get throttler - if queueName not in self.throttlerMap: - if hasattr(queueConfig, 'throttler'): - throttler = self.pluginFactory.get_plugin(queueConfig.throttler) - else: - throttler = None - self.throttlerMap[queueName] = throttler - - # check throttler - throttler = self.throttlerMap[queueName] - if throttler is not None: - toThrottle, tmpMsg = throttler.to_be_throttled(queueConfig) - if toThrottle: - dyn_num_workers[queueName][resource_type]['nNewWorkers'] = 0 - retMsg = 'set nNewWorkers=0 by {0}:{1}'.format(throttler.__class__.__name__, tmpMsg) - tmpLog.debug(retMsg) + for job_type, jt_values in iteritems(static_num_workers[queue_name]): + for resource_type, tmp_val in iteritems(jt_values): + tmp_log.debug('Processing queue {0} job_type {1} resource_type {2} with static_num_workers {3}'. + format(queue_name, job_type, resource_type, tmp_val)) + + # set 0 to num of new workers when the queue is disabled + if queue_name in queue_stat and queue_stat[queue_name]['status'] in ['offline', 'standby', + 'maintenance']: + dyn_num_workers[queue_name][job_type][resource_type]['nNewWorkers'] = 0 + ret_msg = 'set n_new_workers=0 since status={0}'.format(queue_stat[queue_name]['status']) + tmp_log.debug(ret_msg) + apf_msg = 'Not submitting workers since queue status = {0}'.format(queue_stat[queue_name]['status']) continue - # check stats - nQueue = tmpVal['nQueue'] - nReady = tmpVal['nReady'] - nRunning = tmpVal['nRunning'] - if resource_type != 'ANY': - nQueue_total += nQueue - nReady_total += nReady - nRunning_total += nRunning - if queueConfig.runMode == 'slave': - nNewWorkersDef = tmpVal['nNewWorkers'] - if nNewWorkersDef == 0: - dyn_num_workers[queueName][resource_type]['nNewWorkers'] = 0 - retMsg = 'set nNewWorkers=0 by panda in slave mode' - tmpLog.debug(retMsg) + # protection against not-up-to-date queue config + if queue_config is None: + dyn_num_workers[queue_name][job_type][resource_type]['nNewWorkers'] = 0 + ret_msg = 'set n_new_workers=0 due to missing queue_config' + tmp_log.debug(ret_msg) + apf_msg = 'Not submitting workers because of missing queue_config' continue - else: - nNewWorkersDef = None - - # define num of new workers based on static site config - nNewWorkers = 0 - if nQueue >= nQueueLimitPerRT > 0: - # enough queued workers - retMsg = 'No nNewWorkers since nQueue({0})>=nQueueLimitPerRT({1})'.format(nQueue, nQueueLimitPerRT) - tmpLog.debug(retMsg) - pass - elif (nQueue + nReady + nRunning) >= maxWorkers > 0: - # enough workers in the system - retMsg = 'No nNewWorkers since nQueue({0}) + nReady({1}) + nRunning({2}) '.format(nQueue, - nReady, - nRunning) - retMsg += '>= maxWorkers({0})'.format(maxWorkers) - tmpLog.debug(retMsg) - pass - else: - - maxQueuedWorkers = None - - if nQueueLimitPerRT > 0: # there is a limit set for the queue - maxQueuedWorkers = nQueueLimitPerRT - - # Reset the maxQueueWorkers according to particular - if nNewWorkersDef is not None: # don't surpass limits given centrally - maxQueuedWorkers_slave = nNewWorkersDef + nQueue - if maxQueuedWorkers is not None: - maxQueuedWorkers = min(maxQueuedWorkers_slave, maxQueuedWorkers) + + # get throttler + if queue_name not in self.throttlerMap: + if hasattr(queue_config, 'throttler'): + throttler = self.pluginFactory.get_plugin(queue_config.throttler) else: - maxQueuedWorkers = maxQueuedWorkers_slave - - elif queueConfig.mapType == 'NoJob': # for pull mode, limit to activated jobs - # limit the queue to the number of activated jobs to avoid empty pilots - try: - n_activated = max(job_stats[queueName]['activated'], 1) # avoid no activity queues - queue_limit = maxQueuedWorkers - maxQueuedWorkers = min(n_activated, maxQueuedWorkers) - tmpLog.debug('limiting maxQueuedWorkers to min(n_activated={0}, queue_limit={1})'. - format(n_activated, queue_limit)) - except KeyError: - tmpLog.warning('n_activated not defined, defaulting to configured queue limits') - pass - - if maxQueuedWorkers is None: # no value found, use default value - maxQueuedWorkers = 1 - - # new workers - nNewWorkers = max(maxQueuedWorkers - nQueue, 0) - tmpLog.debug('setting nNewWorkers to {0} in maxQueuedWorkers calculation' - .format(nNewWorkers)) - if maxWorkers > 0: - nNewWorkers = min(nNewWorkers, max(maxWorkers - nQueue - nReady - nRunning, 0)) - tmpLog.debug('setting nNewWorkers to {0} to respect maxWorkers' - .format(nNewWorkers)) - if queueConfig.maxNewWorkersPerCycle > 0: - nNewWorkers = min(nNewWorkers, queueConfig.maxNewWorkersPerCycle) - tmpLog.debug('setting nNewWorkers to {0} in order to respect maxNewWorkersPerCycle' - .format(nNewWorkers)) - if self.maxNewWorkers is not None and self.maxNewWorkers > 0: - nNewWorkers = min(nNewWorkers, self.maxNewWorkers) - tmpLog.debug('setting nNewWorkers to {0} in order to respect universal maxNewWorkers' - .format(nNewWorkers)) - dyn_num_workers[queueName][resource_type]['nNewWorkers'] = nNewWorkers - - # adjust nNewWorkers for UCORE to let aggregations over RT respect nQueueLimitWorker and maxWorkers - if queueConfig is None: - maxNewWorkersPerCycle = 0 - retMsg = 'set maxNewWorkersPerCycle=0 in UCORE aggregation due to missing queueConfig' - tmpLog.debug(retMsg) + throttler = None + self.throttlerMap[queue_name] = throttler + + # check throttler + throttler = self.throttlerMap[queue_name] + if throttler is not None: + to_throttle, tmp_msg = throttler.to_be_throttled(queue_config) + if to_throttle: + dyn_num_workers[queue_name][job_type][resource_type]['nNewWorkers'] = 0 + ret_msg = 'set n_new_workers=0 by {0}:{1}'.format(throttler.__class__.__name__, tmp_msg) + tmp_log.debug(ret_msg) + continue + + # check stats + n_queue = tmp_val['nQueue'] + n_ready = tmp_val['nReady'] + n_running = tmp_val['nRunning'] + if resource_type != 'ANY' and job_type != 'ANY' and job_type is not None: + n_queue_total += n_queue + n_ready_total += n_ready + n_running_total += n_running + if queue_config.runMode == 'slave': + n_new_workers_def = tmp_val['nNewWorkers'] + if n_new_workers_def == 0: + dyn_num_workers[queue_name][job_type][resource_type]['nNewWorkers'] = 0 + ret_msg = 'set n_new_workers=0 by panda in slave mode' + tmp_log.debug(ret_msg) + continue + else: + n_new_workers_def = None + + # define num of new workers based on static site config + n_new_workers = 0 + if n_queue >= n_queue_limit_per_rt > 0: + # enough queued workers + ret_msg = 'No n_new_workers since n_queue({0})>=n_queue_limit_per_rt({1})'.format(n_queue, + n_queue_limit_per_rt) + tmp_log.debug(ret_msg) + pass + elif (n_queue + n_ready + n_running) >= max_workers > 0: + # enough workers in the system + ret_msg = 'No n_new_workers since n_queue({0}) + n_ready({1}) + n_running({2}) '.format(n_queue, + n_ready, + n_running) + ret_msg += '>= max_workers({0})'.format(max_workers) + tmp_log.debug(ret_msg) + pass + else: + + max_queued_workers = None + + if n_queue_limit_per_rt > 0: # there is a limit set for the queue + max_queued_workers = n_queue_limit_per_rt + + # Reset the maxQueueWorkers according to particular + if n_new_workers_def is not None: # don't surpass limits given centrally + maxQueuedWorkers_slave = n_new_workers_def + n_queue + if max_queued_workers is not None: + max_queued_workers = min(maxQueuedWorkers_slave, max_queued_workers) + else: + max_queued_workers = maxQueuedWorkers_slave + + elif queue_config.mapType == 'NoJob': # for pull mode, limit to activated jobs + # limit the queue to the number of activated jobs to avoid empty pilots + try: + n_activated = max(job_stats[queue_name]['activated'], 1) # avoid no activity queues + queue_limit = max_queued_workers + max_queued_workers = min(n_activated, max_queued_workers) + tmp_log.debug('limiting max_queued_workers to min(n_activated={0}, queue_limit={1})'. + format(n_activated, queue_limit)) + except KeyError: + tmp_log.warning('n_activated not defined, defaulting to configured queue limits') + pass + + if max_queued_workers is None: # no value found, use default value + max_queued_workers = 1 + + # new workers + n_new_workers = max(max_queued_workers - n_queue, 0) + tmp_log.debug('setting n_new_workers to {0} in max_queued_workers calculation' + .format(n_new_workers)) + if max_workers > 0: + n_new_workers = min(n_new_workers, max(max_workers - n_queue - n_ready - n_running, 0)) + tmp_log.debug('setting n_new_workers to {0} to respect max_workers' + .format(n_new_workers)) + if queue_config.max_new_workers_per_cycle > 0: + n_new_workers = min(n_new_workers, queue_config.max_new_workers_per_cycle) + tmp_log.debug('setting n_new_workers to {0} in order to respect max_new_workers_per_cycle' + .format(n_new_workers)) + if self.maxNewWorkers is not None and self.maxNewWorkers > 0: + n_new_workers = min(n_new_workers, self.maxNewWorkers) + tmp_log.debug('setting n_new_workers to {0} in order to respect universal maxNewWorkers' + .format(n_new_workers)) + dyn_num_workers[queue_name][job_type][resource_type]['n_new_workers'] = n_new_workers + + # adjust n_new_workers for UCORE to let aggregations over RT respect nQueueLimitWorker and max_workers + if queue_config is None: + max_new_workers_per_cycle = 0 + ret_msg = 'set max_new_workers_per_cycle=0 in UCORE aggregation due to missing queue_config' + tmp_log.debug(ret_msg) else: - maxNewWorkersPerCycle = queueConfig.maxNewWorkersPerCycle - if len(dyn_num_workers[queueName]) > 1: - total_new_workers_rts = sum( dyn_num_workers[queueName][_rt]['nNewWorkers'] + max_new_workers_per_cycle = queue_config.max_new_workers_per_cycle + if len(dyn_num_workers[queue_name]) > 1: + total_new_workers_rts = sum(dyn_num_workers[queue_name][_rt]['n_new_workers'] if _rt != 'ANY' else 0 - for _rt in dyn_num_workers[queueName] ) - nNewWorkers_max_agg = min( - max(nQueueLimit - nQueue_total, 0), - max(maxWorkers - nQueue_total - nReady_total - nRunning_total, 0), - ) - if maxNewWorkersPerCycle >= 0: - nNewWorkers_max_agg = min(nNewWorkers_max_agg, maxNewWorkersPerCycle) + for _rt in dyn_num_workers[queue_name]) + n_new_workers_max_agg = min(max(n_queue_limit - n_queue_total, 0), + max(max_workers - n_queue_total - n_ready_total - n_running_total, 0)) + if max_new_workers_per_cycle >= 0: + n_new_workers_max_agg = min(n_new_workers_max_agg, max_new_workers_per_cycle) if self.maxNewWorkers is not None and self.maxNewWorkers > 0: - nNewWorkers_max_agg = min(nNewWorkers_max_agg, self.maxNewWorkers) + n_new_workers_max_agg = min(n_new_workers_max_agg, self.maxNewWorkers) + # exceeded max, to adjust - if total_new_workers_rts > nNewWorkers_max_agg: - if nNewWorkers_max_agg == 0: - for resource_type in dyn_num_workers[queueName]: - dyn_num_workers[queueName][resource_type]['nNewWorkers'] = 0 - tmpLog.debug('No nNewWorkers since nNewWorkers_max_agg=0 for UCORE') + if total_new_workers_rts > n_new_workers_max_agg: + if n_new_workers_max_agg == 0: + for job_type in dyn_num_workers[queue_name]: + for resource_type in dyn_num_workers[queue_name][job_type]: + dyn_num_workers[queue_name][job_type][resource_type]['nNewWorkers'] = 0 + tmp_log.debug('No n_new_workers since n_new_workers_max_agg=0 for UCORE') else: - tmpLog.debug('nNewWorkers_max_agg={0} for UCORE'.format(nNewWorkers_max_agg)) - _d = dyn_num_workers[queueName].copy() + tmp_log.debug('n_new_workers_max_agg={0} for UCORE'.format(n_new_workers_max_agg)) + _d = dyn_num_workers[queue_name].copy() del _d['ANY'] - simple_rt_nw_list = [ [_rt, _d[_rt].get('nNewWorkers', 0), 0] for _rt in _d ] - _countdown = nNewWorkers_max_agg + + # TODO: needs to be recalculated + simple_rt_nw_list = [] + for _jt in _d: # jt: job type + for _rt in _d[_jt]: # rt: resource type + simple_rt_nw_list.append = [_d[_jt][_rt].get('n_new_workers', 0), 0] + + _countdown = n_new_workers_max_agg for _rt_list in simple_rt_nw_list: - resource_type, nNewWorkers_orig, _r = _rt_list - nNewWorkers, remainder = divmod(nNewWorkers_orig*nNewWorkers_max_agg, total_new_workers_rts) - dyn_num_workers[queueName][resource_type]['nNewWorkers'] = nNewWorkers + resource_type, n_new_workers_orig, _r = _rt_list + n_new_workers, remainder = divmod(n_new_workers_orig * n_new_workers_max_agg, + total_new_workers_rts) + dyn_num_workers[queue_name][job_type][resource_type]['nNewWorkers'] = n_new_workers _rt_list[2] = remainder - _countdown -= nNewWorkers + _countdown -= n_new_workers _s_list = sorted(simple_rt_nw_list, key=(lambda x: x[1])) sorted_rt_nw_list = sorted(_s_list, key=(lambda x: x[2]), reverse=True) - for resource_type, nNewWorkers_orig, remainder in sorted_rt_nw_list: + for resource_type, n_new_workers_orig, remainder in sorted_rt_nw_list: if _countdown <= 0: break - dyn_num_workers[queueName][resource_type]['nNewWorkers'] += 1 + dyn_num_workers[queue_name][job_type][resource_type]['nNewWorkers'] += 1 _countdown -= 1 - for resource_type in dyn_num_workers[queueName]: + + for resource_type in dyn_num_workers[queue_name]: if resource_type == 'ANY': continue - nNewWorkers = dyn_num_workers[queueName][resource_type]['nNewWorkers'] - tmpLog.debug('setting nNewWorkers to {0} of type {1} in order to respect RT aggregations for UCORE' - .format(nNewWorkers, resource_type)) + n_new_workers = dyn_num_workers[queue_name][job_type][resource_type]['nNewWorkers'] + tmp_log.debug('setting n_new_workers to {0} of job_type {1} resource_type {2} in order to respect RT aggregations for UCORE' + .format(n_new_workers, job_type, resource_type)) if not apf_msg: - apf_data = copy.deepcopy(dyn_num_workers[queueName]) + apf_data = copy.deepcopy(dyn_num_workers[queue_name]) - self.apf_mon.update_label(queueName, apf_msg, apf_data) + self.apf_mon.update_label(queue_name, apf_msg, apf_data) # dump - tmpLog.debug('defined {0}'.format(str(dyn_num_workers))) + tmp_log.debug('defined {0}'.format(str(dyn_num_workers))) return dyn_num_workers except Exception: # dump error - errMsg = core_utils.dump_error_message(tmpLog) + err_msg = core_utils.dump_error_message(tmp_log) return None diff --git a/pandaharvester/harvestercore/core_utils.py b/pandaharvester/harvestercore/core_utils.py index 5358d92a..5a478446 100644 --- a/pandaharvester/harvestercore/core_utils.py +++ b/pandaharvester/harvestercore/core_utils.py @@ -587,7 +587,7 @@ def get_queues_config_url(): # get unique queue name def get_unique_queue_name(queue_name, resource_type): - return '{0}:{1}'.format(queue_name, resource_type) + return '{0}:{1}:{2}'.format(queue_name, resource_type) # capability to dynamically change plugins diff --git a/pandaharvester/harvestercore/db_proxy.py b/pandaharvester/harvestercore/db_proxy.py index 4b58f4b4..2f9f7b02 100644 --- a/pandaharvester/harvestercore/db_proxy.py +++ b/pandaharvester/harvestercore/db_proxy.py @@ -1398,7 +1398,7 @@ def get_queues_to_submit(self, n_queues, lookup_interval, lock_interval, locked_ sqlS += "OR (submitTime<:lookupTimeLimit AND lockedBy IS NULL) " sqlS += "ORDER BY submitTime " # sql to get queues - sqlQ = "SELECT queueName,resourceType,nNewWorkers FROM {0} ".format(pandaQueueTableName) + sqlQ = "SELECT queueName, resourceType, jobType, nNewWorkers FROM {0} ".format(pandaQueueTableName) sqlQ += "WHERE siteName=:siteName " # sql to get orphaned workers sqlO = "SELECT workerID FROM {0} ".format(workTableName) @@ -1408,7 +1408,7 @@ def get_queues_to_submit(self, n_queues, lookup_interval, lock_interval, locked_ sqlD = "DELETE FROM {0} ".format(workTableName) sqlD += "WHERE workerID=:workerID " # sql to count nQueue - sqlN = "SELECT status,COUNT(*) cnt FROM {0} ".format(workTableName) + sqlN = "SELECT status, COUNT(*) cnt FROM {0} ".format(workTableName) sqlN += "WHERE computingSite=:computingSite " # sql to count re-fillers sqlR = "SELECT COUNT(*) cnt FROM {0} ".format(workTableName) @@ -1444,13 +1444,17 @@ def get_queues_to_submit(self, n_queues, lookup_interval, lock_interval, locked_ varMap[':siteName'] = siteName self.execute(sqlQ, varMap) resQ = self.cur.fetchall() - for queueName, resourceType, nNewWorkers in resQ: + for queueName, jobType, resourceType, nNewWorkers in resQ: + # delete orphaned workers varMap = dict() varMap[':computingSite'] = queueName varMap[':status'] = WorkSpec.ST_pending varMap[':timeLimit'] = timeNow - datetime.timedelta(seconds=lock_interval) sqlO_tmp = sqlO + if jobType != 'ANY': + varMap[':jobType'] = jobType + sqlO_tmp += "AND jobType=:jobType " if resourceType != 'ANY': varMap[':resourceType'] = resourceType sqlO_tmp += "AND resourceType=:resourceType " @@ -1462,11 +1466,15 @@ def get_queues_to_submit(self, n_queues, lookup_interval, lock_interval, locked_ self.execute(sqlD, varMap) # commit self.commit() + # count nQueue varMap = dict() varMap[':computingSite'] = queueName varMap[':resourceType'] = resourceType sqlN_tmp = sqlN + if jobType != 'ANY': + varMap[':jobType'] = jobType + sqlN_tmp += "AND jobType=:jobType " if resourceType != 'ANY': varMap[':resourceType'] = resourceType sqlN_tmp += "AND resourceType=:resourceType " @@ -1482,11 +1490,15 @@ def get_queues_to_submit(self, n_queues, lookup_interval, lock_interval, locked_ nReady += tmpNum elif workerStatus in [WorkSpec.ST_running]: nRunning += tmpNum + # count nFillers varMap = dict() varMap[':computingSite'] = queueName varMap[':status'] = WorkSpec.ST_running sqlR_tmp = sqlR + if jobType != 'ANY': + varMap[':jobType'] = jobType + sqlR_tmp += "AND jobType=:jobType " if resourceType != 'ANY': varMap[':resourceType'] = resourceType sqlR_tmp += "AND resourceType=:resourceType " @@ -1494,12 +1506,13 @@ def get_queues_to_submit(self, n_queues, lookup_interval, lock_interval, locked_ nReFill, = self.cur.fetchone() nReady += nReFill # add - retMap.setdefault(queueName, {}) - retMap[queueName][resourceType] = {'nReady': nReady, - 'nRunning': nRunning, - 'nQueue': nQueue, - 'nNewWorkers': nNewWorkers} - resourceMap[resourceType] = queueName + retMap.setdefault(queueName, {jobType: {}}) + retMap[queueName][jobType][resourceType] = {'nReady': nReady, + 'nRunning': nRunning, + 'nQueue': nQueue, + 'nNewWorkers': nNewWorkers} + resourceMap.setdefault(jobType, {}) + resourceMap[jobType][resourceType] = queueName # enough queues if len(retMap) >= 0: break @@ -3651,11 +3664,11 @@ def release_jobs(self, panda_ids, locked_by): return False # clone queue - def clone_queue_with_new_resource_type(self, site_name, queue_name, resource_type, new_workers): + def clone_queue_with_new_job_and_resource_type(self, site_name, queue_name, job_type, resource_type, new_workers): try: # get logger tmpLog = core_utils.make_logger(_logger, 'site_name={0} queue_name={1}'.format(site_name, queue_name), - method_name='clone_queue_with_new_resource_type') + method_name='clone_queue_with_new_job_and_resource_type') tmpLog.debug('start') # get the values from one of the existing queues @@ -3674,6 +3687,8 @@ def clone_queue_with_new_resource_type(self, site_name, queue_name, resource_typ attr_binding = ':{0}'.format(attribute) if attribute == 'resourceType': var_map[attr_binding] = resource_type + elif attribute == 'jobType': + var_map[attr_binding] = job_type elif attribute == 'nNewWorkers': var_map[attr_binding] = new_workers elif attribute == 'uniqueName': @@ -3707,85 +3722,87 @@ def set_queue_limit(self, site_name, params): sql_reset += "SET nNewWorkers=:zero WHERE siteName=:siteName " # sql to get resource types - sql_get_resource = "SELECT resourceType FROM {0} ".format(pandaQueueTableName) - sql_get_resource += "WHERE siteName=:siteName " - sql_get_resource += "FOR UPDATE " + sql_get_job_resource = "SELECT jobType, resourceType FROM {0} ".format(pandaQueueTableName) + sql_get_job_resource += "WHERE siteName=:siteName " + sql_get_job_resource += "FOR UPDATE " # sql to update nQueueLimit sql_update_queue = "UPDATE {0} ".format(pandaQueueTableName) - sql_update_queue += "SET nNewWorkers=:nQueue WHERE siteName=:siteName AND resourceType=:resourceType " + sql_update_queue += "SET nNewWorkers=:nQueue " + sql_update_queue += "WHERE siteName=:siteName AND jobType=:jobType AND resourceType=:resourceType " # sql to get num of submitted workers sql_count_workers = "SELECT COUNT(*) cnt " sql_count_workers += "FROM {0} wt, {1} pq ".format(workTableName, pandaQueueTableName) - sql_count_workers += "WHERE pq.siteName=:siteName AND wt.computingSite=pq.queueName AND wt.status=:status " - sql_count_workers += "ANd pq.resourceType=:resourceType " + sql_count_workers += "WHERE pq.siteName=:siteName AND wt.computingSite=pq.queueName AND wt.status=:status" + sql_count_workers += "AND pq.jobType=:jobType AND pq.resourceType=:resourceType " - # reset nqueued for all resource types + # reset nqueued for all job & resource types varMap = dict() varMap[':zero'] = 0 varMap[':siteName'] = site_name self.execute(sql_reset, varMap) - # get resource types + # get job & resource types varMap = dict() varMap[':siteName'] = site_name - self.execute(sql_get_resource, varMap) - resRes = self.cur.fetchall() - resource_type_list = set() - for tmpRes, in resRes: - resource_type_list.add(tmpRes) + self.execute(sql_get_job_resource, varMap) + res_res = self.cur.fetchall() + job_resource_type_list = set() + for tmp_res, in res_res: + job_resource_type_list.add(tmp_res) # set all queues nUp = 0 - retMap = dict() + ret_map = dict() queue_name = site_name - for resource_type, value in iteritems(params): - tmpLog.debug('Processing rt {0} -> {1}'.format(resource_type, value)) + for job_type, job_values in iteritems(params): + ret_map.setdefault(job_type, {}) + for resource_type, value in iteritems(job_values): + tmpLog.debug('Processing rt {0} -> {1}'.format(resource_type, value)) - # get num of submitted workers - varMap = dict() - varMap[':siteName'] = site_name - varMap[':resourceType'] = resource_type - varMap[':status'] = 'submitted' - self.execute(sql_count_workers, varMap) - res = self.cur.fetchone() - tmpLog.debug('{0} has {1} submitted workers'.format(resource_type, res)) - nSubmittedWorkers = 0 - if res is not None: - nSubmittedWorkers, = res - - # set new value - # value = max(value - nSubmittedWorkers, 0) - if value is None: - value = 0 - varMap = dict() - varMap[':nQueue'] = value - varMap[':siteName'] = site_name - varMap[':resourceType'] = resource_type - self.execute(sql_update_queue, varMap) - iUp = self.cur.rowcount - - # iUp is 0 when nQueue is not changed - if iUp > 0 or resource_type in resource_type_list: - # a queue was updated, add the values to the map - retMap[resource_type] = value - else: - # no queue was updated, we need to create a new one for the resource type - cloned = self.clone_queue_with_new_resource_type(site_name, queue_name, resource_type, value) - if cloned: - retMap[resource_type] = value - iUp = 1 + # get num of submitted workers + varMap = dict() + varMap[':siteName'] = site_name + varMap[':jobType'] = job_type + varMap[':resourceType'] = resource_type + varMap[':status'] = 'submitted' + self.execute(sql_count_workers, varMap) + res = self.cur.fetchone() + tmpLog.debug('{0} has {1} submitted workers'.format(resource_type, res)) + + if value is None: + value = 0 + varMap = dict() + varMap[':nQueue'] = value + varMap[':siteName'] = site_name + varMap[':jobType'] = job_type + varMap[':resourceType'] = resource_type + self.execute(sql_update_queue, varMap) + iUp = self.cur.rowcount + + # iUp is 0 when nQueue is not changed + if iUp > 0 or (job_type, resource_type) in job_resource_type_list: + # a queue was updated, add the values to the map + ret_map[job_type][resource_type] = value + else: + # no queue was updated, we need to create a new one for the resource type + cloned = self.clone_queue_with_new_resource_type(site_name, queue_name, job_type, + resource_type, value) + if cloned: + ret_map[job_type][resource_type] = value + iUp = 1 - nUp += iUp - tmpLog.debug('set nNewWorkers={0} to {1}:{2} with {3}'.format(value, queue_name, resource_type, iUp)) + nUp += iUp + tmpLog.debug('set nNewWorkers={0} to {1}:{2}:{3} with {4}'.format(value, queue_name, job_type, + resource_type, iUp)) # commit self.commit() tmpLog.debug('updated {0} queues'.format(nUp)) - return retMap + return ret_map except Exception: # roll back self.rollback() @@ -4338,16 +4355,20 @@ def get_worker_limits(self, site_name): # get logger tmpLog = core_utils.make_logger(_logger, method_name='get_worker_limits') tmpLog.debug('start') - # sql to get - sqlQ = "SELECT maxWorkers,nQueueLimitWorker,nQueueLimitWorkerRatio," + + # sql to get queue limits + sqlQ = "SELECT maxWorkers, nQueueLimitWorker, nQueueLimitWorkerRatio," sqlQ += "nQueueLimitWorkerMax,nQueueLimitWorkerMin FROM {0} ".format(pandaQueueTableName) - sqlQ += "WHERE siteName=:siteName AND resourceType='ANY'" + sqlQ += "WHERE siteName=:siteName AND jobType='ANY' AND resourceType='ANY'" + # sql to count resource types sqlNT = "SELECT COUNT(*) cnt FROM {0} ".format(pandaQueueTableName) sqlNT += "WHERE siteName=:siteName AND resourceType!='ANY'" + # sql to count running workers sqlNR = "SELECT COUNT(*) cnt FROM {0} ".format(workTableName) sqlNR += "WHERE computingSite=:computingSite AND status IN (:status1)" + # get varMap = dict() varMap[':siteName'] = site_name @@ -4365,6 +4386,7 @@ def get_worker_limits(self, site_name): varMap[':status1'] = 'running' self.execute(sqlNR, varMap) resNR = self.cur.fetchall() + # dynamic nQueueLimitWorker retMap = dict() nRunning = 0 diff --git a/pandaharvester/harvestercore/panda_queue_spec.py b/pandaharvester/harvestercore/panda_queue_spec.py index 1ba06b5d..210e29d7 100644 --- a/pandaharvester/harvestercore/panda_queue_spec.py +++ b/pandaharvester/harvestercore/panda_queue_spec.py @@ -16,6 +16,7 @@ class PandaQueueSpec(SpecBase): 'submitTime:timestamp / index', 'lockedBy:text', 'siteName:text / index', + 'jobType:text', 'resourceType:text', 'nNewWorkers:integer', 'uniqueName:text / unique', @@ -29,7 +30,7 @@ class PandaQueueSpec(SpecBase): # catchall resource type RT_catchall = 'ANY' - + JT_catchall = 'ANY' # constructor def __init__(self): SpecBase.__init__(self) diff --git a/pandaharvester/harvestercore/work_spec.py b/pandaharvester/harvestercore/work_spec.py index e049ad93..63e1869f 100644 --- a/pandaharvester/harvestercore/work_spec.py +++ b/pandaharvester/harvestercore/work_spec.py @@ -79,6 +79,7 @@ class WorkSpec(SpecBase): 'computingElement:text', 'nJobsToReFill:integer / index', 'logFilesToUpload:blob', + 'jobType:text', 'resourceType:text', 'nativeExitCode:integer', 'nativeStatus:text', @@ -236,6 +237,7 @@ def convert_to_propagate(self): 'submitTime', 'startTime', 'endTime', + 'jobType', 'resourceType', 'nativeExitCode', 'nativeStatus', diff --git a/pandaharvester/harvesterworkermaker/dummy_dynamic_worker_maker.py b/pandaharvester/harvesterworkermaker/dummy_dynamic_worker_maker.py index f202eece..beb29294 100644 --- a/pandaharvester/harvesterworkermaker/dummy_dynamic_worker_maker.py +++ b/pandaharvester/harvesterworkermaker/dummy_dynamic_worker_maker.py @@ -11,7 +11,7 @@ def __init__(self, **kwarg): BaseWorkerMaker.__init__(self, **kwarg) # make a worker from jobs - def make_worker(self, jobspec_list, queue_config, resource_type): + def make_worker(self, jobspec_list, queue_config, job_type, resource_type): workSpec = WorkSpec() workSpec.resourceType = resource_type if len(jobspec_list) > 0: diff --git a/pandaharvester/harvesterworkermaker/multijob_worker_maker.py b/pandaharvester/harvesterworkermaker/multijob_worker_maker.py index 7b56c235..7de1f4f5 100644 --- a/pandaharvester/harvesterworkermaker/multijob_worker_maker.py +++ b/pandaharvester/harvesterworkermaker/multijob_worker_maker.py @@ -45,7 +45,7 @@ def _get_executable(self, queue_config): return exe_str # make a worker from a job with a disk access point - def make_worker(self, jobspec_list, queue_config, resource_type): + def make_worker(self, jobspec_list, queue_config, job_type, resource_type): tmpLog = self.make_logger(baseLogger, method_name='make_worker') workSpec = WorkSpec() self.nJobsPerWorker = len(jobspec_list) diff --git a/pandaharvester/harvesterworkermaker/multinode_worker_maker.py b/pandaharvester/harvesterworkermaker/multinode_worker_maker.py index c59b81a5..c03c44cf 100644 --- a/pandaharvester/harvesterworkermaker/multinode_worker_maker.py +++ b/pandaharvester/harvesterworkermaker/multinode_worker_maker.py @@ -58,7 +58,7 @@ def _get_executable(self): return exe_str # make a worker from jobs - def make_worker(self, jobspec_list, queue_config, resource_type): + def make_worker(self, jobspec_list, queue_config, job_type, resource_type): tmpLog = core_utils.make_logger(baseLogger, 'queue={0}'.format(queue_config.queueName), method_name='make_worker') diff --git a/pandaharvester/harvesterworkermaker/simple_bf_es_worker_maker.py b/pandaharvester/harvesterworkermaker/simple_bf_es_worker_maker.py index 2ac53375..bc2d7dad 100644 --- a/pandaharvester/harvesterworkermaker/simple_bf_es_worker_maker.py +++ b/pandaharvester/harvesterworkermaker/simple_bf_es_worker_maker.py @@ -28,7 +28,7 @@ def __init__(self, **kwarg): self.dyn_resources = None # make a worker from jobs - def make_worker(self, jobspec_list, queue_config, resource_type): + def make_worker(self, jobspec_list, queue_config, job_type, resource_type): tmpLog = self.make_logger(_logger, 'queue={0}'.format(queue_config.queueName), method_name='make_worker') @@ -201,7 +201,7 @@ def adjust_resources(self, resources): tmpLog.info("Available backfill resources after adjusting: %s" % ret_resources) return ret_resources - def get_dynamic_resource(self, queue_name, resource_type): + def get_dynamic_resource(self, queue_name, job_type, resource_type): resources = self.get_bf_resources() if resources: resources = self.adjust_resources(resources) diff --git a/pandaharvester/harvesterworkermaker/simple_worker_maker.py b/pandaharvester/harvesterworkermaker/simple_worker_maker.py index b925028d..fb7a8454 100644 --- a/pandaharvester/harvesterworkermaker/simple_worker_maker.py +++ b/pandaharvester/harvesterworkermaker/simple_worker_maker.py @@ -56,8 +56,29 @@ def get_job_core_and_memory(self, queue_dict, job_spec): return job_corecount, job_memory + def get_job_type(self, job_spec, job_type, queue_dict): + + # 1. get prodSourceLabel from job (push) + if job_spec and 'prodSourceLabel' in job_spec.jobParams: + job_type_final = job_spec.jobParams['prodSourceLabel'] + # 2. get prodSourceLabel from the specified job_type (pull UPS) + elif job_type: + job_type_final = job_type + # 3. convert the prodSourcelabel from the queue configuration or leave it empty + else: # 3. get prodSourceLabel from the queue definition (pull) + queue_type = queue_dict.get('type', None) + # map AGIS types to PanDA types + if queue_type == 'analysis': + job_type_final = 'user' + elif queue_type == 'production': + job_type_final = 'managed' + else: + job_type_final = None + + return job_type_final + # make a worker from jobs - def make_worker(self, jobspec_list, queue_config, resource_type): + def make_worker(self, jobspec_list, queue_config, job_type, resource_type): tmpLog = self.make_logger(_logger, 'queue={0}'.format(queue_config.queueName), method_name='make_worker') @@ -76,7 +97,7 @@ def make_worker(self, jobspec_list, queue_config, resource_type): workSpec.nCore = queue_dict.get('corecount', 1) or 1 workSpec.minRamCount = queue_dict.get('maxrss', 1) or 1 - # case of unified queue: look at the resource type and queue configuration + # case of unified queue: look at the job & resource type and queue configuration else: catchall = queue_dict.get('catchall', '') if 'useMaxRam' in catchall or queue_config.queueName in ('Taiwan-LCG2-HPC2_Unified', @@ -128,11 +149,9 @@ def make_worker(self, jobspec_list, queue_config, resource_type): except Exception: pass - if (nCore > 0 and 'nCore' in self.jobAttributesToUse) \ - or unified_queue: + if (nCore > 0 and 'nCore' in self.jobAttributesToUse) or unified_queue: workSpec.nCore = nCore - if (minRamCount > 0 and 'minRamCount' in self.jobAttributesToUse) \ - or unified_queue: + if (minRamCount > 0 and 'minRamCount' in self.jobAttributesToUse) or unified_queue: workSpec.minRamCount = minRamCount if maxDiskCount > 0 and 'maxDiskCount' in self.jobAttributesToUse: workSpec.maxDiskCount = maxDiskCount @@ -140,13 +159,19 @@ def make_worker(self, jobspec_list, queue_config, resource_type): workSpec.maxWalltime = maxWalltime if ioIntensity > 0 and 'ioIntensity' in self.jobAttributesToUse: workSpec.ioIntensity = ioIntensity + workSpec.pilotType = jobspec_list[0].get_pilot_type() + workSpec.jobType = self.get_job_type(jobspec_list[0], job_type, queue_dict) + else: # when no job # randomize pilot type with weighting workSpec.pilotType = random.choice(self.pilotTypeRandomList) if workSpec.pilotType in ['RC', 'ALRB', 'PT']: tmpLog.info('a worker has pilotType={0}'.format(workSpec.pilotType)) + + workSpec.jobType = self.get_job_type(None, job_type, queue_dict) + # TODO: this needs to be improved with real resource types if resource_type and resource_type != 'ANY': workSpec.resourceType = resource_type @@ -155,4 +180,6 @@ def make_worker(self, jobspec_list, queue_config, resource_type): else: workSpec.resourceType = 'MCORE' + + return workSpec From 76bdf64283d396a9f72a30d0dedd05c1c3fbedc5 Mon Sep 17 00:00:00 2001 From: fbarreir Date: Tue, 7 May 2019 17:16:13 +0200 Subject: [PATCH 002/181] checkpoint --- pandaharvester/harvestercore/db_proxy.py | 60 +++++++++++++----------- 1 file changed, 33 insertions(+), 27 deletions(-) diff --git a/pandaharvester/harvestercore/db_proxy.py b/pandaharvester/harvestercore/db_proxy.py index 2f9f7b02..1cbb27b7 100644 --- a/pandaharvester/harvestercore/db_proxy.py +++ b/pandaharvester/harvestercore/db_proxy.py @@ -3264,7 +3264,7 @@ def get_worker_stats(self, site_name): tmpLog = core_utils.make_logger(_logger, method_name='get_worker_stats') tmpLog.debug('start') # sql to get nQueueLimit - sqlQ = "SELECT queueName,resourceType,nNewWorkers FROM {0} ".format(pandaQueueTableName) + sqlQ = "SELECT queueName, jobType, resourceType, nNewWorkers FROM {0} ".format(pandaQueueTableName) sqlQ += "WHERE siteName=:siteName " # get nQueueLimit varMap = dict() @@ -3272,18 +3272,18 @@ def get_worker_stats(self, site_name): self.execute(sqlQ, varMap) resQ = self.cur.fetchall() retMap = dict() - for computingSite, resourceType, nNewWorkers in resQ: - if resourceType not in retMap: - retMap[resourceType] = { - 'running': 0, - 'submitted': 0, - 'to_submit': nNewWorkers - } + for computingSite, jobType, resourceType, nNewWorkers in resQ: + retMap.setdefault(jobType, {}) + if resourceType not in retMap[jobType]: + retMap[jobType][resourceType] = {'running': 0, + 'submitted': 0, + 'to_submit': nNewWorkers} + # get worker stats - sqlW = "SELECT wt.status, wt.computingSite, pq.resourceType, COUNT(*) cnt " + sqlW = "SELECT wt.status, wt.computingSite, pq.jobType, pq.resourceType, COUNT(*) cnt " sqlW += "FROM {0} wt, {1} pq ".format(workTableName, pandaQueueTableName) sqlW += "WHERE pq.siteName=:siteName AND wt.computingSite=pq.queueName AND wt.status IN (:st1,:st2) " - sqlW += "GROUP BY wt.status, wt.computingSite, pq.resourceType " + sqlW += "GROUP BY wt.status, wt.computingSite, pq.jobType, pq.resourceType " # get worker stats varMap = dict() varMap[':siteName'] = site_name @@ -3291,14 +3291,14 @@ def get_worker_stats(self, site_name): varMap[':st2'] = 'submitted' self.execute(sqlW, varMap) resW = self.cur.fetchall() - for workerStatus, computingSite, resourceType, cnt in resW: + for workerStatus, computingSite, jobType, resourceType, cnt in resW: + retMap.setdefault(jobType, {}) if resourceType not in retMap: - retMap[resourceType] = { - 'running': 0, - 'submitted': 0, - 'to_submit': 0 - } - retMap[resourceType][workerStatus] = cnt + retMap[jobType][resourceType] = {'running': 0, + 'submitted': 0, + 'to_submit': 0 + } + retMap[jobType][resourceType][workerStatus] = cnt # commit self.commit() tmpLog.debug('got {0}'.format(str(retMap))) @@ -3318,40 +3318,46 @@ def get_worker_stats_bulk(self, active_ups_queues): tmpLog = core_utils.make_logger(_logger, method_name='get_worker_stats_bulk') tmpLog.debug('start') # sql to get nQueueLimit - sqlQ = "SELECT queueName, resourceType, nNewWorkers FROM {0} ".format(pandaQueueTableName) + sqlQ = "SELECT queueName, jobType, resourceType, nNewWorkers FROM {0} ".format(pandaQueueTableName) # get nQueueLimit self.execute(sqlQ) resQ = self.cur.fetchall() retMap = dict() - for computingSite, resourceType, nNewWorkers in resQ: + for computingSite, jobType, resourceType, nNewWorkers in resQ: retMap.setdefault(computingSite, {}) - if resourceType and resourceType != 'ANY' and resourceType not in retMap[computingSite]: - retMap[computingSite][resourceType] = {'running': 0, 'submitted': 0, 'to_submit': nNewWorkers} + retMap[computingSite].setdefault(jobType, {}) + if resourceType and resourceType != 'ANY' and resourceType not in retMap[computingSite][jobType]: + retMap[computingSite][jobType][resourceType] = {'running': 0, + 'submitted': 0, + 'to_submit': nNewWorkers} # get worker stats - sqlW = "SELECT wt.status, wt.computingSite, wt.resourceType, COUNT(*) cnt " + sqlW = "SELECT wt.status, wt.computingSite, wt.jobType, wt.resourceType, COUNT(*) cnt " sqlW += "FROM {0} wt ".format(workTableName) sqlW += "WHERE wt.status IN (:st1,:st2) " - sqlW += "GROUP BY wt.status,wt.computingSite, wt.resourceType " + sqlW += "GROUP BY wt.status,wt.computingSite, wt.jobType, wt.resourceType " # get worker stats varMap = dict() varMap[':st1'] = 'running' varMap[':st2'] = 'submitted' self.execute(sqlW, varMap) resW = self.cur.fetchall() - for workerStatus, computingSite, resourceType, cnt in resW: + for workerStatus, computingSite, jobType, resourceType, cnt in resW: if resourceType and resourceType != 'ANY': retMap.setdefault(computingSite, {}) - retMap[computingSite].setdefault(resourceType, {'running': 0, 'submitted': 0, 'to_submit': 0}) - retMap[computingSite][resourceType][workerStatus] = cnt + retMap[computingSite].setdefault(jobType, {}) + retMap[computingSite][jobType].setdefault(resourceType, {'running': 0, + 'submitted': 0, + 'to_submit': 0}) + retMap[computingSite][jobType][resourceType][workerStatus] = cnt # if there are no jobs for an active UPS queue, it needs to be initialized so that the pilot streaming # on panda server starts processing the queue if active_ups_queues: for ups_queue in active_ups_queues: if ups_queue not in retMap or not retMap[ups_queue]: - retMap[ups_queue] = {'SCORE': {'running': 0, 'submitted': 0, 'to_submit': 0}} + retMap[ups_queue] = {'managed': {'SCORE': {'running': 0, 'submitted': 0, 'to_submit': 0}}} # commit self.commit() From 3fcfec664506b0220d8c3711b5612ce9ccbdaf22 Mon Sep 17 00:00:00 2001 From: fbarreir Date: Wed, 8 May 2019 10:43:50 +0200 Subject: [PATCH 003/181] typo --- pandaharvester/harvesterbody/submitter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandaharvester/harvesterbody/submitter.py b/pandaharvester/harvesterbody/submitter.py index a7970cc4..53c3b0f1 100644 --- a/pandaharvester/harvesterbody/submitter.py +++ b/pandaharvester/harvesterbody/submitter.py @@ -38,7 +38,7 @@ def __init__(self, queue_config_mapper, single_mode=False): def run(self): locked_by = 'submitter-{0}'.format(self.get_pid()) monitor_fifo = self.monitor_fifo - queue_lock_interval = getattr(harvester_config.submitter, 'queue_lock_interval', + queue_lock_interval = getattr(harvester_config.submitter, 'queueLockInterval', harvester_config.submitter.lockInterval) while True: sw_main = core_utils.get_stopwatch() From 98875e5ae9f5c8737e145bf714033ee3ce6b35fb Mon Sep 17 00:00:00 2001 From: fbarreir Date: Fri, 30 Aug 2019 17:28:23 +0200 Subject: [PATCH 004/181] Prodanaly: correction on unique queue name --- pandaharvester/harvestercore/core_utils.py | 4 ++-- pandaharvester/harvestercore/db_proxy.py | 2 +- pandaharvester/harvestercore/queue_config_mapper.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pandaharvester/harvestercore/core_utils.py b/pandaharvester/harvestercore/core_utils.py index 5a478446..560fd77a 100644 --- a/pandaharvester/harvestercore/core_utils.py +++ b/pandaharvester/harvestercore/core_utils.py @@ -586,8 +586,8 @@ def get_queues_config_url(): # get unique queue name -def get_unique_queue_name(queue_name, resource_type): - return '{0}:{1}:{2}'.format(queue_name, resource_type) +def get_unique_queue_name(queue_name, resource_type, job_type): + return '{0}:{1}:{2}'.format(queue_name, resource_type, job_type) # capability to dynamically change plugins diff --git a/pandaharvester/harvestercore/db_proxy.py b/pandaharvester/harvestercore/db_proxy.py index 1cbb27b7..c0344be6 100644 --- a/pandaharvester/harvestercore/db_proxy.py +++ b/pandaharvester/harvestercore/db_proxy.py @@ -3698,7 +3698,7 @@ def clone_queue_with_new_job_and_resource_type(self, site_name, queue_name, job_ elif attribute == 'nNewWorkers': var_map[attr_binding] = new_workers elif attribute == 'uniqueName': - var_map[attr_binding] = core_utils.get_unique_queue_name(queue_name, resource_type) + var_map[attr_binding] = core_utils.get_unique_queue_name(queue_name, resource_type, job_type) else: var_map[attr_binding] = value attribute_list.append(attribute) diff --git a/pandaharvester/harvestercore/queue_config_mapper.py b/pandaharvester/harvestercore/queue_config_mapper.py index 0471c898..4e6f9b48 100644 --- a/pandaharvester/harvestercore/queue_config_mapper.py +++ b/pandaharvester/harvestercore/queue_config_mapper.py @@ -82,7 +82,7 @@ def get_source_label(self): # set unique name def set_unique_name(self): - self.uniqueName = core_utils.get_unique_queue_name(self.queueName, self.resourceType) + self.uniqueName = core_utils.get_unique_queue_name(self.queueName, self.resourceType, self.prodSourceLabel) # update attributes def update_attributes(self, data): From 7aa3ce3cfa9b679efc83a8e150444798702509e3 Mon Sep 17 00:00:00 2001 From: fbarreir Date: Mon, 2 Sep 2019 14:10:19 +0200 Subject: [PATCH 005/181] Prodanaly: initialize jobtype in panda queue --- pandaharvester/harvestercore/queue_config_mapper.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandaharvester/harvestercore/queue_config_mapper.py b/pandaharvester/harvestercore/queue_config_mapper.py index 4e6f9b48..c7bb7dd7 100644 --- a/pandaharvester/harvestercore/queue_config_mapper.py +++ b/pandaharvester/harvestercore/queue_config_mapper.py @@ -56,6 +56,7 @@ def __init__(self, queue_name): self.noHeartbeat = '' self.runMode = 'self' self.resourceType = PandaQueueSpec.RT_catchall + self.jobType = PandaQueueSpec.JT_catchall self.getJobCriteria = None self.ddmEndpointIn = None self.allowJobMixture = False From 59127ee44236fb1ed03ab8e633a96552af27844e Mon Sep 17 00:00:00 2001 From: fbarreir Date: Tue, 3 Sep 2019 14:19:35 +0200 Subject: [PATCH 006/181] Prodanaly: PEP8 convention issues --- pandaharvester/harvesterbody/worker_adjuster.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandaharvester/harvesterbody/worker_adjuster.py b/pandaharvester/harvesterbody/worker_adjuster.py index 2737a97d..32e13ac7 100644 --- a/pandaharvester/harvesterbody/worker_adjuster.py +++ b/pandaharvester/harvesterbody/worker_adjuster.py @@ -169,9 +169,9 @@ def define_num_workers(self, static_num_workers, site_name): n_new_workers = min(n_new_workers, max(max_workers - n_queue - n_ready - n_running, 0)) tmp_log.debug('setting n_new_workers to {0} to respect max_workers' .format(n_new_workers)) - if queue_config.max_new_workers_per_cycle > 0: - n_new_workers = min(n_new_workers, queue_config.max_new_workers_per_cycle) - tmp_log.debug('setting n_new_workers to {0} in order to respect max_new_workers_per_cycle' + if queue_config.maxNewWorkersPerCycle > 0: + n_new_workers = min(n_new_workers, queue_config.maxNewWorkersPerCycle) + tmp_log.debug('setting n_new_workers to {0} in order to respect maxNewWorkersPerCycle' .format(n_new_workers)) if self.maxNewWorkers is not None and self.maxNewWorkers > 0: n_new_workers = min(n_new_workers, self.maxNewWorkers) @@ -185,7 +185,7 @@ def define_num_workers(self, static_num_workers, site_name): ret_msg = 'set max_new_workers_per_cycle=0 in UCORE aggregation due to missing queue_config' tmp_log.debug(ret_msg) else: - max_new_workers_per_cycle = queue_config.max_new_workers_per_cycle + max_new_workers_per_cycle = queue_config.maxNewWorkersPerCycle if len(dyn_num_workers[queue_name]) > 1: total_new_workers_rts = sum(dyn_num_workers[queue_name][_rt]['n_new_workers'] if _rt != 'ANY' else 0 From 21029d8a505464aa2bde5a42624bb5f7e09dd774 Mon Sep 17 00:00:00 2001 From: fbarreir Date: Tue, 3 Sep 2019 15:10:08 +0200 Subject: [PATCH 007/181] Prodanaly: PEP8 conventions related issues --- pandaharvester/harvesterbody/worker_adjuster.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandaharvester/harvesterbody/worker_adjuster.py b/pandaharvester/harvesterbody/worker_adjuster.py index 32e13ac7..ff0127d5 100644 --- a/pandaharvester/harvesterbody/worker_adjuster.py +++ b/pandaharvester/harvesterbody/worker_adjuster.py @@ -177,7 +177,7 @@ def define_num_workers(self, static_num_workers, site_name): n_new_workers = min(n_new_workers, self.maxNewWorkers) tmp_log.debug('setting n_new_workers to {0} in order to respect universal maxNewWorkers' .format(n_new_workers)) - dyn_num_workers[queue_name][job_type][resource_type]['n_new_workers'] = n_new_workers + dyn_num_workers[queue_name][job_type][resource_type]['nNewWorkers'] = n_new_workers # adjust n_new_workers for UCORE to let aggregations over RT respect nQueueLimitWorker and max_workers if queue_config is None: @@ -187,7 +187,7 @@ def define_num_workers(self, static_num_workers, site_name): else: max_new_workers_per_cycle = queue_config.maxNewWorkersPerCycle if len(dyn_num_workers[queue_name]) > 1: - total_new_workers_rts = sum(dyn_num_workers[queue_name][_rt]['n_new_workers'] + total_new_workers_rts = sum(dyn_num_workers[queue_name][_rt]['nNewWorkers'] if _rt != 'ANY' else 0 for _rt in dyn_num_workers[queue_name]) n_new_workers_max_agg = min(max(n_queue_limit - n_queue_total, 0), @@ -213,7 +213,7 @@ def define_num_workers(self, static_num_workers, site_name): simple_rt_nw_list = [] for _jt in _d: # jt: job type for _rt in _d[_jt]: # rt: resource type - simple_rt_nw_list.append = [_d[_jt][_rt].get('n_new_workers', 0), 0] + simple_rt_nw_list.append = [_d[_jt][_rt].get('nNewWorkers', 0), 0] _countdown = n_new_workers_max_agg for _rt_list in simple_rt_nw_list: From 8144f70b338ae969e0ea257c1ae3f57acbcf6cc3 Mon Sep 17 00:00:00 2001 From: fbarreir Date: Tue, 3 Sep 2019 15:31:11 +0200 Subject: [PATCH 008/181] Prodanaly: make_workers missing job_type argument --- pandaharvester/harvesterbody/worker_maker.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandaharvester/harvesterbody/worker_maker.py b/pandaharvester/harvesterbody/worker_maker.py index 7557ea5f..3ca7d6a8 100644 --- a/pandaharvester/harvesterbody/worker_maker.py +++ b/pandaharvester/harvesterbody/worker_maker.py @@ -18,7 +18,7 @@ def get_plugin(self, queue_config): return self.pluginFactory.get_plugin(queue_config.workerMaker) # make workers - def make_workers(self, jobchunk_list, queue_config, n_ready, resource_type, maker=None): + def make_workers(self, jobchunk_list, queue_config, n_ready, job_type, resource_type, maker=None): tmpLog = core_utils.make_logger(_logger, 'queue={0} rtype={1}'.format(queue_config.queueName, resource_type), method_name='make_workers') tmpLog.debug('start') @@ -38,7 +38,7 @@ def make_workers(self, jobchunk_list, queue_config, n_ready, resource_type, make for iChunk, jobChunk in enumerate(jobchunk_list): # make a worker if iChunk >= n_ready: - workSpec = maker.make_worker(jobChunk, queue_config, resource_type) + workSpec = maker.make_worker(jobChunk, queue_config, job_type, resource_type) else: # use ready worker if iChunk < len(readyWorkers): From ba8d3a87d0f361ae55feb1e86378a369c02d6535 Mon Sep 17 00:00:00 2001 From: fbarreir Date: Tue, 3 Sep 2019 15:36:12 +0200 Subject: [PATCH 009/181] Prodanaly: PEP8 related typo --- pandaharvester/harvesterbody/submitter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandaharvester/harvesterbody/submitter.py b/pandaharvester/harvesterbody/submitter.py index 53c3b0f1..ff7fbee1 100644 --- a/pandaharvester/harvesterbody/submitter.py +++ b/pandaharvester/harvesterbody/submitter.py @@ -242,7 +242,7 @@ def run(self): # map type work_spec.mapType = queue_config.mapType # queue name - work_spec.computingSite = queue_config.queue_name + work_spec.computingSite = queue_config.queueName # set access point work_spec.accessPoint = queue_config.messenger['accessPoint'] # sync level From d11c767db8dde6ae7e4a78c4737c26c0c704724f Mon Sep 17 00:00:00 2001 From: fbarreir Date: Wed, 4 Sep 2019 16:28:47 +0200 Subject: [PATCH 010/181] Prodanaly: small cleanup of unused import, empty lines, unused variables --- pandaharvester/harvestersubmitter/htcondor_submitter.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/pandaharvester/harvestersubmitter/htcondor_submitter.py b/pandaharvester/harvestersubmitter/htcondor_submitter.py index b2468de6..366b9cdf 100644 --- a/pandaharvester/harvestersubmitter/htcondor_submitter.py +++ b/pandaharvester/harvestersubmitter/htcondor_submitter.py @@ -7,7 +7,7 @@ from concurrent.futures import ThreadPoolExecutor import re -from math import sqrt, log1p +from math import log1p from pandaharvester.harvesterconfig import harvester_config from pandaharvester.harvestercore.queue_config_mapper import QueueConfigMapper @@ -227,8 +227,6 @@ def submit_bag_of_workers(data_list): worker_retval_map[workerID] = (tmpRetVal, workspec.get_changed_attributes()) # attributes try: - ce_info_dict = data['ce_info_dict'] - batch_log_dict = data['batch_log_dict'] use_spool = data['use_spool'] except KeyError: errStr = '{0} not submitted due to incomplete data of the worker'.format(workerID) @@ -592,8 +590,6 @@ def submit_workers(self, workspec_list): tmpLog.error('No valid CE endpoint found') to_submit_any = False - - def _handle_one_worker(workspec, to_submit=to_submit_any): # make logger tmpLog = core_utils.make_logger(baseLogger, 'workerID={0}'.format(workspec.workerID), From 7688a0232b9c533647d2689ad3bb3a6c01af68f7 Mon Sep 17 00:00:00 2001 From: fbarreir Date: Fri, 6 Sep 2019 18:21:42 +0200 Subject: [PATCH 011/181] Prodanaly: choose proxy depending on the job_type for the worker --- .../harvestersubmitter/htcondor_submitter.py | 36 ++++++++++++++++--- 1 file changed, 31 insertions(+), 5 deletions(-) diff --git a/pandaharvester/harvestersubmitter/htcondor_submitter.py b/pandaharvester/harvestersubmitter/htcondor_submitter.py index fbd43bc1..0ef4d81b 100644 --- a/pandaharvester/harvestersubmitter/htcondor_submitter.py +++ b/pandaharvester/harvestersubmitter/htcondor_submitter.py @@ -43,7 +43,8 @@ def _get_ce_weighting(ce_endpoint_list=[], worker_ce_all_tuple=None): for _ce in worker_ce_backend_throughput_dict)) thruput_avg = (log1p(Q_good_init) - log1p(Q_good_fin)) n_new_workers = float(n_new_workers) - def _get_thruput(_ce_endpoint): + + def _get_thruput(_ce_endpoint): # inner function if _ce_endpoint not in worker_ce_backend_throughput_dict: q_good_init = 0. q_good_fin = 0. @@ -54,7 +55,8 @@ def _get_thruput(_ce_endpoint): for _st in ('submitted',))) thruput = (log1p(q_good_init) - log1p(q_good_fin)) return thruput - def _get_thruput_adj_ratio(thruput): + + def _get_thruput_adj_ratio(thruput): # inner function try: thruput_adj_ratio = thruput/thruput_avg + 1/N except ZeroDivisionError: @@ -65,7 +67,8 @@ def _get_thruput_adj_ratio(thruput): return thruput_adj_ratio ce_base_weight_sum = sum((_get_thruput_adj_ratio(_get_thruput(_ce)) for _ce in ce_endpoint_list)) - def _get_init_weight(_ce_endpoint): + + def _get_init_weight(_ce_endpoint): # inner function if _ce_endpoint not in worker_ce_stats_dict: q = 0. r = 0. @@ -454,11 +457,16 @@ def __init__(self, **kwarg): self.logDir except AttributeError: self.logDir = os.getenv('TMPDIR') or '/tmp' - # x509 proxy + # Default x509 proxy for a queue try: self.x509UserProxy except AttributeError: self.x509UserProxy = os.getenv('X509_USER_PROXY') + # x509 proxy for analysis jobs in grandly unified queues + try: + self.x509UserProxyAnalysis + except AttributeError: + self.x509UserProxyAnalysis = os.getenv('X509_USER_PROXY_ANAL') # ATLAS AGIS try: self.useAtlasAGIS = bool(self.useAtlasAGIS) @@ -732,6 +740,10 @@ def _handle_one_worker(workspec, to_submit=to_submit_any): batch_log_dict['gtag'] = workspec.workAttributes['stdOut'] tmpLog.debug('Done set_log_file before submission') tmpLog.debug('Done jobspec attribute setting') + + # choose the x509 certificate based on the type of job (analysis or production) + proxy = _choose_proxy(workspec) + # set data dict data.update({ 'workspec': workspec, @@ -742,7 +754,7 @@ def _handle_one_worker(workspec, to_submit=to_submit_any): 'log_subdir': log_subdir, 'n_core_per_node': n_core_per_node, 'panda_queue_name': panda_queue_name, - 'x509_user_proxy': self.x509UserProxy, + 'x509_user_proxy': proxy, 'ce_info_dict': ce_info_dict, 'batch_log_dict': batch_log_dict, 'special_par': special_par, @@ -755,6 +767,20 @@ def _handle_one_worker(workspec, to_submit=to_submit_any): }) return data + def _choose_proxy(workspec): + """ + Choose the proxy based on the job type + """ + job_type = workspec.job_type + proxy = self.x509UserProxy + if (job_type == 'user' or job_type == 'analysis') and self.x509UserProxyAnalysis: + tmpLog.debug('Taking analysis proxy') + proxy = self.x509UserProxyAnalysis + else: + tmpLog.debug('Taking default proxy') + + return proxy + def _propagate_attributes(workspec, tmpVal): # make logger tmpLog = core_utils.make_logger(baseLogger, 'workerID={0}'.format(workspec.workerID), From a12c51db40af7b0e17979902ecce55d2e5dfe8c0 Mon Sep 17 00:00:00 2001 From: fbarreir Date: Mon, 9 Sep 2019 14:20:26 +0200 Subject: [PATCH 012/181] Prodanaly: choose prod vs pilot proxy and small details --- pandaharvester/harvesterbody/worker_maker.py | 2 +- pandaharvester/harvestermisc/htcondor_utils.py | 2 -- pandaharvester/harvestermisc/info_utils.py | 16 +++++++++++++++- .../harvestersubmitter/htcondor_submitter.py | 4 +++- 4 files changed, 19 insertions(+), 5 deletions(-) diff --git a/pandaharvester/harvesterbody/worker_maker.py b/pandaharvester/harvesterbody/worker_maker.py index 3ca7d6a8..076367f1 100644 --- a/pandaharvester/harvesterbody/worker_maker.py +++ b/pandaharvester/harvesterbody/worker_maker.py @@ -19,7 +19,7 @@ def get_plugin(self, queue_config): # make workers def make_workers(self, jobchunk_list, queue_config, n_ready, job_type, resource_type, maker=None): - tmpLog = core_utils.make_logger(_logger, 'queue={0} rtype={1}'.format(queue_config.queueName, resource_type), + tmpLog = core_utils.make_logger(_logger, 'queue={0} rtype={1} jtype={2}'.format(queue_config.queueName, resource_type, job_type), method_name='make_workers') tmpLog.debug('start') try: diff --git a/pandaharvester/harvestermisc/htcondor_utils.py b/pandaharvester/harvestermisc/htcondor_utils.py index ed37ca03..15ad7aed 100644 --- a/pandaharvester/harvestermisc/htcondor_utils.py +++ b/pandaharvester/harvestermisc/htcondor_utils.py @@ -3,7 +3,6 @@ import re import time -import datetime import threading import random import multiprocessing @@ -27,7 +26,6 @@ from pandaharvester.harvestercore import core_utils from pandaharvester.harvesterconfig import harvester_config from pandaharvester.harvestercore.core_utils import SingletonWithID -from pandaharvester.harvestercore.work_spec import WorkSpec from pandaharvester.harvestercore.fifos import SpecialFIFOBase # condor python or command api diff --git a/pandaharvester/harvestermisc/info_utils.py b/pandaharvester/harvestermisc/info_utils.py index cb22af4c..289a9a9c 100644 --- a/pandaharvester/harvestermisc/info_utils.py +++ b/pandaharvester/harvestermisc/info_utils.py @@ -69,7 +69,7 @@ def get_all_queue_names(self): names = set() for queue_name, queue_dict in iteritems(self): if queue_dict.get('pilot_manager') in ['Harvester'] \ - and queue_dict.get('harvester') == harvesterID: + and queue_dict.get('harvester') == harvesterID: names.add(queue_name) return names @@ -83,6 +83,20 @@ def is_ups_queue(self, panda_resource): return True return False + # is grandly unified queue, i.e. runs analysis and production + def is_grandly_unified_queue(self, panda_resource): + panda_queue_dict = self.get(panda_resource) + if panda_queue_dict is None: + return False + + # initial, temporary nomenclature + if 'grandly_unified' in panda_queue_dict.get('catchall'): + return True + + # TODO: implement the final nomenclature + + return False + # get harvester params def get_harvester_params(self, panda_resource): panda_queue_dict = self.get(panda_resource) diff --git a/pandaharvester/harvestersubmitter/htcondor_submitter.py b/pandaharvester/harvestersubmitter/htcondor_submitter.py index 0ef4d81b..81f6ea17 100644 --- a/pandaharvester/harvestersubmitter/htcondor_submitter.py +++ b/pandaharvester/harvestersubmitter/htcondor_submitter.py @@ -547,11 +547,13 @@ def submit_workers(self, workspec_list): _queueConfigMapper = QueueConfigMapper() harvester_queue_config = _queueConfigMapper.get_queue(self.queueName) + is_grandly_unified_queue = False # get queue info from AGIS by cacher in db if self.useAtlasAGIS: panda_queues_dict = PandaQueuesDict() panda_queue_name = panda_queues_dict.get_panda_queue_name(self.queueName) this_panda_queue_dict = panda_queues_dict.get(self.queueName, dict()) + is_grandly_unified_queue = panda_queues_dict.is_grandly_unified_queue(self.queueName) # tmpLog.debug('panda_queues_name and queue_info: {0}, {1}'.format(self.queueName, panda_queues_dict[self.queueName])) else: panda_queues_dict = dict() @@ -773,7 +775,7 @@ def _choose_proxy(workspec): """ job_type = workspec.job_type proxy = self.x509UserProxy - if (job_type == 'user' or job_type == 'analysis') and self.x509UserProxyAnalysis: + if is_grandly_unified_queue and (job_type == 'user' or job_type == 'analysis') and self.x509UserProxyAnalysis: tmpLog.debug('Taking analysis proxy') proxy = self.x509UserProxyAnalysis else: From 6eda97ad68bfc85bbc2eb505262d2c189ec76448 Mon Sep 17 00:00:00 2001 From: fbarreir Date: Tue, 10 Sep 2019 10:44:34 +0200 Subject: [PATCH 013/181] typo --- pandaharvester/harvestersubmitter/htcondor_submitter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandaharvester/harvestersubmitter/htcondor_submitter.py b/pandaharvester/harvestersubmitter/htcondor_submitter.py index 81f6ea17..b5a8a57f 100644 --- a/pandaharvester/harvestersubmitter/htcondor_submitter.py +++ b/pandaharvester/harvestersubmitter/htcondor_submitter.py @@ -773,7 +773,7 @@ def _choose_proxy(workspec): """ Choose the proxy based on the job type """ - job_type = workspec.job_type + job_type = workspec.jobType proxy = self.x509UserProxy if is_grandly_unified_queue and (job_type == 'user' or job_type == 'analysis') and self.x509UserProxyAnalysis: tmpLog.debug('Taking analysis proxy') From 92f37b84bcf9e41e0e07206215df688391cf527f Mon Sep 17 00:00:00 2001 From: fbarreir Date: Tue, 10 Sep 2019 11:19:07 +0200 Subject: [PATCH 014/181] typos --- pandaharvester/harvestercore/core_utils.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandaharvester/harvestercore/core_utils.py b/pandaharvester/harvestercore/core_utils.py index e53efe15..caa216b3 100644 --- a/pandaharvester/harvestercore/core_utils.py +++ b/pandaharvester/harvestercore/core_utils.py @@ -613,10 +613,10 @@ def _asdict(self): return dict(zip(self.attributes, self)) -# Make a list of choice candidates accroding to permille weight +# Make a list of choice candidates according to permille weight def make_choice_list(pdpm={}, default=None): weight_sum = sum(pdpm.values()) - weight_defualt = 1000 + weight_default = 1000 ret_list = [] for candidate, weight in iteritems(pdpm): if weight_sum > 1000: @@ -624,8 +624,8 @@ def make_choice_list(pdpm={}, default=None): else: real_weight = int(weight) ret_list.extend([candidate]*real_weight) - weight_defualt -= real_weight - ret_list.extend([default]*weight_defualt) + weight_default -= real_weight + ret_list.extend([default]*weight_default) return ret_list From e470afc4f30e511da73dc61bad6ab94a6104e6a2 Mon Sep 17 00:00:00 2001 From: fbarreir Date: Thu, 12 Sep 2019 11:29:00 +0200 Subject: [PATCH 015/181] prodanaly: set_queue_limit correction --- pandaharvester/harvestercore/db_proxy.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandaharvester/harvestercore/db_proxy.py b/pandaharvester/harvestercore/db_proxy.py index 51a10c05..0076822e 100644 --- a/pandaharvester/harvestercore/db_proxy.py +++ b/pandaharvester/harvestercore/db_proxy.py @@ -3800,10 +3800,10 @@ def set_queue_limit(self, site_name, params): varMap = dict() varMap[':siteName'] = site_name self.execute(sql_get_job_resource, varMap) - res_res = self.cur.fetchall() + results = self.cur.fetchall() job_resource_type_list = set() - for tmp_res, in res_res: - job_resource_type_list.add(tmp_res) + for tmp_job_type, tmp_resource_type in results: + job_resource_type_list.add(tmp_job_type, tmp_resource_type) # set all queues nUp = 0 From 0f35dfd52af51b6d00ea7e90991745790dfc41aa Mon Sep 17 00:00:00 2001 From: fbarreir Date: Thu, 12 Sep 2019 13:37:02 +0200 Subject: [PATCH 016/181] prodanaly: set_queue_limit correction --- pandaharvester/harvestercore/db_proxy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandaharvester/harvestercore/db_proxy.py b/pandaharvester/harvestercore/db_proxy.py index 0076822e..79ac5187 100644 --- a/pandaharvester/harvestercore/db_proxy.py +++ b/pandaharvester/harvestercore/db_proxy.py @@ -3803,7 +3803,7 @@ def set_queue_limit(self, site_name, params): results = self.cur.fetchall() job_resource_type_list = set() for tmp_job_type, tmp_resource_type in results: - job_resource_type_list.add(tmp_job_type, tmp_resource_type) + job_resource_type_list.add((tmp_job_type, tmp_resource_type)) # set all queues nUp = 0 From 5095dc5d3546880c8d4c00289cc665c7cf3b6241 Mon Sep 17 00:00:00 2001 From: fbarreir Date: Thu, 12 Sep 2019 13:40:42 +0200 Subject: [PATCH 017/181] prodanaly: set_queue_limit correction --- pandaharvester/harvestercore/db_proxy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandaharvester/harvestercore/db_proxy.py b/pandaharvester/harvestercore/db_proxy.py index 79ac5187..1b21ece9 100644 --- a/pandaharvester/harvestercore/db_proxy.py +++ b/pandaharvester/harvestercore/db_proxy.py @@ -3787,7 +3787,7 @@ def set_queue_limit(self, site_name, params): # sql to get num of submitted workers sql_count_workers = "SELECT COUNT(*) cnt " sql_count_workers += "FROM {0} wt, {1} pq ".format(workTableName, pandaQueueTableName) - sql_count_workers += "WHERE pq.siteName=:siteName AND wt.computingSite=pq.queueName AND wt.status=:status" + sql_count_workers += "WHERE pq.siteName=:siteName AND wt.computingSite=pq.queueName AND wt.status=:status " sql_count_workers += "AND pq.jobType=:jobType AND pq.resourceType=:resourceType " # reset nqueued for all job & resource types From e52ea97900e7137c3affb9b617b0047f15448996 Mon Sep 17 00:00:00 2001 From: fbarreir Date: Thu, 12 Sep 2019 14:18:36 +0200 Subject: [PATCH 018/181] prodanaly: set_queue_limit correction --- pandaharvester/harvestercore/db_proxy.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandaharvester/harvestercore/db_proxy.py b/pandaharvester/harvestercore/db_proxy.py index 1b21ece9..9fd8938d 100644 --- a/pandaharvester/harvestercore/db_proxy.py +++ b/pandaharvester/harvestercore/db_proxy.py @@ -3841,8 +3841,8 @@ def set_queue_limit(self, site_name, params): ret_map[job_type][resource_type] = value else: # no queue was updated, we need to create a new one for the resource type - cloned = self.clone_queue_with_new_resource_type(site_name, queue_name, job_type, - resource_type, value) + cloned = self.clone_queue_with_new_job_and_resource_type(site_name, queue_name, job_type, + resource_type, value) if cloned: ret_map[job_type][resource_type] = value iUp = 1 From c95a59dc908178f4f8ac74834addb9df47ef7811 Mon Sep 17 00:00:00 2001 From: fbarreir Date: Thu, 12 Sep 2019 15:13:56 +0200 Subject: [PATCH 019/181] prodanaly: get_queues_to_submit correction --- pandaharvester/harvestercore/db_proxy.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandaharvester/harvestercore/db_proxy.py b/pandaharvester/harvestercore/db_proxy.py index 9fd8938d..49e587ba 100644 --- a/pandaharvester/harvestercore/db_proxy.py +++ b/pandaharvester/harvestercore/db_proxy.py @@ -1524,7 +1524,8 @@ def get_queues_to_submit(self, n_queues, lookup_interval, lock_interval, locked_ nReFill, = self.cur.fetchone() nReady += nReFill # add - retMap.setdefault(queueName, {jobType: {}}) + retMap.setdefault(queueName, {}) + retMap[queueName].setdefault(jobType, {}) retMap[queueName][jobType][resourceType] = {'nReady': nReady, 'nRunning': nRunning, 'nQueue': nQueue, From 71c25be6c34e4c2e59a133479de7f157a5ae7da6 Mon Sep 17 00:00:00 2001 From: fbarreir Date: Thu, 12 Sep 2019 17:40:57 +0200 Subject: [PATCH 020/181] prodanaly: worker_adjuster correction --- pandaharvester/harvesterbody/worker_adjuster.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pandaharvester/harvesterbody/worker_adjuster.py b/pandaharvester/harvesterbody/worker_adjuster.py index ff0127d5..82cc0869 100644 --- a/pandaharvester/harvesterbody/worker_adjuster.py +++ b/pandaharvester/harvesterbody/worker_adjuster.py @@ -187,9 +187,11 @@ def define_num_workers(self, static_num_workers, site_name): else: max_new_workers_per_cycle = queue_config.maxNewWorkersPerCycle if len(dyn_num_workers[queue_name]) > 1: - total_new_workers_rts = sum(dyn_num_workers[queue_name][_rt]['nNewWorkers'] - if _rt != 'ANY' else 0 - for _rt in dyn_num_workers[queue_name]) + total_new_workers_rts = 0 + for _jt in dyn_num_workers[queue_name]; + for _rt in dyn_num_workers[queue_name][_jt]: + if _jt != 'ANY' and _rt != 'ANY': + total_new_workers_rts = total_new_workers_rts + dyn_num_workers[queue_name][_jt][_rt]['nNewWorkers'] n_new_workers_max_agg = min(max(n_queue_limit - n_queue_total, 0), max(max_workers - n_queue_total - n_ready_total - n_running_total, 0)) if max_new_workers_per_cycle >= 0: From e4ed699019f023da4ebf946ff891a45b94b3fd8c Mon Sep 17 00:00:00 2001 From: fbarreir Date: Thu, 12 Sep 2019 17:49:45 +0200 Subject: [PATCH 021/181] typo --- pandaharvester/harvesterbody/worker_adjuster.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandaharvester/harvesterbody/worker_adjuster.py b/pandaharvester/harvesterbody/worker_adjuster.py index 82cc0869..470b6a87 100644 --- a/pandaharvester/harvesterbody/worker_adjuster.py +++ b/pandaharvester/harvesterbody/worker_adjuster.py @@ -188,7 +188,7 @@ def define_num_workers(self, static_num_workers, site_name): max_new_workers_per_cycle = queue_config.maxNewWorkersPerCycle if len(dyn_num_workers[queue_name]) > 1: total_new_workers_rts = 0 - for _jt in dyn_num_workers[queue_name]; + for _jt in dyn_num_workers[queue_name]: for _rt in dyn_num_workers[queue_name][_jt]: if _jt != 'ANY' and _rt != 'ANY': total_new_workers_rts = total_new_workers_rts + dyn_num_workers[queue_name][_jt][_rt]['nNewWorkers'] From a13c53209dbb1961ecce464eff1608cae0d48678 Mon Sep 17 00:00:00 2001 From: fbarreir Date: Thu, 12 Sep 2019 17:53:53 +0200 Subject: [PATCH 022/181] prodanaly: worker_adjuster correction --- pandaharvester/harvesterbody/worker_adjuster.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandaharvester/harvesterbody/worker_adjuster.py b/pandaharvester/harvesterbody/worker_adjuster.py index 470b6a87..e8ff97ff 100644 --- a/pandaharvester/harvesterbody/worker_adjuster.py +++ b/pandaharvester/harvesterbody/worker_adjuster.py @@ -215,7 +215,7 @@ def define_num_workers(self, static_num_workers, site_name): simple_rt_nw_list = [] for _jt in _d: # jt: job type for _rt in _d[_jt]: # rt: resource type - simple_rt_nw_list.append = [_d[_jt][_rt].get('nNewWorkers', 0), 0] + simple_rt_nw_list.append([_d[_jt][_rt].get('nNewWorkers', 0), 0]) _countdown = n_new_workers_max_agg for _rt_list in simple_rt_nw_list: From da46134eaf110edbaea907a9abaf5955074d2b8f Mon Sep 17 00:00:00 2001 From: fbarreir Date: Thu, 12 Sep 2019 17:57:50 +0200 Subject: [PATCH 023/181] prodanaly: typo --- pandaharvester/harvesterbody/worker_adjuster.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandaharvester/harvesterbody/worker_adjuster.py b/pandaharvester/harvesterbody/worker_adjuster.py index e8ff97ff..56a4dc5f 100644 --- a/pandaharvester/harvesterbody/worker_adjuster.py +++ b/pandaharvester/harvesterbody/worker_adjuster.py @@ -215,7 +215,7 @@ def define_num_workers(self, static_num_workers, site_name): simple_rt_nw_list = [] for _jt in _d: # jt: job type for _rt in _d[_jt]: # rt: resource type - simple_rt_nw_list.append([_d[_jt][_rt].get('nNewWorkers', 0), 0]) + simple_rt_nw_list.append([_rt, _d[_jt][_rt].get('nNewWorkers', 0), 0]) _countdown = n_new_workers_max_agg for _rt_list in simple_rt_nw_list: From 4a6e2dded603b8a49ebdf002c9135a94f3b6b718 Mon Sep 17 00:00:00 2001 From: fbarreir Date: Thu, 12 Sep 2019 18:48:35 +0200 Subject: [PATCH 024/181] prodanaly: corrected query column order in get_queues_to_submit --- pandaharvester/harvestercore/db_proxy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandaharvester/harvestercore/db_proxy.py b/pandaharvester/harvestercore/db_proxy.py index 49e587ba..73c50c67 100644 --- a/pandaharvester/harvestercore/db_proxy.py +++ b/pandaharvester/harvestercore/db_proxy.py @@ -1416,7 +1416,7 @@ def get_queues_to_submit(self, n_queues, lookup_interval, lock_interval, locked_ sqlS += "OR (submitTime<:lookupTimeLimit AND lockedBy IS NULL) " sqlS += "ORDER BY submitTime " # sql to get queues - sqlQ = "SELECT queueName, resourceType, jobType, nNewWorkers FROM {0} ".format(pandaQueueTableName) + sqlQ = "SELECT queueName, jobType, resourceType, nNewWorkers FROM {0} ".format(pandaQueueTableName) sqlQ += "WHERE siteName=:siteName " # sql to get orphaned workers sqlO = "SELECT workerID FROM {0} ".format(workTableName) From 32a226146746a9b7df58f2b30a1bc0dd65e9cdba Mon Sep 17 00:00:00 2001 From: fbarreir Date: Thu, 12 Sep 2019 19:06:28 +0200 Subject: [PATCH 025/181] prodanaly: worker_adjuster correction --- .../harvesterbody/worker_adjuster.py | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/pandaharvester/harvesterbody/worker_adjuster.py b/pandaharvester/harvesterbody/worker_adjuster.py index 56a4dc5f..b5cd1576 100644 --- a/pandaharvester/harvesterbody/worker_adjuster.py +++ b/pandaharvester/harvesterbody/worker_adjuster.py @@ -213,9 +213,9 @@ def define_num_workers(self, static_num_workers, site_name): # TODO: needs to be recalculated simple_rt_nw_list = [] - for _jt in _d: # jt: job type - for _rt in _d[_jt]: # rt: resource type - simple_rt_nw_list.append([_rt, _d[_jt][_rt].get('nNewWorkers', 0), 0]) + for job_type in _d: # jt: job type + for resource_type in _d[job_type]: # rt: resource type + simple_rt_nw_list.append([resource_type, _d[job_type][resource_type].get('nNewWorkers', 0), 0]) _countdown = n_new_workers_max_agg for _rt_list in simple_rt_nw_list: @@ -232,13 +232,13 @@ def define_num_workers(self, static_num_workers, site_name): break dyn_num_workers[queue_name][job_type][resource_type]['nNewWorkers'] += 1 _countdown -= 1 - - for resource_type in dyn_num_workers[queue_name]: - if resource_type == 'ANY': - continue - n_new_workers = dyn_num_workers[queue_name][job_type][resource_type]['nNewWorkers'] - tmp_log.debug('setting n_new_workers to {0} of job_type {1} resource_type {2} in order to respect RT aggregations for UCORE' - .format(n_new_workers, job_type, resource_type)) + for job_type in dyn_num_workers[queue_name]: + for resource_type in dyn_num_workers[queue_name][job_type]: + if job_type == 'ANY' or resource_type == 'ANY': + continue + n_new_workers = dyn_num_workers[queue_name][job_type][resource_type]['nNewWorkers'] + tmp_log.debug('setting n_new_workers to {0} of job_type {1} resource_type {2} in order to respect RT aggregations for UCORE' + .format(n_new_workers, job_type, resource_type)) if not apf_msg: apf_data = copy.deepcopy(dyn_num_workers[queue_name]) From 85629e6647d534180fe5163394694123cae3fbcc Mon Sep 17 00:00:00 2001 From: fbarreir Date: Fri, 13 Sep 2019 16:26:29 +0200 Subject: [PATCH 026/181] prodanaly: various fixes --- pandaharvester/harvesterbody/submitter.py | 7 ++++--- pandaharvester/harvesterbody/worker_maker.py | 12 ++++++------ .../harvesterworkermaker/simple_worker_maker.py | 6 ++---- 3 files changed, 12 insertions(+), 13 deletions(-) diff --git a/pandaharvester/harvesterbody/submitter.py b/pandaharvester/harvesterbody/submitter.py index ff7fbee1..e2f15928 100644 --- a/pandaharvester/harvesterbody/submitter.py +++ b/pandaharvester/harvesterbody/submitter.py @@ -82,9 +82,9 @@ def run(self): else: # loop over all queues and resource types for queue_name in n_workers_per_queue_jt_rt: - for job_type, tmp_job_vals in iteritems(n_workers_per_queue_jt_rt[queue_name]): - for resource_type, tmp_val in iteritems(tmp_job_vals): - + for job_type in n_workers_per_queue_jt_rt[queue_name]: + for resource_type in n_workers_per_queue_jt_rt[queue_name][job_type]: + tmp_val = n_workers_per_queue_jt_rt[queue_name][job_type][resource_type] tmp_log = self.make_logger(_logger, 'id={0} queue={1} jtype={2} rtype={3}'.format( locked_by, queue_name, job_type, resource_type), method_name='run') try: @@ -196,6 +196,7 @@ def run(self): okChunks, ngChunks = self.workerMaker.make_workers(jobChunks, queue_config, nReady, job_type, resource_type, maker=workerMakerCore) + if len(ngChunks) == 0: tmp_log.debug('successfully made {0} workers'.format(len(okChunks))) else: diff --git a/pandaharvester/harvesterbody/worker_maker.py b/pandaharvester/harvesterbody/worker_maker.py index 076367f1..8aae3440 100644 --- a/pandaharvester/harvesterbody/worker_maker.py +++ b/pandaharvester/harvesterbody/worker_maker.py @@ -19,7 +19,7 @@ def get_plugin(self, queue_config): # make workers def make_workers(self, jobchunk_list, queue_config, n_ready, job_type, resource_type, maker=None): - tmpLog = core_utils.make_logger(_logger, 'queue={0} rtype={1} jtype={2}'.format(queue_config.queueName, resource_type, job_type), + tmpLog = core_utils.make_logger(_logger, 'queue={0} jtype={1} rtype={2}'.format(queue_config.queueName, job_type, resource_type), method_name='make_workers') tmpLog.debug('start') try: @@ -65,35 +65,35 @@ def make_workers(self, jobchunk_list, queue_config, n_ready, job_type, resource_ return [], jobchunk_list # get number of jobs per worker - def get_num_jobs_per_worker(self, queue_config, n_workers, resource_type, maker=None): + def get_num_jobs_per_worker(self, queue_config, n_workers, job_type, resource_type, maker=None): # get plugin if maker is None: maker = self.pluginFactory.get_plugin(queue_config.workerMaker) return maker.get_num_jobs_per_worker(n_workers) # get number of workers per job - def get_num_workers_per_job(self, queue_config, n_workers, resource_type, maker=None): + def get_num_workers_per_job(self, queue_config, n_workers, job_type, resource_type, maker=None): # get plugin if maker is None: maker = self.pluginFactory.get_plugin(queue_config.workerMaker) return maker.get_num_workers_per_job(n_workers) # check number of ready resources - def num_ready_resources(self, queue_config, resource_type, maker=None): + def num_ready_resources(self, queue_config, job_type, resource_type, maker=None): # get plugin if maker is None: maker = self.pluginFactory.get_plugin(queue_config.workerMaker) return maker.num_ready_resources() # get upper limit on the cumulative total of workers per job - def get_max_workers_per_job_in_total(self, queue_config, resource_type, maker=None): + def get_max_workers_per_job_in_total(self, queue_config, job_type, resource_type, maker=None): # get plugin if maker is None: maker = self.pluginFactory.get_plugin(queue_config.workerMaker) return maker.get_max_workers_per_job_in_total() # get upper limit on the number of new workers per job in a cycle - def get_max_workers_per_job_per_cycle(self, queue_config, resource_type, maker=None): + def get_max_workers_per_job_per_cycle(self, queue_config, job_type, resource_type, maker=None): # get plugin if maker is None: maker = self.pluginFactory.get_plugin(queue_config.workerMaker) diff --git a/pandaharvester/harvesterworkermaker/simple_worker_maker.py b/pandaharvester/harvesterworkermaker/simple_worker_maker.py index fcab5b62..2433dd2c 100644 --- a/pandaharvester/harvesterworkermaker/simple_worker_maker.py +++ b/pandaharvester/harvesterworkermaker/simple_worker_maker.py @@ -61,7 +61,7 @@ def get_job_type(self, job_spec, job_type, queue_dict): # make a worker from jobs def make_worker(self, jobspec_list, queue_config, job_type, resource_type): - tmpLog = self.make_logger(_logger, 'queue={0}'.format(queue_config.queueName), + tmpLog = self.make_logger(_logger, 'queue={0}:{1}:{2}'.format(queue_config.queueName, job_type, resource_type), method_name='make_worker') tmpLog.debug('jobspec_list: {0}'.format(jobspec_list)) @@ -160,8 +160,8 @@ def make_worker(self, jobspec_list, queue_config, job_type, resource_type): tmpLog.info('a worker has pilotType={0}'.format(workSpec.pilotType)) workSpec.jobType = self.get_job_type(None, job_type, queue_dict) + tmpLog.debug('get_job_type decided for job_type: {0}'.format(workSpec.jobType)) - # TODO: this needs to be improved with real resource types if resource_type and resource_type != 'ANY': workSpec.resourceType = resource_type elif workSpec.nCore == 1: @@ -169,6 +169,4 @@ def make_worker(self, jobspec_list, queue_config, job_type, resource_type): else: workSpec.resourceType = 'MCORE' - - return workSpec From 8065256cb74eff39409003561b5ed55cca67464d Mon Sep 17 00:00:00 2001 From: fbarreir Date: Thu, 10 Oct 2019 09:52:28 +0200 Subject: [PATCH 027/181] prodanaly: added sitename to get_worker_limits log messages --- pandaharvester/harvestercore/db_proxy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandaharvester/harvestercore/db_proxy.py b/pandaharvester/harvestercore/db_proxy.py index 73c50c67..0f1df9c4 100644 --- a/pandaharvester/harvestercore/db_proxy.py +++ b/pandaharvester/harvestercore/db_proxy.py @@ -4407,7 +4407,7 @@ def increment_submission_attempt(self, panda_id, new_number): def get_worker_limits(self, site_name): try: # get logger - tmpLog = core_utils.make_logger(_logger, method_name='get_worker_limits') + tmpLog = core_utils.make_logger(_logger, token='site_name={0}'.format(site_name), method_name='get_worker_limits') tmpLog.debug('start') # sql to get queue limits From deee7bbecea4c9b60e542d4c4ca2dfa820976cf3 Mon Sep 17 00:00:00 2001 From: fbarreir Date: Thu, 10 Oct 2019 10:58:06 +0200 Subject: [PATCH 028/181] prodanaly: protection against uninitialized jobType in getWorkerLimits --- pandaharvester/harvestercore/db_proxy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandaharvester/harvestercore/db_proxy.py b/pandaharvester/harvestercore/db_proxy.py index 187c79bb..d6fa87ac 100644 --- a/pandaharvester/harvestercore/db_proxy.py +++ b/pandaharvester/harvestercore/db_proxy.py @@ -4413,7 +4413,7 @@ def get_worker_limits(self, site_name): # sql to get queue limits sqlQ = "SELECT maxWorkers, nQueueLimitWorker, nQueueLimitWorkerRatio," sqlQ += "nQueueLimitWorkerMax,nQueueLimitWorkerMin FROM {0} ".format(pandaQueueTableName) - sqlQ += "WHERE siteName=:siteName AND jobType='ANY' AND resourceType='ANY'" + sqlQ += "WHERE siteName=:siteName AND resourceType='ANY' AND (jobType='ANY' OR jobType IS NULL) " # sql to count resource types sqlNT = "SELECT COUNT(*) cnt FROM {0} ".format(pandaQueueTableName) From ddd2eff37369951b0e44002c9a0e330f2d376e1e Mon Sep 17 00:00:00 2001 From: fbarreir Date: Fri, 18 Oct 2019 11:00:48 +0200 Subject: [PATCH 029/181] prodanaly: backwards compatibility for reporting worker stats --- pandaharvester/harvestercommunicator/panda_communicator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandaharvester/harvestercommunicator/panda_communicator.py b/pandaharvester/harvestercommunicator/panda_communicator.py index c1402971..5385cff3 100644 --- a/pandaharvester/harvestercommunicator/panda_communicator.py +++ b/pandaharvester/harvestercommunicator/panda_communicator.py @@ -530,7 +530,7 @@ def update_worker_stats(self, site_name, stats): data['siteName'] = site_name data['paramsList'] = json.dumps(stats) tmpLog.debug('update stats for {0}, stats: {1}'.format(site_name, stats)) - tmpStat, tmpRes = self.post_ssl('reportWorkerStats', data) + tmpStat, tmpRes = self.post_ssl('reportWorkerStats_jobtype', data) errStr = 'OK' if tmpStat is False: errStr = core_utils.dump_error_message(tmpLog, tmpRes) From b61359846e8f3423ee6a8634e711cbc0889e0247 Mon Sep 17 00:00:00 2001 From: fbarreir Date: Fri, 18 Oct 2019 11:37:28 +0200 Subject: [PATCH 030/181] prodanaly: UPS command name change for migration --- pandaharvester/harvestercore/command_spec.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandaharvester/harvestercore/command_spec.py b/pandaharvester/harvestercore/command_spec.py index a1e674f5..ac18f30a 100644 --- a/pandaharvester/harvestercore/command_spec.py +++ b/pandaharvester/harvestercore/command_spec.py @@ -16,7 +16,7 @@ class CommandSpec(SpecBase): ) # commands COM_reportWorkerStats = 'REPORT_WORKER_STATS' - COM_setNWorkers = 'SET_N_WORKERS' + COM_setNWorkers = 'SET_N_WORKERS_JOBTYPE' COM_killWorkers = 'KILL_WORKERS' # mapping between command and receiver receiver_map = { From a6912cbf267bad8eeb074ecabdf2b0833a46c9b5 Mon Sep 17 00:00:00 2001 From: fbarreir Date: Wed, 13 Nov 2019 14:11:05 +0100 Subject: [PATCH 031/181] prodanaly: initialize jobType --- pandaharvester/harvestercore/db_proxy.py | 24 +++++++++++++++++++++++ pandaharvester/harvestermisc/k8s_utils.py | 6 ++---- 2 files changed, 26 insertions(+), 4 deletions(-) diff --git a/pandaharvester/harvestercore/db_proxy.py b/pandaharvester/harvestercore/db_proxy.py index d6fa87ac..eca8868f 100644 --- a/pandaharvester/harvestercore/db_proxy.py +++ b/pandaharvester/harvestercore/db_proxy.py @@ -361,6 +361,20 @@ def need_index(self, attr): isUnique = True return isIndex, isUnique + def initialize_jobType(self, table_name): + # initialize old NULL entries to ANY in pq_table and work_table + # get logger + tmp_log = core_utils.make_logger(_logger, method_name='initialize_jobType') + + sql_update = "UPDATE {0} SET jobType = 'ANY' WHERE jobType is NULL ".format(table_name) + try: + self.execute(sql_update) + # commit + self.commit() + tmp_log.debug('initialized entries in {0}'.format(table_name)) + except Exception: + core_utils.dump_error_message(tmp_log) + # make table def make_table(self, cls, table_name): try: @@ -430,6 +444,12 @@ def make_table(self, cls, table_name): tmpLog.debug('added {0} to {1}'.format(attr, table_name)) except Exception: core_utils.dump_error_message(tmpLog) + + # if we just added the jobType, old entries need to be initialized + if (table_name == pandaQueueTableName and attrName == 'jobType') \ + or (table_name == pandaQueueTableName and attrName == 'jobType'): + self.initialize_jobType(table_name) + # make indexes for index in indexes: indexName = 'idx_{0}_{1}'.format(index, table_name) @@ -479,6 +499,10 @@ def make_tables(self, queue_config_mapper): for outStr in outStrs: print (outStr) sys.exit(1) + + # initialize the job types to ANY when NULL + self.initialize_jobType() + # add sequential numbers self.add_seq_number('SEQ_workerID', 1) self.add_seq_number('SEQ_configID', 1) diff --git a/pandaharvester/harvestermisc/k8s_utils.py b/pandaharvester/harvestermisc/k8s_utils.py index b197f5d9..be2e6cc4 100644 --- a/pandaharvester/harvestermisc/k8s_utils.py +++ b/pandaharvester/harvestermisc/k8s_utils.py @@ -48,10 +48,8 @@ def create_job_from_yaml(self, yaml_content, work_spec, cert, cert_in_secret=Tru # note that predefined values in the yaml template will NOT be overwritten if work_spec.nCore > 0: - container_env['resources'].setdefault('limits', { - 'cpu': str(work_spec.nCore)}) - container_env['resources'].setdefault('requests', { - 'cpu': str(work_spec.nCore*cpuadjustratio/100.0)}) + container_env['resources'].setdefault('limits', {'cpu': str(work_spec.nCore)}) + container_env['resources'].setdefault('requests', {'cpu': str(work_spec.nCore * cpuadjustratio / 100.0)}) if work_spec.minRamCount > 4: # K8S minimum memory limit = 4 MB From 1b1cbe0b29ca08ce1cb9b5286681e4369752f7e9 Mon Sep 17 00:00:00 2001 From: fbarreir Date: Wed, 13 Nov 2019 14:18:52 +0100 Subject: [PATCH 032/181] prodanaly: removed unnecessary call to initialize_jobType --- pandaharvester/harvestercore/db_proxy.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/pandaharvester/harvestercore/db_proxy.py b/pandaharvester/harvestercore/db_proxy.py index eca8868f..69a8d6b6 100644 --- a/pandaharvester/harvestercore/db_proxy.py +++ b/pandaharvester/harvestercore/db_proxy.py @@ -500,9 +500,6 @@ def make_tables(self, queue_config_mapper): print (outStr) sys.exit(1) - # initialize the job types to ANY when NULL - self.initialize_jobType() - # add sequential numbers self.add_seq_number('SEQ_workerID', 1) self.add_seq_number('SEQ_configID', 1) From c97061d54425bf77b60ab2fe7cd898bac141b99e Mon Sep 17 00:00:00 2001 From: fbarreir Date: Thu, 14 Nov 2019 13:09:47 +0100 Subject: [PATCH 033/181] update version --- pandaharvester/panda_pkg_info.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandaharvester/panda_pkg_info.py b/pandaharvester/panda_pkg_info.py index 8b3a7c96..a4389a0c 100644 --- a/pandaharvester/panda_pkg_info.py +++ b/pandaharvester/panda_pkg_info.py @@ -1 +1 @@ -release_version = "0.1.5-rc" +release_version = "0.2.0-rc" From 55b527ed73154808d4c4d373774da9e105255ae5 Mon Sep 17 00:00:00 2001 From: fbarreir Date: Mon, 18 Nov 2019 11:18:41 +0100 Subject: [PATCH 034/181] prodanaly: missing dict initialization --- pandaharvester/harvesterbody/worker_adjuster.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandaharvester/harvesterbody/worker_adjuster.py b/pandaharvester/harvesterbody/worker_adjuster.py index b5cd1576..0f62b656 100644 --- a/pandaharvester/harvesterbody/worker_adjuster.py +++ b/pandaharvester/harvesterbody/worker_adjuster.py @@ -222,6 +222,7 @@ def define_num_workers(self, static_num_workers, site_name): resource_type, n_new_workers_orig, _r = _rt_list n_new_workers, remainder = divmod(n_new_workers_orig * n_new_workers_max_agg, total_new_workers_rts) + dyn_num_workers[queue_name][job_type].setdefault(resource_type, {}) dyn_num_workers[queue_name][job_type][resource_type]['nNewWorkers'] = n_new_workers _rt_list[2] = remainder _countdown -= n_new_workers From 1a7cd927fa4e66216ef332700d2da3f795258bf1 Mon Sep 17 00:00:00 2001 From: fbarreir Date: Mon, 18 Nov 2019 11:29:18 +0100 Subject: [PATCH 035/181] prodanaly: fix --- pandaharvester/harvesterbody/worker_adjuster.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandaharvester/harvesterbody/worker_adjuster.py b/pandaharvester/harvesterbody/worker_adjuster.py index 0f62b656..93e0fd03 100644 --- a/pandaharvester/harvesterbody/worker_adjuster.py +++ b/pandaharvester/harvesterbody/worker_adjuster.py @@ -222,7 +222,9 @@ def define_num_workers(self, static_num_workers, site_name): resource_type, n_new_workers_orig, _r = _rt_list n_new_workers, remainder = divmod(n_new_workers_orig * n_new_workers_max_agg, total_new_workers_rts) - dyn_num_workers[queue_name][job_type].setdefault(resource_type, {}) + dyn_num_workers[queue_name][job_type].setdefault(resource_type, + {'nReady': 0, 'nRunning': 0, + 'nQueue': 0, 'nNewWorkers': 0}) dyn_num_workers[queue_name][job_type][resource_type]['nNewWorkers'] = n_new_workers _rt_list[2] = remainder _countdown -= n_new_workers From 5d9329db99b78cb39cf079a638c6444a43421fd5 Mon Sep 17 00:00:00 2001 From: David Cameron Date: Wed, 20 Nov 2019 11:18:32 +0100 Subject: [PATCH 036/181] take prodSourceLabel from job to support unified prod/analy queues --- pandaharvester/harvestersubmitter/act_submitter.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandaharvester/harvestersubmitter/act_submitter.py b/pandaharvester/harvestersubmitter/act_submitter.py index 177372f3..a42624d5 100644 --- a/pandaharvester/harvestersubmitter/act_submitter.py +++ b/pandaharvester/harvestersubmitter/act_submitter.py @@ -69,6 +69,8 @@ def submit_workers(self, workspec_list): if jobSpec: jobSpec = jobSpec[0] tmpLog.debug("JobSpec: {0}".format(jobSpec.values_map())) + # Unified queues: take prodsourcelabel from job + prodSourceLabel = jobSpec.jobParams.get('prodSourceLabel', prodSourceLabel) desc = {} # If we need to prefetch events, set aCT status waiting. From 73748937f07c2e5a2b5b25c00845101dfce19d58 Mon Sep 17 00:00:00 2001 From: fbarreir Date: Wed, 20 Nov 2019 16:44:00 +0100 Subject: [PATCH 037/181] k8s: replaced requests call with httplib code in k8s_startup_script --- .../harvestercloud/k8s_startup_script.py | 139 ++++++++---------- 1 file changed, 65 insertions(+), 74 deletions(-) diff --git a/pandaharvester/harvestercloud/k8s_startup_script.py b/pandaharvester/harvestercloud/k8s_startup_script.py index c9c14487..52fbeae3 100644 --- a/pandaharvester/harvestercloud/k8s_startup_script.py +++ b/pandaharvester/harvestercloud/k8s_startup_script.py @@ -1,13 +1,14 @@ #!/usr/bin/env python """ -This script will be executed at the VM startup time. -- It will download the proxy and panda queue from Google instance metadata +This script will be executed at container startup +- It will retrieve the proxy and panda queue from the environment - It will download the pilot wrapper from github and execute it -- It will upload the pilot logs to panda cache +- It will upload the pilot logs to panda cache at the end + +post-multipart code was taken from: https://github.com/haiwen/webapi-examples/blob/master/python/upload-file.py """ -import requests try: import subprocess32 as subprocess except Exception: @@ -15,30 +16,73 @@ import os import sys import logging -import time import traceback -from threading import Thread +import httplib +import mimetypes +import ssl +import urlparse logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s', filename='/tmp/vm_script.log', filemode='w') -global loop -loop = True +def post_multipart(host, port, selector, files, proxy_cert): + """ + Post files to an http host as multipart/form-data. + files is a sequence of (name, filename, value) elements for data to be uploaded as files + Return the server's response page. + """ + content_type, body = encode_multipart_formdata(files) + + context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH) + context.load_cert_chain(certfile=proxy_cert, keyfile=proxy_cert) + + h = httplib.HTTPSConnection(host, port, context=context, timeout=180) -def upload_logs(url, log_file_name, destination_name, proxy_path): - try: - # open and compress the content of the file - with open(log_file_name, 'rb') as log_file_object: - files = {'file': (destination_name, log_file_object.read())} + h.putrequest('POST', selector) + h.putheader('content-type', content_type) + h.putheader('content-length', str(len(body))) + h.endheaders() + h.send(body) + response = h.getresponse() + return response.status, response.reason - cert = [proxy_path, proxy_path] - # verify = '/etc/grid-security/certificates' # not supported in CernVM - requests.exceptions.SSLError: [Errno 21] Is a directory + +def encode_multipart_formdata(files): + """ + files is a sequence of (name, filename, value) elements for data to be uploaded as files + Return (content_type, body) ready for httplib.HTTP instance + """ + BOUNDARY = '----------ThIs_Is_tHe_bouNdaRY_$' + CRLF = '\r\n' + L = [] + for (key, filename, value) in files: + L.append('--' + BOUNDARY) + L.append('Content-Disposition: form-data; name="%s"; filename="%s"' % (key, filename)) + L.append('Content-Type: %s' % get_content_type(filename)) + L.append('') + L.append(value) + L.append('--' + BOUNDARY + '--') + L.append('') + body = CRLF.join(L) + content_type = 'multipart/form-data; boundary=%s' % BOUNDARY + return content_type, body + + +def get_content_type(filename): + return mimetypes.guess_type(filename)[0] or 'application/octet-stream' + + +def upload_logs(url, log_file_name, destination_name, proxy_cert): + try: + full_url = url + '/putFile' + urlparts = urlparse.urlsplit(full_url) logging.debug('[upload_logs] start') - res = requests.post(url + '/putFile', files=files, timeout=180, verify=False, cert=cert) - logging.debug('[upload_logs] finished with code={0} msg={1}'.format(res.status_code, res.text)) - if res.status_code == 200: + files = [('file', destination_name, open(log_file_name).read())] + status, reason = post_multipart(urlparts.hostname, urlparts.port, urlparts.path, files, proxy_cert) + logging.debug('[upload_logs] finished with code={0} msg={1}'.format(status, reason)) + if status == 200: return True except Exception: err_type, err_value = sys.exc_info()[:2] @@ -49,44 +93,10 @@ def upload_logs(url, log_file_name, destination_name, proxy_path): return False -def contact_harvester(harvester_frontend, data, auth_token, proxy_path): - try: - headers = {'Content-Type': 'application/json', - 'Authorization': 'Bearer {0}'.format(auth_token)} - cert = [proxy_path, proxy_path] - #verify = '/etc/grid-security/certificates' # not supported in CernVM - requests.exceptions.SSLError: [Errno 21] Is a directory - verify = False - resp = requests.post(harvester_frontend, json=data, headers=headers, cert=cert, verify=verify) - logging.debug('[contact_harvester] harvester returned: {0}'.format(resp.text)) - except Exception as e: - # message could not be sent - logging.debug('[contact_harvester] failed to send message to harvester: {0}'.format(e)) - pass - - -def heartbeat(harvester_frontend, worker_id, auth_token, proxy_path): - data = {'methodName': 'heartbeat', 'workerID': worker_id, 'data': None} - logging.debug('[heartbeat] sending heartbeat to harvester: {0}'.format(data)) - return contact_harvester(harvester_frontend, data, auth_token, proxy_path) - - -def suicide(harvester_frontend, worker_id, auth_token, proxy_path): - data = {'methodName': 'killWorker', 'workerID': worker_id, 'data': None} - logging.debug('[suicide] sending suicide message to harvester: {0}'.format(data)) - return contact_harvester(harvester_frontend, data, auth_token, proxy_path) - - -def heartbeat_loop(harvester_frontend, worker_id, auth_token, proxy_path): - while loop: - heartbeat(harvester_frontend, worker_id, auth_token, proxy_path) - time.sleep(300) - - def get_url(url, headers=None): """ get content from specified URL """ - reply = requests.get(url, headers=headers) if reply.status_code != 200: logging.debug('[get_attribute] Failed to open {0}'.format(url)) @@ -126,20 +136,10 @@ def get_configuration(): resource_type = os.environ.get('resourceType') logging.debug('[main] got resource type: {0}'.format(resource_type)) - # get the harvester frontend URL, where we'll send heartbeats - # harvester_frontend_url = METADATA_URL.format("harvester_frontend") - harvester_frontend = None - # logging.debug('[main] got harvester frontend: {0}'.format(harvester_frontend)) - # get the worker id worker_id = os.environ.get('workerID') logging.debug('[main] got worker id: {0}'.format(worker_id)) - # get the authentication token - # auth_token_url = METADATA_URL.format("auth_token") - auth_token = None - # logging.debug('[main] got authentication token') - # get the URL (e.g. panda cache) to upload logs logs_frontend_w = os.environ.get('logs_frontend_w') logging.debug('[main] got url to upload logs') @@ -148,17 +148,13 @@ def get_configuration(): logs_frontend_r = os.environ.get('logs_frontend_r') logging.debug('[main] got url to download logs') - return proxy_path, panda_site, panda_queue, resource_type, harvester_frontend, worker_id, auth_token, logs_frontend_w, logs_frontend_r + return proxy_path, panda_site, panda_queue, resource_type, worker_id, logs_frontend_w, logs_frontend_r if __name__ == "__main__": # get all the configuration from the GCE metadata server - proxy_path, panda_site, panda_queue, resource_type, harvester_frontend, worker_id, auth_token, logs_frontend_w, logs_frontend_r = get_configuration() - - # start a separate thread that will send a heartbeat to harvester every 5 minutes - # heartbeat_thread = Thread(target=heartbeat_loop, args=(harvester_frontend, worker_id, auth_token, proxy_path)) - # heartbeat_thread.start() + proxy_path, panda_site, panda_queue, resource_type, worker_id, logs_frontend_w, logs_frontend_r = get_configuration() # the pilot should propagate the download link via the pilotId field in the job table destination_name = '{0}.out'.format(worker_id) @@ -190,9 +186,4 @@ def get_configuration(): logging.debug('[main] pilot wrapper done...') # upload logs to e.g. panda cache or similar - upload_logs(logs_frontend_w, '/tmp/wrapper-wid.log', destination_name, proxy_path) - - # ask harvester to kill the VM and stop the heartbeat - # suicide(harvester_frontend, worker_id, auth_token, proxy_path) - loop = False - # heartbeat_thread.join() + upload_logs(logs_frontend_w, '/tmp/wrapper-wid.log', destination_name, proxy_path) \ No newline at end of file From c1cf17d278424109501f64fbe04f97c1eeef2cb9 Mon Sep 17 00:00:00 2001 From: fbarreir Date: Thu, 21 Nov 2019 11:18:01 +0100 Subject: [PATCH 038/181] k8s: replaced leftover requests call --- pandaharvester/harvestercloud/k8s_startup_script.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/pandaharvester/harvestercloud/k8s_startup_script.py b/pandaharvester/harvestercloud/k8s_startup_script.py index 52fbeae3..e96546ff 100644 --- a/pandaharvester/harvestercloud/k8s_startup_script.py +++ b/pandaharvester/harvestercloud/k8s_startup_script.py @@ -21,6 +21,7 @@ import mimetypes import ssl import urlparse +import urllib2 logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s', filename='/tmp/vm_script.log', filemode='w') @@ -96,14 +97,11 @@ def upload_logs(url, log_file_name, destination_name, proxy_cert): def get_url(url, headers=None): """ get content from specified URL + TODO: error handling """ - reply = requests.get(url, headers=headers) - if reply.status_code != 200: - logging.debug('[get_attribute] Failed to open {0}'.format(url)) - return None - else: - return reply.content - + response = urllib2.urlopen(wrapper_url) + content = response.read() + return content def get_configuration(): From 58c29588af9e6c66d48d166db513617b8e5e7119 Mon Sep 17 00:00:00 2001 From: a6350202 Date: Tue, 26 Nov 2019 17:16:41 +0800 Subject: [PATCH 039/181] k8s: Change to pilot2 --- .../harvestercloud/k8s_startup_script.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/pandaharvester/harvestercloud/k8s_startup_script.py b/pandaharvester/harvestercloud/k8s_startup_script.py index e96546ff..5272d29a 100644 --- a/pandaharvester/harvestercloud/k8s_startup_script.py +++ b/pandaharvester/harvestercloud/k8s_startup_script.py @@ -160,8 +160,8 @@ def get_configuration(): os.environ['GTAG'] = log_download_url # GTAG env variable is read by pilot # get the pilot wrapper - wrapper_path = "/tmp/runpilot3-wrapper.sh" - wrapper_url = "https://raw.githubusercontent.com/fbarreir/adc/master/runpilot3-wrapper.sh" + wrapper_path = "/tmp/runpilot2-wrapper.sh" + wrapper_url = "https://raw.githubusercontent.com/ptrlv/adc/master/runpilot2-wrapper.sh" wrapper_string = get_url(wrapper_url) with open(wrapper_path, "w") as wrapper_file: wrapper_file.write(wrapper_string) @@ -172,16 +172,16 @@ def get_configuration(): logging.debug('[main] starting pilot wrapper...') resource_type_option = '' if resource_type: - resource_type_option = '-R {0}'.format(resource_type) - wrapper_params = '-s {0} -h {1} {2}'.format(panda_site, panda_queue, resource_type_option) + resource_type_option = '--resource-type {0}'.format(resource_type) + wrapper_params = '-s {0} -r {1} -q {2} {3}'.format(panda_site, panda_queue, panda_queue, resource_type_option) if 'ANALY' in panda_queue: - wrapper_params = '{0} -u user'.format(wrapper_params) + wrapper_params = '{0} -j user'.format(wrapper_params) else: - wrapper_params = '{0} -u managed'.format(wrapper_params) - command = "/tmp/runpilot3-wrapper.sh {0} -p 25443 -w https://pandaserver.cern.ch >& /tmp/wrapper-wid.log".\ + wrapper_params = '{0} -j managed'.format(wrapper_params) + command = "/tmp/runpilot2-wrapper.sh {0} -i PR -w generic --pilot-user=ATLAS --url=https://pandaserver.cern.ch -d --harvester-submit-mode=PULL --allow-same-user=False >& /tmp/wrapper-wid.log".\ format(wrapper_params, worker_id) subprocess.call(command, shell=True) logging.debug('[main] pilot wrapper done...') # upload logs to e.g. panda cache or similar - upload_logs(logs_frontend_w, '/tmp/wrapper-wid.log', destination_name, proxy_path) \ No newline at end of file + upload_logs(logs_frontend_w, '/tmp/wrapper-wid.log', destination_name, proxy_path) From 4502c21e7201832b8ba3a9a15b560a46e1106169 Mon Sep 17 00:00:00 2001 From: fbarreir Date: Wed, 27 Nov 2019 17:42:55 +0100 Subject: [PATCH 040/181] prodanaly: issue with UPS queue initialization from scratch --- pandaharvester/harvestercore/db_proxy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandaharvester/harvestercore/db_proxy.py b/pandaharvester/harvestercore/db_proxy.py index 69a8d6b6..c2635af6 100644 --- a/pandaharvester/harvestercore/db_proxy.py +++ b/pandaharvester/harvestercore/db_proxy.py @@ -3424,7 +3424,7 @@ def get_worker_stats_bulk(self, active_ups_queues): # on panda server starts processing the queue if active_ups_queues: for ups_queue in active_ups_queues: - if ups_queue not in retMap or not retMap[ups_queue]: + if ups_queue not in retMap or not retMap[ups_queue] or retMap[ups_queue] == {'ANY': {}}: retMap[ups_queue] = {'managed': {'SCORE': {'running': 0, 'submitted': 0, 'to_submit': 0}}} # commit From 15942a7d2cf488bea9cb9e82065d8e2d7a9e88f4 Mon Sep 17 00:00:00 2001 From: fbarreir Date: Thu, 28 Nov 2019 10:21:42 +0100 Subject: [PATCH 041/181] k8s: CVMFS CSI ATLAS configuration for different kubernetes versions --- .../{k8s_cvmfs.yaml => k8s_cvmfs_1.13.yaml} | 0 examples/k8s/k8s_cvmfs_1.15.yaml | 101 ++++++++++++++++++ 2 files changed, 101 insertions(+) rename examples/k8s/{k8s_cvmfs.yaml => k8s_cvmfs_1.13.yaml} (100%) create mode 100644 examples/k8s/k8s_cvmfs_1.15.yaml diff --git a/examples/k8s/k8s_cvmfs.yaml b/examples/k8s/k8s_cvmfs_1.13.yaml similarity index 100% rename from examples/k8s/k8s_cvmfs.yaml rename to examples/k8s/k8s_cvmfs_1.13.yaml diff --git a/examples/k8s/k8s_cvmfs_1.15.yaml b/examples/k8s/k8s_cvmfs_1.15.yaml new file mode 100644 index 00000000..64fe7690 --- /dev/null +++ b/examples/k8s/k8s_cvmfs_1.15.yaml @@ -0,0 +1,101 @@ +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + name: csi-cvmfs-atlas +provisioner: cvmfs.csi.cern.ch +parameters: + repository: atlas.cern.ch +--- +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + name: csi-cvmfs-sft +provisioner: cvmfs.csi.cern.ch +parameters: + repository: sft.cern.ch +--- +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + name: csi-cvmfs-grid +provisioner: cvmfs.csi.cern.ch +parameters: + repository: grid.cern.ch +--- +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + name: csi-cvmfs-atlas-condb +provisioner: cvmfs.csi.cern.ch +parameters: + repository: atlas-condb.cern.ch +--- +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + name: csi-cvmfs-atlas-nightlies +provisioner: cvmfs.csi.cern.ch +parameters: + repository: atlas-nightlies.cern.ch +--- +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: csi-cvmfs-atlas-pvc +spec: + accessModes: + - ReadOnlyMany + resources: + requests: + storage: 1Gi + storageClassName: csi-cvmfs-atlas +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: csi-cvmfs-sft-pvc +spec: + accessModes: + - ReadOnlyMany + resources: + requests: + storage: 1Gi + storageClassName: csi-cvmfs-sft +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: csi-cvmfs-grid-pvc +spec: + accessModes: + - ReadOnlyMany + resources: + requests: + storage: 1Gi + storageClassName: csi-cvmfs-grid + +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: csi-cvmfs-atlas-condb-pvc +spec: + accessModes: + - ReadOnlyMany + resources: + requests: + storage: 1Gi + storageClassName: csi-cvmfs-atlas-condb +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: csi-cvmfs-atlas-nightlies-pvc +spec: + accessModes: + - ReadOnlyMany + resources: + requests: + storage: 1Gi + storageClassName: csi-cvmfs-atlas-nightlies \ No newline at end of file From 054179cc5e051490d7d7670a6855f0b141e3cf82 Mon Sep 17 00:00:00 2001 From: fbarreir Date: Mon, 2 Dec 2019 21:42:28 +0100 Subject: [PATCH 042/181] prodanaly: use production templates for unified queues --- pandaharvester/harvestermisc/info_utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandaharvester/harvestermisc/info_utils.py b/pandaharvester/harvestermisc/info_utils.py index 289a9a9c..d1452e47 100644 --- a/pandaharvester/harvestermisc/info_utils.py +++ b/pandaharvester/harvestermisc/info_utils.py @@ -121,5 +121,7 @@ def get_type_workflow(self, panda_resource): workflow = None else: pq_type = panda_queue_dict.get('type') + if pq_type == 'unified': # use production templates + pq_type = 'production' workflow = panda_queue_dict.get('workflow') return pq_type, workflow From fae333b4a92259dc17041b3e1f6a08e5f864b57c Mon Sep 17 00:00:00 2001 From: fbarreir Date: Tue, 3 Dec 2019 14:15:44 +0100 Subject: [PATCH 043/181] prodanaly: job prodsourcelabel precedence over queue prodsourcelabel --- pandaharvester/harvestercore/queue_config_mapper.py | 8 +++++++- pandaharvester/harvestersubmitter/htcondor_submitter.py | 2 +- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/pandaharvester/harvestercore/queue_config_mapper.py b/pandaharvester/harvestercore/queue_config_mapper.py index dca5b9ab..36fd1033 100644 --- a/pandaharvester/harvestercore/queue_config_mapper.py +++ b/pandaharvester/harvestercore/queue_config_mapper.py @@ -77,9 +77,15 @@ def is_no_heartbeat_status(self, status): return status in self.get_no_heartbeat_status() # get prodSourceLabel - def get_source_label(self): + def get_source_label(self, job_type=None): + # if queue is in test status, only submit workers for HC jobs if self.queueStatus == 'test': return 'test' + + # grandly unified queues: prodsourcelabel in job has precedence over queue prodsourcelabel + if job_type == 'user': + return job_type + return self.prodSourceLabel # set unique name diff --git a/pandaharvester/harvestersubmitter/htcondor_submitter.py b/pandaharvester/harvestersubmitter/htcondor_submitter.py index 41ffe03a..fb9baa63 100644 --- a/pandaharvester/harvestersubmitter/htcondor_submitter.py +++ b/pandaharvester/harvestersubmitter/htcondor_submitter.py @@ -365,7 +365,7 @@ def make_a_jdl(workspec, template, n_core_per_node, log_dir, panda_queue_name, e # decide prodSourceLabel pilot_opt_tuple = _get_prodsourcelabel_pilotypeopt_piloturlstr(workspec.pilotType, pilot_version) if pilot_opt_tuple is None: - prod_source_label = harvester_queue_config.get_source_label() + prod_source_label = harvester_queue_config.get_source_label(workspec.jobType) pilot_type_opt = workspec.pilotType pilot_url_str = '' else: From f128e23416b34d5021dae234e11c7a22d99ec592 Mon Sep 17 00:00:00 2001 From: fbarreir Date: Tue, 3 Dec 2019 16:30:52 +0100 Subject: [PATCH 044/181] prodanaly: treat prodsourcelabel=panda as analysis jobs --- pandaharvester/harvestercore/queue_config_mapper.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandaharvester/harvestercore/queue_config_mapper.py b/pandaharvester/harvestercore/queue_config_mapper.py index 36fd1033..ee83827a 100644 --- a/pandaharvester/harvestercore/queue_config_mapper.py +++ b/pandaharvester/harvestercore/queue_config_mapper.py @@ -83,8 +83,8 @@ def get_source_label(self, job_type=None): return 'test' # grandly unified queues: prodsourcelabel in job has precedence over queue prodsourcelabel - if job_type == 'user': - return job_type + if job_type in ('user', 'panda'): + return 'user' return self.prodSourceLabel From a6a97592db822bae7e774a5d04c0713550837082 Mon Sep 17 00:00:00 2001 From: fbarreir Date: Tue, 3 Dec 2019 16:50:13 +0100 Subject: [PATCH 045/181] prodanaly: more prodsourcelabel=panda cases --- pandaharvester/harvestermisc/info_utils.py | 5 ++--- pandaharvester/harvestersubmitter/htcondor_submitter.py | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/pandaharvester/harvestermisc/info_utils.py b/pandaharvester/harvestermisc/info_utils.py index d1452e47..dca3615b 100644 --- a/pandaharvester/harvestermisc/info_utils.py +++ b/pandaharvester/harvestermisc/info_utils.py @@ -90,11 +90,10 @@ def is_grandly_unified_queue(self, panda_resource): return False # initial, temporary nomenclature - if 'grandly_unified' in panda_queue_dict.get('catchall'): + if 'grandly_unified' in panda_queue_dict.get('catchall') \ + or panda_queue_dict.get('type') == 'unified': return True - # TODO: implement the final nomenclature - return False # get harvester params diff --git a/pandaharvester/harvestersubmitter/htcondor_submitter.py b/pandaharvester/harvestersubmitter/htcondor_submitter.py index fb9baa63..feb30ab5 100644 --- a/pandaharvester/harvestersubmitter/htcondor_submitter.py +++ b/pandaharvester/harvestersubmitter/htcondor_submitter.py @@ -775,7 +775,7 @@ def _choose_proxy(workspec): """ job_type = workspec.jobType proxy = self.x509UserProxy - if is_grandly_unified_queue and (job_type == 'user' or job_type == 'analysis') and self.x509UserProxyAnalysis: + if is_grandly_unified_queue and job_type in ('user', 'panda', 'analysis') and self.x509UserProxyAnalysis: tmpLog.debug('Taking analysis proxy') proxy = self.x509UserProxyAnalysis else: From e48344b0d04fff003803a0015cb40a906ca8e431 Mon Sep 17 00:00:00 2001 From: Doug Benjamin Date: Mon, 9 Dec 2019 08:10:52 -0600 Subject: [PATCH 046/181] new preparator using xrdcp --- .../harvesterpreparator/xrdcp_preparator.py | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 pandaharvester/harvesterpreparator/xrdcp_preparator.py diff --git a/pandaharvester/harvesterpreparator/xrdcp_preparator.py b/pandaharvester/harvesterpreparator/xrdcp_preparator.py new file mode 100644 index 00000000..cf17b1e0 --- /dev/null +++ b/pandaharvester/harvesterpreparator/xrdcp_preparator.py @@ -0,0 +1,149 @@ +import os +import tempfile +try: + import subprocess32 as subprocess +except Exception: + import subprocess + +from pandaharvester.harvestercore.plugin_base import PluginBase +from pandaharvester.harvestercore import core_utils +from pandaharvester.harvestermover import mover_utils + +# logger +baseLogger = core_utils.setup_logger('xrdcp_preparator') + + +# preparator plugin with https://xrootd.slac.stanford.edu/ xrdcp +""" + -- Example of plugin config + "preparator": { + "name": "XrdcpPreparator", + "module": "pandaharvester.harvesterpreparator.xrdcp_preparator", + # base path for source xrdcp server + "srcBasePath": " root://dcgftp.usatlas.bnl.gov:1096//pnfs/usatlas.bnl.gov/BNLT0D1/rucio", + # base path for local access to the copied files + "localBasePath": "/hpcgpfs01/scratch/benjamin/harvester/rucio-data-area", + # max number of attempts + "maxAttempts": 3, + # check paths under localBasePath. + "checkLocalPath": true, + # options for xrdcp + "xrdcpOpts": "--retry 3 --cksum adler32 --debug 1" + } +""" +class XrdcpPreparator(PluginBase): + # constructor + def __init__(self, **kwarg): + self.xrdcpOpts = None + self.maxAttempts = 3 + self.timeout = None + self.checkLocalPath = True + PluginBase.__init__(self, **kwarg) + + # trigger preparation + def trigger_preparation(self, jobspec): + # make logger + tmpLog = self.make_logger(baseLogger, 'PandaID={0}'.format(jobspec.PandaID), + method_name='trigger_preparation') + tmpLog.debug('start') + # get the environment + harvester_env = os.environ.copy() + #tmpLog.debug('Harvester environment : {}'.format(harvester_env)) + # loop over all inputs + inFileInfo = jobspec.get_input_file_attributes() + xrdcpInput = None + allfiles_transfered = True + overall_errMsg = "" + for tmpFileSpec in jobspec.inFiles: + # construct source and destination paths + srcPath = mover_utils.construct_file_path(self.srcBasePath, inFileInfo[tmpFileSpec.lfn]['scope'], + tmpFileSpec.lfn) + # local path + localPath = mover_utils.construct_file_path(self.localBasePath, inFileInfo[tmpFileSpec.lfn]['scope'], + tmpFileSpec.lfn) + if self.checkLocalPath: + # check if already exits + if os.path.exists(localPath): + # calculate checksum + checksum = core_utils.calc_adler32(localPath) + checksum = 'ad:{0}'.format(checksum) + if checksum == inFileInfo[tmpFileSpec.lfn]['checksum']: + continue + # make directories if needed + if not os.path.isdir(os.path.dirname(localPath)): + os.makedirs(os.path.dirname(localPath)) + tmpLog.debug('Make directory - {0}'.format(os.path.dirname(localPath))) + # collect list of input files + if xrdcpInput is None: + xrdcpInput = [srcPath] + else: + xrdcpInput.append[srcPath] + # transfer using xrdcp one file at a time + tmpLog.debug('execute xrdcp') + args = ['xrdcp', '--nopbar', '--force'] + args_files = [srcPath,localPath] + if self.xrdcpOpts is not None: + args += self.xrdcpOpts.split() + args += args_files + tmpFileSpec.attemptNr += 1 + try: + xrdcp_cmd = ' '.join(args) + tmpLog.debug('execute: {0}'.format(xrdcp_cmd)) + p = subprocess.Popen(xrdcp_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=harvester_env, shell=True) + try: + stdout, stderr = p.communicate(timeout=self.timeout) + except subprocess.TimeoutExpired: + p.kill() + stdout, stderr = p.communicate() + tmpLog.warning('command timeout') + return_code = p.returncode + if stdout is not None: + if not isinstance(stdout, str): + stdout = stdout.decode() + stdout = stdout.replace('\n', ' ') + if stderr is not None: + if not isinstance(stderr, str): + stderr = stderr.decode() + stderr = stderr.replace('\n', ' ') + tmpLog.debug("stdout: %s" % stdout) + tmpLog.debug("stderr: %s" % stderr) + except Exception: + core_utils.dump_error_message(tmpLog) + return_code = 1 + if return_code != 0: + overall_errMsg += "file - {0} did not transfer error code {1} ".format(localPath,return_code) + allfiles_transfered = False + errMsg = 'failed with {0}'.format(return_code) + tmpLog.error(errMsg) + # check attemptNr + if tmpFileSpec.attemptNr >= self.maxAttempts: + errMsg = 'gave up due to max attempts' + tmpLog.error(errMsg) + return (False, errMsg) + # end loop over input files + # nothing to transfer + if xrdcpInput is None: + tmpLog.debug('done with no transfers') + return True, '' + # check if all files were transfered + if allfiles_transfered : + return True, '' + else: + return None, overall_errMsg + + + # check status + def check_stage_in_status(self, jobspec): + return True, '' + + # resolve input file paths + def resolve_input_paths(self, jobspec): + # input files + inFileInfo = jobspec.get_input_file_attributes() + pathInfo = dict() + for tmpFileSpec in jobspec.inFiles: + localPath = mover_utils.construct_file_path(self.localBasePath, inFileInfo[tmpFileSpec.lfn]['scope'], + tmpFileSpec.lfn) + pathInfo[tmpFileSpec.lfn] = {'path': localPath} + jobspec.set_input_file_paths(pathInfo) + return True, '' From 5fcbb81bd3a081baefd086a634ac86592c232668 Mon Sep 17 00:00:00 2001 From: fbarreir Date: Wed, 11 Dec 2019 16:22:58 +0100 Subject: [PATCH 047/181] k8s: corrected wrapper URL --- pandaharvester/harvestercloud/k8s_startup_script.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandaharvester/harvestercloud/k8s_startup_script.py b/pandaharvester/harvestercloud/k8s_startup_script.py index 5272d29a..51047ef7 100644 --- a/pandaharvester/harvestercloud/k8s_startup_script.py +++ b/pandaharvester/harvestercloud/k8s_startup_script.py @@ -151,7 +151,7 @@ def get_configuration(): if __name__ == "__main__": - # get all the configuration from the GCE metadata server + # get all the configuration from the environment proxy_path, panda_site, panda_queue, resource_type, worker_id, logs_frontend_w, logs_frontend_r = get_configuration() # the pilot should propagate the download link via the pilotId field in the job table @@ -161,7 +161,7 @@ def get_configuration(): # get the pilot wrapper wrapper_path = "/tmp/runpilot2-wrapper.sh" - wrapper_url = "https://raw.githubusercontent.com/ptrlv/adc/master/runpilot2-wrapper.sh" + wrapper_url = "https://raw.githubusercontent.com/PanDAWMS/pilot-wrapper/master/runpilot2-wrapper.sh" wrapper_string = get_url(wrapper_url) with open(wrapper_path, "w") as wrapper_file: wrapper_file.write(wrapper_string) From f1212f3043ac80be32946f4a1e14faf18d545a6c Mon Sep 17 00:00:00 2001 From: fahui Date: Fri, 13 Dec 2019 16:51:16 +0800 Subject: [PATCH 048/181] htcondor_submitter fixes --- pandaharvester/commit_timestamp.py | 2 +- pandaharvester/harvestersubmitter/htcondor_submitter.py | 9 +++++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/pandaharvester/commit_timestamp.py b/pandaharvester/commit_timestamp.py index 8ffbd205..c68977dc 100644 --- a/pandaharvester/commit_timestamp.py +++ b/pandaharvester/commit_timestamp.py @@ -1 +1 @@ -timestamp = "08-10-2019 22:42:20 on contrib_cern (by fahui)" +timestamp = "13-12-2019 08:51:16 on contrib_cern (by fahui)" diff --git a/pandaharvester/harvestersubmitter/htcondor_submitter.py b/pandaharvester/harvestersubmitter/htcondor_submitter.py index feb30ab5..8ec1c3bd 100644 --- a/pandaharvester/harvestersubmitter/htcondor_submitter.py +++ b/pandaharvester/harvestersubmitter/htcondor_submitter.py @@ -438,6 +438,7 @@ class HTCondorSubmitter(PluginBase): # constructor def __init__(self, **kwarg): self.logBaseURL = None + self.templateFile = None PluginBase.__init__(self, **kwarg) # number of processes try: @@ -479,7 +480,7 @@ def __init__(self, **kwarg): self.useAtlasGridCE = False finally: self.useAtlasAGIS = self.useAtlasAGIS or self.useAtlasGridCE - # sdf template directories of CEs + # sdf template directories of CEs; ignored if templateFile is set try: self.CEtemplateDir except AttributeError: @@ -639,8 +640,8 @@ def _handle_one_worker(workspec, to_submit=to_submit_any): except KeyError: tmpLog.info('Problem choosing CE with weighting. Choose an arbitrary CE endpoint') ce_info_dict = random.choice(list(ce_auxilary_dict.values())).copy() - # go on info of the CE - ce_endpoint_from_queue = ce_info_dict.get('ce_endpoint', '') + # go on info of the CE; ignore protocol prefix in ce_endpoint + ce_endpoint_from_queue = re.sub('^\w+://', '', ce_info_dict.get('ce_endpoint', '')) ce_flavour_str = str(ce_info_dict.get('ce_flavour', '')).lower() ce_version_str = str(ce_info_dict.get('ce_version', '')).lower() ce_info_dict['ce_hostname'] = re.sub(':\w*', '', ce_endpoint_from_queue) @@ -656,7 +657,7 @@ def _handle_one_worker(workspec, to_submit=to_submit_any): ce_info_dict['ce_endpoint'] = '{0}:{1}'.format(ce_endpoint_from_queue, default_port) tmpLog.debug('For site {0} got pilot version: "{1}"; CE endpoint: "{2}", flavour: "{3}"'.format( self.queueName, pilot_version_orig, ce_endpoint_from_queue, ce_flavour_str)) - if os.path.isdir(self.CEtemplateDir) and ce_flavour_str: + if not self.templateFile and os.path.isdir(self.CEtemplateDir) and ce_flavour_str: sdf_template_filename = '{ce_flavour_str}{pilot_version_suffix_str}.sdf'.format( ce_flavour_str=ce_flavour_str, pilot_version_suffix_str=pilot_version_suffix_str) self.templateFile = os.path.join(self.CEtemplateDir, sdf_template_filename) From a09db3939a88bc0305b807f3c128166f6df41fa4 Mon Sep 17 00:00:00 2001 From: fahui Date: Wed, 18 Dec 2019 08:14:08 +0800 Subject: [PATCH 049/181] change panda-common package --- pandaharvester/commit_timestamp.py | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandaharvester/commit_timestamp.py b/pandaharvester/commit_timestamp.py index c68977dc..0d47544a 100644 --- a/pandaharvester/commit_timestamp.py +++ b/pandaharvester/commit_timestamp.py @@ -1 +1 @@ -timestamp = "13-12-2019 08:51:16 on contrib_cern (by fahui)" +timestamp = "18-12-2019 00:14:08 on contrib_cern (by fahui)" diff --git a/setup.py b/setup.py index 6281fa4f..7dbbdafb 100644 --- a/setup.py +++ b/setup.py @@ -30,7 +30,7 @@ 'future', 'futures; python_version == "2.*"', 'pycryptodomex', - 'panda-common-s >= 0.0.11', + 'panda-common', 'pyjwt', 'subprocess32; python_version == "2.*"', 'rpyc', From a5cba615623bcc861302fc038a2a321b33f0e202 Mon Sep 17 00:00:00 2001 From: tmaeno Date: Wed, 18 Dec 2019 08:10:32 +0100 Subject: [PATCH 050/181] set pilot_closed when workers are killed while waiting in the queue --- pandaharvester/harvesterbody/monitor.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandaharvester/harvesterbody/monitor.py b/pandaharvester/harvesterbody/monitor.py index 1028bc2f..fbdafed9 100644 --- a/pandaharvester/harvesterbody/monitor.py +++ b/pandaharvester/harvesterbody/monitor.py @@ -644,8 +644,12 @@ def check_workers(self, mon_core, messenger, all_workers, queue_config, tmp_log, tmp_log.debug('kill workerID={0} due to queuing longer than {1} seconds'.format( workerID, workerQueueTimeLimit)) self.dbProxy.kill_worker(workSpec.workerID) - diagMessage = 'Killed by Harvester due to worker queuing too long' + diagMessage + diagMessage = 'Killed by Harvester due to worker queuing too long. ' + diagMessage workSpec.set_pilot_error(PilotErrors.ERR_FAILEDBYSERVER, diagMessage) + # set closed and reset start/endTime for accounting + workSpec.set_pilot_closed() + workSpec.set_start_time(True) + workSpec.set_end_time(True) # expired heartbeat - only when requested in the configuration try: # check if the queue configuration requires checking for worker heartbeat From dc7a69130917728f9cb2d5a94e67c53cce6c2b9b Mon Sep 17 00:00:00 2001 From: tmaeno Date: Wed, 18 Dec 2019 08:27:39 +0100 Subject: [PATCH 051/181] fix --- pandaharvester/harvesterbody/monitor.py | 4 +--- pandaharvester/harvestercore/core_utils.py | 2 ++ pandaharvester/harvestercore/job_spec.py | 5 +++++ 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/pandaharvester/harvesterbody/monitor.py b/pandaharvester/harvesterbody/monitor.py index fbdafed9..c1762f39 100644 --- a/pandaharvester/harvesterbody/monitor.py +++ b/pandaharvester/harvesterbody/monitor.py @@ -646,10 +646,8 @@ def check_workers(self, mon_core, messenger, all_workers, queue_config, tmp_log, self.dbProxy.kill_worker(workSpec.workerID) diagMessage = 'Killed by Harvester due to worker queuing too long. ' + diagMessage workSpec.set_pilot_error(PilotErrors.ERR_FAILEDBYSERVER, diagMessage) - # set closed and reset start/endTime for accounting + # set closed workSpec.set_pilot_closed() - workSpec.set_start_time(True) - workSpec.set_end_time(True) # expired heartbeat - only when requested in the configuration try: # check if the queue configuration requires checking for worker heartbeat diff --git a/pandaharvester/harvestercore/core_utils.py b/pandaharvester/harvestercore/core_utils.py index 4befffa1..fcaade8d 100644 --- a/pandaharvester/harvestercore/core_utils.py +++ b/pandaharvester/harvestercore/core_utils.py @@ -322,6 +322,8 @@ def update_job_attributes_with_workers(map_type, jobspec_list, workspec_list, fi # set start and end times if workSpec.status in [WorkSpec.ST_running]: jobSpec.set_start_time() + elif workSpec.pilot_closed: + jobSpec.reset_start_end_time() elif workSpec.is_final_status(): jobSpec.set_end_time() # core count diff --git a/pandaharvester/harvestercore/job_spec.py b/pandaharvester/harvestercore/job_spec.py index b212d718..42c2b615 100644 --- a/pandaharvester/harvestercore/job_spec.py +++ b/pandaharvester/harvestercore/job_spec.py @@ -388,6 +388,11 @@ def set_end_time(self, force=False): if self.endTime is None or force is True: self.endTime = datetime.datetime.utcnow() + # reset start and end time + def reset_start_end_time(self): + self.startTime = datetime.datetime.utcnow() + self.endTime = self.startTime + # add work spec list def add_workspec_list(self, workspec_list): self.workspec_list = workspec_list From 6d3dca6e5348d704c5eed4f8852a466568be5186 Mon Sep 17 00:00:00 2001 From: fahui Date: Thu, 19 Dec 2019 09:51:11 +0800 Subject: [PATCH 052/181] htcondor_submitter add --- pandaharvester/commit_timestamp.py | 2 +- pandaharvester/harvestersubmitter/htcondor_submitter.py | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/pandaharvester/commit_timestamp.py b/pandaharvester/commit_timestamp.py index 0d47544a..85729cbe 100644 --- a/pandaharvester/commit_timestamp.py +++ b/pandaharvester/commit_timestamp.py @@ -1 +1 @@ -timestamp = "18-12-2019 00:14:08 on contrib_cern (by fahui)" +timestamp = "19-12-2019 01:51:11 on contrib_cern (by fahui)" diff --git a/pandaharvester/harvestersubmitter/htcondor_submitter.py b/pandaharvester/harvestersubmitter/htcondor_submitter.py index 8ec1c3bd..0330270b 100644 --- a/pandaharvester/harvestersubmitter/htcondor_submitter.py +++ b/pandaharvester/harvestersubmitter/htcondor_submitter.py @@ -675,6 +675,12 @@ def _handle_one_worker(workspec, to_submit=to_submit_any): ce_info_dict['ce_endpoint'] = self.ceEndpoint except AttributeError: pass + try: + # Manually define ceQueueName + if self.ceQueueName: + ce_info_dict['ce_queue_name'] = self.ceQueueName + except AttributeError: + pass # template for batch script try: tmpFile = open(self.templateFile) From 0cb344def1f3dd1fe0c2dac320d843d253959832 Mon Sep 17 00:00:00 2001 From: tmaeno Date: Thu, 19 Dec 2019 11:38:34 +0100 Subject: [PATCH 053/181] using pilot_closed for submission failures --- pandaharvester/harvesterbody/submitter.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandaharvester/harvesterbody/submitter.py b/pandaharvester/harvesterbody/submitter.py index 5f9134db..53bc4719 100644 --- a/pandaharvester/harvesterbody/submitter.py +++ b/pandaharvester/harvesterbody/submitter.py @@ -318,6 +318,7 @@ def run(self): work_spec.set_status(WorkSpec.ST_missed) work_spec.set_dialog_message(tmpStr) work_spec.set_pilot_error(PilotErrors.ERR_SETUPFAILURE, errStr) + work_spec.set_pilot_closed() if jobList is not None: # increment attempt number newJobList = [] From 42a0627ffad3b0adf7f6980383cdcd1d47b1af52 Mon Sep 17 00:00:00 2001 From: tmaeno Date: Fri, 20 Dec 2019 23:41:03 +0100 Subject: [PATCH 054/181] using pilot_closed for submission failures --- pandaharvester/commit_timestamp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandaharvester/commit_timestamp.py b/pandaharvester/commit_timestamp.py index 85729cbe..4e34d57b 100644 --- a/pandaharvester/commit_timestamp.py +++ b/pandaharvester/commit_timestamp.py @@ -1 +1 @@ -timestamp = "19-12-2019 01:51:11 on contrib_cern (by fahui)" +timestamp = "20-12-2019 22:39:30 on master (by tmaeno)" From 23469b8430d2a066f02fcb4e0000d7819427b618 Mon Sep 17 00:00:00 2001 From: tmaeno Date: Fri, 20 Dec 2019 23:49:11 +0100 Subject: [PATCH 055/181] using pilot_closed for submission failures --- pandaharvester/README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/pandaharvester/README.md b/pandaharvester/README.md index 46194ca1..1ef2b4d4 100644 --- a/pandaharvester/README.md +++ b/pandaharvester/README.md @@ -13,4 +13,3 @@ * **Submitter**: Classes to submit jobs to the batch system * **Test**: Test scripts * **Worker Maker**: Makes workers - From f9469f25834ac44d1f58f8f119e9b7af0134db63 Mon Sep 17 00:00:00 2001 From: tmaeno Date: Fri, 20 Dec 2019 23:59:14 +0100 Subject: [PATCH 056/181] pre-commit test --- pandaharvester/commit_timestamp.py | 2 +- setup.py | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/pandaharvester/commit_timestamp.py b/pandaharvester/commit_timestamp.py index 4e34d57b..1b03f285 100644 --- a/pandaharvester/commit_timestamp.py +++ b/pandaharvester/commit_timestamp.py @@ -1 +1 @@ -timestamp = "20-12-2019 22:39:30 on master (by tmaeno)" +timestamp = "20-12-2019 22:59:14 on master (by tmaeno)" diff --git a/setup.py b/setup.py index 7dbbdafb..957f1a32 100644 --- a/setup.py +++ b/setup.py @@ -4,9 +4,7 @@ # # import sys - from setuptools import setup, find_packages - from pandaharvester import panda_pkg_info sys.path.insert(0, '.') From 693086845b2c5d10729eafe0b9aa46d02e16d5a7 Mon Sep 17 00:00:00 2001 From: tmaeno Date: Tue, 4 Feb 2020 16:40:49 +0100 Subject: [PATCH 057/181] randomized DNS resolution --- pandaharvester/commit_timestamp.py | 2 +- .../panda_communicator.py | 39 ++++++++++--------- 2 files changed, 22 insertions(+), 19 deletions(-) diff --git a/pandaharvester/commit_timestamp.py b/pandaharvester/commit_timestamp.py index 1b03f285..3bda1cf9 100644 --- a/pandaharvester/commit_timestamp.py +++ b/pandaharvester/commit_timestamp.py @@ -1 +1 @@ -timestamp = "20-12-2019 22:59:14 on master (by tmaeno)" +timestamp = "04-02-2020 15:40:49 on master (by tmaeno)" diff --git a/pandaharvester/harvestercommunicator/panda_communicator.py b/pandaharvester/harvestercommunicator/panda_communicator.py index 5385cff3..6b56000d 100644 --- a/pandaharvester/harvestercommunicator/panda_communicator.py +++ b/pandaharvester/harvestercommunicator/panda_communicator.py @@ -15,7 +15,6 @@ import uuid import inspect import datetime -import requests import traceback from future.utils import iteritems # TO BE REMOVED for python2.7 @@ -26,6 +25,7 @@ pass from pandaharvester.harvestercore import core_utils from pandaharvester.harvesterconfig import harvester_config +from pandacommon.pandautils.net_utils import get_http_adapter_with_random_dns_resolution from .base_communicator import BaseCommunicator @@ -56,11 +56,12 @@ def post(self, path, data): url = '{0}/{1}'.format(harvester_config.pandacon.pandaURL, path) if self.verbose: tmpLog.debug('exec={0} URL={1} data={2}'.format(tmpExec, url, str(data))) - res = requests.post(url, - data=data, - headers={"Accept": "application/json", - "Connection": "close"}, - timeout=harvester_config.pandacon.timeout) + session = get_http_adapter_with_random_dns_resolution() + res = session.post(url, + data=data, + headers={"Accept": "application/json", + "Connection": "close"}, + timeout=harvester_config.pandacon.timeout) if self.verbose: tmpLog.debug('exec={0} code={1} return={2}'.format(tmpExec, res.status_code, res.text)) if res.status_code == 200: @@ -90,14 +91,15 @@ def post_ssl(self, path, data, cert=None): if cert is None: cert = (harvester_config.pandacon.cert_file, harvester_config.pandacon.key_file) + session = get_http_adapter_with_random_dns_resolution() sw = core_utils.get_stopwatch() - res = requests.post(url, - data=data, - headers={"Accept": "application/json", - "Connection": "close"}, - timeout=harvester_config.pandacon.timeout, - verify=harvester_config.pandacon.ca_cert, - cert=cert) + res = session.post(url, + data=data, + headers={"Accept": "application/json", + "Connection": "close"}, + timeout=harvester_config.pandacon.timeout, + verify=harvester_config.pandacon.ca_cert, + cert=cert) if self.verbose: tmpLog.debug('exec={0} code={1} {3}. return={2}'.format(tmpExec, res.status_code, res.text, sw.get_elapsed_time())) @@ -129,11 +131,12 @@ def put_ssl(self, path, files, cert=None): if cert is None: cert = (harvester_config.pandacon.cert_file, harvester_config.pandacon.key_file) - res = requests.post(url, - files=files, - timeout=harvester_config.pandacon.timeout, - verify=harvester_config.pandacon.ca_cert, - cert=cert) + session = get_http_adapter_with_random_dns_resolution() + res = session.post(url, + files=files, + timeout=harvester_config.pandacon.timeout, + verify=harvester_config.pandacon.ca_cert, + cert=cert) if self.verbose: tmpLog.debug('exec={0} code={1} return={2}'.format(tmpExec, res.status_code, res.text)) if res.status_code == 200: From ad455ed76da7119d81cacd757fca1abb25c5602c Mon Sep 17 00:00:00 2001 From: fbarreir Date: Tue, 4 Feb 2020 20:18:44 +0100 Subject: [PATCH 058/181] prodanaly: new branch because I polluted gu_push branch accidentally --- pandaharvester/harvesterbody/job_fetcher.py | 13 +++++++++++-- pandaharvester/harvestercore/queue_config_mapper.py | 6 +++++- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/pandaharvester/harvesterbody/job_fetcher.py b/pandaharvester/harvesterbody/job_fetcher.py index b0d7823b..b3418877 100644 --- a/pandaharvester/harvesterbody/job_fetcher.py +++ b/pandaharvester/harvesterbody/job_fetcher.py @@ -10,6 +10,7 @@ from pandaharvester.harvestercore.db_proxy_pool import DBProxyPool as DBProxy from pandaharvester.harvesterbody.agent_base import AgentBase from pandaharvester.harvestercore.plugin_factory import PluginFactory +from pandaharvester.harvestermisc.info_utils import PandaQueuesDict # logger _logger = core_utils.setup_logger('job_fetcher') @@ -24,6 +25,7 @@ def __init__(self, communicator, queue_config_mapper, single_mode=False): self.communicator = communicator self.nodeName = socket.gethostname() self.queueConfigMapper = queue_config_mapper + self.pandaQueueDict = PandaQueuesDict() self.pluginFactory = PluginFactory() # main loop @@ -44,17 +46,24 @@ def run(self): method_name='run') # get queue queueConfig = self.queueConfigMapper.get_queue(queueName) + siteName = queueConfig.siteName # upper limit if nJobs > harvester_config.jobfetcher.maxJobs: nJobs = harvester_config.jobfetcher.maxJobs + # get jobs - default_prodSourceLabel = queueConfig.get_source_label() + try: + is_grandly_unified_queue = self.pandaQueueDict.is_grandly_unified_queue(siteName) + except Exception: + is_grandly_unified_queue = False + + default_prodSourceLabel = queueConfig.get_source_label(is_gu=is_grandly_unified_queue) + pdpm = getattr(queueConfig, 'prodSourceLabelRandomWeightsPermille', {}) choice_list = core_utils.make_choice_list(pdpm=pdpm, default=default_prodSourceLabel) prodSourceLabel = random.choice(choice_list) tmpLog.debug('getting {0} jobs for prodSourceLabel {1}'.format(nJobs, prodSourceLabel)) sw = core_utils.get_stopwatch() - siteName = queueConfig.siteName jobs, errStr = self.communicator.get_jobs(siteName, self.nodeName, prodSourceLabel, self.nodeName, nJobs, diff --git a/pandaharvester/harvestercore/queue_config_mapper.py b/pandaharvester/harvestercore/queue_config_mapper.py index ee83827a..4b912c28 100644 --- a/pandaharvester/harvestercore/queue_config_mapper.py +++ b/pandaharvester/harvestercore/queue_config_mapper.py @@ -77,7 +77,7 @@ def is_no_heartbeat_status(self, status): return status in self.get_no_heartbeat_status() # get prodSourceLabel - def get_source_label(self, job_type=None): + def get_source_label(self, job_type=None, is_gu=None): # if queue is in test status, only submit workers for HC jobs if self.queueStatus == 'test': return 'test' @@ -86,6 +86,10 @@ def get_source_label(self, job_type=None): if job_type in ('user', 'panda'): return 'user' + # grandly unified queues: call to getJobs should not request for a particular prodSourceLabel + if is_gu: + return 'unified' + return self.prodSourceLabel # set unique name From a2da512b70f768a6690ba444a84e08d546b15123 Mon Sep 17 00:00:00 2001 From: Doug Benjamin Date: Mon, 10 Feb 2020 06:48:18 -0600 Subject: [PATCH 059/181] create monitor for LSF --- .../harvestermonitor/lsf_monitor.py | 69 +++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 pandaharvester/harvestermonitor/lsf_monitor.py diff --git a/pandaharvester/harvestermonitor/lsf_monitor.py b/pandaharvester/harvestermonitor/lsf_monitor.py new file mode 100644 index 00000000..7218b34e --- /dev/null +++ b/pandaharvester/harvestermonitor/lsf_monitor.py @@ -0,0 +1,69 @@ +import re +try: + import subprocess32 as subprocess +except: + import subprocess + +from pandaharvester.harvestercore import core_utils +from pandaharvester.harvestercore.work_spec import WorkSpec +from pandaharvester.harvestercore.plugin_base import PluginBase + +# logger +baseLogger = core_utils.setup_logger('pbs_monitor') + + +# monitor for LSF batch system +class LSFMonitor(PluginBase): + # constructor + def __init__(self, **kwarg): + PluginBase.__init__(self, **kwarg) + + # check workers + def check_workers(self, workspec_list): + retList = [] + for workSpec in workspec_list: + # make logger + tmpLog = self.make_logger(baseLogger, 'workerID={0}'.format(workSpec.workerID), + method_name='check_workers') + # command + comStr = "bjobs -noheader -o 'jobid:10 stat:10' {0}".format(workSpec.batchID) + # check + tmpLog.debug('check with {0}'.format(comStr)) + p = subprocess.Popen(comStr.split(), + shell=False, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + newStatus = workSpec.status + # check return code + stdOut, stdErr = p.communicate() + retCode = p.returncode + tmpLog.debug('retCode={0}'.format(retCode)) + errStr = '' + if retCode == 0: + # parse + for tmpLine in stdOut.split('\n'): + tmpMatch = re.search('{0} '.format(workSpec.batchID), tmpLine) + if tmpMatch is not None: + errStr = tmpLine + batchStatus = tmpLine.split()[-2] + if batchStatus in ['RUN']: + newStatus = WorkSpec.ST_running + elif batchStatus in ['DONE']: + newStatus = WorkSpec.ST_finished + elif batchStatus in ['PEND', 'PROV','WAIT']: + newStatus = WorkSpec.ST_submitted + else: + newStatus = WorkSpec.ST_failed + tmpLog.debug('batchStatus {0} -> workerStatus {1}'.format(batchStatus, + newStatus)) + break + retList.append((newStatus, errStr)) + else: + # failed + errStr = stdOut + ' ' + stdErr + tmpLog.error(errStr) + if 'Unknown Job Id Error' in errStr: + tmpLog.info("Mark job as finished.") + newStatus = WorkSpec.ST_finished + retList.append((newStatus, errStr)) + return True, retList From c5380b269f1e192f3c50aaa4190fc54ad0dcfcca Mon Sep 17 00:00:00 2001 From: Doug Benjamin Date: Mon, 10 Feb 2020 06:59:05 -0600 Subject: [PATCH 060/181] lsf --- .../harvestersubmitter/lsf_submitter.py | 107 ++++++++++++++++++ 1 file changed, 107 insertions(+) create mode 100644 pandaharvester/harvestersubmitter/lsf_submitter.py diff --git a/pandaharvester/harvestersubmitter/lsf_submitter.py b/pandaharvester/harvestersubmitter/lsf_submitter.py new file mode 100644 index 00000000..438182d6 --- /dev/null +++ b/pandaharvester/harvestersubmitter/lsf_submitter.py @@ -0,0 +1,107 @@ +import datetime +import tempfile +try: + import subprocess32 as subprocess +except: + import subprocess + +from pandaharvester.harvestercore import core_utils +from pandaharvester.harvestercore.plugin_base import PluginBase + +# logger +baseLogger = core_utils.setup_logger('lsf_submitter') + + +# submitter for LSF batch system +class LSFSubmitter(PluginBase): + # constructor + def __init__(self, **kwarg): + self.uploadLog = False + self.logBaseURL = None + PluginBase.__init__(self, **kwarg) + # template for batch script + tmpFile = open(self.templateFile) + self.template = tmpFile.read() + tmpFile.close() + + # submit workers + def submit_workers(self, workspec_list): + retList = [] + for workSpec in workspec_list: + # make logger + tmpLog = self.make_logger(baseLogger, 'workerID={0}'.format(workSpec.workerID), + method_name='submit_workers') + # make batch script + batchFile = self.make_batch_script(workSpec) + # command + comStr = "qsub {0}".format(batchFile) + # submit + tmpLog.debug('submit with {0}'.format(comStr)) + p = subprocess.Popen(comStr.split(), + shell=False, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + # check return code + stdOut, stdErr = p.communicate() + retCode = p.returncode + tmpLog.debug('retCode={0}'.format(retCode)) + if retCode == 0: + # extract batchID + workSpec.batchID = stdOut.split()[-1] + tmpLog.debug('batchID={0}'.format(workSpec.batchID)) + # set log files + if self.uploadLog: + if self.logBaseURL is None: + baseDir = workSpec.get_access_point() + else: + baseDir = self.logBaseURL + stdOut, stdErr = self.get_log_file_names(batchFile, workSpec.batchID) + if stdOut is not None: + workSpec.set_log_file('stdout', '{0}/{1}'.format(baseDir, stdOut)) + if stdErr is not None: + workSpec.set_log_file('stderr', '{0}/{1}'.format(baseDir, stdErr)) + tmpRetVal = (True, '') + else: + # failed + errStr = stdOut + ' ' + stdErr + tmpLog.error(errStr) + tmpRetVal = (False, errStr) + retList.append(tmpRetVal) + return retList + + # make batch script + def make_batch_script(self, workspec): + if hasattr(self, 'dynamicSizing') and self.dynamicSizing is True: + maxWalltime = str(datetime.timedelta(seconds=workspec.maxWalltime)) + yodaWallClockLimit = workspec.maxWalltime / 60 + else: + workspec.nCore = self.nCore + maxWalltime = str(datetime.timedelta(seconds=self.maxWalltime)) + yodaWallClockLimit = self.maxWalltime / 60 + tmpFile = tempfile.NamedTemporaryFile(delete=False, suffix='_submit.sh', dir=workspec.get_access_point()) + tmpFile.write(self.template.format(nCorePerNode=self.nCorePerNode, + localQueue=self.localQueue, + projectName=self.projectName, + nNode=workspec.nCore / self.nCorePerNode, + accessPoint=workspec.accessPoint, + walltime=maxWalltime, + yodaWallClockLimit=yodaWallClockLimit, + workerID=workspec.workerID) + ) + tmpFile.close() + return tmpFile.name + + # get log file names + def get_log_file_names(self, batch_script, batch_id): + stdOut = None + stdErr = None + with open(batch_script) as f: + for line in f: + if not line.startswith('#BSUB'): + continue + items = line.split() + if '-o' in items: + stdOut = items[-1].replace('$LSB_BATCH_JID', batch_id) + elif '-e' in items: + stdErr = items[-1].replace('$LSB_BATCH_JID', batch_id) + return stdOut, stdErr From 272b271edd7caf1301ea6914b49c6c3650cd2c8d Mon Sep 17 00:00:00 2001 From: Doug Benjamin Date: Tue, 11 Feb 2020 02:14:13 -0600 Subject: [PATCH 061/181] fix logger name --- pandaharvester/harvestermonitor/lsf_monitor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandaharvester/harvestermonitor/lsf_monitor.py b/pandaharvester/harvestermonitor/lsf_monitor.py index 7218b34e..a5f66918 100644 --- a/pandaharvester/harvestermonitor/lsf_monitor.py +++ b/pandaharvester/harvestermonitor/lsf_monitor.py @@ -9,7 +9,7 @@ from pandaharvester.harvestercore.plugin_base import PluginBase # logger -baseLogger = core_utils.setup_logger('pbs_monitor') +baseLogger = core_utils.setup_logger('lsf_monitor') # monitor for LSF batch system From 3610178d1fbb250ee07a6c43e7816aa6480d0fe5 Mon Sep 17 00:00:00 2001 From: Doug Benjamin Date: Tue, 11 Feb 2020 02:40:45 -0600 Subject: [PATCH 062/181] add lsf sweeper module --- .../harvestersweeper/lsf_sweeper.py | 68 +++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 pandaharvester/harvestersweeper/lsf_sweeper.py diff --git a/pandaharvester/harvestersweeper/lsf_sweeper.py b/pandaharvester/harvestersweeper/lsf_sweeper.py new file mode 100644 index 00000000..21c63e04 --- /dev/null +++ b/pandaharvester/harvestersweeper/lsf_sweeper.py @@ -0,0 +1,68 @@ +import os +import shutil +try: + import subprocess32 as subprocess +except: + import subprocess + +from pandaharvester.harvestercore import core_utils +from pandaharvester.harvestercore.plugin_base import PluginBase + +# logger +baseLogger = core_utils.setup_logger('lsf_sweeper') + + +# plugin for sweeper with LSF +class LFSSweeper(PluginBase): + # constructor + def __init__(self, **kwarg): + PluginBase.__init__(self, **kwarg) + + # kill a worker + def kill_worker(self, workspec): + """Kill a worker in a scheduling system like batch systems and computing elements. + + :param workspec: worker specification + :type workspec: WorkSpec + :return: A tuple of return code (True for success, False otherwise) and error dialog + :rtype: (bool, string) + """ + # make logger + tmpLog = self.make_logger(baseLogger, 'workerID={0}'.format(workspec.workerID), + method_name='kill_worker') + # kill command + comStr = 'bkill {0}'.format(workspec.batchID) + # execute + p = subprocess.Popen(comStr.split(), shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + stdOut, stdErr = p.communicate() + retCode = p.returncode + if retCode != 0: + # failed + errStr = 'command "{0}" failed, retCode={1}, error: {2} {3}'.format(comStr, retCode, stdOut, stdErr) + tmpLog.error(errStr) + return False, errStr + else: + tmpLog.info('Succeeded to kill workerID={0} batchID={1}'.format(workspec.workerID, workspec.workerID)) + # return + return True, '' + + # cleanup for a worker + def sweep_worker(self, workspec): + """Perform cleanup procedures for a worker, such as deletion of work directory. + + :param workspec: worker specification + :type workspec: WorkSpec + :return: A tuple of return code (True for success, False otherwise) and error dialog + :rtype: (bool, string) + """ + # make logger + tmpLog = self.make_logger(baseLogger, 'workerID={0}'.format(workspec.workerID), + method_name='sweep_worker') + # clean up worker directory + if os.path.exists(workspec.accessPoint): + shutil.rmtree(workspec.accessPoint) + tmpLog.info('removed {0}'.format(workspec.accessPoint)) + else: + tmpLog.info('access point already removed.') + # return + return True, '' From 818e03959c604a5cf23120c5da847ebea639c274 Mon Sep 17 00:00:00 2001 From: David Cameron Date: Thu, 13 Feb 2020 12:18:12 +0100 Subject: [PATCH 063/181] set pilot proxy for panda prodsourcelabel --- pandaharvester/harvestersubmitter/act_submitter.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandaharvester/harvestersubmitter/act_submitter.py b/pandaharvester/harvestersubmitter/act_submitter.py index a42624d5..c0b02298 100644 --- a/pandaharvester/harvestersubmitter/act_submitter.py +++ b/pandaharvester/harvestersubmitter/act_submitter.py @@ -40,7 +40,6 @@ def __init__(self, **kwarg): uc.ProxyPath(str(proxy)) cred = arc.Credential(uc) dn = cred.GetIdentityName() - self.log.info("Proxy {0} with DN {1} and role {2}".format(proxy, dn, role)) actp = aCTProxy(self.log) attr = '/atlas/Role='+role @@ -83,7 +82,8 @@ def submit_workers(self, workspec_list): desc['pandastatus'] = 'sent' desc['actpandastatus'] = 'sent' desc['siteName'] = workSpec.computingSite - desc['proxyid'] = self.proxymap['pilot' if prodSourceLabel == 'user' else 'production'] + desc['proxyid'] = self.proxymap['pilot' if prodSourceLabel in ['user', 'panda'] else 'production'] + desc['prodSourceLabel'] = prodSourceLabel desc['sendhb'] = 0 metadata = {'harvesteraccesspoint': workSpec.get_access_point(), 'schedulerid': 'harvester-{}'.format(harvester_config.master.harvester_id)} From d5614b1c35c50d34441cb5856281cd4506ddfefa Mon Sep 17 00:00:00 2001 From: Doug Benjamin Date: Thu, 13 Feb 2020 15:03:56 -0600 Subject: [PATCH 064/181] comment out maxwalltime --- .../harvestersubmitter/lsf_submitter.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/pandaharvester/harvestersubmitter/lsf_submitter.py b/pandaharvester/harvestersubmitter/lsf_submitter.py index 438182d6..cd6e7780 100644 --- a/pandaharvester/harvestersubmitter/lsf_submitter.py +++ b/pandaharvester/harvestersubmitter/lsf_submitter.py @@ -71,21 +71,20 @@ def submit_workers(self, workspec_list): # make batch script def make_batch_script(self, workspec): - if hasattr(self, 'dynamicSizing') and self.dynamicSizing is True: - maxWalltime = str(datetime.timedelta(seconds=workspec.maxWalltime)) - yodaWallClockLimit = workspec.maxWalltime / 60 - else: - workspec.nCore = self.nCore - maxWalltime = str(datetime.timedelta(seconds=self.maxWalltime)) - yodaWallClockLimit = self.maxWalltime / 60 + #if hasattr(self, 'dynamicSizing') and self.dynamicSizing is True: + # maxWalltime = str(datetime.timedelta(seconds=workspec.maxWalltime)) + # yodaWallClockLimit = workspec.maxWalltime / 60 + #else: + # workspec.nCore = self.nCore + # maxWalltime = str(datetime.timedelta(seconds=self.maxWalltime)) + # yodaWallClockLimit = self.maxWalltime / 60 tmpFile = tempfile.NamedTemporaryFile(delete=False, suffix='_submit.sh', dir=workspec.get_access_point()) tmpFile.write(self.template.format(nCorePerNode=self.nCorePerNode, localQueue=self.localQueue, projectName=self.projectName, nNode=workspec.nCore / self.nCorePerNode, accessPoint=workspec.accessPoint, - walltime=maxWalltime, - yodaWallClockLimit=yodaWallClockLimit, + #walltime=maxWalltime, workerID=workspec.workerID) ) tmpFile.close() From de158ba6a965cd4c32f16e71bb1f792dbfe59db6 Mon Sep 17 00:00:00 2001 From: Doug Benjamin Date: Thu, 13 Feb 2020 15:27:39 -0600 Subject: [PATCH 065/181] comment out local queue --- pandaharvester/harvestersubmitter/lsf_submitter.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandaharvester/harvestersubmitter/lsf_submitter.py b/pandaharvester/harvestersubmitter/lsf_submitter.py index cd6e7780..a3923c77 100644 --- a/pandaharvester/harvestersubmitter/lsf_submitter.py +++ b/pandaharvester/harvestersubmitter/lsf_submitter.py @@ -34,7 +34,7 @@ def submit_workers(self, workspec_list): # make batch script batchFile = self.make_batch_script(workSpec) # command - comStr = "qsub {0}".format(batchFile) + comStr = "bsub {0}".format(batchFile) # submit tmpLog.debug('submit with {0}'.format(comStr)) p = subprocess.Popen(comStr.split(), @@ -80,7 +80,7 @@ def make_batch_script(self, workspec): # yodaWallClockLimit = self.maxWalltime / 60 tmpFile = tempfile.NamedTemporaryFile(delete=False, suffix='_submit.sh', dir=workspec.get_access_point()) tmpFile.write(self.template.format(nCorePerNode=self.nCorePerNode, - localQueue=self.localQueue, + #localQueue=self.localQueue, projectName=self.projectName, nNode=workspec.nCore / self.nCorePerNode, accessPoint=workspec.accessPoint, From 094253a1c57bbd5c289e55ad29e253c197f11c45 Mon Sep 17 00:00:00 2001 From: Doug Benjamin Date: Thu, 13 Feb 2020 21:40:48 -0600 Subject: [PATCH 066/181] comment out projectname --- pandaharvester/harvestersubmitter/lsf_submitter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandaharvester/harvestersubmitter/lsf_submitter.py b/pandaharvester/harvestersubmitter/lsf_submitter.py index a3923c77..f6b94c82 100644 --- a/pandaharvester/harvestersubmitter/lsf_submitter.py +++ b/pandaharvester/harvestersubmitter/lsf_submitter.py @@ -81,7 +81,7 @@ def make_batch_script(self, workspec): tmpFile = tempfile.NamedTemporaryFile(delete=False, suffix='_submit.sh', dir=workspec.get_access_point()) tmpFile.write(self.template.format(nCorePerNode=self.nCorePerNode, #localQueue=self.localQueue, - projectName=self.projectName, + #projectName=self.projectName, nNode=workspec.nCore / self.nCorePerNode, accessPoint=workspec.accessPoint, #walltime=maxWalltime, From 52ed07d02d9e60cb852d7dfa8c27b19191923cdb Mon Sep 17 00:00:00 2001 From: Doug Benjamin Date: Fri, 14 Feb 2020 00:58:05 -0600 Subject: [PATCH 067/181] explicately set mode for batch file creation --- pandaharvester/harvestersubmitter/lsf_submitter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandaharvester/harvestersubmitter/lsf_submitter.py b/pandaharvester/harvestersubmitter/lsf_submitter.py index f6b94c82..924a13a0 100644 --- a/pandaharvester/harvestersubmitter/lsf_submitter.py +++ b/pandaharvester/harvestersubmitter/lsf_submitter.py @@ -78,7 +78,7 @@ def make_batch_script(self, workspec): # workspec.nCore = self.nCore # maxWalltime = str(datetime.timedelta(seconds=self.maxWalltime)) # yodaWallClockLimit = self.maxWalltime / 60 - tmpFile = tempfile.NamedTemporaryFile(delete=False, suffix='_submit.sh', dir=workspec.get_access_point()) + tmpFile = tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='_submit.sh', dir=workspec.get_access_point()) tmpFile.write(self.template.format(nCorePerNode=self.nCorePerNode, #localQueue=self.localQueue, #projectName=self.projectName, From 2aee16f5b79f7f36c2026780d9a27b914addf1da Mon Sep 17 00:00:00 2001 From: fahui Date: Fri, 14 Feb 2020 16:50:45 +0800 Subject: [PATCH 068/181] support utf8 characters in DB --- pandaharvester/commit_timestamp.py | 2 +- pandaharvester/harvestercore/core_utils.py | 7 +++++++ pandaharvester/harvestercore/db_proxy.py | 2 +- pandaharvester/harvesterfifo/mysql_fifo.py | 4 ++-- 4 files changed, 11 insertions(+), 4 deletions(-) diff --git a/pandaharvester/commit_timestamp.py b/pandaharvester/commit_timestamp.py index 3bda1cf9..62fbed64 100644 --- a/pandaharvester/commit_timestamp.py +++ b/pandaharvester/commit_timestamp.py @@ -1 +1 @@ -timestamp = "04-02-2020 15:40:49 on master (by tmaeno)" +timestamp = "14-02-2020 08:50:45 on contrib_cern (by fahui)" diff --git a/pandaharvester/harvestercore/core_utils.py b/pandaharvester/harvestercore/core_utils.py index fcaade8d..d488a467 100644 --- a/pandaharvester/harvestercore/core_utils.py +++ b/pandaharvester/harvestercore/core_utils.py @@ -4,6 +4,7 @@ """ import os +import re import sys import time import zlib @@ -639,3 +640,9 @@ def pickle_to_text(data): # unpickle from text def unpickle_from_text(text): return pickle.loads(codecs.decode(text.encode(), 'base64')) + + +# remove non-latin characters and return string +def remove_non_latins(text): + ret = re.sub(r'[^\x00-\x7F\x80-\xFF\u0100-\u017F\u0180-\u024F\u1E00-\u1EFF]', '', text) + return ret diff --git a/pandaharvester/harvestercore/db_proxy.py b/pandaharvester/harvestercore/db_proxy.py index c2635af6..131e3470 100644 --- a/pandaharvester/harvestercore/db_proxy.py +++ b/pandaharvester/harvestercore/db_proxy.py @@ -104,7 +104,7 @@ def fetchall(self): self.con = MySQLdb.connect(user=harvester_config.db.user, passwd=harvester_config.db.password, db=harvester_config.db.schema, host=host, port=port, - cursorclass=MyCursor) + cursorclass=MyCursor, charset='utf8') self.cur = self.con.cursor() else: import mysql.connector diff --git a/pandaharvester/harvesterfifo/mysql_fifo.py b/pandaharvester/harvesterfifo/mysql_fifo.py index 7405d081..684ab180 100644 --- a/pandaharvester/harvesterfifo/mysql_fifo.py +++ b/pandaharvester/harvesterfifo/mysql_fifo.py @@ -52,8 +52,8 @@ def __init__(self, **kwarg): except ImportError: raise Exception('No available MySQL DB API installed. Please pip install mysqlclient or mysql-connection-python') else: - self.con = mysql.connector.connect(user=db_user, passwd=db_password, - db=db_schema, host=db_host, port=db_port) + self.con = mysql.connector.connect(user=db_user, passwd=db_password, db=db_schema, + host=db_host, port=db_port, charset='utf8') self.cur = self.con.cursor(buffered=True) self.OperationalError = mysql.connector.errors.OperationalError else: From eac969ca0c523e0f2a632fd6e8e5fbd8271b86f4 Mon Sep 17 00:00:00 2001 From: fahui Date: Fri, 14 Feb 2020 16:57:52 +0800 Subject: [PATCH 069/181] v0.2.1-rc --- pandaharvester/commit_timestamp.py | 2 +- pandaharvester/panda_pkg_info.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandaharvester/commit_timestamp.py b/pandaharvester/commit_timestamp.py index 62fbed64..af253ed4 100644 --- a/pandaharvester/commit_timestamp.py +++ b/pandaharvester/commit_timestamp.py @@ -1 +1 @@ -timestamp = "14-02-2020 08:50:45 on contrib_cern (by fahui)" +timestamp = "14-02-2020 08:57:52 on contrib_cern (by fahui)" diff --git a/pandaharvester/panda_pkg_info.py b/pandaharvester/panda_pkg_info.py index a4389a0c..3dbb6482 100644 --- a/pandaharvester/panda_pkg_info.py +++ b/pandaharvester/panda_pkg_info.py @@ -1 +1 @@ -release_version = "0.2.0-rc" +release_version = "0.2.1-rc" From 11dfaf19ad65aa910cb8c12a3fb076ff4d1d42f2 Mon Sep 17 00:00:00 2001 From: Doug Benjamin Date: Fri, 14 Feb 2020 14:20:42 -0600 Subject: [PATCH 070/181] add nGpuPerNode to lsf submiter code --- .../harvestersubmitter/lsf_submitter.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/pandaharvester/harvestersubmitter/lsf_submitter.py b/pandaharvester/harvestersubmitter/lsf_submitter.py index 924a13a0..5d752afa 100644 --- a/pandaharvester/harvestersubmitter/lsf_submitter.py +++ b/pandaharvester/harvestersubmitter/lsf_submitter.py @@ -78,13 +78,27 @@ def make_batch_script(self, workspec): # workspec.nCore = self.nCore # maxWalltime = str(datetime.timedelta(seconds=self.maxWalltime)) # yodaWallClockLimit = self.maxWalltime / 60 + + # set number of nodes - Note Ultimately will need to something more sophisticated + if hasattr(self,'nGpuPerNode'): + if int(self.nGpuPerNode) > 0: + numnodes = int(workspec.nJobs/self.nGpuPerNode) + if numnodes <= 0: + numnodes = 1 + else: + if (workspec.nJobs % self.nGpuPerNode) != 0 : + numnodes += 1 + else: + numnodes=workspec.nCore / self.nCorePerNode + tmpFile = tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='_submit.sh', dir=workspec.get_access_point()) tmpFile.write(self.template.format(nCorePerNode=self.nCorePerNode, #localQueue=self.localQueue, #projectName=self.projectName, - nNode=workspec.nCore / self.nCorePerNode, + nNode=numnodes, accessPoint=workspec.accessPoint, #walltime=maxWalltime, + #yodaWallClockLimit=yodaWallClockLimit, workerID=workspec.workerID) ) tmpFile.close() From 9f6694a0a322430a407f2cf6fc0cb139806f897e Mon Sep 17 00:00:00 2001 From: Doug Benjamin Date: Sat, 15 Feb 2020 10:06:56 -0600 Subject: [PATCH 071/181] fixed bsub command --- pandaharvester/harvestersubmitter/lsf_submitter.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/pandaharvester/harvestersubmitter/lsf_submitter.py b/pandaharvester/harvestersubmitter/lsf_submitter.py index 5d752afa..1210a80c 100644 --- a/pandaharvester/harvestersubmitter/lsf_submitter.py +++ b/pandaharvester/harvestersubmitter/lsf_submitter.py @@ -34,13 +34,14 @@ def submit_workers(self, workspec_list): # make batch script batchFile = self.make_batch_script(workSpec) # command - comStr = "bsub {0}".format(batchFile) + comStr = "bsub -L /bin/sh" # submit - tmpLog.debug('submit with {0}'.format(comStr)) + tmpLog.debug('submit with {0} and LSF options file {1}'.format(comStr,batchFile)) p = subprocess.Popen(comStr.split(), shell=False, stdout=subprocess.PIPE, - stderr=subprocess.PIPE) + stderr=subprocess.PIPE, + stdin=open(batchFile,'r')) # check return code stdOut, stdErr = p.communicate() retCode = p.returncode @@ -114,7 +115,9 @@ def get_log_file_names(self, batch_script, batch_id): continue items = line.split() if '-o' in items: - stdOut = items[-1].replace('$LSB_BATCH_JID', batch_id) + #stdOut = items[-1].replace('$LSB_BATCH_JID', batch_id) + stdOut = items[-1].replace('%J', batch_id) elif '-e' in items: - stdErr = items[-1].replace('$LSB_BATCH_JID', batch_id) + #stdErr = items[-1].replace('$LSB_BATCH_JID', batch_id) + stdErr = items[-1].replace('%J', batch_id) return stdOut, stdErr From 8ff98dc1ecb34936b44572bbfd91fe22a583b037 Mon Sep 17 00:00:00 2001 From: fahui Date: Mon, 17 Feb 2020 17:02:37 +0800 Subject: [PATCH 072/181] fix --- pandaharvester/commit_timestamp.py | 2 +- pandaharvester/harvestercore/core_utils.py | 7 ------- 2 files changed, 1 insertion(+), 8 deletions(-) diff --git a/pandaharvester/commit_timestamp.py b/pandaharvester/commit_timestamp.py index af253ed4..d5b1650b 100644 --- a/pandaharvester/commit_timestamp.py +++ b/pandaharvester/commit_timestamp.py @@ -1 +1 @@ -timestamp = "14-02-2020 08:57:52 on contrib_cern (by fahui)" +timestamp = "17-02-2020 09:02:37 on contrib_cern (by fahui)" diff --git a/pandaharvester/harvestercore/core_utils.py b/pandaharvester/harvestercore/core_utils.py index d488a467..fcaade8d 100644 --- a/pandaharvester/harvestercore/core_utils.py +++ b/pandaharvester/harvestercore/core_utils.py @@ -4,7 +4,6 @@ """ import os -import re import sys import time import zlib @@ -640,9 +639,3 @@ def pickle_to_text(data): # unpickle from text def unpickle_from_text(text): return pickle.loads(codecs.decode(text.encode(), 'base64')) - - -# remove non-latin characters and return string -def remove_non_latins(text): - ret = re.sub(r'[^\x00-\x7F\x80-\xFF\u0100-\u017F\u0180-\u024F\u1E00-\u1EFF]', '', text) - return ret From 80c78eb6c815d3f74ed2defd3fbafc6c21e988e5 Mon Sep 17 00:00:00 2001 From: tmaeno Date: Wed, 19 Feb 2020 14:52:40 +0100 Subject: [PATCH 073/181] fixed for aux input --- pandaharvester/commit_timestamp.py | 2 +- pandaharvester/harvesterbody/preparator.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/pandaharvester/commit_timestamp.py b/pandaharvester/commit_timestamp.py index d5b1650b..75fa0930 100644 --- a/pandaharvester/commit_timestamp.py +++ b/pandaharvester/commit_timestamp.py @@ -1 +1 @@ -timestamp = "17-02-2020 09:02:37 on contrib_cern (by fahui)" +timestamp = "19-02-2020 13:52:40 on master (by tmaeno)" diff --git a/pandaharvester/harvesterbody/preparator.py b/pandaharvester/harvesterbody/preparator.py index 3cc94956..618d0f5c 100644 --- a/pandaharvester/harvesterbody/preparator.py +++ b/pandaharvester/harvesterbody/preparator.py @@ -43,7 +43,8 @@ def run(self): harvester_config.preparator.checkInterval, harvester_config.preparator.lockInterval, lockedBy, - max_files_per_job=maxFilesPerJob) + max_files_per_job=maxFilesPerJob, + ng_file_status_list=['ready']) mainLog.debug('got {0} jobs to check'.format(len(jobsToCheck))) # loop over all jobs for jobSpec in jobsToCheck: From 5eadfb58b5a9a4907030f98f86f34737784dd423 Mon Sep 17 00:00:00 2001 From: Doug Benjamin Date: Wed, 19 Feb 2020 09:38:35 -0600 Subject: [PATCH 074/181] fix batchID mistake --- pandaharvester/harvestersubmitter/lsf_submitter.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pandaharvester/harvestersubmitter/lsf_submitter.py b/pandaharvester/harvestersubmitter/lsf_submitter.py index 1210a80c..479d150c 100644 --- a/pandaharvester/harvestersubmitter/lsf_submitter.py +++ b/pandaharvester/harvestersubmitter/lsf_submitter.py @@ -1,5 +1,6 @@ import datetime import tempfile +import re try: import subprocess32 as subprocess except: @@ -46,9 +47,14 @@ def submit_workers(self, workspec_list): stdOut, stdErr = p.communicate() retCode = p.returncode tmpLog.debug('retCode={0}'.format(retCode)) + tmpLog.debug('stdOut={0}'.format(stdOut)) + tmpLog.debug('stdErr={0}'.format(stdErr)) if retCode == 0: # extract batchID - workSpec.batchID = stdOut.split()[-1] + batchID = str(stdOut.split()[1],'utf-8') + result = re.sub('[^0-9]','', batchID) + tmpLog.debug('strip out non-numberic charactors from {0} - result {1}'.format(batchID,result)) + workSpec.batchID = result tmpLog.debug('batchID={0}'.format(workSpec.batchID)) # set log files if self.uploadLog: From 7de15c547e5566a7024047909e62daf9898f7ecd Mon Sep 17 00:00:00 2001 From: Doug Benjamin Date: Wed, 19 Feb 2020 09:39:55 -0600 Subject: [PATCH 075/181] fix module name typo --- pandaharvester/harvestersweeper/lsf_sweeper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandaharvester/harvestersweeper/lsf_sweeper.py b/pandaharvester/harvestersweeper/lsf_sweeper.py index 21c63e04..7c0c29d4 100644 --- a/pandaharvester/harvestersweeper/lsf_sweeper.py +++ b/pandaharvester/harvestersweeper/lsf_sweeper.py @@ -13,7 +13,7 @@ # plugin for sweeper with LSF -class LFSSweeper(PluginBase): +class LSFSweeper(PluginBase): # constructor def __init__(self, **kwarg): PluginBase.__init__(self, **kwarg) From f28958168363847e19e7837366b548d67c6a24b3 Mon Sep 17 00:00:00 2001 From: fahui Date: Thu, 20 Feb 2020 21:52:03 +0800 Subject: [PATCH 076/181] k8s logs --- pandaharvester/commit_timestamp.py | 2 +- .../harvestermessenger/k8s_messenger.py | 44 +++++++++++++++++++ pandaharvester/harvestermisc/k8s_utils.py | 9 ++++ 3 files changed, 54 insertions(+), 1 deletion(-) create mode 100644 pandaharvester/harvestermessenger/k8s_messenger.py diff --git a/pandaharvester/commit_timestamp.py b/pandaharvester/commit_timestamp.py index 75fa0930..b0464fc9 100644 --- a/pandaharvester/commit_timestamp.py +++ b/pandaharvester/commit_timestamp.py @@ -1 +1 @@ -timestamp = "19-02-2020 13:52:40 on master (by tmaeno)" +timestamp = "20-02-2020 13:52:03 on contrib_cern (by fahui)" diff --git a/pandaharvester/harvestermessenger/k8s_messenger.py b/pandaharvester/harvestermessenger/k8s_messenger.py new file mode 100644 index 00000000..4895e942 --- /dev/null +++ b/pandaharvester/harvestermessenger/k8s_messenger.py @@ -0,0 +1,44 @@ + + +from pandaharvester.harvestercore import core_utils +from .base_messenger import BaseMessenger +from pandaharvester.harvesterconfig import harvester_config +from pandaharvester.harvestermisc.k8s_utils import k8s_Client +# from pandaharvester.harvestercore.queue_config_mapper import QueueConfigMapper +# from pandaharvester.harvestercore.work_spec import WorkSpec + + +# Messenger for generic Kubernetes clusters +class K8sMessenger(BaseMessenger): + + def __init__(self, **kwargs): + BaseMessenger.__init__(self, **kwarg) + try: + self.logDir + except AttributeError: + print('K8sMessenger: Missing attribute logDir') + raise + self.k8s_client = k8s_Client(namespace=self.k8s_namespace, config_file=self.k8s_config_file) + self._all_pods_list = self.k8s_client.get_pods_info() + + def post_processing(self, workspec, jobspec_list, map_type): + """ + Do the folloiwing in post_processing, i.e. when workers terminate (finished/failed/cancelled) + - Fetch logs of the pod from k8s + - Store or upload logs + """ + # fetch and store logs + job_id = workspec.batchID + pods_list = self.k8s_client.filter_pods_info(self._all_pods_list, job_name=job_id) + pod_name_list = [ pods_info['name'] for pods_info in pods_list ] + outlog_filename = os.path(self.logDir, 'gridK8S.{0}.{1}.out'.format(workspec.workerID, workspec.batchID)) + with open(outlog_filename, 'w') as f: + for pod_name in pod_name_list: + current_log_str = self.k8s_client.get_pod_logs(pod_name) + previous_log_str = self.k8s_client.get_pod_logs(pod_name, previous=True) + f.write(previous_log_str) + f.write('\n\n') + f.write(current_log_str) + f.write('\n\n\n') + # upload logs + pass diff --git a/pandaharvester/harvestermisc/k8s_utils.py b/pandaharvester/harvestermisc/k8s_utils.py index be2e6cc4..33f6f180 100644 --- a/pandaharvester/harvestermisc/k8s_utils.py +++ b/pandaharvester/harvestermisc/k8s_utils.py @@ -183,3 +183,12 @@ def create_or_patch_secret(self, file_list, secret_name): print('Exception when patch secret: {0} . Try to create secret instead...'.format(e)) rsp = self.corev1.create_namespaced_secret(body=body, namespace=self.namespace) return rsp + + def get_pod_logs(self, pod_name, previous=False): + try: + rsp = self.corev1.read_namespaced_pod_log(name=pod_name, namespace=self.namespace, previous=previous) + except ApiException as e: + print('Exception when getting logs from pod {0} : {1} . Skipped'.format(e)) + raise + else: + return rsp From a57629012360af23103e87ae754c685640824ea5 Mon Sep 17 00:00:00 2001 From: Doug Benjamin Date: Sun, 23 Feb 2020 20:30:07 -0600 Subject: [PATCH 077/181] handle monitoring --- .../harvestermonitor/lsf_monitor.py | 49 +++++++++++++------ 1 file changed, 35 insertions(+), 14 deletions(-) diff --git a/pandaharvester/harvestermonitor/lsf_monitor.py b/pandaharvester/harvestermonitor/lsf_monitor.py index a5f66918..ea226f60 100644 --- a/pandaharvester/harvestermonitor/lsf_monitor.py +++ b/pandaharvester/harvestermonitor/lsf_monitor.py @@ -1,4 +1,7 @@ import re +from shlex import quote +from shlex import split + try: import subprocess32 as subprocess except: @@ -26,10 +29,10 @@ def check_workers(self, workspec_list): tmpLog = self.make_logger(baseLogger, 'workerID={0}'.format(workSpec.workerID), method_name='check_workers') # command - comStr = "bjobs -noheader -o 'jobid:10 stat:10' {0}".format(workSpec.batchID) + comStr = 'bjobs -a -noheader -o {0} {1} '.format(quote("jobid:10 stat:10"),workSpec.batchID) + comStr_split = split(comStr) # check - tmpLog.debug('check with {0}'.format(comStr)) - p = subprocess.Popen(comStr.split(), + p = subprocess.Popen(comStr_split, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) @@ -37,24 +40,42 @@ def check_workers(self, workspec_list): # check return code stdOut, stdErr = p.communicate() retCode = p.returncode + tmpLog.debug('len(stdOut) = {0} stdOut={1}'.format(len(str(stdOut)),stdOut)) + tmpLog.debug('len(stdErr) = {0} stdErr={1}'.format(len(str(stdErr)),stdErr)) tmpLog.debug('retCode={0}'.format(retCode)) errStr = '' if retCode == 0: + # check if any came back on stdOut otherwise check stdErr + tempresponse = "" + if len(str(stdOut)) >= len(str(stdErr)): + tempresponse = str(stdOut) + else: + tempresponse = str(stdErr) + #tmpLog.debug('tempresponse = {0}'.format(tempresponse)) # parse - for tmpLine in stdOut.split('\n'): - tmpMatch = re.search('{0} '.format(workSpec.batchID), tmpLine) + for tmpLine in tempresponse.split('\n'): + tmpMatch = re.search('{0}'.format(workSpec.batchID), tmpLine) + tmpLog.debug('tmpLine = {0} tmpMatch = {1}'.format(tmpLine,tmpMatch)) if tmpMatch is not None: errStr = tmpLine - batchStatus = tmpLine.split()[-2] - if batchStatus in ['RUN']: - newStatus = WorkSpec.ST_running - elif batchStatus in ['DONE']: - newStatus = WorkSpec.ST_finished - elif batchStatus in ['PEND', 'PROV','WAIT']: - newStatus = WorkSpec.ST_submitted - else: + # search for phrase is not found + tmpMatch = re.search('is not found', tmpLine) + if tmpMatch is not None: + batchStatus = 'Job {0} is not found'.format(workSpec.batchID) newStatus = WorkSpec.ST_failed - tmpLog.debug('batchStatus {0} -> workerStatus {1}'.format(batchStatus, + tmpLog.debug('batchStatus {0} -> workerStatus {1}'.format(batchStatus, + retCode)) + else: + batchStatus = tmpLine.split()[-2] + if batchStatus in ['RUN']: + newStatus = WorkSpec.ST_running + elif batchStatus in ['DONE']: + newStatus = WorkSpec.ST_finished + elif batchStatus in ['PEND', 'PROV','WAIT']: + newStatus = WorkSpec.ST_submitted + else: + newStatus = WorkSpec.ST_failed + tmpLog.debug('batchStatus {0} -> workerStatus {1}'.format(batchStatus, newStatus)) break retList.append((newStatus, errStr)) From 6e80cc15f152c63bf6e1d8bbcc2979c019ecb4c5 Mon Sep 17 00:00:00 2001 From: Doug Benjamin Date: Sun, 23 Feb 2020 20:30:49 -0600 Subject: [PATCH 078/181] handle gzip tar files from requests header --- .../harvesterpreparator/analysis_aux_preparator.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/pandaharvester/harvesterpreparator/analysis_aux_preparator.py b/pandaharvester/harvesterpreparator/analysis_aux_preparator.py index d791fefc..26f0f6be 100644 --- a/pandaharvester/harvesterpreparator/analysis_aux_preparator.py +++ b/pandaharvester/harvesterpreparator/analysis_aux_preparator.py @@ -50,8 +50,14 @@ def trigger_preparation(self, jobspec): tmpLog.debug('getting via http from {0} to {1}'.format(url, accPath)) res = requests.get(url, timeout=180, verify=False) if res.status_code == 200: - with open(accPath, 'w') as f: - f.write(res.content) + tmpLog.debug('res.headers = {0}'.format(res.headers)) + # Should we check the returned size if too small through error? + if res.headers['content-type'] == 'application/gzip' : + with open(accPath, 'wb') as f: + f.write(res.content) + else: + with open(accPath, 'w') as f: + f.write(res.content) return_code = 0 else: errMsg = 'failed to get {0} with StatusCode={1} {2}'.format(url, res.status_code, res.text) From 7d5aa29467c6875bb727194d51fb1c098ef69ba5 Mon Sep 17 00:00:00 2001 From: Doug Benjamin Date: Mon, 24 Feb 2020 06:13:25 -0600 Subject: [PATCH 079/181] better error handling for 0 length files --- .../analysis_aux_preparator.py | 62 +++++++++++++------ 1 file changed, 42 insertions(+), 20 deletions(-) diff --git a/pandaharvester/harvesterpreparator/analysis_aux_preparator.py b/pandaharvester/harvesterpreparator/analysis_aux_preparator.py index 26f0f6be..040e2631 100644 --- a/pandaharvester/harvesterpreparator/analysis_aux_preparator.py +++ b/pandaharvester/harvesterpreparator/analysis_aux_preparator.py @@ -37,8 +37,15 @@ def trigger_preparation(self, jobspec): # local access path url = tmpFileSpec.url accPath = self.make_local_access_path(tmpFileSpec.scope, tmpFileSpec.lfn) + tmpLog.debug('url : {0} accPath : {1}'.format(url,accPath)) # check if already exits if os.path.exists(accPath): + # test its size if 0 size remove file + tmpLog.debug('accPath - {0} file size - {1}'.format(accPath,os.path.getsize(accPath))) + if os.path.getsize(accPath) == 0 : + tmpLog.debug('Remove file - {0}'.format(accPath)) + os.remove(accPath) + else: continue # make directories if needed if not os.path.isdir(os.path.dirname(accPath)): @@ -52,35 +59,50 @@ def trigger_preparation(self, jobspec): if res.status_code == 200: tmpLog.debug('res.headers = {0}'.format(res.headers)) # Should we check the returned size if too small through error? - if res.headers['content-type'] == 'application/gzip' : - with open(accPath, 'wb') as f: - f.write(res.content) + with open(accPath, 'wb') as f: + f.write(res.content) + # if accPath file exists and zero size remove + if os.path.exists(accPath) and os.path.getsize(accPath) == 0 : + tmpLog.debug('Remove file - {0} size - {1}'.format(accPath,os.path.getsize(accPath))) + os.remove(accPath) else: - with open(accPath, 'w') as f: - f.write(res.content) - return_code = 0 + tmpLog.debug('Successfully fetched file - {0}'.format(accPath)) + return_code = 0 else: errMsg = 'failed to get {0} with StatusCode={1} {2}'.format(url, res.status_code, res.text) tmpLog.error(errMsg) except requests.exceptions.ReadTimeout: tmpLog.error('read timeout when getting data from {0}'.format(url)) except Exception: + # if accPath file exists and zero size remove + if os.path.exists(accPath) and os.path.getsize(accPath) == 0 : + tmpLog.debug('Remove file - {0} size - {1}'.format(accPath,os.path.getsize(accPath))) + os.remove(accPath) core_utils.dump_error_message(tmpLog) elif url.startswith('docker'): - args = ['docker', 'save', '-o', accPath, url.split('://')[-1]] - try: - tmpLog.debug('executing ' + ' '.join(args)) - p = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - stdout, stderr = p.communicate() - return_code = p.returncode - if stdout is not None: - stdout = stdout.replace('\n', ' ') - if stderr is not None: - stderr = stderr.replace('\n', ' ') - tmpLog.debug("stdout: %s" % stdout) - tmpLog.debug("stderr: %s" % stderr) - except Exception: - core_utils.dump_error_message(tmpLog) + # test if docker command exists + # else test if singularity command exists + args = None + if shutil.which('docker') is not None: + args = ['docker', 'save', '-o', accPath, url.split('://')[-1]] + elif shutil.which('singularity') is not None: + # extract image name + args = ['singularity', 'build', '--sandbox', accPath, url ] + # test if we are at Summit - container needs to be built elsewhere + if args is not None: + try: + tmpLog.debug('executing ' + ' '.join(args)) + p = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + stdout, stderr = p.communicate() + return_code = p.returncode + if stdout is not None: + stdout = stdout.replace('\n', ' ') + if stderr is not None: + stderr = stderr.replace('\n', ' ') + tmpLog.debug("stdout: {0}".format(stdout)) + tmpLog.debug("stderr: [0}".format(stderr)) + except Exception: + core_utils.dump_error_message(tmpLog) elif url.startswith('/'): try: shutil.copyfile(url, accPath) From 6b7e7f78f1152b14c4028aacce09377a92b0d176 Mon Sep 17 00:00:00 2001 From: fahui Date: Mon, 24 Feb 2020 20:26:58 +0800 Subject: [PATCH 080/181] Clear error message --- pandaharvester/commit_timestamp.py | 2 +- pandaharvester/harvestermisc/k8s_utils.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/pandaharvester/commit_timestamp.py b/pandaharvester/commit_timestamp.py index b0464fc9..bdae7116 100644 --- a/pandaharvester/commit_timestamp.py +++ b/pandaharvester/commit_timestamp.py @@ -1 +1 @@ -timestamp = "20-02-2020 13:52:03 on contrib_cern (by fahui)" +timestamp = "24-02-2020 12:26:58 on contrib_cern (by fahui)" diff --git a/pandaharvester/harvestermisc/k8s_utils.py b/pandaharvester/harvestermisc/k8s_utils.py index 33f6f180..09357e2a 100644 --- a/pandaharvester/harvestermisc/k8s_utils.py +++ b/pandaharvester/harvestermisc/k8s_utils.py @@ -17,6 +17,8 @@ class k8s_Client(object): def __init__(self, namespace, config_file=None): + if not os.path.isfile(config_file): + raise RuntimeError('Cannot find k8s config file: {0}'.format(config_file)) config.load_kube_config(config_file=config_file) self.namespace = namespace if namespace else 'default' self.corev1 = client.CoreV1Api() From f84db66d20e4ac86b3a7b99f4c7d442fc1799149 Mon Sep 17 00:00:00 2001 From: tmaeno Date: Mon, 24 Feb 2020 16:29:38 +0100 Subject: [PATCH 081/181] changed analysis_aux_preparator following suggestions --- pandaharvester/commit_timestamp.py | 2 +- .../analysis_aux_preparator.py | 92 +++++++++---------- 2 files changed, 46 insertions(+), 48 deletions(-) diff --git a/pandaharvester/commit_timestamp.py b/pandaharvester/commit_timestamp.py index 75fa0930..9b9d99a3 100644 --- a/pandaharvester/commit_timestamp.py +++ b/pandaharvester/commit_timestamp.py @@ -1 +1 @@ -timestamp = "19-02-2020 13:52:40 on master (by tmaeno)" +timestamp = "24-02-2020 15:29:39 on pull/37 (by tmaeno)" diff --git a/pandaharvester/harvesterpreparator/analysis_aux_preparator.py b/pandaharvester/harvesterpreparator/analysis_aux_preparator.py index 040e2631..f6d4cd0c 100644 --- a/pandaharvester/harvesterpreparator/analysis_aux_preparator.py +++ b/pandaharvester/harvesterpreparator/analysis_aux_preparator.py @@ -11,7 +11,6 @@ from pandaharvester.harvestercore.plugin_base import PluginBase from pandaharvester.harvestercore import core_utils from pandaharvester.harvestermover import mover_utils -from pandaharvester.harvesterconfig import harvester_config # logger baseLogger = core_utils.setup_logger('analysis_aux_preparator') @@ -21,7 +20,7 @@ class AnalysisAuxPreparator(PluginBase): # constructor def __init__(self, **kwarg): - self.gulOpts = None + self.containerRuntime = None self.maxAttempts = 3 PluginBase.__init__(self, **kwarg) @@ -37,16 +36,11 @@ def trigger_preparation(self, jobspec): # local access path url = tmpFileSpec.url accPath = self.make_local_access_path(tmpFileSpec.scope, tmpFileSpec.lfn) + accPathTmp = accPath + '.tmp' tmpLog.debug('url : {0} accPath : {1}'.format(url,accPath)) # check if already exits if os.path.exists(accPath): - # test its size if 0 size remove file - tmpLog.debug('accPath - {0} file size - {1}'.format(accPath,os.path.getsize(accPath))) - if os.path.getsize(accPath) == 0 : - tmpLog.debug('Remove file - {0}'.format(accPath)) - os.remove(accPath) - else: - continue + continue # make directories if needed if not os.path.isdir(os.path.dirname(accPath)): os.makedirs(os.path.dirname(accPath)) @@ -54,63 +48,67 @@ def trigger_preparation(self, jobspec): return_code = 1 if url.startswith('http'): try: - tmpLog.debug('getting via http from {0} to {1}'.format(url, accPath)) + tmpLog.debug('getting via http from {0} to {1}'.format(url, accPathTmp)) res = requests.get(url, timeout=180, verify=False) if res.status_code == 200: - tmpLog.debug('res.headers = {0}'.format(res.headers)) - # Should we check the returned size if too small through error? with open(accPath, 'wb') as f: f.write(res.content) - # if accPath file exists and zero size remove - if os.path.exists(accPath) and os.path.getsize(accPath) == 0 : - tmpLog.debug('Remove file - {0} size - {1}'.format(accPath,os.path.getsize(accPath))) - os.remove(accPath) - else: - tmpLog.debug('Successfully fetched file - {0}'.format(accPath)) - return_code = 0 + tmpLog.debug('Successfully fetched file - {0}'.format(accPathTmp)) + return_code = 0 else: errMsg = 'failed to get {0} with StatusCode={1} {2}'.format(url, res.status_code, res.text) tmpLog.error(errMsg) except requests.exceptions.ReadTimeout: tmpLog.error('read timeout when getting data from {0}'.format(url)) except Exception: - # if accPath file exists and zero size remove - if os.path.exists(accPath) and os.path.getsize(accPath) == 0 : - tmpLog.debug('Remove file - {0} size - {1}'.format(accPath,os.path.getsize(accPath))) - os.remove(accPath) core_utils.dump_error_message(tmpLog) elif url.startswith('docker'): - # test if docker command exists - # else test if singularity command exists - args = None - if shutil.which('docker') is not None: - args = ['docker', 'save', '-o', accPath, url.split('://')[-1]] - elif shutil.which('singularity') is not None: - # extract image name - args = ['singularity', 'build', '--sandbox', accPath, url ] - # test if we are at Summit - container needs to be built elsewhere - if args is not None: - try: - tmpLog.debug('executing ' + ' '.join(args)) - p = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - stdout, stderr = p.communicate() - return_code = p.returncode - if stdout is not None: - stdout = stdout.replace('\n', ' ') - if stderr is not None: - stderr = stderr.replace('\n', ' ') - tmpLog.debug("stdout: {0}".format(stdout)) - tmpLog.debug("stderr: [0}".format(stderr)) - except Exception: - core_utils.dump_error_message(tmpLog) + if self.containerRuntime is None: + tmpLog.debug('container downloading is disabled') + continue + if self.containerRuntime == 'docker': + args = ['docker', 'save', '-o', accPathTmp, url.split('://')[-1]] + elif self.containerRuntime == 'singularity': + args = ['singularity', 'build', '--sandbox', accPathTmp, url ] + else: + tmpLog.error('unsupported container runtime : {0}'.format(self.containerRuntime)) + # + try: + tmpLog.debug('executing ' + ' '.join(args)) + p = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + stdout, stderr = p.communicate() + return_code = p.returncode + if stdout is not None: + stdout = stdout.replace('\n', ' ') + if stderr is not None: + stderr = stderr.replace('\n', ' ') + tmpLog.debug("stdout: {0}".format(stdout)) + tmpLog.debug("stderr: [0}".format(stderr)) + except Exception: + core_utils.dump_error_message(tmpLog) elif url.startswith('/'): try: - shutil.copyfile(url, accPath) + shutil.copyfile(url, accPathTmp) return_code = 0 except Exception: core_utils.dump_error_message(tmpLog) else: tmpLog.error('unsupported protocol in {0}'.format(url)) + # remove empty files + if os.path.exists(accPathTmp) and os.path.getsize(accPathTmp) == 0: + return_code = 1 + tmpLog.debug('remove empty file - {0}'.format(accPathTmp)) + try: + os.remove(accPathTmp) + except Exception: + core_utils.dump_error_message(tmpLog) + # rename + if return_code == 0: + try: + os.rename(accPathTmp, accPath) + except Exception: + return_code = 1 + core_utils.dump_error_message(tmpLog) if return_code != 0: allDone = False if allDone: From 80ade9d5ca23dc62fd5518d83aac81ecb5e617c1 Mon Sep 17 00:00:00 2001 From: fahui Date: Tue, 25 Feb 2020 01:23:21 +0800 Subject: [PATCH 082/181] fix k8s logs --- pandaharvester/commit_timestamp.py | 2 +- .../harvestermessenger/k8s_messenger.py | 45 ++++++++++++------- 2 files changed, 29 insertions(+), 18 deletions(-) diff --git a/pandaharvester/commit_timestamp.py b/pandaharvester/commit_timestamp.py index bdae7116..c13fc729 100644 --- a/pandaharvester/commit_timestamp.py +++ b/pandaharvester/commit_timestamp.py @@ -1 +1 @@ -timestamp = "24-02-2020 12:26:58 on contrib_cern (by fahui)" +timestamp = "24-02-2020 17:23:21 on contrib_cern (by fahui)" diff --git a/pandaharvester/harvestermessenger/k8s_messenger.py b/pandaharvester/harvestermessenger/k8s_messenger.py index 4895e942..d28702bb 100644 --- a/pandaharvester/harvestermessenger/k8s_messenger.py +++ b/pandaharvester/harvestermessenger/k8s_messenger.py @@ -1,4 +1,4 @@ - +import os from pandaharvester.harvestercore import core_utils from .base_messenger import BaseMessenger @@ -8,11 +8,15 @@ # from pandaharvester.harvestercore.work_spec import WorkSpec +# logger +_logger = core_utils.setup_logger('k8s_messenger') + + # Messenger for generic Kubernetes clusters class K8sMessenger(BaseMessenger): def __init__(self, **kwargs): - BaseMessenger.__init__(self, **kwarg) + BaseMessenger.__init__(self, **kwargs) try: self.logDir except AttributeError: @@ -27,18 +31,25 @@ def post_processing(self, workspec, jobspec_list, map_type): - Fetch logs of the pod from k8s - Store or upload logs """ - # fetch and store logs - job_id = workspec.batchID - pods_list = self.k8s_client.filter_pods_info(self._all_pods_list, job_name=job_id) - pod_name_list = [ pods_info['name'] for pods_info in pods_list ] - outlog_filename = os.path(self.logDir, 'gridK8S.{0}.{1}.out'.format(workspec.workerID, workspec.batchID)) - with open(outlog_filename, 'w') as f: - for pod_name in pod_name_list: - current_log_str = self.k8s_client.get_pod_logs(pod_name) - previous_log_str = self.k8s_client.get_pod_logs(pod_name, previous=True) - f.write(previous_log_str) - f.write('\n\n') - f.write(current_log_str) - f.write('\n\n\n') - # upload logs - pass + # get logger + tmpLog = core_utils.make_logger(_logger, 'workerID={0}'.format(workspec.workerID), + method_name='post_processing') + tmpLog.debug('start') + try: + # fetch and store logs + job_id = workspec.batchID + pods_list = self.k8s_client.filter_pods_info(self._all_pods_list, job_name=job_id) + pod_name_list = [ pods_info['name'] for pods_info in pods_list ] + outlog_filename = os.path.join(self.logDir, 'gridK8S.{0}.{1}.out'.format(workspec.workerID, workspec.batchID)) + with open(outlog_filename, 'w') as f: + for pod_name in pod_name_list: + current_log_str = self.k8s_client.get_pod_logs(pod_name) + f.write(current_log_str) + # upload logs + pass + # return + tmpLog.debug('done') + return True + except Exception: + core_utils.dump_error_message(tmpLog) + return None From ad39753ddc5e3b305287ec81251f04fc94d9a609 Mon Sep 17 00:00:00 2001 From: fahui Date: Tue, 25 Feb 2020 01:23:43 +0800 Subject: [PATCH 083/181] support force post-processing --- pandaharvester/commit_timestamp.py | 2 +- pandaharvester/harvesterbody/monitor.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/pandaharvester/commit_timestamp.py b/pandaharvester/commit_timestamp.py index c13fc729..3a1ff71f 100644 --- a/pandaharvester/commit_timestamp.py +++ b/pandaharvester/commit_timestamp.py @@ -1 +1 @@ -timestamp = "24-02-2020 17:23:21 on contrib_cern (by fahui)" +timestamp = "24-02-2020 17:23:43 on contrib_cern (by fahui)" diff --git a/pandaharvester/harvesterbody/monitor.py b/pandaharvester/harvesterbody/monitor.py index c1762f39..547a42f1 100644 --- a/pandaharvester/harvesterbody/monitor.py +++ b/pandaharvester/harvesterbody/monitor.py @@ -696,7 +696,8 @@ def check_workers(self, mon_core, messenger, all_workers, queue_config, tmp_log, else: newStatus = WorkSpec.ST_idle elif not workSpec.is_post_processed(): - if not queue_config.is_no_heartbeat_status(newStatus) and not queue_config.truePilot: + if (not queue_config.is_no_heartbeat_status(newStatus) and not queue_config.truePilot) \ + or (hasattr(messenger, 'forcePostProcessing') and messenger.forcePostProcessing): # post processing unless heartbeat is suppressed jobSpecs = self.dbProxy.get_jobs_with_worker_id(workSpec.workerID, None, True, From f4828ca79ac5c41a892db8c2e8201cdfdd4b604d Mon Sep 17 00:00:00 2001 From: fbarreir Date: Wed, 26 Feb 2020 13:30:34 +0100 Subject: [PATCH 084/181] k8s: set memory limits --- pandaharvester/harvestermisc/k8s_utils.py | 30 ++++++++++++++++------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/pandaharvester/harvestermisc/k8s_utils.py b/pandaharvester/harvestermisc/k8s_utils.py index 09357e2a..5f7d0375 100644 --- a/pandaharvester/harvestermisc/k8s_utils.py +++ b/pandaharvester/harvestermisc/k8s_utils.py @@ -48,17 +48,29 @@ def create_job_from_yaml(self, yaml_content, work_spec, cert, cert_in_secret=Tru container_env.setdefault('resources', {}) + # set the resources (CPU and memory) we need for the container # note that predefined values in the yaml template will NOT be overwritten + container_env.setdefault('resources', {}) if work_spec.nCore > 0: - container_env['resources'].setdefault('limits', {'cpu': str(work_spec.nCore)}) - container_env['resources'].setdefault('requests', {'cpu': str(work_spec.nCore * cpuadjustratio / 100.0)}) - - if work_spec.minRamCount > 4: - # K8S minimum memory limit = 4 MB - container_env['resources'].setdefault('limits', { - 'memory': str(work_spec.minRamCount) + 'M'}) - container_env['resources'].setdefault('requests', { - 'memory': str(work_spec.minRamCount*memoryadjustratio/100.0) + 'M'}) + + # CPU limits + container_env['resources'].setdefault('limits', {}) + if 'cpu' not in container_env['resources']['limits']: + container_env['resources']['limits']['cpu'] = str(work_spec.nCore) + # CPU requests + container_env['resources'].setdefault('requests', {}) + if 'cpu' not in container_env['resources']['requests']: + container_env['resources']['requests']['cpu'] = str(work_spec.nCore * cpuadjustratio / 100.0) + + if work_spec.minRamCount > 4: # K8S minimum memory limit = 4 MB + # memory limits + container_env['resources'].setdefault('limits', {}) + if 'memory' not in container_env['resources']['limits']: + container_env['resources']['limits']['memory'] = str(work_spec.minRamCount) + 'M' + # memory requests + container_env['resources'].setdefault('requests', {}) + if 'memory' not in container_env['resources']['requests']: + container_env['resources']['requests']['memory'] = str(work_spec.minRamCount * memoryadjustratio / 100.0) + 'M' container_env.setdefault('env', []) From f4971eec8931156b13b06b3e9a6426f9e7153b6b Mon Sep 17 00:00:00 2001 From: Doug Benjamin Date: Sat, 29 Feb 2020 06:13:41 -0600 Subject: [PATCH 085/181] add changes for creating singularity images on Summit --- .../analysis_aux_preparator.py | 71 +++++++++++++++---- 1 file changed, 56 insertions(+), 15 deletions(-) diff --git a/pandaharvester/harvesterpreparator/analysis_aux_preparator.py b/pandaharvester/harvesterpreparator/analysis_aux_preparator.py index f6d4cd0c..c49b38f3 100644 --- a/pandaharvester/harvesterpreparator/analysis_aux_preparator.py +++ b/pandaharvester/harvesterpreparator/analysis_aux_preparator.py @@ -1,4 +1,5 @@ import os +import stat import shutil try: import subprocess32 as subprocess @@ -51,7 +52,7 @@ def trigger_preparation(self, jobspec): tmpLog.debug('getting via http from {0} to {1}'.format(url, accPathTmp)) res = requests.get(url, timeout=180, verify=False) if res.status_code == 200: - with open(accPath, 'wb') as f: + with open(accPathTmp, 'wb') as f: f.write(res.content) tmpLog.debug('Successfully fetched file - {0}'.format(accPathTmp)) return_code = 0 @@ -68,25 +69,16 @@ def trigger_preparation(self, jobspec): continue if self.containerRuntime == 'docker': args = ['docker', 'save', '-o', accPathTmp, url.split('://')[-1]] + return_code = make_container(tmpLog,args) elif self.containerRuntime == 'singularity': args = ['singularity', 'build', '--sandbox', accPathTmp, url ] + return_code = make_container(tmpLog,args) + elif self.containerRuntime == 'Summit_singularity': + return_code = make_container_script(tmpLog, accPathTmp, url) else: tmpLog.error('unsupported container runtime : {0}'.format(self.containerRuntime)) # - try: - tmpLog.debug('executing ' + ' '.join(args)) - p = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - stdout, stderr = p.communicate() - return_code = p.returncode - if stdout is not None: - stdout = stdout.replace('\n', ' ') - if stderr is not None: - stderr = stderr.replace('\n', ' ') - tmpLog.debug("stdout: {0}".format(stdout)) - tmpLog.debug("stderr: [0}".format(stderr)) - except Exception: - core_utils.dump_error_message(tmpLog) - elif url.startswith('/'): + elif url.startswith('/'): try: shutil.copyfile(url, accPathTmp) return_code = 0 @@ -142,3 +134,52 @@ def resolve_input_paths(self, jobspec): # make local access path def make_local_access_path(self, scope, lfn): return mover_utils.construct_file_path(self.localBasePath, scope, lfn) + + # execute commands to make container in subprocess + def make_container(self, tmpLog, args): + return_code = 1 + try: + tmpLog.debug('executing ' + ' '.join(args)) + p = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + stdout, stderr = p.communicate() + return_code = p.returncode + if stdout is not None: + stdout = stdout.replace('\n', ' ') + if stderr is not None: + stderr = stderr.replace('\n', ' ') + tmpLog.debug("stdout: {0}".format(stdout)) + tmpLog.debug("stderr: [0}".format(stderr)) + except Exception: + core_utils.dump_error_message(tmpLog) + return return_code + + # create file to be used to create container + def make_container_script(self, tmpLog, accPathTmp, url): + return_code = 1 + # extract container name from url + container_name = url.rsplit('/',1)[1] + # construct path to container + containerPath = "{basePath}/{name}".format(basePath=self.localContainerPath, name=container_name) + # check if container already exits + if os.path.exists(containerPath): + return_code = 0 + else: + try: + # create the directory + os.makedirs(containerPath) + # now create the command file for creating Singularity sandbox container + with open(accPathTmp, 'w') as f: + f.write("#!/bin/sh\n") + f.write("\n") + f.write("# this file creates the Singularity sandbox container {0}\n".format(containerPath)) + f.write("\n") + f.write("singularity build --sandbox {path} {url}\n".format(path=containerPath,url=url)) + f.write("\n") + # change permissions on script to executable + st = os.stat(accPathTmp) + os.chmod(accPathTmp, st.st_mode | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH ) + tmpLog.debug('Successfully fetched file - {0}'.format(accPathTmp)) + return_code = 0 + except Exception: + core_utils.dump_error_message(tmpLog) + return return_code From 5dc82aeba85cca8966f0a39ae1f84c4663ccadf2 Mon Sep 17 00:00:00 2001 From: Doug Benjamin Date: Sat, 29 Feb 2020 06:30:45 -0600 Subject: [PATCH 086/181] fix indent --- pandaharvester/harvesterpreparator/analysis_aux_preparator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandaharvester/harvesterpreparator/analysis_aux_preparator.py b/pandaharvester/harvesterpreparator/analysis_aux_preparator.py index c49b38f3..09f6bb9b 100644 --- a/pandaharvester/harvesterpreparator/analysis_aux_preparator.py +++ b/pandaharvester/harvesterpreparator/analysis_aux_preparator.py @@ -78,7 +78,7 @@ def trigger_preparation(self, jobspec): else: tmpLog.error('unsupported container runtime : {0}'.format(self.containerRuntime)) # - elif url.startswith('/'): + elif url.startswith('/'): try: shutil.copyfile(url, accPathTmp) return_code = 0 From 9c72a64a2238abadf0f34dc9e7566a346b9acf74 Mon Sep 17 00:00:00 2001 From: Doug Benjamin Date: Sat, 29 Feb 2020 07:19:30 -0600 Subject: [PATCH 087/181] fix function reference --- .../harvesterpreparator/analysis_aux_preparator.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/pandaharvester/harvesterpreparator/analysis_aux_preparator.py b/pandaharvester/harvesterpreparator/analysis_aux_preparator.py index 09f6bb9b..b0b66c44 100644 --- a/pandaharvester/harvesterpreparator/analysis_aux_preparator.py +++ b/pandaharvester/harvesterpreparator/analysis_aux_preparator.py @@ -31,6 +31,8 @@ def trigger_preparation(self, jobspec): tmpLog = self.make_logger(baseLogger, 'PandaID={0}'.format(jobspec.PandaID), method_name='trigger_preparation') tmpLog.debug('start') + tmpLog.debug("from queueconfig file - containerRuntime : {0}".format(self.containerRuntime)) + tmpLog.debug("from queueconfig file - localContainerPath: {0}".format(self.localContainerPath)) # loop over all inputs allDone = True for tmpFileSpec in jobspec.inFiles: @@ -69,12 +71,12 @@ def trigger_preparation(self, jobspec): continue if self.containerRuntime == 'docker': args = ['docker', 'save', '-o', accPathTmp, url.split('://')[-1]] - return_code = make_container(tmpLog,args) + return_code = self.make_container(tmpLog,args) elif self.containerRuntime == 'singularity': args = ['singularity', 'build', '--sandbox', accPathTmp, url ] - return_code = make_container(tmpLog,args) + return_code = self.make_container(tmpLog,args) elif self.containerRuntime == 'Summit_singularity': - return_code = make_container_script(tmpLog, accPathTmp, url) + return_code = self.make_container_script(tmpLog, accPathTmp, url) else: tmpLog.error('unsupported container runtime : {0}'.format(self.containerRuntime)) # @@ -160,6 +162,7 @@ def make_container_script(self, tmpLog, accPathTmp, url): container_name = url.rsplit('/',1)[1] # construct path to container containerPath = "{basePath}/{name}".format(basePath=self.localContainerPath, name=container_name) + tmpLog.debug("accPathTmp : {0} url : {1} containerPath : {2}".format(accPathTmp,url,containerPath)) # check if container already exits if os.path.exists(containerPath): return_code = 0 From 14bcfc5f133319093f88a2b6f91982dc5e3224d6 Mon Sep 17 00:00:00 2001 From: Doug Benjamin Date: Sat, 29 Feb 2020 21:21:44 -0600 Subject: [PATCH 088/181] changes to create containers automatically at Summit --- .../harvesterpreparator/analysis_aux_preparator.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/pandaharvester/harvesterpreparator/analysis_aux_preparator.py b/pandaharvester/harvesterpreparator/analysis_aux_preparator.py index b0b66c44..4f649556 100644 --- a/pandaharvester/harvesterpreparator/analysis_aux_preparator.py +++ b/pandaharvester/harvesterpreparator/analysis_aux_preparator.py @@ -168,16 +168,18 @@ def make_container_script(self, tmpLog, accPathTmp, url): return_code = 0 else: try: - # create the directory - os.makedirs(containerPath) + # make directories if needed + if not os.path.isdir(containerPath): + os.makedirs(containerPath) + if not os.path.isdir(os.path.dirname(accPathTmp)): + os.makedirs(os.path.dirname(accPathTmp)) # now create the command file for creating Singularity sandbox container with open(accPathTmp, 'w') as f: f.write("#!/bin/sh\n") - f.write("\n") f.write("# this file creates the Singularity sandbox container {0}\n".format(containerPath)) - f.write("\n") - f.write("singularity build --sandbox {path} {url}\n".format(path=containerPath,url=url)) - f.write("\n") + f.write("set -x \n") + f.write("singularity build --force --sandbox {path} {url}\n".format(path=containerPath,url=url)) + f.write("set +x \n") # change permissions on script to executable st = os.stat(accPathTmp) os.chmod(accPathTmp, st.st_mode | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH ) From ea5d257dab17b9a1a042d5fed2dc629de8ce5471 Mon Sep 17 00:00:00 2001 From: Doug Benjamin Date: Sat, 29 Feb 2020 21:21:53 -0600 Subject: [PATCH 089/181] changes to create containers automatically at Summit --- .../harvestersubmitter/lsf_submitter.py | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/pandaharvester/harvestersubmitter/lsf_submitter.py b/pandaharvester/harvestersubmitter/lsf_submitter.py index 479d150c..f05ee092 100644 --- a/pandaharvester/harvestersubmitter/lsf_submitter.py +++ b/pandaharvester/harvestersubmitter/lsf_submitter.py @@ -8,6 +8,7 @@ from pandaharvester.harvestercore import core_utils from pandaharvester.harvestercore.plugin_base import PluginBase +from pandaharvester.harvestermover import mover_utils # logger baseLogger = core_utils.setup_logger('lsf_submitter') @@ -98,12 +99,35 @@ def make_batch_script(self, workspec): else: numnodes=workspec.nCore / self.nCorePerNode + # create the string of files to execute to create the containers for the jobs + containerscripts = "" + scope = "aux_input" + container_command_files = [] + # loop over the jobspec's associated with the workerspec + jobspec_list = workspec.get_jobspec_list() + for jobSpec in jobspec_list: + jobPars = jobSpec.jobParams['jobPars'] + trf = jobSpec.jobParams['transformation'] + container = None + tmpM = re.search(' --containerImage\s+([^\s]+)', jobPars) + if tmpM is not None: + container = tmpM.group(1) + container_name = container.rsplit('/',1)[1] + path = mover_utils.construct_file_path(self.localBasePath, scope, container_name) + container_command_files.append("/bin/sh {0}\n".format(path)) + + if len(container_command_files) > 0: + # remove duplicates + container_command_files = list( dict.fromkeys(container_command_files) ) + containerscripts = ' '.join(container_command_files) + tmpFile = tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='_submit.sh', dir=workspec.get_access_point()) tmpFile.write(self.template.format(nCorePerNode=self.nCorePerNode, #localQueue=self.localQueue, #projectName=self.projectName, nNode=numnodes, accessPoint=workspec.accessPoint, + containerscripts=containerscripts, #walltime=maxWalltime, #yodaWallClockLimit=yodaWallClockLimit, workerID=workspec.workerID) From 666b93a090b2ae96d5fe5c3b99c5516e7d14ac19 Mon Sep 17 00:00:00 2001 From: fahui Date: Mon, 2 Mar 2020 22:33:40 +0800 Subject: [PATCH 090/181] k8s: derive worker status with container state, e.g. OOMKilled --- pandaharvester/commit_timestamp.py | 2 +- pandaharvester/harvestermisc/k8s_utils.py | 8 +++- .../harvestermonitor/k8s_monitor.py | 37 +++++++++++++++++-- 3 files changed, 40 insertions(+), 7 deletions(-) diff --git a/pandaharvester/commit_timestamp.py b/pandaharvester/commit_timestamp.py index 3a1ff71f..af9f1f1c 100644 --- a/pandaharvester/commit_timestamp.py +++ b/pandaharvester/commit_timestamp.py @@ -1 +1 @@ -timestamp = "24-02-2020 17:23:43 on contrib_cern (by fahui)" +timestamp = "02-03-2020 14:33:40 on contrib_cern (by fahui)" diff --git a/pandaharvester/harvestermisc/k8s_utils.py b/pandaharvester/harvestermisc/k8s_utils.py index 5f7d0375..06e70e3a 100644 --- a/pandaharvester/harvestermisc/k8s_utils.py +++ b/pandaharvester/harvestermisc/k8s_utils.py @@ -104,9 +104,13 @@ def get_pods_info(self): pod_info['name'] = i.metadata.name pod_info['start_time'] = i.status.start_time.replace(tzinfo=None) if i.status.start_time else i.status.start_time pod_info['status'] = i.status.phase - pod_info['status_reason'] = i.status.conditions[0].reason if i.status.conditions else None - pod_info['status_message'] = i.status.conditions[0].message if i.status.conditions else None + pod_info['status_conditions'] = i.status.conditions pod_info['job_name'] = i.metadata.labels['job-name'] if i.metadata.labels and 'job-name' in i.metadata.labels else None + pod_info['containers_state'] = [] + if i.status.container_statuses: + for cs in i.status.container_statuses: + if cs.state: + pod_info['containers_state'].append(cs.state) pods_list.append(pod_info) return pods_list diff --git a/pandaharvester/harvestermonitor/k8s_monitor.py b/pandaharvester/harvestermonitor/k8s_monitor.py index f79224c5..edb6f3d9 100644 --- a/pandaharvester/harvestermonitor/k8s_monitor.py +++ b/pandaharvester/harvestermonitor/k8s_monitor.py @@ -7,6 +7,7 @@ from pandaharvester.harvestercore import core_utils from pandaharvester.harvestercore.work_spec import WorkSpec +from pandaharvester.harvestercore.worker_errors import WorkerErrors from pandaharvester.harvestercore.plugin_base import PluginBase from pandaharvester.harvestermisc.k8s_utils import k8s_Client @@ -40,8 +41,9 @@ def __init__(self, **kwarg): self._all_pods_list = [] - def check_pods_status(self, pods_status_list): + def check_pods_status(self, pods_status_list, containers_state_list): newStatus = '' + subMesg = '' if 'Unknown' in pods_status_list: if all(item == 'Unknown' for item in pods_status_list): @@ -56,7 +58,24 @@ def check_pods_status(self, pods_status_list): # elif all(item == 'Succeeded' for item in pods_status_list): # newStatus = WorkSpec.ST_finished elif 'Succeeded' in pods_status_list: - newStatus = WorkSpec.ST_finished + if all((item.terminated is not None and item.terminated.reason == 'Completed') for item in containers_state_list): + newStatus = WorkSpec.ST_finished + else: + sub_mesg_list = [] + for item in containers_state_list: + mesg_str = '' + if item.terminated is None: + state = 'UNKNOWN' + if item.running is not None: + state = 'running' + elif item.waiting is not None: + state = 'waiting' + mesg_str = 'container not terminated yet ({0}) while pod Succeeded'.format(state) + elif item.terminated.reason != 'Completed': + mesg_str = 'container termiated by k8s for reason {0}'.format(item.terminated.reason) + sub_mesg_list.append(mesg_str) + subMesg = ';'.join(sub_mesg_list) + newStatus = WorkSpec.ST_cancelled elif 'Running' in pods_status_list: newStatus = WorkSpec.ST_running elif 'Failed' in pods_status_list: @@ -64,7 +83,7 @@ def check_pods_status(self, pods_status_list): else: newStatus = WorkSpec.ST_idle - return newStatus + return newStatus, subMesg def check_a_job(self, workspec): # set logger @@ -80,13 +99,17 @@ def check_a_job(self, workspec): pods_list = self.k8s_client.filter_pods_info(self._all_pods_list, job_name=job_id) timeNow = datetime.datetime.utcnow() pods_status_list = [] + containers_state_list = [] pods_name_to_delete_list = [] + pods_sup_diag_list = [] for pods_info in pods_list: if pods_info['status'] in ['Pending', 'Unknown'] and pods_info['start_time'] \ and timeNow - pods_info['start_time'] > datetime.timedelta(seconds=self.podQueueTimeLimit): # fetch queuing too long pods pods_name_to_delete_list.append(pods_info['name']) pods_status_list.append(pods_info['status']) + containers_state_list.extend(pods_info['containers_state']) + pods_sup_diag_list.append(pods_info['name']) except Exception as _e: errStr = 'Failed to get POD status of JOB id={0} ; {1}'.format(job_id, _e) tmpLog.error(errStr) @@ -99,7 +122,9 @@ def check_a_job(self, workspec): newStatus = WorkSpec.ST_cancelled else: tmpLog.debug('pods_status_list={0}'.format(pods_status_list)) - newStatus = self.check_pods_status(pods_status_list) + newStatus, subMesg = self.check_pods_status(pods_status_list, containers_state_list) + if subMesg: + errStr += subMesg tmpLog.debug('new_status={0}'.format(newStatus)) # delete queuing too long pods if pods_name_to_delete_list: @@ -111,6 +136,10 @@ def check_a_job(self, workspec): deleted_pods_list.append(item['name']) tmpLog.debug('Deleted pods queuing too long: {0}'.format( ','.join(deleted_pods_list))) + # supplemental diag messages + sup_error_code = WorkerErrors.error_codes.get('GENERAL_ERROR') if errStr else WorkerErrors.error_codes.get('SUCCEEDED') + sup_error_diag = 'PODs=' + ','.join(pods_sup_diag_list) + ' ; ' + errStr + workspec.set_supplemental_error(error_code=sup_error_code, error_diag=sup_error_diag) return (newStatus, errStr) From 7262e722e8eb867b3c160ae895c0a6f614ad6ef8 Mon Sep 17 00:00:00 2001 From: tmaeno Date: Mon, 2 Mar 2020 22:58:22 +0100 Subject: [PATCH 091/181] added externalCommand to anal_aux_prep --- pandaharvester/commit_timestamp.py | 2 +- .../analysis_aux_preparator.py | 102 +++++++++++++++++- 2 files changed, 99 insertions(+), 5 deletions(-) diff --git a/pandaharvester/commit_timestamp.py b/pandaharvester/commit_timestamp.py index af9f1f1c..4d750484 100644 --- a/pandaharvester/commit_timestamp.py +++ b/pandaharvester/commit_timestamp.py @@ -1 +1 @@ -timestamp = "02-03-2020 14:33:40 on contrib_cern (by fahui)" +timestamp = "02-03-2020 21:58:22 on master (by tmaeno)" diff --git a/pandaharvester/harvesterpreparator/analysis_aux_preparator.py b/pandaharvester/harvesterpreparator/analysis_aux_preparator.py index f6d4cd0c..fe449a0e 100644 --- a/pandaharvester/harvesterpreparator/analysis_aux_preparator.py +++ b/pandaharvester/harvesterpreparator/analysis_aux_preparator.py @@ -21,6 +21,7 @@ class AnalysisAuxPreparator(PluginBase): # constructor def __init__(self, **kwarg): self.containerRuntime = None + self.externalCommand = {} self.maxAttempts = 3 PluginBase.__init__(self, **kwarg) @@ -32,6 +33,7 @@ def trigger_preparation(self, jobspec): tmpLog.debug('start') # loop over all inputs allDone = True + bulkExtCommand = {} for tmpFileSpec in jobspec.inFiles: # local access path url = tmpFileSpec.url @@ -44,14 +46,28 @@ def trigger_preparation(self, jobspec): # make directories if needed if not os.path.isdir(os.path.dirname(accPath)): os.makedirs(os.path.dirname(accPath)) - # get + # check if use an external command + extCommand = None + for protocol in self.externalCommand: + if url.startswith(protocol): + extCommand = self.externalCommand[protocol] + # collect file info to execute the command later + bulkExtCommand.setdefault(protocol, {'command': extCommand, 'url': [], 'dst': [], 'lfn': []}) + bulkExtCommand[protocol]['url'].append(url) + bulkExtCommand[protocol]['dst'].append(accPath) + bulkExtCommand[protocol]['lfn'].append(tmpFileSpec.lfn) + break + # execute the command later + if extCommand is not None: + continue + # execute return_code = 1 if url.startswith('http'): try: tmpLog.debug('getting via http from {0} to {1}'.format(url, accPathTmp)) res = requests.get(url, timeout=180, verify=False) if res.status_code == 200: - with open(accPath, 'wb') as f: + with open(accPathTmp, 'wb') as f: f.write(res.content) tmpLog.debug('Successfully fetched file - {0}'.format(accPathTmp)) return_code = 0 @@ -72,7 +88,6 @@ def trigger_preparation(self, jobspec): args = ['singularity', 'build', '--sandbox', accPathTmp, url ] else: tmpLog.error('unsupported container runtime : {0}'.format(self.containerRuntime)) - # try: tmpLog.debug('executing ' + ' '.join(args)) p = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE) @@ -83,7 +98,7 @@ def trigger_preparation(self, jobspec): if stderr is not None: stderr = stderr.replace('\n', ' ') tmpLog.debug("stdout: {0}".format(stdout)) - tmpLog.debug("stderr: [0}".format(stderr)) + tmpLog.debug("stderr: {0}".format(stderr)) except Exception: core_utils.dump_error_message(tmpLog) elif url.startswith('/'): @@ -111,6 +126,45 @@ def trigger_preparation(self, jobspec): core_utils.dump_error_message(tmpLog) if return_code != 0: allDone = False + # execute external command + execIdMap = {} + for protocol in bulkExtCommand: + args = [] + for arg in bulkExtCommand[protocol]['command']['trigger']['args']: + if arg == '{src}': + arg = ','.join(bulkExtCommand[protocol]['url']) + elif arg == '{dst}': + arg = ','.join(bulkExtCommand[protocol]['dst']) + args.append(arg) + # execute + try: + tmpLog.debug('executing external command: ' + ' '.join(args)) + p = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + stdout, stderr = p.communicate() + return_code = p.returncode + if stdout is None: + stdout = '' + if stderr is None: + stderr = '' + # get ID of command execution such as transfer ID and batch job ID + executionID = None + if return_code == 0 and 'check' in bulkExtCommand[protocol]['command']: + executionID = [s for s in stdout.split('\n') if s][-1] + executionID = '{0}:{1}'.format(protocol, executionID) + execIdMap[executionID] = {'lfns': bulkExtCommand[protocol]['lfn'], 'groupStatus': 'active'} + stdout = stdout.replace('\n', ' ') + stderr = stderr.replace('\n', ' ') + tmpLog.debug("stdout: {0}".format(stdout)) + tmpLog.debug("stderr: {0}".format(stderr)) + if executionID is not None: + tmpLog.debug("execution ID: {0}".format(executionID)) + except Exception: + core_utils.dump_error_message(tmpLog) + allDone = False + # keep execution ID to check later + if execIdMap: + jobspec.set_groups_to_files(execIdMap) + # done if allDone: tmpLog.debug('succeeded') return True, '' @@ -127,6 +181,46 @@ def trigger_preparation(self, jobspec): # check status def check_stage_in_status(self, jobspec): + # make logger + tmpLog = self.make_logger(baseLogger, 'PandaID={0}'.format(jobspec.PandaID), + method_name='check_stage_in_status') + tmpLog.debug('start') + allDone = True + errMsg = '' + transferGroups = jobspec.get_groups_of_input_files(skip_ready=True) + for tmpGroupID in transferGroups: + if tmpGroupID is None: + continue + protocol, executionID = tmpGroupID.split(':') + args = [] + for arg in self.externalCommand[protocol]['check']['args']: + if arg == '{id}': + arg = executionID + args.append(arg) + # execute + try: + tmpLog.debug('executing external command: ' + ' '.join(args)) + p = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + stdout, stderr = p.communicate() + return_code = p.returncode + if stdout is None: + stdout = '' + if stderr is None: + stderr = '' + stdout = stdout.replace('\n', ' ') + stderr = stderr.replace('\n', ' ') + tmpLog.debug("stdout: {0}".format(stdout)) + tmpLog.debug("stderr: {0}".format(stderr)) + if return_code != 0: + errMsg = '{0} is not ready'.format(tmpGroupID) + allDone = False + break + except Exception: + errMsg = core_utils.dump_error_message(tmpLog) + allDone = False + break + if not allDone: + return None, errMsg return True, '' # resolve input file paths From 2fb7041e1d6a30551fd543cd9a56adceba9337de Mon Sep 17 00:00:00 2001 From: David Cameron Date: Tue, 3 Mar 2020 10:40:32 +0100 Subject: [PATCH 092/181] adjust urllib to python3 --- pandaharvester/harvestersubmitter/act_submitter.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandaharvester/harvestersubmitter/act_submitter.py b/pandaharvester/harvestersubmitter/act_submitter.py index c0b02298..981997a0 100644 --- a/pandaharvester/harvestersubmitter/act_submitter.py +++ b/pandaharvester/harvestersubmitter/act_submitter.py @@ -2,7 +2,7 @@ import json import socket import time -import urllib +import urllib.parse from pandaharvester.harvestercore import core_utils from pandaharvester.harvestercore.plugin_base import PluginBase @@ -92,7 +92,7 @@ def submit_workers(self, workspec_list): if jobSpec: # push mode: aCT takes the url-encoded job description (like it gets from panda server) pandaid = jobSpec.PandaID - actjobdesc = urllib.urlencode(jobSpec.jobParams) + actjobdesc = urllib.parse.urlencode(jobSpec.jobParams) else: # pull mode: just set pandaid (to workerid) and prodsourcelabel pandaid = workSpec.workerID From 65710d76b841fb286be1d06ca9f29bc62ae731ae Mon Sep 17 00:00:00 2001 From: David Cameron Date: Tue, 3 Mar 2020 12:32:38 +0100 Subject: [PATCH 093/181] use subprocess.run --- .../arcproxy_cred_manager.py | 31 +++++++++---------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/pandaharvester/harvestercredmanager/arcproxy_cred_manager.py b/pandaharvester/harvestercredmanager/arcproxy_cred_manager.py index e93c9dda..99467926 100644 --- a/pandaharvester/harvestercredmanager/arcproxy_cred_manager.py +++ b/pandaharvester/harvestercredmanager/arcproxy_cred_manager.py @@ -1,8 +1,5 @@ import re -try: - import subprocess32 as subprocess -except: - import subprocess +import subprocess from pandaharvester.harvestercore.plugin_base import PluginBase from pandaharvester.harvestercore import core_utils @@ -25,21 +22,22 @@ def check_credential(self): comStr = "arcproxy -i vomsACvalidityLeft -P {0}".format(self.outCertFile) mainLog.debug(comStr) try: - p = subprocess.Popen(comStr.split(), - shell=False, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) - stdOut, stdErr = p.communicate() + p = subprocess.run(comStr.split(), + encoding='utf-8', + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + stdOut = p.stdout.strip() + stdErr = p.stderr retCode = p.returncode except: core_utils.dump_error_message(mainLog) return False mainLog.debug('retCode={0} stdOut={1} stdErr={2}'.format(retCode, stdOut, stdErr)) - if retCode != 0 or not re.match(r'\d+', stdOut.strip()): + if retCode != 0 or not re.match(r'\d+', stdOut): mainLog.error('Unexpected output from arcproxy: {0}'.format(stdOut)) return False # return whether lifetime is greater than three days - return int(stdOut.strip()) > 3600 * 72 + return int(stdOut) > 3600 * 72 # renew proxy def renew_credential(self): @@ -50,11 +48,12 @@ def renew_credential(self): self.inCertFile) mainLog.debug(comStr) try: - p = subprocess.Popen(comStr.split(), - shell=False, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) - stdOut, stdErr = p.communicate() + p = subprocess.run(comStr.split(), + encoding='utf-8', + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + stdOut = p.stdout + stdErr = p.stderr retCode = p.returncode mainLog.debug('retCode={0} stdOut={1} stdErr={2}'.format(retCode, stdOut, stdErr)) except: From 88667908da4ed5828374df7a31c372da887678d0 Mon Sep 17 00:00:00 2001 From: Doug Benjamin Date: Tue, 3 Mar 2020 10:29:49 -0600 Subject: [PATCH 094/181] add changes to submit batch file to create image on Summit --- .../analysis_aux_preparator.py | 110 +++++++++++++++--- 1 file changed, 97 insertions(+), 13 deletions(-) diff --git a/pandaharvester/harvesterpreparator/analysis_aux_preparator.py b/pandaharvester/harvesterpreparator/analysis_aux_preparator.py index fe449a0e..d377e3d7 100644 --- a/pandaharvester/harvesterpreparator/analysis_aux_preparator.py +++ b/pandaharvester/harvesterpreparator/analysis_aux_preparator.py @@ -1,4 +1,5 @@ import os +import stat import shutil try: import subprocess32 as subprocess @@ -84,23 +85,15 @@ def trigger_preparation(self, jobspec): continue if self.containerRuntime == 'docker': args = ['docker', 'save', '-o', accPathTmp, url.split('://')[-1]] + return_code = self.make_image(jobspec,args) elif self.containerRuntime == 'singularity': args = ['singularity', 'build', '--sandbox', accPathTmp, url ] + return_code = self.make_image(jobspec,args) + elif self.containerRuntime == 'Summit_singularity': + retCode = self.make_image_Summit(jobspec, accPath, url) + tmpLog.debug('self.make_image_Summit(tmpLog, accPath, url) return value : {0}'.format(retCode)) else: tmpLog.error('unsupported container runtime : {0}'.format(self.containerRuntime)) - try: - tmpLog.debug('executing ' + ' '.join(args)) - p = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - stdout, stderr = p.communicate() - return_code = p.returncode - if stdout is not None: - stdout = stdout.replace('\n', ' ') - if stderr is not None: - stderr = stderr.replace('\n', ' ') - tmpLog.debug("stdout: {0}".format(stdout)) - tmpLog.debug("stderr: {0}".format(stderr)) - except Exception: - core_utils.dump_error_message(tmpLog) elif url.startswith('/'): try: shutil.copyfile(url, accPathTmp) @@ -236,3 +229,94 @@ def resolve_input_paths(self, jobspec): # make local access path def make_local_access_path(self, scope, lfn): return mover_utils.construct_file_path(self.localBasePath, scope, lfn) + + # run the command to create the image + def make_image(self, jobspec, args): + # make logger + tmpLog = self.make_logger(baseLogger, 'PandaID={0}'.format(jobspec.PandaID), + method_name='make_image') + tmpLog.debug('start') + return_code = 1 + try: + tmpLog.debug('executing ' + ' '.join(args)) + p = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + stdout, stderr = p.communicate() + return_code = p.returncode + if stdout is not None: + stdout = stdout.replace('\n', ' ') + if stderr is not None: + stderr = stderr.replace('\n', ' ') + tmpLog.debug("stdout: {0}".format(stdout)) + tmpLog.debug("stderr: [0}".format(stderr)) + except Exception: + core_utils.dump_error_message(tmpLog) + tmpLog.debug('end with return code {0}'.format(return_code)) + return return_code + + # create and submit the job make the image runs on the Summit Launch nodes + def make_image_Summit(self, jobspec, accPath, url): + # make logger + tmpLog = self.make_logger(baseLogger, 'PandaID={0}'.format(jobspec.PandaID), + method_name='make_image') + tmpLog.debug('start') + return_code = 1 + tmpLog.debug('make_container_Summit container: {0} url : {1}'.format(accPath,url)) + # extract container name from url + container_name = url.rsplit('/',1)[1] + # check if batch script for creating container exists + batchscriptname = 'build_singularity_image_{0}.bsub'.format(container_name) + batchscriptPath = os.join.path(self.localContainerPath,batchscriptname) + if not os.path.exists(batchscriptPath): + try: + # Open template for batch script + tmpFile = open(self.containertemplateFile) + self.template = tmpFile.read() + tmpFile.close() + # fill in the new values to the template and write the template to the batchscript file + # now create the command file for creating Singularity sandbox container + with open(batchscriptPath, 'w') as f: + f.write(self.template.format(container = accPath, + container_name = container_name, + source_url = url)) + # change permissions on script to executable + st = os.stat(batchscriptPath) + os.chmod(batchscriptPath, st.st_mode | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH ) + tmpLog.debug('Successfully created batch file to create singularity container - {0}'.format(batchscriptPath)) + except Exception: + core_utils.dump_error_message(tmpLog) + + # check if container exists + if os.path.exists(accPath): + return_code = 0 + else: + try: + # make directories if needed + if not os.path.isdir(accPath): + os.makedirs(accPath) + # submit batch script to create container + comStr = "bsub -L /bin/sh" + # submit + tmpLog.debug('submit with {0} and LSF options file {1}'.format(comStr,batchscriptPath)) + p = subprocess.Popen(comStr.split(), + shell=False, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + stdin=open(batchscriptPath,'r')) + # check return code + stdOut, stdErr = p.communicate() + return_code = p.returncode + tmpLog.debug('retCode={0}'.format(return_code)) + tmpLog.debug('stdOut={0}'.format(stdOut)) + if return_code == 0: + # extract batchID + batchID = str(stdOut.split()[1],'utf-8') + result = re.sub('[^0-9]','', batchID) + tmpLog.debug('strip out non-numberic charactors from {0} - result {1}'.format(batchID,result)) + else: + # failed + errStr = stdOut + ' ' + stdErr + tmpLog.error(errStr) + except Exception: + core_utils.dump_error_message(tmpLog) + tmpLog.debug('end with return code {0}'.format(return_code)) + return return_code From dffb400e4df997ccd05bb1f00bd75bc75275be73 Mon Sep 17 00:00:00 2001 From: Doug Benjamin Date: Tue, 3 Mar 2020 10:39:58 -0600 Subject: [PATCH 095/181] remove code for creating image --- .../harvestersubmitter/lsf_submitter.py | 24 ------------------- 1 file changed, 24 deletions(-) diff --git a/pandaharvester/harvestersubmitter/lsf_submitter.py b/pandaharvester/harvestersubmitter/lsf_submitter.py index f05ee092..479d150c 100644 --- a/pandaharvester/harvestersubmitter/lsf_submitter.py +++ b/pandaharvester/harvestersubmitter/lsf_submitter.py @@ -8,7 +8,6 @@ from pandaharvester.harvestercore import core_utils from pandaharvester.harvestercore.plugin_base import PluginBase -from pandaharvester.harvestermover import mover_utils # logger baseLogger = core_utils.setup_logger('lsf_submitter') @@ -99,35 +98,12 @@ def make_batch_script(self, workspec): else: numnodes=workspec.nCore / self.nCorePerNode - # create the string of files to execute to create the containers for the jobs - containerscripts = "" - scope = "aux_input" - container_command_files = [] - # loop over the jobspec's associated with the workerspec - jobspec_list = workspec.get_jobspec_list() - for jobSpec in jobspec_list: - jobPars = jobSpec.jobParams['jobPars'] - trf = jobSpec.jobParams['transformation'] - container = None - tmpM = re.search(' --containerImage\s+([^\s]+)', jobPars) - if tmpM is not None: - container = tmpM.group(1) - container_name = container.rsplit('/',1)[1] - path = mover_utils.construct_file_path(self.localBasePath, scope, container_name) - container_command_files.append("/bin/sh {0}\n".format(path)) - - if len(container_command_files) > 0: - # remove duplicates - container_command_files = list( dict.fromkeys(container_command_files) ) - containerscripts = ' '.join(container_command_files) - tmpFile = tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='_submit.sh', dir=workspec.get_access_point()) tmpFile.write(self.template.format(nCorePerNode=self.nCorePerNode, #localQueue=self.localQueue, #projectName=self.projectName, nNode=numnodes, accessPoint=workspec.accessPoint, - containerscripts=containerscripts, #walltime=maxWalltime, #yodaWallClockLimit=yodaWallClockLimit, workerID=workspec.workerID) From ce9143ae36b1b5d378c02a27995db0ca19446e75 Mon Sep 17 00:00:00 2001 From: Doug Benjamin Date: Tue, 3 Mar 2020 12:39:01 -0600 Subject: [PATCH 096/181] better protection for multiple image creation submissions --- .../analysis_aux_preparator.py | 27 ++++++++----------- 1 file changed, 11 insertions(+), 16 deletions(-) diff --git a/pandaharvester/harvesterpreparator/analysis_aux_preparator.py b/pandaharvester/harvesterpreparator/analysis_aux_preparator.py index d377e3d7..aa807ae5 100644 --- a/pandaharvester/harvesterpreparator/analysis_aux_preparator.py +++ b/pandaharvester/harvesterpreparator/analysis_aux_preparator.py @@ -1,4 +1,5 @@ import os +import re import stat import shutil try: @@ -257,7 +258,7 @@ def make_image(self, jobspec, args): def make_image_Summit(self, jobspec, accPath, url): # make logger tmpLog = self.make_logger(baseLogger, 'PandaID={0}'.format(jobspec.PandaID), - method_name='make_image') + method_name='make_image_Summit') tmpLog.debug('start') return_code = 1 tmpLog.debug('make_container_Summit container: {0} url : {1}'.format(accPath,url)) @@ -265,8 +266,12 @@ def make_image_Summit(self, jobspec, accPath, url): container_name = url.rsplit('/',1)[1] # check if batch script for creating container exists batchscriptname = 'build_singularity_image_{0}.bsub'.format(container_name) - batchscriptPath = os.join.path(self.localContainerPath,batchscriptname) - if not os.path.exists(batchscriptPath): + batchscriptPath = os.path.join(self.localContainerPath,batchscriptname) + # use batch script file as a lock to avoid double submissions + if os.path.exists(batchscriptPath): + tmpLog.debug('batch script to create image exists- {0} Skipping submission'.format(batchscriptPath)) + else: + tmpLog.debug('Create batch script to create image - {0}'.format(batchscriptPath)) try: # Open template for batch script tmpFile = open(self.containertemplateFile) @@ -282,18 +287,6 @@ def make_image_Summit(self, jobspec, accPath, url): st = os.stat(batchscriptPath) os.chmod(batchscriptPath, st.st_mode | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH ) tmpLog.debug('Successfully created batch file to create singularity container - {0}'.format(batchscriptPath)) - except Exception: - core_utils.dump_error_message(tmpLog) - - # check if container exists - if os.path.exists(accPath): - return_code = 0 - else: - try: - # make directories if needed - if not os.path.isdir(accPath): - os.makedirs(accPath) - # submit batch script to create container comStr = "bsub -L /bin/sh" # submit tmpLog.debug('submit with {0} and LSF options file {1}'.format(comStr,batchscriptPath)) @@ -311,11 +304,13 @@ def make_image_Summit(self, jobspec, accPath, url): # extract batchID batchID = str(stdOut.split()[1],'utf-8') result = re.sub('[^0-9]','', batchID) - tmpLog.debug('strip out non-numberic charactors from {0} - result {1}'.format(batchID,result)) + tmpLog.debug('LSF job id : {0} - result {1}'.format(batchID,result)) else: # failed errStr = stdOut + ' ' + stdErr tmpLog.error(errStr) + if os.path.exists(batchscriptPath): + os.remove(batchscriptPath) except Exception: core_utils.dump_error_message(tmpLog) tmpLog.debug('end with return code {0}'.format(return_code)) From 959899df606f8ef219be76979fc540463feb003e Mon Sep 17 00:00:00 2001 From: Doug Benjamin Date: Wed, 4 Mar 2020 05:14:52 -0600 Subject: [PATCH 097/181] remove Summit specific pieces --- .../analysis_aux_preparator.py | 64 ------------------- 1 file changed, 64 deletions(-) diff --git a/pandaharvester/harvesterpreparator/analysis_aux_preparator.py b/pandaharvester/harvesterpreparator/analysis_aux_preparator.py index aa807ae5..916cd385 100644 --- a/pandaharvester/harvesterpreparator/analysis_aux_preparator.py +++ b/pandaharvester/harvesterpreparator/analysis_aux_preparator.py @@ -90,9 +90,6 @@ def trigger_preparation(self, jobspec): elif self.containerRuntime == 'singularity': args = ['singularity', 'build', '--sandbox', accPathTmp, url ] return_code = self.make_image(jobspec,args) - elif self.containerRuntime == 'Summit_singularity': - retCode = self.make_image_Summit(jobspec, accPath, url) - tmpLog.debug('self.make_image_Summit(tmpLog, accPath, url) return value : {0}'.format(retCode)) else: tmpLog.error('unsupported container runtime : {0}'.format(self.containerRuntime)) elif url.startswith('/'): @@ -254,64 +251,3 @@ def make_image(self, jobspec, args): tmpLog.debug('end with return code {0}'.format(return_code)) return return_code - # create and submit the job make the image runs on the Summit Launch nodes - def make_image_Summit(self, jobspec, accPath, url): - # make logger - tmpLog = self.make_logger(baseLogger, 'PandaID={0}'.format(jobspec.PandaID), - method_name='make_image_Summit') - tmpLog.debug('start') - return_code = 1 - tmpLog.debug('make_container_Summit container: {0} url : {1}'.format(accPath,url)) - # extract container name from url - container_name = url.rsplit('/',1)[1] - # check if batch script for creating container exists - batchscriptname = 'build_singularity_image_{0}.bsub'.format(container_name) - batchscriptPath = os.path.join(self.localContainerPath,batchscriptname) - # use batch script file as a lock to avoid double submissions - if os.path.exists(batchscriptPath): - tmpLog.debug('batch script to create image exists- {0} Skipping submission'.format(batchscriptPath)) - else: - tmpLog.debug('Create batch script to create image - {0}'.format(batchscriptPath)) - try: - # Open template for batch script - tmpFile = open(self.containertemplateFile) - self.template = tmpFile.read() - tmpFile.close() - # fill in the new values to the template and write the template to the batchscript file - # now create the command file for creating Singularity sandbox container - with open(batchscriptPath, 'w') as f: - f.write(self.template.format(container = accPath, - container_name = container_name, - source_url = url)) - # change permissions on script to executable - st = os.stat(batchscriptPath) - os.chmod(batchscriptPath, st.st_mode | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH ) - tmpLog.debug('Successfully created batch file to create singularity container - {0}'.format(batchscriptPath)) - comStr = "bsub -L /bin/sh" - # submit - tmpLog.debug('submit with {0} and LSF options file {1}'.format(comStr,batchscriptPath)) - p = subprocess.Popen(comStr.split(), - shell=False, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - stdin=open(batchscriptPath,'r')) - # check return code - stdOut, stdErr = p.communicate() - return_code = p.returncode - tmpLog.debug('retCode={0}'.format(return_code)) - tmpLog.debug('stdOut={0}'.format(stdOut)) - if return_code == 0: - # extract batchID - batchID = str(stdOut.split()[1],'utf-8') - result = re.sub('[^0-9]','', batchID) - tmpLog.debug('LSF job id : {0} - result {1}'.format(batchID,result)) - else: - # failed - errStr = stdOut + ' ' + stdErr - tmpLog.error(errStr) - if os.path.exists(batchscriptPath): - os.remove(batchscriptPath) - except Exception: - core_utils.dump_error_message(tmpLog) - tmpLog.debug('end with return code {0}'.format(return_code)) - return return_code From 5aa92e6b539246af271c8ba0c998e527966d04b8 Mon Sep 17 00:00:00 2001 From: tmaeno Date: Thu, 5 Mar 2020 14:09:50 +0100 Subject: [PATCH 098/181] to avoid bytes from Popen.communicate --- pandaharvester/commit_timestamp.py | 2 +- .../harvesterpreparator/analysis_aux_preparator.py | 8 +++----- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/pandaharvester/commit_timestamp.py b/pandaharvester/commit_timestamp.py index 4d750484..d7669d41 100644 --- a/pandaharvester/commit_timestamp.py +++ b/pandaharvester/commit_timestamp.py @@ -1 +1 @@ -timestamp = "02-03-2020 21:58:22 on master (by tmaeno)" +timestamp = "05-03-2020 13:09:50 on master (by tmaeno)" diff --git a/pandaharvester/harvesterpreparator/analysis_aux_preparator.py b/pandaharvester/harvesterpreparator/analysis_aux_preparator.py index 916cd385..2855865c 100644 --- a/pandaharvester/harvesterpreparator/analysis_aux_preparator.py +++ b/pandaharvester/harvesterpreparator/analysis_aux_preparator.py @@ -1,6 +1,4 @@ import os -import re -import stat import shutil try: import subprocess32 as subprocess @@ -130,7 +128,7 @@ def trigger_preparation(self, jobspec): # execute try: tmpLog.debug('executing external command: ' + ' '.join(args)) - p = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + p = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) stdout, stderr = p.communicate() return_code = p.returncode if stdout is None: @@ -191,7 +189,7 @@ def check_stage_in_status(self, jobspec): # execute try: tmpLog.debug('executing external command: ' + ' '.join(args)) - p = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + p = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) stdout, stderr = p.communicate() return_code = p.returncode if stdout is None: @@ -237,7 +235,7 @@ def make_image(self, jobspec, args): return_code = 1 try: tmpLog.debug('executing ' + ' '.join(args)) - p = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + p = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) stdout, stderr = p.communicate() return_code = p.returncode if stdout is not None: From fa5d9723713bf1a469e0b81adc68a04ec7e54e21 Mon Sep 17 00:00:00 2001 From: tmaeno Date: Fri, 6 Mar 2020 15:48:09 +0100 Subject: [PATCH 099/181] check file status in job_fetcher and preparator based on input type --- pandaharvester/commit_timestamp.py | 2 +- pandaharvester/harvesterbody/job_fetcher.py | 24 ++-- pandaharvester/harvesterbody/preparator.py | 7 +- pandaharvester/harvestermisc/rucio_utils.py | 115 ++++++++------------ 4 files changed, 64 insertions(+), 84 deletions(-) diff --git a/pandaharvester/commit_timestamp.py b/pandaharvester/commit_timestamp.py index d7669d41..f4e34224 100644 --- a/pandaharvester/commit_timestamp.py +++ b/pandaharvester/commit_timestamp.py @@ -1 +1 @@ -timestamp = "05-03-2020 13:09:50 on master (by tmaeno)" +timestamp = "06-03-2020 14:48:09 on master (by tmaeno)" diff --git a/pandaharvester/harvesterbody/job_fetcher.py b/pandaharvester/harvesterbody/job_fetcher.py index b3418877..36dc5987 100644 --- a/pandaharvester/harvesterbody/job_fetcher.py +++ b/pandaharvester/harvesterbody/job_fetcher.py @@ -98,11 +98,6 @@ def run(self): fileGroupDictList.append(extractorCore.get_aux_inputs(jobSpec)) for fileGroupDict in fileGroupDictList: for tmpLFN, fileAttrs in iteritems(fileGroupDict): - # check file status - if tmpLFN not in fileStatMap: - fileStatMap[tmpLFN] = self.dbProxy.get_file_status(tmpLFN, 'input', - queueConfig.ddmEndpointIn, - 'starting') # make file spec fileSpec = FileSpec() fileSpec.PandaID = jobSpec.PandaID @@ -110,20 +105,25 @@ def run(self): fileSpec.lfn = tmpLFN fileSpec.endpoint = queueConfig.ddmEndpointIn fileSpec.scope = fileAttrs['scope'] + if 'INTERNAL_FileType' in fileAttrs: + fileSpec.fileType = fileAttrs['INTERNAL_FileType'] + jobSpec.auxInput = JobSpec.AUX_hasAuxInput + else: + fileSpec.fileType = 'input' + # check file status + if tmpLFN not in fileStatMap: + fileStatMap[tmpLFN] = self.dbProxy.get_file_status(tmpLFN, fileSpec.fileType, + queueConfig.ddmEndpointIn, + 'starting') # set preparing to skip stage-in if the file is (being) taken care of by another job - if 'ready' in fileStatMap[tmpLFN] or 'preparing' in fileStatMap[tmpLFN] \ - or 'to_prepare' in fileStatMap[tmpLFN]: + if [x for x in ['ready', 'preparing', 'to_prepare', 'triggered'] + if x in fileStatMap[tmpLFN]]: fileSpec.status = 'preparing' else: fileSpec.status = 'to_prepare' if fileSpec.status not in fileStatMap[tmpLFN]: fileStatMap[tmpLFN][fileSpec.status] = 0 fileStatMap[tmpLFN][fileSpec.status] += 1 - if 'INTERNAL_FileType' in fileAttrs: - fileSpec.fileType = fileAttrs['INTERNAL_FileType'] - jobSpec.auxInput = JobSpec.AUX_hasAuxInput - else: - fileSpec.fileType = 'input' if 'INTERNAL_URL' in fileAttrs: fileSpec.url = fileAttrs['INTERNAL_URL'] jobSpec.add_in_file(fileSpec) diff --git a/pandaharvester/harvesterbody/preparator.py b/pandaharvester/harvesterbody/preparator.py index 618d0f5c..b4b78470 100644 --- a/pandaharvester/harvesterbody/preparator.py +++ b/pandaharvester/harvesterbody/preparator.py @@ -7,6 +7,7 @@ from pandaharvester.harvesterbody.agent_base import AgentBase from pandaharvester.harvestercore.pilot_errors import PilotErrors from pandaharvester.harvestercore.job_spec import JobSpec +from pandaharvester.harvestercore.file_spec import FileSpec # logger _logger = core_utils.setup_logger('preparator') @@ -176,8 +177,10 @@ def run(self): # get plugin if jobSpec.auxInput in [None, JobSpec.AUX_hasAuxInput]: preparatorCore = self.pluginFactory.get_plugin(queueConfig.preparator) + fileType = 'input' else: preparatorCore = self.pluginFactory.get_plugin(queueConfig.aux_preparator) + fileType = FileSpec.AUX_INPUT if preparatorCore is None: # not found tmpLog.error('plugin for {0} not found'.format(jobSpec.computingSite)) @@ -201,13 +204,13 @@ def run(self): updateStatus = False if fileSpec.lfn not in fileStatMap[queueConfig.ddmEndpointIn]: fileStatMap[queueConfig.ddmEndpointIn][fileSpec.lfn] \ - = self.dbProxy.get_file_status(fileSpec.lfn, 'input', queueConfig.ddmEndpointIn, + = self.dbProxy.get_file_status(fileSpec.lfn, fileType, queueConfig.ddmEndpointIn, 'starting') if 'ready' in fileStatMap[queueConfig.ddmEndpointIn][fileSpec.lfn]: # the file is ready fileSpec.status = 'ready' # set group info if any - groupInfo = self.dbProxy.get_group_for_file(fileSpec.lfn, 'input', + groupInfo = self.dbProxy.get_group_for_file(fileSpec.lfn, fileType, queueConfig.ddmEndpointIn) if groupInfo is not None: fileSpec.groupID = groupInfo['groupID'] diff --git a/pandaharvester/harvestermisc/rucio_utils.py b/pandaharvester/harvestermisc/rucio_utils.py index 7dc45d68..92c026ad 100644 --- a/pandaharvester/harvestermisc/rucio_utils.py +++ b/pandaharvester/harvestermisc/rucio_utils.py @@ -2,30 +2,12 @@ utilities routines associated with Rucio CLI access """ -from future.utils import iteritems - try: import subprocess32 as subprocess except: import subprocess from pandaharvester.harvestercore import core_utils -from pandalogger.PandaLogger import PandaLogger -from pandalogger.LogWrapper import LogWrapper - -import time -import datetime -import uuid -import os -import sys -import stat -import os.path -import threading -import tarfile -import hashlib -import string -import shutil -import errno def rucio_create_dataset(tmpLog,datasetScope,datasetName): @@ -35,58 +17,53 @@ def rucio_create_dataset(tmpLog,datasetScope,datasetName): lifetime = 7*24*60*60 tmpLog.debug('register {0}:{1} lifetime = {2}' .format(datasetScope, datasetName,lifetime)) - try: - executable = ['/usr/bin/env', - 'rucio', 'add-dataset'] - executable += [ '--lifetime',('%d' %lifetime)] - executable += [datasetName] - - #print executable - - tmpLog.debug('rucio add-dataset command: {0} '.format(executable)) - tmpLog.debug('rucio add-dataset command (for human): %s ' % ' '.join(executable)) - - process = subprocess.Popen(executable, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT) - - stdout,stderr = process.communicate() - - if process.returncode == 0: - tmpLog.debug(stdout) - return True,'' + executable = ['/usr/bin/env', + 'rucio', 'add-dataset'] + executable += [ '--lifetime',('%d' %lifetime)] + executable += [datasetName] + tmpLog.debug('rucio add-dataset command: {0} '.format(executable)) + tmpLog.debug('rucio add-dataset command (for human): %s ' % ' '.join(executable)) + process = subprocess.Popen(executable, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + universal_newlines=True) + stdout,stderr = process.communicate() + if process.returncode == 0: + tmpLog.debug(stdout) + return True,'' + else: + # check what failed + dataset_exists = False + rucio_sessions_limit_error = False + for line in stdout.split('\n'): + if 'Data Identifier Already Exists' in line: + dataset_exists = True + break + elif 'exceeded simultaneous SESSIONS_PER_USER limit' in line: + rucio_sessions_limit_error = True + break + if dataset_exists: + errMsg = 'dataset {0}:{1} already exists'.format(datasetScope, + datasetName) + tmpLog.debug(errMsg) + return True,errMsg + elif rucio_sessions_limit_error: + # do nothing + errStr = 'Rucio returned error, will retry: stdout: {0}'.format(stdout) + tmpLog.warning(errStr) + return None,errStr else: - # check what failed - dataset_exists = False - rucio_sessions_limit_error = False - for line in stdout.split('\n'): - if 'Data Identifier Already Exists' in line: - dataset_exists = True - break - elif 'exceeded simultaneous SESSIONS_PER_USER limit' in line: - rucio_sessions_limit_error = True - break - if dataset_exists: - errMsg = 'dataset {0}:{1} already exists'.format(datasetScope, - datasetName) - tmpLog.debug(errMsg) - return True,errMsg - elif rucio_sessions_limit_error: - # do nothing - errStr = 'Rucio returned error, will retry: stdout: {0}'.format(stdout) - tmpLog.warning(errStr) - return None,errStr - else: - # some other Rucio error - errStr = 'Rucio returned error : stdout: {0}'.format(stdout) - tmpLog.error(errStr) - return False,errStr - except Exception: - errMsg = 'Could not create dataset {0}:{1}'.format(datasetScope, - datasetName) - core_utils.dump_error_message(tmpLog) - tmpLog.error(errMsg) - return False,errMsg + # some other Rucio error + errStr = 'Rucio returned error : stdout: {0}'.format(stdout) + tmpLog.error(errStr) + return False,errStr + except Exception as e: + errMsg = 'Could not create dataset {0}:{1} with {2}'.format(datasetScope, + datasetName, + str(e)) + core_utils.dump_error_message(tmpLog) + tmpLog.error(errMsg) + return False,errMsg def rucio_add_files_to_dataset(tmpLog,datasetScope,datasetName,fileList): # add files to dataset From a32319596ddf4524163c034033b45b663b07d1f0 Mon Sep 17 00:00:00 2001 From: tmaeno Date: Mon, 9 Mar 2020 09:10:07 +0100 Subject: [PATCH 100/181] to ignore ready files in aux_input_preparation --- pandaharvester/commit_timestamp.py | 2 +- pandaharvester/harvesterbody/preparator.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/pandaharvester/commit_timestamp.py b/pandaharvester/commit_timestamp.py index f4e34224..5f99ceba 100644 --- a/pandaharvester/commit_timestamp.py +++ b/pandaharvester/commit_timestamp.py @@ -1 +1 @@ -timestamp = "06-03-2020 14:48:09 on master (by tmaeno)" +timestamp = "09-03-2020 08:10:07 on master (by tmaeno)" diff --git a/pandaharvester/harvesterbody/preparator.py b/pandaharvester/harvesterbody/preparator.py index b4b78470..48061049 100644 --- a/pandaharvester/harvesterbody/preparator.py +++ b/pandaharvester/harvesterbody/preparator.py @@ -154,7 +154,8 @@ def run(self): lockedBy, 'preparing', max_files_per_job=maxFilesPerJob, - ng_file_status_list=['triggered']) + ng_file_status_list=['triggered', + 'ready']) mainLog.debug('got {0} jobs to prepare'.format(len(jobsToTrigger))) # loop over all jobs fileStatMap = dict() From 9c7b678e2c4529286447ef2fa2574655a260b8e2 Mon Sep 17 00:00:00 2001 From: tmaeno Date: Tue, 10 Mar 2020 12:48:15 +0100 Subject: [PATCH 101/181] to set path for ready files in preparator.trigger_preparation --- pandaharvester/commit_timestamp.py | 2 +- pandaharvester/harvesterbody/job_fetcher.py | 4 +--- pandaharvester/harvesterbody/preparator.py | 7 ++++--- pandaharvester/harvestercore/db_proxy.py | 10 ++++++---- 4 files changed, 12 insertions(+), 11 deletions(-) diff --git a/pandaharvester/commit_timestamp.py b/pandaharvester/commit_timestamp.py index 5f99ceba..72bc14b8 100644 --- a/pandaharvester/commit_timestamp.py +++ b/pandaharvester/commit_timestamp.py @@ -1 +1 @@ -timestamp = "09-03-2020 08:10:07 on master (by tmaeno)" +timestamp = "10-03-2020 11:48:15 on master (by tmaeno)" diff --git a/pandaharvester/harvesterbody/job_fetcher.py b/pandaharvester/harvesterbody/job_fetcher.py index 36dc5987..78d7fb15 100644 --- a/pandaharvester/harvesterbody/job_fetcher.py +++ b/pandaharvester/harvesterbody/job_fetcher.py @@ -121,9 +121,7 @@ def run(self): fileSpec.status = 'preparing' else: fileSpec.status = 'to_prepare' - if fileSpec.status not in fileStatMap[tmpLFN]: - fileStatMap[tmpLFN][fileSpec.status] = 0 - fileStatMap[tmpLFN][fileSpec.status] += 1 + fileStatMap[tmpLFN].setdefault(fileSpec.status, None) if 'INTERNAL_URL' in fileAttrs: fileSpec.url = fileAttrs['INTERNAL_URL'] jobSpec.add_in_file(fileSpec) diff --git a/pandaharvester/harvesterbody/preparator.py b/pandaharvester/harvesterbody/preparator.py index 48061049..7a5ac288 100644 --- a/pandaharvester/harvesterbody/preparator.py +++ b/pandaharvester/harvesterbody/preparator.py @@ -210,6 +210,9 @@ def run(self): if 'ready' in fileStatMap[queueConfig.ddmEndpointIn][fileSpec.lfn]: # the file is ready fileSpec.status = 'ready' + if fileStatMap[queueConfig.ddmEndpointIn][fileSpec.lfn]['path']: + fileSpec.path = list( + fileStatMap[queueConfig.ddmEndpointIn][fileSpec.lfn]['path'])[0] # set group info if any groupInfo = self.dbProxy.get_group_for_file(fileSpec.lfn, fileType, queueConfig.ddmEndpointIn) @@ -229,9 +232,7 @@ def run(self): # set new status if updateStatus: newFileStatusData.append((fileSpec.fileID, fileSpec.lfn, fileSpec.status)) - if fileSpec.status not in fileStatMap[queueConfig.ddmEndpointIn][fileSpec.lfn]: - fileStatMap[queueConfig.ddmEndpointIn][fileSpec.lfn][fileSpec.status] = 0 - fileStatMap[queueConfig.ddmEndpointIn][fileSpec.lfn][fileSpec.status] += 1 + fileStatMap[queueConfig.ddmEndpointIn][fileSpec.lfn].setdefault(fileSpec.status, None) if len(newFileStatusData) > 0: self.dbProxy.change_file_status(jobSpec.PandaID, newFileStatusData, lockedBy) # wait since files are being prepared by another diff --git a/pandaharvester/harvestercore/db_proxy.py b/pandaharvester/harvestercore/db_proxy.py index 131e3470..93120465 100644 --- a/pandaharvester/harvestercore/db_proxy.py +++ b/pandaharvester/harvestercore/db_proxy.py @@ -4087,12 +4087,12 @@ def get_file_status(self, lfn, file_type, endpoint, job_status): method_name='get_file_status') tmpLog.debug('start') # sql to get files - sqlF = "SELECT f.status, COUNT(*) cnt FROM {0} f, {1} j ".format(fileTableName, jobTableName) + sqlF = "SELECT f.status, f.path, COUNT(*) cnt FROM {0} f, {1} j ".format(fileTableName, jobTableName) sqlF += "WHERE j.PandaID=f.PandaID AND j.status=:jobStatus " sqlF += "AND f.lfn=:lfn AND f.fileType=:type " if endpoint is not None: sqlF += "AND f.endpoint=:endpoint " - sqlF += "GROUP BY f.status " + sqlF += "GROUP BY f.status, f.path " # get files varMap = dict() varMap[':lfn'] = lfn @@ -4102,8 +4102,10 @@ def get_file_status(self, lfn, file_type, endpoint, job_status): varMap[':endpoint'] = endpoint self.execute(sqlF, varMap) retMap = dict() - for status, cnt in self.cur.fetchall(): - retMap[status] = cnt + for status, path, cnt in self.cur.fetchall(): + retMap.setdefault(status, {'cnt': 0, 'path': set()}) + retMap[status]['cnt'] += cnt + retMap[status]['path'].add(path) # commit self.commit() tmpLog.debug('got {0}'.format(str(retMap))) From d344f75a0e470f47783bfe8a3cc8914ea6d2e036 Mon Sep 17 00:00:00 2001 From: Doug Benjamin Date: Wed, 11 Mar 2020 06:48:28 -0500 Subject: [PATCH 102/181] fix Type error problem --- pandaharvester/harvesterbody/preparator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandaharvester/harvesterbody/preparator.py b/pandaharvester/harvesterbody/preparator.py index 7a5ac288..10c1a20a 100644 --- a/pandaharvester/harvesterbody/preparator.py +++ b/pandaharvester/harvesterbody/preparator.py @@ -210,9 +210,9 @@ def run(self): if 'ready' in fileStatMap[queueConfig.ddmEndpointIn][fileSpec.lfn]: # the file is ready fileSpec.status = 'ready' - if fileStatMap[queueConfig.ddmEndpointIn][fileSpec.lfn]['path']: + if fileStatMap[queueConfig.ddmEndpointIn][fileSpec.lfn]['ready']['path']: fileSpec.path = list( - fileStatMap[queueConfig.ddmEndpointIn][fileSpec.lfn]['path'])[0] + fileStatMap[queueConfig.ddmEndpointIn][fileSpec.lfn]['ready']['path'])[0] # set group info if any groupInfo = self.dbProxy.get_group_for_file(fileSpec.lfn, fileType, queueConfig.ddmEndpointIn) From 1c6eff6e7acf2d82235a5febcaef4d7b174f71a5 Mon Sep 17 00:00:00 2001 From: Doug Benjamin Date: Wed, 11 Mar 2020 08:06:31 -0500 Subject: [PATCH 103/181] add file path to transfer id --- .../harvesterpreparator/analysis_aux_preparator.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/pandaharvester/harvesterpreparator/analysis_aux_preparator.py b/pandaharvester/harvesterpreparator/analysis_aux_preparator.py index 2855865c..bf1ab492 100644 --- a/pandaharvester/harvesterpreparator/analysis_aux_preparator.py +++ b/pandaharvester/harvesterpreparator/analysis_aux_preparator.py @@ -34,6 +34,7 @@ def trigger_preparation(self, jobspec): # loop over all inputs allDone = True bulkExtCommand = {} + tmpLog.debug('number of inFiles : {0}'.format(len(jobspec.inFiles))) for tmpFileSpec in jobspec.inFiles: # local access path url = tmpFileSpec.url @@ -117,6 +118,7 @@ def trigger_preparation(self, jobspec): allDone = False # execute external command execIdMap = {} + tmpLog.debug('bulkExtCommand : {0}'.format(bulkExtCommand)) for protocol in bulkExtCommand: args = [] for arg in bulkExtCommand[protocol]['command']['trigger']['args']: @@ -139,7 +141,9 @@ def trigger_preparation(self, jobspec): executionID = None if return_code == 0 and 'check' in bulkExtCommand[protocol]['command']: executionID = [s for s in stdout.split('\n') if s][-1] - executionID = '{0}:{1}'.format(protocol, executionID) + dst = bulkExtCommand[protocol]['dst'][0] + executionID = '{0}:{1}:{2}'.format(protocol, executionID, dst) + tmpLog.debug('executionID - {0}'.format(executionID)) execIdMap[executionID] = {'lfns': bulkExtCommand[protocol]['lfn'], 'groupStatus': 'active'} stdout = stdout.replace('\n', ' ') stderr = stderr.replace('\n', ' ') @@ -151,6 +155,7 @@ def trigger_preparation(self, jobspec): core_utils.dump_error_message(tmpLog) allDone = False # keep execution ID to check later + tmpLog.debug('execIdMap : {0}'.format(execIdMap)) if execIdMap: jobspec.set_groups_to_files(execIdMap) # done @@ -180,11 +185,13 @@ def check_stage_in_status(self, jobspec): for tmpGroupID in transferGroups: if tmpGroupID is None: continue - protocol, executionID = tmpGroupID.split(':') + protocol, executionID, dst = tmpGroupID.split(':') args = [] for arg in self.externalCommand[protocol]['check']['args']: if arg == '{id}': arg = executionID + elif arg == '{dst}': + arg = dst args.append(arg) # execute try: @@ -198,6 +205,7 @@ def check_stage_in_status(self, jobspec): stderr = '' stdout = stdout.replace('\n', ' ') stderr = stderr.replace('\n', ' ') + tmpLog.debug("return_code: {0}".format(return_code)) tmpLog.debug("stdout: {0}".format(stdout)) tmpLog.debug("stderr: {0}".format(stderr)) if return_code != 0: @@ -209,7 +217,9 @@ def check_stage_in_status(self, jobspec): allDone = False break if not allDone: + tmpLog.debug("check_stage_in_status: Return : None errMsg : {0}".format(errMsg)) return None, errMsg + tmpLog.debug("check_stage_in_status: Return : True") return True, '' # resolve input file paths From 13993a5f158b923421e2910d3ac72775d2e6842d Mon Sep 17 00:00:00 2001 From: Doug Benjamin Date: Wed, 11 Mar 2020 18:50:17 -0500 Subject: [PATCH 104/181] make requested change per Tadashi --- pandaharvester/harvesterpreparator/analysis_aux_preparator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandaharvester/harvesterpreparator/analysis_aux_preparator.py b/pandaharvester/harvesterpreparator/analysis_aux_preparator.py index bf1ab492..ae5c044b 100644 --- a/pandaharvester/harvesterpreparator/analysis_aux_preparator.py +++ b/pandaharvester/harvesterpreparator/analysis_aux_preparator.py @@ -141,7 +141,7 @@ def trigger_preparation(self, jobspec): executionID = None if return_code == 0 and 'check' in bulkExtCommand[protocol]['command']: executionID = [s for s in stdout.split('\n') if s][-1] - dst = bulkExtCommand[protocol]['dst'][0] + dst = ','.join(bulkExtCommand[protocol]['dst']) executionID = '{0}:{1}:{2}'.format(protocol, executionID, dst) tmpLog.debug('executionID - {0}'.format(executionID)) execIdMap[executionID] = {'lfns': bulkExtCommand[protocol]['lfn'], 'groupStatus': 'active'} From 101827764beb55f7b83c8b03e777f644008989a0 Mon Sep 17 00:00:00 2001 From: fahui Date: Thu, 12 Mar 2020 14:01:04 +0100 Subject: [PATCH 105/181] htcondor: support condor host config file and weighting --- pandaharvester/commit_timestamp.py | 2 +- .../harvestersubmitter/htcondor_submitter.py | 37 +++++++++++++++++-- pandaharvester/panda_pkg_info.py | 2 +- 3 files changed, 36 insertions(+), 5 deletions(-) diff --git a/pandaharvester/commit_timestamp.py b/pandaharvester/commit_timestamp.py index 72bc14b8..95a18bc2 100644 --- a/pandaharvester/commit_timestamp.py +++ b/pandaharvester/commit_timestamp.py @@ -1 +1 @@ -timestamp = "10-03-2020 11:48:15 on master (by tmaeno)" +timestamp = "12-03-2020 13:01:04 on flin (by fahui)" diff --git a/pandaharvester/harvestersubmitter/htcondor_submitter.py b/pandaharvester/harvestersubmitter/htcondor_submitter.py index 0330270b..4a9727ba 100644 --- a/pandaharvester/harvestersubmitter/htcondor_submitter.py +++ b/pandaharvester/harvestersubmitter/htcondor_submitter.py @@ -4,6 +4,7 @@ import tempfile import threading import random +import json from concurrent.futures import ThreadPoolExecutor import re @@ -437,6 +438,7 @@ def parse_batch_job_filename(value_str, file_dir, batchID, guess=False): class HTCondorSubmitter(PluginBase): # constructor def __init__(self, **kwarg): + tmpLog = core_utils.make_logger(baseLogger, method_name='__init__') self.logBaseURL = None self.templateFile = None PluginBase.__init__(self, **kwarg) @@ -485,7 +487,7 @@ def __init__(self, **kwarg): self.CEtemplateDir except AttributeError: self.CEtemplateDir = '' - # remote condor schedd and pool name (collector), and spool option + # remote condor schedd and pool name (collector) try: self.condorSchedd except AttributeError: @@ -494,6 +496,32 @@ def __init__(self, **kwarg): self.condorPool except AttributeError: self.condorPool = None + # json config file of remote condor host: schedd/pool and weighting. If set, condorSchedd and condorPool are overwritten + try: + self.condorHostConfig + except AttributeError: + self.condorHostConfig = False + if self.condorHostConfig: + try: + self.condorSchedd = [] + self.condorPool = [] + self.condorHostWeight = [] + with open(self.condorHostConfig, 'r') as f: + condor_host_config_map = json.load(f) + for _schedd, _cm in condor_host_config_map.items(): + _pool = _cm['pool'] + _weight = int(_cm['weight']) + self.condorSchedd.append(_schedd) + self.condorPool.append(_pool) + self.condorHostWeight.append(_weight) + except Exception as e: + tmpLog.error('error when parsing condorHostConfig json file; {0}: {1}'.format(e.__class__.__name__, e)) + raise + else: + if isinstance(self.condorSchedd, list): + self.condorHostWeight = [1] * len(self.condorSchedd) + else: + self.condorHostWeight = [1] # condor spool mechanism. If False, need shared FS across remote schedd try: self.useSpool @@ -576,10 +604,13 @@ def submit_workers(self, workspec_list): # deal with Condor schedd and central managers; make a random list the choose n_bulks = _div_round_up(nWorkers, self.minBulkToRamdomizedSchedd) if isinstance(self.condorSchedd, list) and len(self.condorSchedd) > 0: + orig_list = [] if isinstance(self.condorPool, list) and len(self.condorPool) > 0: - orig_list = list(zip(self.condorSchedd, self.condorPool)) + for _schedd, _pool, _weight in zip(self.condorSchedd, self.condorPool, self.condorHostWeight): + orig_list.extend([(_schedd, _pool)] * _weight) else: - orig_list = [ (_schedd, self.condorPool) for _schedd in self.condorSchedd ] + for _schedd, _weight in zip(self.condorSchedd, self.condorHostWeight): + orig_list.extend([(_schedd, self.condorPool)] * _weight) if n_bulks < len(orig_list): schedd_pool_choice_list = random.sample(orig_list, n_bulks) else: diff --git a/pandaharvester/panda_pkg_info.py b/pandaharvester/panda_pkg_info.py index 3dbb6482..39128bab 100644 --- a/pandaharvester/panda_pkg_info.py +++ b/pandaharvester/panda_pkg_info.py @@ -1 +1 @@ -release_version = "0.2.1-rc" +release_version = "0.2.2-rc" From 7f3307bb1834bbc9e55383f49e42912284cdc05c Mon Sep 17 00:00:00 2001 From: fbarreir Date: Thu, 12 Mar 2020 14:09:25 +0100 Subject: [PATCH 106/181] added jobType to jdl template completion --- pandaharvester/harvestersubmitter/htcondor_submitter.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandaharvester/harvestersubmitter/htcondor_submitter.py b/pandaharvester/harvestersubmitter/htcondor_submitter.py index 0330270b..b14188bb 100644 --- a/pandaharvester/harvestersubmitter/htcondor_submitter.py +++ b/pandaharvester/harvestersubmitter/htcondor_submitter.py @@ -402,6 +402,7 @@ def make_a_jdl(workspec, template, n_core_per_node, log_dir, panda_queue_name, e logSubdir=log_subdir, gtag=batch_log_dict.get('gtag', 'fake_GTAG_string'), prodSourceLabel=prod_source_label, + jobType=workspec.jobType, resourceType=_get_resource_type(workspec.resourceType, is_unified_queue), pilotResourceTypeOption=_get_resource_type(workspec.resourceType, is_unified_queue, True, pilot_version), ioIntensity=io_intensity, From 1bfc7fa3197a5e338a072b848e6e2c678f382f47 Mon Sep 17 00:00:00 2001 From: fahui Date: Thu, 12 Mar 2020 14:47:40 +0100 Subject: [PATCH 107/181] fix monitor fifo worker with jobpecs --- pandaharvester/commit_timestamp.py | 2 +- pandaharvester/harvestercore/db_proxy.py | 15 +++++++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/pandaharvester/commit_timestamp.py b/pandaharvester/commit_timestamp.py index 95a18bc2..14a0e7f2 100644 --- a/pandaharvester/commit_timestamp.py +++ b/pandaharvester/commit_timestamp.py @@ -1 +1 @@ -timestamp = "12-03-2020 13:01:04 on flin (by fahui)" +timestamp = "12-03-2020 13:47:40 on flin (by fahui)" diff --git a/pandaharvester/harvestercore/db_proxy.py b/pandaharvester/harvestercore/db_proxy.py index 93120465..fbb313d0 100644 --- a/pandaharvester/harvestercore/db_proxy.py +++ b/pandaharvester/harvestercore/db_proxy.py @@ -4796,6 +4796,10 @@ def get_active_workers(self, n_workers, seconds_ago=0): sqlW += "WHERE status IN (:st_submitted,:st_running,:st_idle) " sqlW += "AND modificationTime<:timeLimit " sqlW += "ORDER BY modificationTime,computingSite LIMIT {0} ".format(n_workers) + # sql to get jobs + sqlJ = "SELECT j.{columns} FROM {jobWorkerTableName} jw, {jobTableName} j ".format(columns=JobSpec.column_names(), jobTableName=jobTableName, jobWorkerTableName=jobWorkerTableName) + sqlJ += "WHERE j.PandaID=jw.PandaID AND jw.workerID=:workerID " + # parameter map varMap = dict() varMap[':timeLimit'] = datetime.datetime.utcnow() - datetime.timedelta(seconds=seconds_ago) varMap[':st_submitted'] = WorkSpec.ST_submitted @@ -4806,7 +4810,18 @@ def get_active_workers(self, n_workers, seconds_ago=0): def _get_workspec_from_record(rec): workspec = WorkSpec() workspec.pack(rec) + jobspec_list = [] workspec.pandaid_list = [] + varMap = dict() + varMap[':workerID'] = workspec.workerID + self.execute(sqlJ, varMap) + resJ = self.cur.fetchall() + for one_job in resJ: + jobspec = JobSpec() + jobspec.pack(one_job) + jobspec_list.append(jobspec) + workspec.pandaid_list.append(jobspec.PandaID) + workspec.set_jobspec_list(jobspec_list) return workspec retVal = map(_get_workspec_from_record, resW) tmpLog.debug('got {0} workers'.format(len(resW))) From 42322444e2958245138e0e40c93fe9e892b4054c Mon Sep 17 00:00:00 2001 From: fahui Date: Thu, 12 Mar 2020 23:11:00 +0100 Subject: [PATCH 108/181] fix qconf mapper bug with pilotVersion --- pandaharvester/commit_timestamp.py | 2 +- pandaharvester/harvestercore/queue_config_mapper.py | 3 ++- templates/panda_harvester.cfg.rpmnew.template | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/pandaharvester/commit_timestamp.py b/pandaharvester/commit_timestamp.py index 14a0e7f2..df307c92 100644 --- a/pandaharvester/commit_timestamp.py +++ b/pandaharvester/commit_timestamp.py @@ -1 +1 @@ -timestamp = "12-03-2020 13:47:40 on flin (by fahui)" +timestamp = "12-03-2020 22:11:00 on flin (by fahui)" diff --git a/pandaharvester/harvestercore/queue_config_mapper.py b/pandaharvester/harvestercore/queue_config_mapper.py index 4b912c28..7466d7a1 100644 --- a/pandaharvester/harvestercore/queue_config_mapper.py +++ b/pandaharvester/harvestercore/queue_config_mapper.py @@ -587,7 +587,8 @@ def load_data(self): continue # filter for pilot version if hasattr(harvester_config.qconf, 'pilotVersion') and \ - pandaQueueDict[queueConfig.siteName].get('pilot_version') != str(harvester_config.qconf.pilotVersion): + pandaQueueDict.get(queueConfig.siteName) is not None and \ + pandaQueueDict.get(queueConfig.siteName).get('pilot_version') != str(harvester_config.qconf.pilotVersion): continue if 'ALL' not in harvester_config.qconf.queueList and \ 'DYNAMIC' not in harvester_config.qconf.queueList and \ diff --git a/templates/panda_harvester.cfg.rpmnew.template b/templates/panda_harvester.cfg.rpmnew.template index fce0977a..736d34a6 100644 --- a/templates/panda_harvester.cfg.rpmnew.template +++ b/templates/panda_harvester.cfg.rpmnew.template @@ -182,7 +182,7 @@ resolverClass = PandaQueuesDict autoBlacklist = False # restrict to a certain pilot version (optional) -pilotVersion = 1 +#pilotVersion = 2 From 40d9488582b3c7dd0cdcf2128bda3538978069bc Mon Sep 17 00:00:00 2001 From: Doug Benjamin Date: Fri, 13 Mar 2020 06:23:40 -0500 Subject: [PATCH 109/181] protect against : in container name --- .../harvesterpreparator/analysis_aux_preparator.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pandaharvester/harvesterpreparator/analysis_aux_preparator.py b/pandaharvester/harvesterpreparator/analysis_aux_preparator.py index ae5c044b..99c177aa 100644 --- a/pandaharvester/harvesterpreparator/analysis_aux_preparator.py +++ b/pandaharvester/harvesterpreparator/analysis_aux_preparator.py @@ -185,7 +185,9 @@ def check_stage_in_status(self, jobspec): for tmpGroupID in transferGroups: if tmpGroupID is None: continue - protocol, executionID, dst = tmpGroupID.split(':') + tmpGroupID_parts = tmpGroupID.split(':',maxsplit=2) + tmpLog.debug('transfer group ID : {0} components: {1}'.format(tmpGroupID, tmpGroupID_parts)) + protocol, executionID, dst = tmpGroupID.split(':',maxsplit=2) args = [] for arg in self.externalCommand[protocol]['check']['args']: if arg == '{id}': @@ -224,11 +226,15 @@ def check_stage_in_status(self, jobspec): # resolve input file paths def resolve_input_paths(self, jobspec): + # make logger + tmpLog = self.make_logger(baseLogger, 'PandaID={0}'.format(jobspec.PandaID), + method_name='resolve_input_paths') pathInfo = dict() for tmpFileSpec in jobspec.inFiles: url = tmpFileSpec.lfn accPath = self.make_local_access_path(tmpFileSpec.scope, tmpFileSpec.lfn) pathInfo[tmpFileSpec.lfn] = {'path': accPath} + tmpLog.debug('lfn: {0} scope : {1} accPath : {2} pathInfo : {3}'.format(url, tmpFileSpec.scope, accPath, pathInfo)) jobspec.set_input_file_paths(pathInfo) return True, '' From fc85abfc21d83e60a5ffa782b0d633df61169c31 Mon Sep 17 00:00:00 2001 From: Doug Benjamin Date: Fri, 13 Mar 2020 22:54:50 -0500 Subject: [PATCH 110/181] protect for consistency --- pandaharvester/harvestercore/job_spec.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandaharvester/harvestercore/job_spec.py b/pandaharvester/harvestercore/job_spec.py index 42c2b615..65a6d38e 100644 --- a/pandaharvester/harvestercore/job_spec.py +++ b/pandaharvester/harvestercore/job_spec.py @@ -334,7 +334,9 @@ def set_input_file_paths(self, in_files): lfns = self.get_input_file_attributes().keys() paths = [] for lfn in lfns: - paths.append(in_files[lfn]['path']) + # check for consistency + if lfn in in_files: + paths.append(in_files[lfn]['path']) self.jobParams['inFilePaths'] = ','.join(paths) # trigger updating self.force_update('jobParams') From ef4f7ce2ee627c02511cba9fdcc2765a891aa574 Mon Sep 17 00:00:00 2001 From: fahui Date: Fri, 20 Mar 2020 10:48:08 +0100 Subject: [PATCH 111/181] uwsgi run foreground option --- pandaharvester/commit_timestamp.py | 2 +- templates/init.d/panda_harvester-uwsgi.rpmnew.template | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/pandaharvester/commit_timestamp.py b/pandaharvester/commit_timestamp.py index df307c92..ea635740 100644 --- a/pandaharvester/commit_timestamp.py +++ b/pandaharvester/commit_timestamp.py @@ -1 +1 @@ -timestamp = "12-03-2020 22:11:00 on flin (by fahui)" +timestamp = "20-03-2020 09:48:08 on flin (by fahui)" diff --git a/templates/init.d/panda_harvester-uwsgi.rpmnew.template b/templates/init.d/panda_harvester-uwsgi.rpmnew.template index 1652c445..16b476b7 100755 --- a/templates/init.d/panda_harvester-uwsgi.rpmnew.template +++ b/templates/init.d/panda_harvester-uwsgi.rpmnew.template @@ -172,6 +172,13 @@ start) fi fi ;; +runfg) + echo Run harvester prescript + ${PRESCRIPT} + echo Run Harvester in foreground + ${PROGNAME} >> ${HSTDOUT} 2>> ${HSTDERR} + rm -f ${PIDFILE} + ;; stop) if [ ! -f ${PIDFILE} ]; then echo "WANRING: pidfile:${PIDFILE} does not exist. Nothing done" From eec588d4be6756d06a77bff66dd67db63c1bbd38 Mon Sep 17 00:00:00 2001 From: fbarreir Date: Tue, 24 Mar 2020 14:21:17 +0100 Subject: [PATCH 112/181] k8s: changed the cred manager logging id to be able to distinguish the queue being patched --- pandaharvester/harvesterbody/cred_manager.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pandaharvester/harvesterbody/cred_manager.py b/pandaharvester/harvesterbody/cred_manager.py index 72c51c21..1f97536f 100644 --- a/pandaharvester/harvesterbody/cred_manager.py +++ b/pandaharvester/harvesterbody/cred_manager.py @@ -74,8 +74,11 @@ def execute(self): # do nothing if exeCore is None: continue - # make logger - mainLog = self.make_logger(_logger, "{0} {1}".format(exeCore.__class__.__name__, exeCore.outCertFile), + + # make logger + mainLog = self.make_logger(_logger, "{0} {1} {2}".format(exeCore.__class__.__name__, + exeCore.inCertFile, + exeCore.outCertFile), method_name='execute') try: # check credential From 7909d07d86c59f6f3e8aa43cce1da924a95ac8fc Mon Sep 17 00:00:00 2001 From: fahui Date: Fri, 27 Mar 2020 21:13:58 +0100 Subject: [PATCH 113/181] htcondor: add pilotersion support, remove pilot 1 --- pandaharvester/commit_timestamp.py | 2 +- .../harvestersubmitter/htcondor_submitter.py | 50 +++++++------------ 2 files changed, 20 insertions(+), 32 deletions(-) diff --git a/pandaharvester/commit_timestamp.py b/pandaharvester/commit_timestamp.py index ea635740..4ac6ce2f 100644 --- a/pandaharvester/commit_timestamp.py +++ b/pandaharvester/commit_timestamp.py @@ -1 +1 @@ -timestamp = "20-03-2020 09:48:08 on flin (by fahui)" +timestamp = "27-03-2020 20:13:58 on flin (by fahui)" diff --git a/pandaharvester/harvestersubmitter/htcondor_submitter.py b/pandaharvester/harvestersubmitter/htcondor_submitter.py index bbbbb3f8..78a7185a 100644 --- a/pandaharvester/harvestersubmitter/htcondor_submitter.py +++ b/pandaharvester/harvestersubmitter/htcondor_submitter.py @@ -178,16 +178,13 @@ def _condor_macro_replace(string, **kwarg): # Parse resource type from string for Unified PanDA Queue -def _get_resource_type(string, is_unified_queue, is_pilot_option=False, pilot_version='1'): +def _get_resource_type(string, is_unified_queue, is_pilot_option=False): string = str(string) if not is_unified_queue: ret = '' elif string in set(['SCORE', 'MCORE', 'SCORE_HIMEM', 'MCORE_HIMEM']): if is_pilot_option: - if pilot_version == '2': - ret = '--resource-type {0}'.format(string) - else: - ret = '-R {0}'.format(string) + ret = '--resource-type {0}'.format(string) else: ret = string else: @@ -196,23 +193,13 @@ def _get_resource_type(string, is_unified_queue, is_pilot_option=False, pilot_ve # Map "pilotType" (defined in harvester) to prodSourceLabel and pilotType option (defined in pilot, -i option) -# and piloturl (pilot option --piloturl) -# Depending on pilot version 1 or 2 -def _get_prodsourcelabel_pilotypeopt_piloturlstr(pilot_type, pilot_version='1'): - if pilot_version == '2': - # pilot 2 - pt_psl_map = { - 'RC': ('rc_test2', 'RC', '--piloturl http://cern.ch/atlas-panda-pilot/pilot2-dev.tar.gz'), - 'ALRB': ('rc_alrb', 'ALRB', ''), - 'PT': ('ptest', 'PR', ''), - } - else: - # pilot 1, need not piloturl since wrapper covers it - pt_psl_map = { - 'RC': ('rc_test', 'RC', ''), - 'ALRB': ('rc_alrb', 'ALRB', ''), - 'PT': ('ptest', 'PR', ''), - } +# and piloturl (pilot option --piloturl) for pilot 2 +def _get_complicated_pilot_options(pilot_type): + pt_psl_map = { + 'RC': ('rc_test2', 'RC', '--piloturl http://cern.ch/atlas-panda-pilot/pilot2-dev.tar.gz'), + 'ALRB': ('rc_alrb', 'ALRB', ''), + 'PT': ('ptest', 'PR', ''), + } pilot_opt_tuple = pt_psl_map.get(pilot_type, None) return pilot_opt_tuple @@ -329,7 +316,7 @@ def submit_bag_of_workers(data_list): # make a condor jdl for a worker def make_a_jdl(workspec, template, n_core_per_node, log_dir, panda_queue_name, executable_file, x509_user_proxy, log_subdir=None, ce_info_dict=dict(), batch_log_dict=dict(), - special_par='', harvester_queue_config=None, is_unified_queue=False, pilot_version='1', **kwarg): + special_par='', harvester_queue_config=None, is_unified_queue=False, pilot_version='unknown', **kwarg): # make logger tmpLog = core_utils.make_logger(baseLogger, 'workerID={0}'.format(workspec.workerID), method_name='make_a_jdl') @@ -364,7 +351,7 @@ def make_a_jdl(workspec, template, n_core_per_node, log_dir, panda_queue_name, e request_walltime_minute = _div_round_up(request_walltime, 60) request_cputime_minute = _div_round_up(request_cputime, 60) # decide prodSourceLabel - pilot_opt_tuple = _get_prodsourcelabel_pilotypeopt_piloturlstr(workspec.pilotType, pilot_version) + pilot_opt_tuple = _get_complicated_pilot_options(workspec.pilotType) if pilot_opt_tuple is None: prod_source_label = harvester_queue_config.get_source_label(workspec.jobType) pilot_type_opt = workspec.pilotType @@ -405,10 +392,11 @@ def make_a_jdl(workspec, template, n_core_per_node, log_dir, panda_queue_name, e prodSourceLabel=prod_source_label, jobType=workspec.jobType, resourceType=_get_resource_type(workspec.resourceType, is_unified_queue), - pilotResourceTypeOption=_get_resource_type(workspec.resourceType, is_unified_queue, True, pilot_version), + pilotResourceTypeOption=_get_resource_type(workspec.resourceType, is_unified_queue, True), ioIntensity=io_intensity, pilotType=pilot_type_opt, pilotUrlOption=pilot_url_str, + pilotVersion=pilot_version, ) # save jdl to submit description file tmpFile.write(jdl_str) @@ -593,8 +581,8 @@ def submit_workers(self, workspec_list): # get default information from queue info n_core_per_node_from_queue = this_panda_queue_dict.get('corecount', 1) if this_panda_queue_dict.get('corecount', 1) else 1 is_unified_queue = this_panda_queue_dict.get('capability', '') == 'ucore' - pilot_version_orig = str(this_panda_queue_dict.get('pilot_version', '')) - pilot_version_suffix_str = '_pilot2' if pilot_version_orig == '2' else '' + pilot_version = str(this_panda_queue_dict.get('pilot_version', 'current')) + sdf_suffix_str = '_pilot2' # get override requirements from queue configured try: @@ -688,10 +676,10 @@ def _handle_one_worker(workspec, to_submit=to_submit_any): default_port = default_port_map[ce_flavour_str] ce_info_dict['ce_endpoint'] = '{0}:{1}'.format(ce_endpoint_from_queue, default_port) tmpLog.debug('For site {0} got pilot version: "{1}"; CE endpoint: "{2}", flavour: "{3}"'.format( - self.queueName, pilot_version_orig, ce_endpoint_from_queue, ce_flavour_str)) + self.queueName, pilot_version, ce_endpoint_from_queue, ce_flavour_str)) if not self.templateFile and os.path.isdir(self.CEtemplateDir) and ce_flavour_str: - sdf_template_filename = '{ce_flavour_str}{pilot_version_suffix_str}.sdf'.format( - ce_flavour_str=ce_flavour_str, pilot_version_suffix_str=pilot_version_suffix_str) + sdf_template_filename = '{ce_flavour_str}{sdf_suffix_str}.sdf'.format( + ce_flavour_str=ce_flavour_str, sdf_suffix_str=sdf_suffix_str) self.templateFile = os.path.join(self.CEtemplateDir, sdf_template_filename) else: try: @@ -804,7 +792,7 @@ def _handle_one_worker(workspec, to_submit=to_submit_any): 'condor_schedd': condor_schedd, 'condor_pool': condor_pool, 'use_spool': self.useSpool, - 'pilot_version': pilot_version_orig, + 'pilot_version': pilot_version, }) return data From 81269f9473aeffea6f269d7b292b365c37ffe995 Mon Sep 17 00:00:00 2001 From: fahui Date: Tue, 7 Apr 2020 13:18:20 +0200 Subject: [PATCH 114/181] add dummy cred_manager --- pandaharvester/commit_timestamp.py | 2 +- .../harvestercredmanager/dummy_cred_manager.py | 17 +++++++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) create mode 100644 pandaharvester/harvestercredmanager/dummy_cred_manager.py diff --git a/pandaharvester/commit_timestamp.py b/pandaharvester/commit_timestamp.py index 4ac6ce2f..92088277 100644 --- a/pandaharvester/commit_timestamp.py +++ b/pandaharvester/commit_timestamp.py @@ -1 +1 @@ -timestamp = "27-03-2020 20:13:58 on flin (by fahui)" +timestamp = "07-04-2020 11:18:20 on flin (by fahui)" diff --git a/pandaharvester/harvestercredmanager/dummy_cred_manager.py b/pandaharvester/harvestercredmanager/dummy_cred_manager.py new file mode 100644 index 00000000..c4657d61 --- /dev/null +++ b/pandaharvester/harvestercredmanager/dummy_cred_manager.py @@ -0,0 +1,17 @@ +from pandaharvester.harvestercore.plugin_base import PluginBase + + +# dummy credential manager +class DummyCredManager(PluginBase): + + # constructor + def __init__(self, **kwarg): + PluginBase.__init__(self, **kwarg) + + # check proxy + def check_credential(self): + return True + + # renew proxy + def renew_credential(self): + return True, '' From 446440f7fb8ff603c50eb9bbbd9aefd1722fc100 Mon Sep 17 00:00:00 2001 From: fahui Date: Thu, 9 Apr 2020 19:48:10 +0200 Subject: [PATCH 115/181] htcondor: support of piloturl from AGIS associated params --- pandaharvester/commit_timestamp.py | 2 +- .../harvestersubmitter/htcondor_submitter.py | 53 ++++++++++++++----- 2 files changed, 42 insertions(+), 13 deletions(-) diff --git a/pandaharvester/commit_timestamp.py b/pandaharvester/commit_timestamp.py index 92088277..4b84cd08 100644 --- a/pandaharvester/commit_timestamp.py +++ b/pandaharvester/commit_timestamp.py @@ -1 +1 @@ -timestamp = "07-04-2020 11:18:20 on flin (by fahui)" +timestamp = "09-04-2020 17:48:10 on flin (by fahui)" diff --git a/pandaharvester/harvestersubmitter/htcondor_submitter.py b/pandaharvester/harvestersubmitter/htcondor_submitter.py index 78a7185a..9114449a 100644 --- a/pandaharvester/harvestersubmitter/htcondor_submitter.py +++ b/pandaharvester/harvestersubmitter/htcondor_submitter.py @@ -194,14 +194,28 @@ def _get_resource_type(string, is_unified_queue, is_pilot_option=False): # Map "pilotType" (defined in harvester) to prodSourceLabel and pilotType option (defined in pilot, -i option) # and piloturl (pilot option --piloturl) for pilot 2 -def _get_complicated_pilot_options(pilot_type): +def _get_complicated_pilot_options(pilot_type, pilot_url=None): pt_psl_map = { - 'RC': ('rc_test2', 'RC', '--piloturl http://cern.ch/atlas-panda-pilot/pilot2-dev.tar.gz'), - 'ALRB': ('rc_alrb', 'ALRB', ''), - 'PT': ('ptest', 'PR', ''), - } - pilot_opt_tuple = pt_psl_map.get(pilot_type, None) - return pilot_opt_tuple + 'RC': { + 'prod_source_label': 'rc_test2', + 'pilot_type_opt': 'RC', + 'pilot_url_str': '--piloturl http://cern.ch/atlas-panda-pilot/pilot2-dev.tar.gz', + }, + 'ALRB': { + 'prod_source_label': 'rc_alrb', + 'pilot_type_opt': 'ALRB', + 'pilot_url_str': '', + }, + 'PT': { + 'prod_source_label': 'ptest', + 'pilot_type_opt': 'PR', + 'pilot_url_str': '', + }, + } + pilot_opt_dict = pt_psl_map.get(pilot_type, None) + if pilot_url and pilot_opt_dict: + pilot_opt_dict['pilot_url_str'] = '--piloturl {0}'.format(pilot_url) + return pilot_opt_dict # submit a bag of workers @@ -315,7 +329,7 @@ def submit_bag_of_workers(data_list): # make a condor jdl for a worker def make_a_jdl(workspec, template, n_core_per_node, log_dir, panda_queue_name, executable_file, - x509_user_proxy, log_subdir=None, ce_info_dict=dict(), batch_log_dict=dict(), + x509_user_proxy, log_subdir=None, ce_info_dict=dict(), batch_log_dict=dict(), pilot_url=None, special_par='', harvester_queue_config=None, is_unified_queue=False, pilot_version='unknown', **kwarg): # make logger tmpLog = core_utils.make_logger(baseLogger, 'workerID={0}'.format(workspec.workerID), @@ -351,13 +365,15 @@ def make_a_jdl(workspec, template, n_core_per_node, log_dir, panda_queue_name, e request_walltime_minute = _div_round_up(request_walltime, 60) request_cputime_minute = _div_round_up(request_cputime, 60) # decide prodSourceLabel - pilot_opt_tuple = _get_complicated_pilot_options(workspec.pilotType) - if pilot_opt_tuple is None: + pilot_opt_dict = _get_complicated_pilot_options(workspec.pilotType, pilot_url=pilot_url) + if pilot_opt_dict is None: prod_source_label = harvester_queue_config.get_source_label(workspec.jobType) pilot_type_opt = workspec.pilotType - pilot_url_str = '' + pilot_url_str = '--piloturl {0}'.format(pilot_url) if pilot_url else '' else: - prod_source_label, pilot_type_opt, pilot_url_str = pilot_opt_tuple + prod_source_label = pilot_opt_dict['prod_source_label'] + pilot_type_opt = pilot_opt_dict['pilot_type_opt'] + pilot_url_str = pilot_opt_dict['pilot_url_str'] # open tmpfile as submit description file tmpFile = tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='_submit.sdf', dir=workspec.get_access_point()) # fill in template string @@ -524,6 +540,10 @@ def __init__(self, **kwarg): # record of information of CE statistics self.ceStatsLock = threading.Lock() self.ceStats = dict() + # allowed associated parameters from AGIS + self._allowed_agis_attrs = ( + 'pilot_url', + ) # get CE statistics of a site def get_ce_statistics(self, site_name, n_new_workers, time_window=21600): @@ -565,6 +585,9 @@ def submit_workers(self, workspec_list): _queueConfigMapper = QueueConfigMapper() harvester_queue_config = _queueConfigMapper.get_queue(self.queueName) + # associated parameters dict + associated_params_dict = {} + is_grandly_unified_queue = False # get queue info from AGIS by cacher in db if self.useAtlasAGIS: @@ -573,6 +596,10 @@ def submit_workers(self, workspec_list): this_panda_queue_dict = panda_queues_dict.get(self.queueName, dict()) is_grandly_unified_queue = panda_queues_dict.is_grandly_unified_queue(self.queueName) # tmpLog.debug('panda_queues_name and queue_info: {0}, {1}'.format(self.queueName, panda_queues_dict[self.queueName])) + # associated params on AGIS + for key, val in panda_queues_dict.get_harvester_params(self.queueName).items(): + if key in self._allowed_agis_attrs: + associated_params_dict[key] = val else: panda_queues_dict = dict() panda_queue_name = self.queueName @@ -581,6 +608,7 @@ def submit_workers(self, workspec_list): # get default information from queue info n_core_per_node_from_queue = this_panda_queue_dict.get('corecount', 1) if this_panda_queue_dict.get('corecount', 1) else 1 is_unified_queue = this_panda_queue_dict.get('capability', '') == 'ucore' + pilot_url = associated_params_dict.get('pilot_url') pilot_version = str(this_panda_queue_dict.get('pilot_version', 'current')) sdf_suffix_str = '_pilot2' @@ -792,6 +820,7 @@ def _handle_one_worker(workspec, to_submit=to_submit_any): 'condor_schedd': condor_schedd, 'condor_pool': condor_pool, 'use_spool': self.useSpool, + 'pilot_url': pilot_url, 'pilot_version': pilot_version, }) return data From eed8e4f38cb3e440f83aac94d614f19c8c22701f Mon Sep 17 00:00:00 2001 From: fbarreir Date: Tue, 14 Apr 2020 16:50:07 +0200 Subject: [PATCH 116/181] k8s: improvements --- examples/k8s/job_cern.yaml | 80 ++++++ .../harvestercloud/pilots_starter.py | 240 ++++++++++++++++++ pandaharvester/harvestermisc/k8s_utils.py | 72 ++++-- .../harvestermonitor/k8s_monitor.py | 128 +++++----- setup.py | 8 + 5 files changed, 436 insertions(+), 92 deletions(-) create mode 100644 examples/k8s/job_cern.yaml create mode 100644 pandaharvester/harvestercloud/pilots_starter.py diff --git a/examples/k8s/job_cern.yaml b/examples/k8s/job_cern.yaml new file mode 100644 index 00000000..c5a08fe1 --- /dev/null +++ b/examples/k8s/job_cern.yaml @@ -0,0 +1,80 @@ +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: grid-job +spec: + ttlSecondsAfterFinished: 172800 + backoffLimit: 0 + template: + spec: + restartPolicy: Never + containers: + - name: atlas-grid-centos7 + image: atlasadc/atlas-grid-centos7 + env: + - name: computingSite + value: "$computingSite" + - name: pandaQueueName + value: "$pandaQueueName" + - name: proxySecretPath + value: "$proxySecretPath" + - name: proxyContent + value: "$proxyContent" + - name: workerID + value: "$workerID" + - name: logs_frontend_w + value: "$logs_frontend_w" + - name: logs_frontend_r + value: "$logs_frontend_r" + - name: resourceType + value: "$resourceType" + - name: HARVESTER_WORKER_ID + value: "$HARVESTER_WORKER_ID" + - name: HARVESTER_ID + value: "$HARVESTER_ID" + - name: PANDA_JSID + value: "$PANDA_JSID" + - name: TMPDIR + value: "/root" + - name: PILOT_NOKILL + value: "True" + command: ["/usr/bin/bash"] + args: ["-c", "cd; wget https://raw.githubusercontent.com/HSF/harvester/k8s_analysis/pandaharvester/harvestercloud/pilots_starter.py; chmod 755 pilots_starter.py; ./pilots_starter.py || true"] + volumeMounts: + - name: atlas + mountPath: /cvmfs/atlas.cern.ch + - name: atlas-condb + mountPath: /cvmfs/atlas-condb.cern.ch + - name: atlas-nightlies + mountPath: /cvmfs/atlas-nightlies.cern.ch + - name: sft + mountPath: /cvmfs/sft.cern.ch + - name: grid + mountPath: /cvmfs/grid.cern.ch + - name: proxy-secret + mountPath: /proxy + volumes: + - name: atlas + persistentVolumeClaim: + claimName: csi-cvmfs-atlas-pvc + readOnly: true + - name: atlas-condb + persistentVolumeClaim: + claimName: csi-cvmfs-atlas-condb-pvc + readOnly: true + - name: atlas-nightlies + persistentVolumeClaim: + claimName: csi-cvmfs-atlas-nightlies-pvc + readOnly: true + - name: sft + persistentVolumeClaim: + claimName: csi-cvmfs-sft-pvc + readOnly: true + - name: grid + persistentVolumeClaim: + claimName: csi-cvmfs-grid-pvc + readOnly: true + - name: proxy-secret + secret: + secretName: proxy-secret \ No newline at end of file diff --git a/pandaharvester/harvestercloud/pilots_starter.py b/pandaharvester/harvestercloud/pilots_starter.py new file mode 100644 index 00000000..150dd7a4 --- /dev/null +++ b/pandaharvester/harvestercloud/pilots_starter.py @@ -0,0 +1,240 @@ +#!/usr/bin/env python + +""" +This script will be executed at container startup +- It will retrieve the proxy and panda queue from the environment +- It will download the pilot wrapper from github and execute it +- It will upload the pilot logs to panda cache at the end + +post-multipart code was taken from: https://github.com/haiwen/webapi-examples/blob/master/python/upload-file.py +""" + +try: + import subprocess32 as subprocess +except Exception: + import subprocess +import os +import sys +import shutil +import logging +import httplib +import mimetypes +import ssl +import urlparse +import urllib2 +import traceback + +WORK_DIR = '/scratch' +CONFIG_DIR = '/scratch/jobconfig' +PJD = 'pandaJobData.out' +PFC = 'PoolFileCatalog_H.xml' +CONFIG_FILES = [PJD, PFC] + +logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s', stream=sys.stdout) + + +# handlers=[logging.FileHandler('/tmp/vm_script.log'), logging.StreamHandler(sys.stdout)]) +# filename='/tmp/vm_script.log', filemode='w') + + +def post_multipart(host, port, selector, files, proxy_cert): + """ + Post files to an http host as multipart/form-data. + files is a sequence of (name, filename, value) elements for data to be uploaded as files + Return the server's response page. + """ + content_type, body = encode_multipart_formdata(files) + + context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH) + context.load_cert_chain(certfile=proxy_cert, keyfile=proxy_cert) + + h = httplib.HTTPSConnection(host, port, context=context, timeout=180) + + h.putrequest('POST', selector) + h.putheader('content-type', content_type) + h.putheader('content-length', str(len(body))) + h.endheaders() + h.send(body) + response = h.getresponse() + return response.status, response.reason + + +def encode_multipart_formdata(files): + """ + files is a sequence of (name, filename, value) elements for data to be uploaded as files + Return (content_type, body) ready for httplib.HTTP instance + """ + BOUNDARY = '----------ThIs_Is_tHe_bouNdaRY_$' + CRLF = '\r\n' + L = [] + for (key, filename, value) in files: + L.append('--' + BOUNDARY) + L.append('Content-Disposition: form-data; name="%s"; filename="%s"' % (key, filename)) + L.append('Content-Type: %s' % get_content_type(filename)) + L.append('') + L.append(value) + L.append('--' + BOUNDARY + '--') + L.append('') + body = CRLF.join(L) + content_type = 'multipart/form-data; boundary=%s' % BOUNDARY + return content_type, body + + +def get_content_type(filename): + return mimetypes.guess_type(filename)[0] or 'application/octet-stream' + + +def upload_logs(url, log_file_name, destination_name, proxy_cert): + try: + full_url = url + '/putFile' + urlparts = urlparse.urlsplit(full_url) + + logging.debug('[upload_logs] start') + files = [('file', destination_name, open(log_file_name).read())] + status, reason = post_multipart(urlparts.hostname, urlparts.port, urlparts.path, files, proxy_cert) + logging.debug('[upload_logs] finished with code={0} msg={1}'.format(status, reason)) + if status == 200: + return True + except Exception: + err_type, err_value = sys.exc_info()[:2] + err_messsage = "failed to put with {0}:{1} ".format(err_type, err_value) + err_messsage += traceback.format_exc() + logging.debug('[upload_logs] excepted with:\n {0}'.format(err_messsage)) + + return False + + +def get_url(url, headers=None): + """ + get content from specified URL + TODO: error handling + """ + response = urllib2.urlopen(wrapper_url) + content = response.read() + return content + + +def copy_files_in_dir(src_dir, dst_dir): + # src_files = os.listdir(src_dir) + for file_name in CONFIG_FILES: + full_file_name = os.path.join(src_dir, file_name) + shutil.copy(full_file_name, dst_dir) + + +def get_configuration(): + # get the proxy certificate and save it + if os.environ.get('proxySecretPath'): + # os.symlink(os.environ.get('proxySecretPath'), proxy_path) + proxy_path = os.environ.get('proxySecretPath') + elif os.environ.get('proxyContent'): + proxy_path = "/tmp/x509up" + proxy_string = os.environ.get('proxyContent').replace(",", "\n") + with open(proxy_path, "w") as proxy_file: + proxy_file.write(proxy_string) + del os.environ['proxyContent'] + os.chmod(proxy_path, 0o600) + else: + logging.debug('[main] no proxy specified in env var $proxySecretPath nor $proxyContent') + raise Exception('Found no voms proxy specified') + os.environ['X509_USER_PROXY'] = proxy_path + logging.debug('[main] initialized proxy') + + # get the panda site name + panda_site = os.environ.get('computingSite') + logging.debug('[main] got panda site: {0}'.format(panda_site)) + + # get the panda queue name + panda_queue = os.environ.get('pandaQueueName') + logging.debug('[main] got panda queue: {0}'.format(panda_queue)) + + # get the resource type of the worker + resource_type = os.environ.get('resourceType') + logging.debug('[main] got resource type: {0}'.format(resource_type)) + + # get the Harvester ID + harvester_id = os.environ.get('HARVESTER_ID') + logging.debug('[main] got Harvester ID: {0}'.format(harvester_id)) + + # get the worker id + worker_id = os.environ.get('workerID') + logging.debug('[main] got worker ID: {0}'.format(worker_id)) + + # get the URL (e.g. panda cache) to upload logs + logs_frontend_w = os.environ.get('logs_frontend_w') + logging.debug('[main] got url to upload logs') + + # get the URL (e.g. panda cache) where the logs can be downloaded afterwards + logs_frontend_r = os.environ.get('logs_frontend_r') + logging.debug('[main] got url to download logs') + + # get the filename to use for the stdout log + stdout_name = os.environ.get('stdout_name') + if not stdout_name: + stdout_name = '{0}_{1}.out'.format(harvester_id, worker_id) + + logging.debug('[main] got filename for the stdout log') + + # get the submission mode (push/pull) for the pilot + submit_mode = os.environ.get('submit_mode') + if not submit_mode: + submit_mode = 'PULL' + + # see if there is a work directory specified + tmpdir = os.environ.get('TMPDIR') + if tmpdir: + global WORK_DIR + WORK_DIR = tmpdir + global CONFIG_DIR + CONFIG_DIR = tmpdir + '/jobconfig' + + return proxy_path, panda_site, panda_queue, resource_type, harvester_id, \ + worker_id, logs_frontend_w, logs_frontend_r, stdout_name, submit_mode + + +if __name__ == "__main__": + + # get all the configuration from environment + proxy_path, panda_site, panda_queue, resource_type, harvester_id, worker_id, logs_frontend_w, logs_frontend_r, \ + destination_name, submit_mode = get_configuration() + + # the pilot should propagate the download link via the pilotId field in the job table + log_download_url = '{0}/{1}'.format(logs_frontend_r, destination_name) + os.environ['GTAG'] = log_download_url # GTAG env variable is read by pilot + + # get the pilot wrapper + wrapper_path = "/tmp/runpilot2-wrapper.sh" + wrapper_url = "https://raw.githubusercontent.com/PanDAWMS/pilot-wrapper/master/runpilot2-wrapper.sh" + wrapper_string = get_url(wrapper_url) + with open(wrapper_path, "w") as wrapper_file: + wrapper_file.write(wrapper_string) + os.chmod(wrapper_path, 0o544) # make pilot wrapper executable + logging.debug('[main] downloaded pilot wrapper') + + # execute the pilot wrapper + logging.debug('[main] starting pilot wrapper...') + resource_type_option = '' + if resource_type: + resource_type_option = '--resource-type {0}'.format(resource_type) + wrapper_params = '-a {0} -s {1} -r {2} -q {3} {4}'.format(WORK_DIR, panda_site, panda_queue, panda_queue, + resource_type_option) + if 'ANALY' in panda_queue: + wrapper_params = '{0} -j user'.format(wrapper_params) + else: + wrapper_params = '{0} -j managed'.format(wrapper_params) + + if submit_mode == 'PUSH': + # job configuration files need to be copied, because k8s configmap mounts as read-only file system + # and therefore the pilot cannot execute in the same directory + copy_files_in_dir(CONFIG_DIR, WORK_DIR) + + command = "/tmp/runpilot2-wrapper.sh {0} -i PR -w generic --pilot-user=ATLAS --url=https://pandaserver.cern.ch -d --harvester-submit-mode={1} --allow-same-user=False | tee /tmp/wrapper-wid.log". \ + format(wrapper_params, submit_mode) + try: + subprocess.call(command, shell=True) + except: + logging.error(traceback.format_exc()) + logging.debug('[main] pilot wrapper done...') + + # upload logs to e.g. panda cache or similar + upload_logs(logs_frontend_w, '/tmp/wrapper-wid.log', destination_name, proxy_path) + logging.debug('[main] FINISHED') \ No newline at end of file diff --git a/pandaharvester/harvestermisc/k8s_utils.py b/pandaharvester/harvestermisc/k8s_utils.py index 06e70e3a..6d7d58bf 100644 --- a/pandaharvester/harvestermisc/k8s_utils.py +++ b/pandaharvester/harvestermisc/k8s_utils.py @@ -12,6 +12,9 @@ from pandaharvester.harvesterconfig import harvester_config from pandaharvester.harvestermisc.info_utils import PandaQueuesDict +from pandaharvester.harvestercore import core_utils + +base_logger = core_utils.setup_logger('k8s_utils') class k8s_Client(object): @@ -31,7 +34,11 @@ def read_yaml_file(self, yaml_file): return yaml_content - def create_job_from_yaml(self, yaml_content, work_spec, cert, cert_in_secret=True, cpuadjustratio=100, memoryadjustratio=100): + def create_job_from_yaml(self, yaml_content, work_spec, cert, cert_in_secret=True, cpuadjustratio=100, + memoryadjustratio=100): + + tmp_log = core_utils.make_logger(base_logger, method_name='create_job_from_yaml') + panda_queues_dict = PandaQueuesDict() queue_name = panda_queues_dict.get_panda_queue_name(work_spec.computingSite) @@ -42,7 +49,7 @@ def create_job_from_yaml(self, yaml_content, work_spec, cert, cert_in_secret=Tru 'labels': {'resourceType': str(work_spec.resourceType)}}) yaml_containers = yaml_content['spec']['template']['spec']['containers'] - del(yaml_containers[1:len(yaml_containers)]) + del (yaml_containers[1:len(yaml_containers)]) container_env = yaml_containers[0] @@ -62,7 +69,7 @@ def create_job_from_yaml(self, yaml_content, work_spec, cert, cert_in_secret=Tru if 'cpu' not in container_env['resources']['requests']: container_env['resources']['requests']['cpu'] = str(work_spec.nCore * cpuadjustratio / 100.0) - if work_spec.minRamCount > 4: # K8S minimum memory limit = 4 MB + if work_spec.minRamCount > 4: # K8S minimum memory limit = 4 MB # memory limits container_env['resources'].setdefault('limits', {}) if 'memory' not in container_env['resources']['limits']: @@ -70,7 +77,8 @@ def create_job_from_yaml(self, yaml_content, work_spec, cert, cert_in_secret=Tru # memory requests container_env['resources'].setdefault('requests', {}) if 'memory' not in container_env['resources']['requests']: - container_env['resources']['requests']['memory'] = str(work_spec.minRamCount * memoryadjustratio / 100.0) + 'M' + container_env['resources']['requests']['memory'] = str( + work_spec.minRamCount * memoryadjustratio / 100.0) + 'M' container_env.setdefault('env', []) @@ -86,11 +94,13 @@ def create_job_from_yaml(self, yaml_content, work_spec, cert, cert_in_secret=Tru {'name': 'PANDA_JSID', 'value': 'harvester-' + harvester_config.master.harvester_id}, {'name': 'HARVESTER_WORKER_ID', 'value': str(work_spec.workerID)}, {'name': 'HARVESTER_ID', 'value': harvester_config.master.harvester_id} - ]) + ]) if 'affinity' not in yaml_content['spec']['template']['spec']: yaml_content = self.set_affinity(yaml_content) + # tmp_log.debug('creating job {0}'.format(yaml_content)) + rsp = self.batchv1.create_namespaced_job(body=yaml_content, namespace=self.namespace) return rsp @@ -102,10 +112,12 @@ def get_pods_info(self): for i in ret.items: pod_info = {} pod_info['name'] = i.metadata.name - pod_info['start_time'] = i.status.start_time.replace(tzinfo=None) if i.status.start_time else i.status.start_time + pod_info['start_time'] = i.status.start_time.replace( + tzinfo=None) if i.status.start_time else i.status.start_time pod_info['status'] = i.status.phase pod_info['status_conditions'] = i.status.conditions - pod_info['job_name'] = i.metadata.labels['job-name'] if i.metadata.labels and 'job-name' in i.metadata.labels else None + pod_info['job_name'] = i.metadata.labels[ + 'job-name'] if i.metadata.labels and 'job-name' in i.metadata.labels else None pod_info['containers_state'] = [] if i.status.container_statuses: for cs in i.status.container_statuses: @@ -117,7 +129,7 @@ def get_pods_info(self): def filter_pods_info(self, pods_list, job_name=None): if job_name: - pods_list = [ i for i in pods_list if i['job_name'] == job_name] + pods_list = [i for i in pods_list if i['job_name'] == job_name] return pods_list def get_jobs_info(self, job_name=None): @@ -127,32 +139,32 @@ def get_jobs_info(self, job_name=None): ret = self.batchv1.list_namespaced_job(namespace=self.namespace, field_selector=field_selector) for i in ret.items: - job_info = {} - job_info['name'] = i.metadata.name - job_info['status'] = i.status.conditions[0].type - job_info['status_reason'] = i.status.conditions[0].reason - job_info['status_message'] = i.status.conditions[0].message + job_info = {'name': i.metadata.name, 'status': i.status.conditions[0].type, + 'status_reason': i.status.conditions[0].reason, + 'status_message': i.status.conditions[0].message} jobs_list.append(job_info) return jobs_list def delete_pods(self, pod_name_list): - retList = list() + ret_list = list() for pod_name in pod_name_list: rsp = {} rsp['name'] = pod_name try: - self.corev1.delete_namespaced_pod(name=pod_name, namespace=self.namespace, body=self.deletev1, grace_period_seconds=0) + self.corev1.delete_namespaced_pod(name=pod_name, namespace=self.namespace, body=self.deletev1, + grace_period_seconds=0) except ApiException as _e: rsp['errMsg'] = '' if _e.status == 404 else _e.reason else: rsp['errMsg'] = '' - retList.append(rsp) + ret_list.append(rsp) - return retList + return ret_list def delete_job(self, job_name): - self.batchv1.delete_namespaced_job(name=job_name, namespace=self.namespace, body=self.deletev1, grace_period_seconds=0) + self.batchv1.delete_namespaced_job(name=job_name, namespace=self.namespace, body=self.deletev1, + grace_period_seconds=0) def set_proxy(self, proxy_path): with open(proxy_path) as f: @@ -170,16 +182,18 @@ def set_affinity(self, yaml_content): 'labelSelector': {'matchExpressions': [ {'key': 'resourceType', 'operator': 'In', 'values': ['SCORE']}]}, 'topologyKey': 'kubernetes.io/hostname'} - }]} + }]} - resourceType = yaml_content['spec']['template']['metadata']['labels']['resourceType'] + resource_type = yaml_content['spec']['template']['metadata']['labels']['resourceType'] - if resourceType == 'SCORE': + if resource_type == 'SCORE': yaml_affinity['podAffinity'] = copy.deepcopy(affinity_spec) - yaml_affinity['podAffinity']['preferredDuringSchedulingIgnoredDuringExecution'][0]['podAffinityTerm']['labelSelector']['matchExpressions'][0]['values'][0] = resourceType + yaml_affinity['podAffinity']['preferredDuringSchedulingIgnoredDuringExecution'][0]['podAffinityTerm'][ + 'labelSelector']['matchExpressions'][0]['values'][0] = resource_type yaml_affinity['podAntiAffinity'] = copy.deepcopy(affinity_spec) - yaml_affinity['podAntiAffinity']['preferredDuringSchedulingIgnoredDuringExecution'][0]['podAffinityTerm']['labelSelector']['matchExpressions'][0]['values'][0] = res_element.difference({resourceType}).pop() + yaml_affinity['podAntiAffinity']['preferredDuringSchedulingIgnoredDuringExecution'][0]['podAffinityTerm'][ + 'labelSelector']['matchExpressions'][0]['values'][0] = res_element.difference({resource_type}).pop() return yaml_content @@ -187,26 +201,30 @@ def create_or_patch_secret(self, file_list, secret_name): # api_version = 'v1' # kind = 'Secret' # type='kubernetes.io/tls' + + tmp_log = core_utils.make_logger(base_logger, method_name='create_or_patch_secret') + metadata = {'name': secret_name, 'namespace': self.namespace} data = {} for file in file_list: filename = os.path.basename(file) with open(file, 'rb') as f: - str = f.read() - data[filename] = base64.b64encode(str).decode() + aux = f.read() + data[filename] = base64.b64encode(aux).decode() body = client.V1Secret(data=data, metadata=metadata) try: rsp = self.corev1.patch_namespaced_secret(name=secret_name, body=body, namespace=self.namespace) except ApiException as e: - print('Exception when patch secret: {0} . Try to create secret instead...'.format(e)) + tmp_log.debug('Exception when patch secret: {0} . Try to create secret instead...'.format(e)) rsp = self.corev1.create_namespaced_secret(body=body, namespace=self.namespace) return rsp def get_pod_logs(self, pod_name, previous=False): + tmp_log = core_utils.make_logger(base_logger, method_name='get_pod_logs') try: rsp = self.corev1.read_namespaced_pod_log(name=pod_name, namespace=self.namespace, previous=previous) except ApiException as e: - print('Exception when getting logs from pod {0} : {1} . Skipped'.format(e)) + tmp_log.debug('Exception when getting logs for pod {0} : {1}. Skipped'.format(pod_name, e)) raise else: return rsp diff --git a/pandaharvester/harvestermonitor/k8s_monitor.py b/pandaharvester/harvestermonitor/k8s_monitor.py index edb6f3d9..f7b3d013 100644 --- a/pandaharvester/harvestermonitor/k8s_monitor.py +++ b/pandaharvester/harvestermonitor/k8s_monitor.py @@ -1,7 +1,4 @@ -import os -import time import datetime -import re from concurrent.futures import ThreadPoolExecutor @@ -13,7 +10,7 @@ # logger -baseLogger = core_utils.setup_logger('k8s_monitor') +base_logger = core_utils.setup_logger('k8s_monitor') # monitor for K8S @@ -42,127 +39,128 @@ def __init__(self, **kwarg): self._all_pods_list = [] def check_pods_status(self, pods_status_list, containers_state_list): - newStatus = '' - subMesg = '' + sub_msg = '' if 'Unknown' in pods_status_list: if all(item == 'Unknown' for item in pods_status_list): - newStatus = None + new_status = None elif 'Running' in pods_status_list: - newStatus = WorkSpec.ST_running + new_status = WorkSpec.ST_running else: - newStatus = WorkSpec.ST_idle + new_status = WorkSpec.ST_idle else: if all(item == 'Pending' for item in pods_status_list): - newStatus = WorkSpec.ST_submitted + new_status = WorkSpec.ST_submitted # elif all(item == 'Succeeded' for item in pods_status_list): - # newStatus = WorkSpec.ST_finished + # new_status = WorkSpec.ST_finished elif 'Succeeded' in pods_status_list: if all((item.terminated is not None and item.terminated.reason == 'Completed') for item in containers_state_list): - newStatus = WorkSpec.ST_finished + new_status = WorkSpec.ST_finished else: sub_mesg_list = [] for item in containers_state_list: - mesg_str = '' + msg_str = '' if item.terminated is None: state = 'UNKNOWN' if item.running is not None: state = 'running' elif item.waiting is not None: state = 'waiting' - mesg_str = 'container not terminated yet ({0}) while pod Succeeded'.format(state) + msg_str = 'container not terminated yet ({0}) while pod Succeeded'.format(state) elif item.terminated.reason != 'Completed': - mesg_str = 'container termiated by k8s for reason {0}'.format(item.terminated.reason) - sub_mesg_list.append(mesg_str) - subMesg = ';'.join(sub_mesg_list) - newStatus = WorkSpec.ST_cancelled + msg_str = 'container termiated by k8s for reason {0}'.format(item.terminated.reason) + sub_mesg_list.append(msg_str) + sub_msg = ';'.join(sub_mesg_list) + new_status = WorkSpec.ST_cancelled elif 'Running' in pods_status_list: - newStatus = WorkSpec.ST_running + new_status = WorkSpec.ST_running elif 'Failed' in pods_status_list: - newStatus = WorkSpec.ST_failed + new_status = WorkSpec.ST_failed else: - newStatus = WorkSpec.ST_idle + new_status = WorkSpec.ST_idle - return newStatus, subMesg + return new_status, sub_msg def check_a_job(self, workspec): # set logger - tmpLog = self.make_logger(baseLogger, 'workerID={0} batchID={1}'.format(workspec.workerID, workspec.batchID), - method_name='check_a_job') + tmp_log = self.make_logger(base_logger, 'workerID={0} batchID={1}'.format(workspec.workerID, workspec.batchID), + method_name='check_a_job') - ## initialization + # initialization job_id = workspec.batchID - newStatus = workspec.status - errStr = '' + err_str = '' + time_now = datetime.datetime.utcnow() + pods_status_list = [] + pods_name_to_delete_list = [] try: pods_list = self.k8s_client.filter_pods_info(self._all_pods_list, job_name=job_id) - timeNow = datetime.datetime.utcnow() - pods_status_list = [] containers_state_list = [] - pods_name_to_delete_list = [] pods_sup_diag_list = [] for pods_info in pods_list: + # make a list of pods that have been queued too long if pods_info['status'] in ['Pending', 'Unknown'] and pods_info['start_time'] \ - and timeNow - pods_info['start_time'] > datetime.timedelta(seconds=self.podQueueTimeLimit): + and time_now - pods_info['start_time'] > datetime.timedelta(seconds=self.podQueueTimeLimit): # fetch queuing too long pods pods_name_to_delete_list.append(pods_info['name']) + # make list of status of the pods belonging to our job pods_status_list.append(pods_info['status']) containers_state_list.extend(pods_info['containers_state']) pods_sup_diag_list.append(pods_info['name']) except Exception as _e: - errStr = 'Failed to get POD status of JOB id={0} ; {1}'.format(job_id, _e) - tmpLog.error(errStr) - newStatus = None + err_str = 'Failed to get POD status of JOB id={0} ; {1}'.format(job_id, _e) + tmp_log.error(err_str) + new_status = None else: if not pods_status_list: - errStr = 'JOB id={0} not found'.format(job_id) - tmpLog.error(errStr) - tmpLog.info('Force to cancel the worker due to JOB not found') - newStatus = WorkSpec.ST_cancelled + # there were no pods found belonging to our job + err_str = 'JOB id={0} not found'.format(job_id) + tmp_log.error(err_str) + tmp_log.info('Force to cancel the worker due to JOB not found') + new_status = WorkSpec.ST_cancelled else: - tmpLog.debug('pods_status_list={0}'.format(pods_status_list)) - newStatus, subMesg = self.check_pods_status(pods_status_list, containers_state_list) - if subMesg: - errStr += subMesg - tmpLog.debug('new_status={0}'.format(newStatus)) - # delete queuing too long pods + # we found pods belonging to our job. Obtain the final status + tmp_log.debug('pods_status_list={0}'.format(pods_status_list)) + new_status, sub_msg = self.check_pods_status(pods_status_list, containers_state_list) + if sub_msg: + err_str += sub_msg + tmp_log.debug('new_status={0}'.format(new_status)) + + # delete pods that have been queueing too long if pods_name_to_delete_list: - tmpLog.debug('Deleting pods queuing too long') - retList = self.k8s_client.delete_pods(pods_name_to_delete_list) + tmp_log.debug('Deleting pods queuing too long') + ret_list = self.k8s_client.delete_pods(pods_name_to_delete_list) deleted_pods_list = [] - for item in retList: + for item in ret_list: if item['errMsg'] == '': deleted_pods_list.append(item['name']) - tmpLog.debug('Deleted pods queuing too long: {0}'.format( + tmp_log.debug('Deleted pods queuing too long: {0}'.format( ','.join(deleted_pods_list))) # supplemental diag messages - sup_error_code = WorkerErrors.error_codes.get('GENERAL_ERROR') if errStr else WorkerErrors.error_codes.get('SUCCEEDED') - sup_error_diag = 'PODs=' + ','.join(pods_sup_diag_list) + ' ; ' + errStr + sup_error_code = WorkerErrors.error_codes.get('GENERAL_ERROR') if err_str else WorkerErrors.error_codes.get('SUCCEEDED') + sup_error_diag = 'PODs=' + ','.join(pods_sup_diag_list) + ' ; ' + err_str workspec.set_supplemental_error(error_code=sup_error_code, error_diag=sup_error_diag) - return (newStatus, errStr) + return new_status, err_str - - # check workers def check_workers(self, workspec_list): - tmpLog = self.make_logger(baseLogger, 'k8s query', method_name='check_workers') - tmpLog.debug('start') + tmp_log = self.make_logger(base_logger, 'k8s query', method_name='check_workers') + tmp_log.debug('start') - retList = list() + ret_list = list() if not workspec_list: - errStr = 'empty workspec_list' - tmpLog.debug(errStr) - retList.append(('', errStr)) - return False, retList + err_str = 'empty workspec_list' + tmp_log.debug(err_str) + ret_list.append(('', err_str)) + return False, ret_list self._all_pods_list = self.k8s_client.get_pods_info() + # resolve status requested workers with ThreadPoolExecutor(self.nProcesses) as thread_pool: - retIterator = thread_pool.map(self.check_a_job, workspec_list) - - retList = list(retIterator) + ret_iterator = thread_pool.map(self.check_a_job, workspec_list) - tmpLog.debug('done') + ret_list = list(ret_iterator) - return True, retList + tmp_log.debug('done') + return True, ret_list diff --git a/setup.py b/setup.py index 957f1a32..95aebfc7 100644 --- a/setup.py +++ b/setup.py @@ -37,6 +37,13 @@ 'psutil >= 5.4.8', 'scandir; python_version < "3.5"' ], + + # optional pip dependencies + extras_require={ + 'kubernetes': ['kubernetes', 'pyyaml'], + 'mysql': ['mysqlclient'] + }, + data_files=[ # config and cron files ('etc/panda', ['templates/panda_harvester.cfg.rpmnew.template', @@ -61,6 +68,7 @@ ] ), ], + scripts=['templates/panda_jedi-renice', 'templates/panda_harvester-sqlite3backup', ] From 3519ac891d6c6c9d00155eb8d34d7a30df6da179 Mon Sep 17 00:00:00 2001 From: fbarreir Date: Tue, 14 Apr 2020 21:14:31 +0200 Subject: [PATCH 117/181] k8s: PUSH mode --- pandaharvester/harvestermisc/k8s_utils.py | 154 ++++++++++++++++++---- 1 file changed, 125 insertions(+), 29 deletions(-) diff --git a/pandaharvester/harvestermisc/k8s_utils.py b/pandaharvester/harvestermisc/k8s_utils.py index 6d7d58bf..1dcb62d2 100644 --- a/pandaharvester/harvestermisc/k8s_utils.py +++ b/pandaharvester/harvestermisc/k8s_utils.py @@ -16,6 +16,9 @@ base_logger = core_utils.setup_logger('k8s_utils') +CONFIG_DIR = '/scratch/jobconfig' +DEF_COMMAND = ["/usr/bin/bash"] +DEF_ARGS = ["-c", "cd; wget https://raw.githubusercontent.com/HSF/harvester/k8s_analysis/pandaharvester/harvestercloud/pilots_starter.py; chmod 755 pilots_starter.py; ./pilots_starter.py || true"] class k8s_Client(object): @@ -34,29 +37,60 @@ def read_yaml_file(self, yaml_file): return yaml_content - def create_job_from_yaml(self, yaml_content, work_spec, cert, cert_in_secret=True, cpuadjustratio=100, - memoryadjustratio=100): + def create_job_from_yaml(self, yaml_content, work_spec, container_image, cert, cert_in_secret=True, + cpu_adjust_ratio=100, memory_adjust_ratio=100, executable=[], args=[]): tmp_log = core_utils.make_logger(base_logger, method_name='create_job_from_yaml') + # consider PULL mode as default, unless specified + submit_mode = 'PULL' + + # create the configmap in push mode + worker_id = None + if work_spec.mapType != 'NoJob': + submit_mode = 'PUSH' + worker_id = str(work_spec.workerID) + res = self.create_configmap(work_spec) + if not res: # if the configmap creation failed, don't submit a job because the pod creation will hang + return res, 'Failed to create a configmap' + + # retrieve panda queue information panda_queues_dict = PandaQueuesDict() queue_name = panda_queues_dict.get_panda_queue_name(work_spec.computingSite) + # set the worker name yaml_content['metadata']['name'] = yaml_content['metadata']['name'] + "-" + str(work_spec.workerID) + # set the resource type and other metadata to filter the pods yaml_content['spec']['template'].setdefault('metadata', {}) - yaml_content['spec']['template']['metadata'].update({ - 'labels': {'resourceType': str(work_spec.resourceType)}}) + yaml_content['spec']['template']['metadata'].update({'labels': + {'resourceType': str(work_spec.resourceType)} + }) + # fill the container details. we can only handle one container (take the first, delete the rest) yaml_containers = yaml_content['spec']['template']['spec']['containers'] del (yaml_containers[1:len(yaml_containers)]) container_env = yaml_containers[0] container_env.setdefault('resources', {}) + # set the container image + if 'image' not in container_env: + container_env['image'] = container_image + + # if there is no user defined executable, run the default executable + if not executable: + executable = DEF_COMMAND + args = DEF_ARGS + + if 'command' not in container_env: + container_env['command'] = executable + container_env['args'] = args # set the resources (CPU and memory) we need for the container # note that predefined values in the yaml template will NOT be overwritten + # Be familiar with QoS classes: https://kubernetes.io/docs/tasks/configure-pod-container/quality-service-pod + # The CPU & memory settings will affect the QoS for the pod container_env.setdefault('resources', {}) if work_spec.nCore > 0: @@ -67,20 +101,26 @@ def create_job_from_yaml(self, yaml_content, work_spec, cert, cert_in_secret=Tru # CPU requests container_env['resources'].setdefault('requests', {}) if 'cpu' not in container_env['resources']['requests']: - container_env['resources']['requests']['cpu'] = str(work_spec.nCore * cpuadjustratio / 100.0) + container_env['resources']['requests']['cpu'] = str(work_spec.nCore * cpu_adjust_ratio / 100.0) if work_spec.minRamCount > 4: # K8S minimum memory limit = 4 MB # memory limits - container_env['resources'].setdefault('limits', {}) - if 'memory' not in container_env['resources']['limits']: - container_env['resources']['limits']['memory'] = str(work_spec.minRamCount) + 'M' + # container_env['resources'].setdefault('limits', {}) + # if 'memory' not in container_env['resources']['limits']: + # container_env['resources']['limits']['memory'] = str(work_spec.minRamCount) + 'M' # memory requests container_env['resources'].setdefault('requests', {}) if 'memory' not in container_env['resources']['requests']: container_env['resources']['requests']['memory'] = str( - work_spec.minRamCount * memoryadjustratio / 100.0) + 'M' + work_spec.minRamCount * memory_adjust_ratio / 100.0) + 'M' container_env.setdefault('env', []) + # try to retrieve the stdout log file name + try: + log_file_name = work_spec.workAttributes['stdout'] + except (KeyError, AttributeError): + tmp_log.debug('work_spec does not have workAttributes field: {0}'.format(work_spec)) + log_file_name = '' container_env['env'].extend([ {'name': 'computingSite', 'value': work_spec.computingSite}, @@ -91,18 +131,30 @@ def create_job_from_yaml(self, yaml_content, work_spec, cert, cert_in_secret=Tru {'name': 'workerID', 'value': str(work_spec.workerID)}, {'name': 'logs_frontend_w', 'value': harvester_config.pandacon.pandaCacheURL_W}, {'name': 'logs_frontend_r', 'value': harvester_config.pandacon.pandaCacheURL_R}, + {'name': 'stdout_name', 'value': log_file_name}, {'name': 'PANDA_JSID', 'value': 'harvester-' + harvester_config.master.harvester_id}, {'name': 'HARVESTER_WORKER_ID', 'value': str(work_spec.workerID)}, - {'name': 'HARVESTER_ID', 'value': harvester_config.master.harvester_id} + {'name': 'HARVESTER_ID', 'value': harvester_config.master.harvester_id}, + {'name': 'submit_mode', 'value': submit_mode} ]) + # in push mode, add the configmap as a volume to the pod + if submit_mode == 'PUSH' and worker_id: + yaml_content['spec']['template']['spec'].setdefault('volumes', []) + yaml_volumes = yaml_content['spec']['template']['spec']['volumes'] + yaml_volumes.append({'name': 'job-config', 'configMap': {'name': worker_id}}) + # mount the volume to the filesystem + container_env.setdefault('volumeMounts', []) + container_env['volumeMounts'].append({'name': 'job-config', 'mountPath': CONFIG_DIR}) + + # set the affinity if 'affinity' not in yaml_content['spec']['template']['spec']: yaml_content = self.set_affinity(yaml_content) # tmp_log.debug('creating job {0}'.format(yaml_content)) rsp = self.batchv1.create_namespaced_job(body=yaml_content, namespace=self.namespace) - return rsp + return rsp, yaml_content def get_pods_info(self): pods_list = list() @@ -110,15 +162,16 @@ def get_pods_info(self): ret = self.corev1.list_namespaced_pod(namespace=self.namespace) for i in ret.items: - pod_info = {} - pod_info['name'] = i.metadata.name - pod_info['start_time'] = i.status.start_time.replace( - tzinfo=None) if i.status.start_time else i.status.start_time - pod_info['status'] = i.status.phase - pod_info['status_conditions'] = i.status.conditions - pod_info['job_name'] = i.metadata.labels[ - 'job-name'] if i.metadata.labels and 'job-name' in i.metadata.labels else None - pod_info['containers_state'] = [] + pod_info = { + 'name': i.metadata.name, + 'start_time': i.status.start_time.replace(tzinfo=None) if i.status.start_time else i.status.start_time, + 'status': i.status.phase, + 'status_conditions': i.status.conditions, + 'job_name': i.metadata.labels['job-name'] if i.metadata.labels and 'job-name' in i.metadata.labels else None, + 'containers_state': [] + } + + # derive worker status with container state, e.g.OOMKilled if i.status.container_statuses: for cs in i.status.container_statuses: if cs.state: @@ -139,9 +192,12 @@ def get_jobs_info(self, job_name=None): ret = self.batchv1.list_namespaced_job(namespace=self.namespace, field_selector=field_selector) for i in ret.items: - job_info = {'name': i.metadata.name, 'status': i.status.conditions[0].type, - 'status_reason': i.status.conditions[0].reason, - 'status_message': i.status.conditions[0].message} + job_info = { + 'name': i.metadata.name, + 'status': i.status.conditions[0].type, + 'status_reason': i.status.conditions[0].reason, + 'status_message': i.status.conditions[0].message + } jobs_list.append(job_info) return jobs_list @@ -166,6 +222,10 @@ def delete_job(self, job_name): self.batchv1.delete_namespaced_job(name=job_name, namespace=self.namespace, body=self.deletev1, grace_period_seconds=0) + def delete_config_map(self, config_map_name): + self.corev1.delete_namespaced_config_map(name=config_map_name, namespace=self.namespace, body=self.deletev1, + grace_period_seconds=0) + def set_proxy(self, proxy_path): with open(proxy_path) as f: content = f.read() @@ -206,19 +266,55 @@ def create_or_patch_secret(self, file_list, secret_name): metadata = {'name': secret_name, 'namespace': self.namespace} data = {} - for file in file_list: - filename = os.path.basename(file) - with open(file, 'rb') as f: - aux = f.read() - data[filename] = base64.b64encode(aux).decode() + for file_name in file_list: + filename = os.path.basename(file_name) + with open(file_name, 'rb') as f: + content = f.read() + data[filename] = base64.b64encode(content).decode() body = client.V1Secret(data=data, metadata=metadata) try: rsp = self.corev1.patch_namespaced_secret(name=secret_name, body=body, namespace=self.namespace) except ApiException as e: - tmp_log.debug('Exception when patch secret: {0} . Try to create secret instead...'.format(e)) + tmp_log.debug('Exception when patching secret: {0} . Try to create secret instead...'.format(e)) rsp = self.corev1.create_namespaced_secret(body=body, namespace=self.namespace) return rsp + def create_configmap(self, work_spec): + # useful guide: https://matthewpalmer.net/kubernetes-app-developer/articles/ultimate-configmap-guide-kubernetes.html + + tmp_log = core_utils.make_logger(base_logger, method_name='create_configmap') + + try: + worker_id = str(work_spec.workerID) + + # Get the access point. The messenger should have dropped the input files for the pilot here + access_point = work_spec.get_access_point() + pjd = 'pandaJobData.out' + job_data_file = os.path.join(access_point, pjd) + with open(job_data_file) as f: + job_data_contents = f.read() + + pfc = 'PoolFileCatalog_H.xml' + pool_file_catalog_file = os.path.join(access_point, pfc) + with open(pool_file_catalog_file) as f: + pool_file_catalog_contents = f.read() + + # put the job data and PFC into a dictionary + data = {pjd: job_data_contents, pfc: pool_file_catalog_contents} + + # instantiate the configmap object + metadata = {'name': worker_id, 'namespace': self.namespace} + config_map = client.V1ConfigMap(api_version="v1", kind="ConfigMap", data=data, metadata=metadata) + + # create the configmap object in K8s + api_response = self.corev1.create_namespaced_config_map(namespace=self.namespace, body=config_map) + tmp_log.debug('Created configmap for worker id: {0}'.format(worker_id)) + return True + + except (ApiException, TypeError) as e: + tmp_log.error('Could not create configmap with: {0}'.format(e)) + return False + def get_pod_logs(self, pod_name, previous=False): tmp_log = core_utils.make_logger(base_logger, method_name='get_pod_logs') try: From 72d9d3ff926efbba6abb2073c2f3a91a7607a55c Mon Sep 17 00:00:00 2001 From: fbarreir Date: Tue, 14 Apr 2020 22:14:52 +0200 Subject: [PATCH 118/181] k8s: PUSH mode --- .../harvestersubmitter/k8s_submitter.py | 168 ++++++++++++++---- 1 file changed, 136 insertions(+), 32 deletions(-) diff --git a/pandaharvester/harvestersubmitter/k8s_submitter.py b/pandaharvester/harvestersubmitter/k8s_submitter.py index 663f3fd8..ca7e4309 100644 --- a/pandaharvester/harvestersubmitter/k8s_submitter.py +++ b/pandaharvester/harvestersubmitter/k8s_submitter.py @@ -1,14 +1,24 @@ import os +import argparse +import traceback +try: + from urllib import unquote # Python 2.X +except ImportError: + from urllib.parse import unquote # Python 3+ from concurrent.futures import ThreadPoolExecutor from pandaharvester.harvestercore import core_utils from pandaharvester.harvestercore.plugin_base import PluginBase from pandaharvester.harvestermisc.k8s_utils import k8s_Client - +from pandaharvester.harvesterconfig import harvester_config # logger -baseLogger = core_utils.setup_logger('k8s_submitter') +base_logger = core_utils.setup_logger('k8s_submitter') + +DEF_SLC6_IMAGE = 'atlasadc/atlas-grid-slc6' +DEF_CENTOS7_IMAGE = 'atlasadc/atlas-grid-centos7' +DEF_IMAGE = DEF_CENTOS7_IMAGE # submitter for K8S @@ -20,6 +30,11 @@ def __init__(self, **kwarg): self.k8s_client = k8s_Client(namespace=self.k8s_namespace, config_file=self.k8s_config_file) + # required for parsing jobParams + self.parser = argparse.ArgumentParser() + self.parser.add_argument('-p', dest='executable', type=unquote) + self.parser.add_argument('--containerImage', dest='container_image') + # number of processes try: self.nProcesses @@ -37,63 +52,152 @@ def __init__(self, **kwarg): # CPU adjust ratio try: - self.cpuAdjustRatio + self.cpu_adjust_ratio except AttributeError: - self.cpuAdjustRatio = 100 + self.cpu_adjust_ratio = 100 # Memory adjust ratio try: - self.memoryAdjustRatio + self.memory_adjust_ratio except AttributeError: - self.memoryAdjustRatio = 100 + self.memory_adjust_ratio = 100 - def submit_a_job(self, work_spec): - tmp_log = self.make_logger(baseLogger, method_name='submit_a_job') - tmpRetVal = (None, 'Nothing done') + def parse_params(self, job_params): + tmp_log = self.make_logger(base_logger, method_name='parse_params') - yaml_content = self.k8s_client.read_yaml_file(self.k8s_yaml_file) + job_params_list = job_params.split(' ') + args, unknown = self.parser.parse_known_args(job_params_list) + + tmp_log.info('Parsed params: {0}'.format(args)) + return args + + def read_job_configuration(self, work_spec): try: + job_spec_list = work_spec.get_jobspec_list() + if job_spec_list: + job_spec = job_spec_list[0] + job_fields = job_spec.jobParams + job_pars_parsed = self.parse_params(job_fields['jobPars']) + return job_fields, job_pars_parsed + except (KeyError, AttributeError): + return None, None + + def decide_container_image(self, job_fields, job_pars_parsed): + """ + Decide container image: + - job defined image: if we are running in push mode and the job specified an image, use it + - production images: take SLC6 or CentOS7 + - otherwise take default image specified for the queue + """ + tmp_log = self.make_logger(base_logger, method_name='decide_container_image') + try: + container_image = job_pars_parsed.container_image + if container_image: + tmp_log.debug('Taking container image from job params: {0}'.format(container_image)) + return container_image + except AttributeError: + pass + + try: + cmt_config = job_fields['cmtconfig'] + requested_os = cmt_config.split('@')[1] + if 'slc6' in requested_os.lower(): + container_image = DEF_SLC6_IMAGE + else: + container_image = DEF_CENTOS7_IMAGE + tmp_log.debug('Taking container image from cmtconfig: {0}'.format(container_image)) + return container_image + except (KeyError, TypeError): + pass + + container_image = DEF_IMAGE + tmp_log.debug('Taking default container image: {0}'.format(container_image)) + return container_image + + def build_executable(self, job_fields, job_pars_parsed): + executable = [] + args = [] + try: + if 'runcontainer' in job_fields['transformation']: + # remove any quotes + exec_list = job_pars_parsed.executable.strip('"\'').split(' ') + # take first word as executable + executable = [exec_list[0]] + # take rest as arguments + if len(exec_list) > 1: + args = [' '.join(exec_list[1:])] + except (AttributeError, TypeError): + executable = [] + args = [] + + return executable, args + + def submit_k8s_worker(self, work_spec): + tmp_log = self.make_logger(base_logger, method_name='submit_k8s_worker') + + # set the stdout log file + log_file_name = '{0}_{1}.out'.format(harvester_config.master.harvester_id, work_spec.workerID) + work_spec.set_log_file('stdout', '{0}/{1}'.format(self.logBaseURL, log_file_name)) + # TODO: consider if we want to upload the yaml file to PanDA cache + + yaml_content = self.k8s_client.read_yaml_file(self.k8s_yaml_file) + try: + + # read the job configuration (if available, only push model) + job_fields, job_pars_parsed = self.read_job_configuration(work_spec) + + # decide container image and executable to run. In pull mode, defaults are provided + container_image = self.decide_container_image(job_fields, job_pars_parsed) + executable, args = self.build_executable(job_fields, job_pars_parsed) + tmp_log.debug('container_image: "{0}"; executable: "{1}"; args: "{2}"'.format(container_image, executable, + args)) + if hasattr(self, 'proxySecretPath'): - rsp = self.k8s_client.create_job_from_yaml(yaml_content, work_spec, self.proxySecretPath, True, self.cpuAdjustRatio, self.memoryAdjustRatio) + cert = self.proxySecretPath + use_secret = True elif hasattr(self, 'x509UserProxy'): - rsp = self.k8s_client.create_job_from_yaml(yaml_content, work_spec, self.x509UserProxy, False, self.cpuAdjustRatio, self.memoryAdjustRatio) + cert = self.x509UserProxy + use_secret = False else: - errStr = 'No proxy specified in proxySecretPath or x509UserProxy; not submitted' - tmpRetVal = (False, errStr) + err_str = 'No proxy specified in proxySecretPath or x509UserProxy; not submitted' + tmp_return_value = (False, err_str) + return tmp_return_value + + rsp, yaml_content_final = self.k8s_client.create_job_from_yaml(yaml_content, work_spec, container_image, + cert, cert_in_secret=use_secret, + cpu_adjust_ratio=self.cpu_adjust_ratio, + memory_adjust_ratio=self.memory_adjust_ratio, + executable=executable, args=args) except Exception as _e: - errStr = 'Failed to create a JOB; {0}'.format(_e) - tmpRetVal = (False, errStr) + tmp_log.error(traceback.format_exc()) + err_str = 'Failed to create a JOB; {0}'.format(_e) + tmp_return_value = (False, err_str) else: work_spec.batchID = yaml_content['metadata']['name'] - - # set the log files - work_spec.set_log_file('stdout', '{0}/{1}.out'.format(self.logBaseURL, work_spec.workerID)) - tmp_log.debug('Created worker {0} with batchID={1}'.format(work_spec.workerID, work_spec.batchID)) - tmpRetVal = (True, '') - - return tmpRetVal + tmp_return_value = (True, '') + return tmp_return_value # submit workers def submit_workers(self, workspec_list): - tmp_log = self.make_logger(baseLogger, method_name='submit_workers') + tmp_log = self.make_logger(base_logger, method_name='submit_workers') - nWorkers = len(workspec_list) - tmp_log.debug('start, nWorkers={0}'.format(nWorkers)) + n_workers = len(workspec_list) + tmp_log.debug('start, n_workers={0}'.format(n_workers)) - retList = list() + ret_list = list() if not workspec_list: tmp_log.debug('empty workspec_list') - return retList + return ret_list with ThreadPoolExecutor(self.nProcesses) as thread_pool: - retValList = thread_pool.map(self.submit_a_job, workspec_list) - tmp_log.debug('{0} workers submitted'.format(nWorkers)) + ret_val_list = thread_pool.map(self.submit_k8s_worker, workspec_list) + tmp_log.debug('{0} workers submitted'.format(n_workers)) - retList = list(retValList) + ret_list = list(ret_val_list) tmp_log.debug('done') - return retList + return ret_list From d6413a91a93129fea0dd3ded4d00bf5dd535fc91 Mon Sep 17 00:00:00 2001 From: fbarreir Date: Tue, 14 Apr 2020 22:24:35 +0200 Subject: [PATCH 119/181] k8s: PUSH model --- .../harvestersweeper/k8s_sweeper.py | 154 ++++++++++-------- 1 file changed, 86 insertions(+), 68 deletions(-) diff --git a/pandaharvester/harvestersweeper/k8s_sweeper.py b/pandaharvester/harvestersweeper/k8s_sweeper.py index a3b2556b..fa0639ab 100644 --- a/pandaharvester/harvestersweeper/k8s_sweeper.py +++ b/pandaharvester/harvestersweeper/k8s_sweeper.py @@ -1,12 +1,9 @@ -import os - from pandaharvester.harvestercore import core_utils from pandaharvester.harvestersweeper.base_sweeper import BaseSweeper from pandaharvester.harvestermisc.k8s_utils import k8s_Client - # logger -baseLogger = core_utils.setup_logger('k8s_sweeper') +base_logger = core_utils.setup_logger('k8s_sweeper') # sweeper for K8S @@ -15,92 +12,113 @@ class K8sSweeper(BaseSweeper): def __init__(self, **kwarg): BaseSweeper.__init__(self, **kwarg) - self.k8s_client = k8s_Client(namespace=self.k8s_namespace, config_file=self.k8s_config_file) + self.k8s_client = k8s_Client(self.k8s_namespace, config_file=self.k8s_config_file) self._all_pods_list = [] # # kill a worker - # def kill_worker(self, workspec): - # tmpLog = self.make_logger(baseLogger, 'workerID={0}'.format(workspec.workerID), + # def kill_worker(self, work_spec): + # tmp_log = self.make_logger(base_logger, 'workerID={0}'.format(work_spec.workerID), # method_name='kill_worker') # - # tmpRetVal = (None, 'Nothing done') + # tmp_ret_val = (None, 'Nothing done') # - # job_id = workspec.batchID + # batch_id = work_spec.batchID # try: - # self.k8s_client.delete_job(job_id) + # self.k8s_client.delete_job(batch_id) # except Exception as _e: - # errStr = 'Failed to delete a JOB with id={0} ; {1}'.format(job_id, _e) - # tmpLog.error(errStr) - # tmpRetVal = (False, errStr) + # err_str = 'Failed to delete a JOB with id={0} ; {1}'.format(batch_id, _e) + # tmp_log.error(err_str) + # tmp_ret_val = (False, err_str) # # self._all_pods_list = self.k8s_client.get_pods_info() - # pods_list = self.k8s_client.filter_pods_info(self._all_pods_list, job_name=job_id) + # pods_list = self.k8s_client.filter_pods_info(self._all_pods_list, job_name=batch_id) # pods_name = [ pods_info['name'] for pods_info in pods_list ] - # job_info = self.k8s_client.get_jobs_info(job_id) + # job_info = self.k8s_client.get_jobs_info(batch_id) # # if not job_info: - # retList = self.k8s_client.delete_pods(pods_name) - # if all(item['errMsg'] == '' for item in retList): - # tmpLog.info('Deleted a JOB & POD with id={0}'.format(job_id)) - # tmpRetVal = (True, '') + # ret_list = self.k8s_client.delete_pods(pods_name) + # if all(item['errMsg'] == '' for item in ret_list): + # tmp_log.info('Deleted a JOB & POD with id={0}'.format(batch_id)) + # tmp_ret_val = (True, '') # else: - # errStrList = list() - # for item in retList: + # err_str_list = list() + # for item in ret_list: # if item['errMsg']: - # errStr = 'Failed to delete a POD with id={0} ; {1}'.format(item['name'], item['errMsg']) - # tmpLog.error(errStr) - # errStrList.append(errStr) - # tmpRetVal = (False, ','.join(errStrList)) + # err_str = 'Failed to delete a POD with id={0} ; {1}'.format(item['name'], item['errMsg']) + # tmp_log.error(err_str) + # err_str_list.append(err_str) + # tmp_ret_val = (False, ','.join(err_str_list)) # - # return tmpRetVal + # return tmp_ret_val # kill workers - def kill_workers(self, workspec_list): - tmpLog = self.make_logger(baseLogger, method_name='kill_workers') + def kill_workers(self, work_spec_list): + tmp_log = self.make_logger(base_logger, method_name='kill_workers') self._all_pods_list = self.k8s_client.get_pods_info() - retList = [] - for workspec in workspec_list: - tmpRetVal = (None, 'Nothing done') - - job_id = workspec.batchID - try: - self.k8s_client.delete_job(job_id) - except Exception as _e: - errStr = 'Failed to delete a JOB with id={0} ; {1}'.format(job_id, _e) - tmpLog.error(errStr) - tmpRetVal = (False, errStr) - - pods_list = self.k8s_client.filter_pods_info(self._all_pods_list, job_name=job_id) - pods_name = [ pods_info['name'] for pods_info in pods_list ] - job_info = self.k8s_client.get_jobs_info(job_id) - - if not job_info: - retList = self.k8s_client.delete_pods(pods_name) - if all(item['errMsg'] == '' for item in retList): - tmpLog.info('Deleted a JOB & POD with id={0}'.format(job_id)) - tmpRetVal = (True, '') + ret_list = [] + for work_spec in work_spec_list: + tmp_ret_val = (None, 'Nothing done') + + batch_id = work_spec.batchID + worker_id = str(work_spec.workerID) + if batch_id: # sometimes there are missed workers that were not submitted + + # if push mode, delete the configmap + if work_spec.mapType != 'NoJob': + try: + self.k8s_client.delete_config_map(worker_id) + tmp_log.debug('Deleted configmap {0}'.format(worker_id)) + except Exception as _e: + err_str = 'Failed to delete a CONFIGMAP with id={0} ; {1}'.format(worker_id, _e) + tmp_log.error(err_str) + tmp_ret_val = (False, err_str) else: - errStrList = list() - for item in retList: - if item['errMsg']: - errStr = 'Failed to delete a POD with id={0} ; {1}'.format(item['name'], item['errMsg']) - tmpLog.error(errStr) - errStrList.append(errStr) - tmpRetVal = (False, ','.join(errStrList)) - - retList.append(tmpRetVal) - - return retList - - - # cleanup for a worker - def sweep_worker(self, workspec): - ## Make logger - tmpLog = self.make_logger(baseLogger, 'workerID={0}'.format(workspec.workerID), - method_name='sweep_worker') + tmp_log.debug('No job/configmap associated to worker {0}'.format(work_spec.workerID)) + + # delete the job + try: + self.k8s_client.delete_job(batch_id) + tmp_log.debug('Deleted JOB {0}'.format(batch_id)) + except Exception as _e: + err_str = 'Failed to delete a JOB with id={0} ; {1}'.format(batch_id, _e) + tmp_log.error(err_str) + tmp_ret_val = (False, err_str) + + # retrieve the associated pods + pods_list = self.k8s_client.filter_pods_info(self._all_pods_list, job_name=batch_id) + pods_name = [pods_info['name'] for pods_info in pods_list] + job_info = self.k8s_client.get_jobs_info(batch_id) + # retrieve the associated pods + if not job_info: + ret_list = self.k8s_client.delete_pods(pods_name) + if all(item['errMsg'] == '' for item in ret_list): + tmp_log.info('Deleted a JOB & POD with id={0}'.format(batch_id)) + tmp_ret_val = (True, '') + else: + err_str_list = list() + for item in ret_list: + if item['errMsg']: + err_str = 'Failed to delete a POD with id={0} ; {1}'.format(item['name'], item['errMsg']) + tmp_log.error(err_str) + err_str_list.append(err_str) + tmp_ret_val = (False, ','.join(err_str_list)) + else: # the worker cannot be cleaned + tmp_ret_val = (True, '') + + ret_list.append(tmp_ret_val) + + return ret_list + + def sweep_worker(self, work_spec): + # cleanup for a worker + tmp_log = self.make_logger(base_logger, 'workerID={0}'.format(work_spec.workerID), method_name='sweep_worker') + + # retrieve and upload the logs to panda cache + # batch_id = work_spec.batchID + # log_content = self.k8s_client.retrieve_pod_log(batch_id) # nothing to do - return True, '' + return True, '' \ No newline at end of file From caf447c7f77b864c2d9b58be8463b80ae1517009 Mon Sep 17 00:00:00 2001 From: fbarreir Date: Wed, 15 Apr 2020 12:17:21 +0200 Subject: [PATCH 120/181] k8s: improvements --- pandaharvester/harvestersubmitter/k8s_submitter.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandaharvester/harvestersubmitter/k8s_submitter.py b/pandaharvester/harvestersubmitter/k8s_submitter.py index ca7e4309..2bcdfdb9 100644 --- a/pandaharvester/harvestersubmitter/k8s_submitter.py +++ b/pandaharvester/harvestersubmitter/k8s_submitter.py @@ -83,6 +83,8 @@ def read_job_configuration(self, work_spec): except (KeyError, AttributeError): return None, None + return None, None + def decide_container_image(self, job_fields, job_pars_parsed): """ Decide container image: From 15fe8d90757fb55461e3afe492bbad2c177a7c69 Mon Sep 17 00:00:00 2001 From: fbarreir Date: Wed, 15 Apr 2020 16:10:10 +0200 Subject: [PATCH 121/181] fix against pandaQueueDict info not being refreshed in jobFetcher --- pandaharvester/harvesterbody/job_fetcher.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/pandaharvester/harvesterbody/job_fetcher.py b/pandaharvester/harvesterbody/job_fetcher.py index 78d7fb15..b8eecfa4 100644 --- a/pandaharvester/harvesterbody/job_fetcher.py +++ b/pandaharvester/harvesterbody/job_fetcher.py @@ -25,7 +25,6 @@ def __init__(self, communicator, queue_config_mapper, single_mode=False): self.communicator = communicator self.nodeName = socket.gethostname() self.queueConfigMapper = queue_config_mapper - self.pandaQueueDict = PandaQueuesDict() self.pluginFactory = PluginFactory() # main loop @@ -37,6 +36,10 @@ def run(self): nJobsPerQueue = self.dbProxy.get_num_jobs_to_fetch(harvester_config.jobfetcher.nQueues, harvester_config.jobfetcher.lookupTime) mainLog.debug('got {0} queues'.format(len(nJobsPerQueue))) + + # get up to date queue configuration + pandaQueueDict = PandaQueuesDict() + # loop over all queues for queueName, nJobs in iteritems(nJobsPerQueue): # check queue @@ -53,7 +56,7 @@ def run(self): # get jobs try: - is_grandly_unified_queue = self.pandaQueueDict.is_grandly_unified_queue(siteName) + is_grandly_unified_queue = pandaQueueDict.is_grandly_unified_queue(siteName) except Exception: is_grandly_unified_queue = False @@ -128,8 +131,8 @@ def run(self): jobSpec.trigger_propagation() jobSpecs.append(jobSpec) # insert to DB - tmpLog.debug("Converting of {0} jobs {1}".format(len(jobs),sw_startconvert.get_elapsed_time())) - sw_insertdb =core_utils.get_stopwatch() + tmpLog.debug("Converting of {0} jobs {1}".format(len(jobs), sw_startconvert.get_elapsed_time())) + sw_insertdb = core_utils.get_stopwatch() self.dbProxy.insert_jobs(jobSpecs) tmpLog.debug('Insert of {0} jobs {1}'.format(len(jobSpecs), sw_insertdb.get_elapsed_time())) mainLog.debug('done') From b1422ada67b221beb4b52c79ead00d2db8794e97 Mon Sep 17 00:00:00 2001 From: fbarreir Date: Wed, 15 Apr 2020 16:43:51 +0200 Subject: [PATCH 122/181] k8s: moved default commands and logic to k8s_submitter rather than k8s_utils --- pandaharvester/harvestermisc/k8s_utils.py | 19 ++++++------------- .../harvestersubmitter/k8s_submitter.py | 16 ++++++++++------ 2 files changed, 16 insertions(+), 19 deletions(-) diff --git a/pandaharvester/harvestermisc/k8s_utils.py b/pandaharvester/harvestermisc/k8s_utils.py index 1dcb62d2..25083a7d 100644 --- a/pandaharvester/harvestermisc/k8s_utils.py +++ b/pandaharvester/harvestermisc/k8s_utils.py @@ -17,8 +17,7 @@ base_logger = core_utils.setup_logger('k8s_utils') CONFIG_DIR = '/scratch/jobconfig' -DEF_COMMAND = ["/usr/bin/bash"] -DEF_ARGS = ["-c", "cd; wget https://raw.githubusercontent.com/HSF/harvester/k8s_analysis/pandaharvester/harvestercloud/pilots_starter.py; chmod 755 pilots_starter.py; ./pilots_starter.py || true"] + class k8s_Client(object): @@ -37,8 +36,8 @@ def read_yaml_file(self, yaml_file): return yaml_content - def create_job_from_yaml(self, yaml_content, work_spec, container_image, cert, cert_in_secret=True, - cpu_adjust_ratio=100, memory_adjust_ratio=100, executable=[], args=[]): + def create_job_from_yaml(self, yaml_content, work_spec, container_image, executable, args, + cert, cert_in_secret=True, cpu_adjust_ratio=100, memory_adjust_ratio=100,): tmp_log = core_utils.make_logger(base_logger, method_name='create_job_from_yaml') @@ -78,11 +77,6 @@ def create_job_from_yaml(self, yaml_content, work_spec, container_image, cert, c if 'image' not in container_env: container_env['image'] = container_image - # if there is no user defined executable, run the default executable - if not executable: - executable = DEF_COMMAND - args = DEF_ARGS - if 'command' not in container_env: container_env['command'] = executable container_env['args'] = args @@ -151,7 +145,7 @@ def create_job_from_yaml(self, yaml_content, work_spec, container_image, cert, c if 'affinity' not in yaml_content['spec']['template']['spec']: yaml_content = self.set_affinity(yaml_content) - # tmp_log.debug('creating job {0}'.format(yaml_content)) + tmp_log.debug('creating job {0}'.format(yaml_content)) rsp = self.batchv1.create_namespaced_job(body=yaml_content, namespace=self.namespace) return rsp, yaml_content @@ -205,8 +199,7 @@ def delete_pods(self, pod_name_list): ret_list = list() for pod_name in pod_name_list: - rsp = {} - rsp['name'] = pod_name + rsp = {'name': pod_name} try: self.corev1.delete_namespaced_pod(name=pod_name, namespace=self.namespace, body=self.deletev1, grace_period_seconds=0) @@ -224,7 +217,7 @@ def delete_job(self, job_name): def delete_config_map(self, config_map_name): self.corev1.delete_namespaced_config_map(name=config_map_name, namespace=self.namespace, body=self.deletev1, - grace_period_seconds=0) + grace_period_seconds=0) def set_proxy(self, proxy_path): with open(proxy_path) as f: diff --git a/pandaharvester/harvestersubmitter/k8s_submitter.py b/pandaharvester/harvestersubmitter/k8s_submitter.py index 2bcdfdb9..38973080 100644 --- a/pandaharvester/harvestersubmitter/k8s_submitter.py +++ b/pandaharvester/harvestersubmitter/k8s_submitter.py @@ -16,10 +16,15 @@ # logger base_logger = core_utils.setup_logger('k8s_submitter') +# image defaults DEF_SLC6_IMAGE = 'atlasadc/atlas-grid-slc6' DEF_CENTOS7_IMAGE = 'atlasadc/atlas-grid-centos7' DEF_IMAGE = DEF_CENTOS7_IMAGE +# command defaults +DEF_COMMAND = ["/usr/bin/bash"] +DEF_ARGS = ["-c", "cd; wget https://raw.githubusercontent.com/HSF/harvester/master/pandaharvester/harvestercloud/pilots_starter.py; chmod 755 pilots_starter.py; ./pilots_starter.py || true"] + # submitter for K8S class K8sSubmitter(PluginBase): @@ -118,8 +123,8 @@ def decide_container_image(self, job_fields, job_pars_parsed): return container_image def build_executable(self, job_fields, job_pars_parsed): - executable = [] - args = [] + executable = DEF_COMMAND + args = DEF_ARGS try: if 'runcontainer' in job_fields['transformation']: # remove any quotes @@ -130,8 +135,7 @@ def build_executable(self, job_fields, job_pars_parsed): if len(exec_list) > 1: args = [' '.join(exec_list[1:])] except (AttributeError, TypeError): - executable = [] - args = [] + pass return executable, args @@ -167,10 +171,10 @@ def submit_k8s_worker(self, work_spec): return tmp_return_value rsp, yaml_content_final = self.k8s_client.create_job_from_yaml(yaml_content, work_spec, container_image, + executable, args cert, cert_in_secret=use_secret, cpu_adjust_ratio=self.cpu_adjust_ratio, - memory_adjust_ratio=self.memory_adjust_ratio, - executable=executable, args=args) + memory_adjust_ratio=self.memory_adjust_ratio) except Exception as _e: tmp_log.error(traceback.format_exc()) err_str = 'Failed to create a JOB; {0}'.format(_e) From 2e0d54b46597cc3f9a86803aeb9b3510cb81b626 Mon Sep 17 00:00:00 2001 From: fbarreir Date: Wed, 15 Apr 2020 16:48:00 +0200 Subject: [PATCH 123/181] k8s: changed debug message --- pandaharvester/harvestermisc/k8s_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandaharvester/harvestermisc/k8s_utils.py b/pandaharvester/harvestermisc/k8s_utils.py index 25083a7d..0d9dc562 100644 --- a/pandaharvester/harvestermisc/k8s_utils.py +++ b/pandaharvester/harvestermisc/k8s_utils.py @@ -113,7 +113,7 @@ def create_job_from_yaml(self, yaml_content, work_spec, container_image, execut try: log_file_name = work_spec.workAttributes['stdout'] except (KeyError, AttributeError): - tmp_log.debug('work_spec does not have workAttributes field: {0}'.format(work_spec)) + tmp_log.debug('work_spec does not have stdout workAttribute, using default') log_file_name = '' container_env['env'].extend([ From d0210eb91361fa080b0495bcfb469f65ea7d2934 Mon Sep 17 00:00:00 2001 From: fbarreir Date: Wed, 15 Apr 2020 16:51:33 +0200 Subject: [PATCH 124/181] k8s: added some debug info --- pandaharvester/harvestermisc/k8s_utils.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandaharvester/harvestermisc/k8s_utils.py b/pandaharvester/harvestermisc/k8s_utils.py index 0d9dc562..5078df20 100644 --- a/pandaharvester/harvestermisc/k8s_utils.py +++ b/pandaharvester/harvestermisc/k8s_utils.py @@ -267,9 +267,11 @@ def create_or_patch_secret(self, file_list, secret_name): body = client.V1Secret(data=data, metadata=metadata) try: rsp = self.corev1.patch_namespaced_secret(name=secret_name, body=body, namespace=self.namespace) + tmp_log.debug('Patched secret') except ApiException as e: tmp_log.debug('Exception when patching secret: {0} . Try to create secret instead...'.format(e)) rsp = self.corev1.create_namespaced_secret(body=body, namespace=self.namespace) + tmp_log.debug('Created secret') return rsp def create_configmap(self, work_spec): @@ -312,6 +314,7 @@ def get_pod_logs(self, pod_name, previous=False): tmp_log = core_utils.make_logger(base_logger, method_name='get_pod_logs') try: rsp = self.corev1.read_namespaced_pod_log(name=pod_name, namespace=self.namespace, previous=previous) + tmp_log.debug('Log file retrieved for {0}'.format(pod_name)) except ApiException as e: tmp_log.debug('Exception when getting logs for pod {0} : {1}. Skipped'.format(pod_name, e)) raise From 94f97b0bee0e0f914f16710694010e9586fbc210 Mon Sep 17 00:00:00 2001 From: fbarreir Date: Wed, 15 Apr 2020 17:38:12 +0200 Subject: [PATCH 125/181] k8s: oops --- pandaharvester/harvestersubmitter/k8s_submitter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandaharvester/harvestersubmitter/k8s_submitter.py b/pandaharvester/harvestersubmitter/k8s_submitter.py index 38973080..b8ba61a1 100644 --- a/pandaharvester/harvestersubmitter/k8s_submitter.py +++ b/pandaharvester/harvestersubmitter/k8s_submitter.py @@ -171,7 +171,7 @@ def submit_k8s_worker(self, work_spec): return tmp_return_value rsp, yaml_content_final = self.k8s_client.create_job_from_yaml(yaml_content, work_spec, container_image, - executable, args + executable, args, cert, cert_in_secret=use_secret, cpu_adjust_ratio=self.cpu_adjust_ratio, memory_adjust_ratio=self.memory_adjust_ratio) From 0ccfbec5ed602e585558002e7a0d048b857151d2 Mon Sep 17 00:00:00 2001 From: fbarreir Date: Mon, 20 Apr 2020 16:12:12 +0200 Subject: [PATCH 126/181] prodanaly: on unified queues run neutral prodsourcelabels with production proxy --- .../simple_worker_maker.py | 24 +++++++++++++------ 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/pandaharvester/harvesterworkermaker/simple_worker_maker.py b/pandaharvester/harvesterworkermaker/simple_worker_maker.py index 20b63ee0..c41730eb 100644 --- a/pandaharvester/harvesterworkermaker/simple_worker_maker.py +++ b/pandaharvester/harvesterworkermaker/simple_worker_maker.py @@ -41,17 +41,27 @@ def get_job_core_and_memory(self, queue_dict, job_spec): return job_corecount, job_memory - def get_job_type(self, job_spec, job_type, queue_dict): + def get_job_type(self, job_spec, job_type, queue_dict, tmp_prodsourcelabel=None): - # 1. get prodSourceLabel from job (push) + queue_type = queue_dict.get('type', None) + + # 1. get prodSourceLabel from job (PUSH) if job_spec and 'prodSourceLabel' in job_spec.jobParams: job_type_final = job_spec.jobParams['prodSourceLabel'] - # 2. get prodSourceLabel from the specified job_type (pull UPS) + + # 2. get prodSourceLabel from the specified job_type (PULL UPS) elif job_type: - job_type_final = job_type - # 3. convert the prodSourcelabel from the queue configuration or leave it empty - else: # 3. get prodSourceLabel from the queue definition (pull) - queue_type = queue_dict.get('type', None) + + if tmp_prodsourcelabel: + if queue_type != 'analysis' and tmp_prodsourcelabel not in ('user', 'panda'): + # for production, unified or other types of queues we need to run neutral prodsourcelabels + # with production proxy since they can't be distinguished and can fail + job_type_final = 'managed' + else: + job_type_final = job_type + + # 3. convert the prodSourcelabel from the queue configuration or leave it empty (PULL) + else: # map AGIS types to PanDA types if queue_type == 'analysis': job_type_final = 'user' From 5461c53b7bac1033b66a3d375871a3bf5d21a9d9 Mon Sep 17 00:00:00 2001 From: fbarreir Date: Mon, 20 Apr 2020 16:16:54 +0200 Subject: [PATCH 127/181] k8s: added some debug info --- pandaharvester/harvesterworkermaker/simple_worker_maker.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandaharvester/harvesterworkermaker/simple_worker_maker.py b/pandaharvester/harvesterworkermaker/simple_worker_maker.py index c41730eb..43a9fc12 100644 --- a/pandaharvester/harvesterworkermaker/simple_worker_maker.py +++ b/pandaharvester/harvesterworkermaker/simple_worker_maker.py @@ -51,14 +51,12 @@ def get_job_type(self, job_spec, job_type, queue_dict, tmp_prodsourcelabel=None) # 2. get prodSourceLabel from the specified job_type (PULL UPS) elif job_type: - + job_type_final = job_type if tmp_prodsourcelabel: if queue_type != 'analysis' and tmp_prodsourcelabel not in ('user', 'panda'): # for production, unified or other types of queues we need to run neutral prodsourcelabels # with production proxy since they can't be distinguished and can fail job_type_final = 'managed' - else: - job_type_final = job_type # 3. convert the prodSourcelabel from the queue configuration or leave it empty (PULL) else: From ae20f80e87716a38fe9a4e27035f4441345cb9d6 Mon Sep 17 00:00:00 2001 From: fbarreir Date: Mon, 20 Apr 2020 16:25:08 +0200 Subject: [PATCH 128/181] prodanaly: added some debug info --- pandaharvester/harvesterworkermaker/simple_worker_maker.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandaharvester/harvesterworkermaker/simple_worker_maker.py b/pandaharvester/harvesterworkermaker/simple_worker_maker.py index 43a9fc12..bafc95f4 100644 --- a/pandaharvester/harvesterworkermaker/simple_worker_maker.py +++ b/pandaharvester/harvesterworkermaker/simple_worker_maker.py @@ -170,8 +170,8 @@ def make_worker(self, jobspec_list, queue_config, job_type, resource_type): if workSpec.pilotType in ['RC', 'ALRB', 'PT']: tmpLog.info('a worker has pilotType={0}'.format(workSpec.pilotType)) - workSpec.jobType = self.get_job_type(None, job_type, queue_dict) - tmpLog.debug('get_job_type decided for job_type: {0}'.format(workSpec.jobType)) + workSpec.jobType = self.get_job_type(None, job_type, queue_dict, tmp_prodsourcelabel) + tmpLog.debug('get_job_type decided for job_type: {0} (input job_type: {1}, tmp_prodsourcelabel: {2})'.format(workSpec.jobType, job_type, tmp_prodsourcelabel)) if resource_type and resource_type != 'ANY': workSpec.resourceType = resource_type From e879b590b08009b0f62682b36eba82c03fbb850c Mon Sep 17 00:00:00 2001 From: fbarreir Date: Mon, 20 Apr 2020 16:27:24 +0200 Subject: [PATCH 129/181] prodanaly: added some debug info --- pandaharvester/harvesterworkermaker/simple_worker_maker.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandaharvester/harvesterworkermaker/simple_worker_maker.py b/pandaharvester/harvesterworkermaker/simple_worker_maker.py index bafc95f4..147188d5 100644 --- a/pandaharvester/harvesterworkermaker/simple_worker_maker.py +++ b/pandaharvester/harvesterworkermaker/simple_worker_maker.py @@ -171,7 +171,8 @@ def make_worker(self, jobspec_list, queue_config, job_type, resource_type): tmpLog.info('a worker has pilotType={0}'.format(workSpec.pilotType)) workSpec.jobType = self.get_job_type(None, job_type, queue_dict, tmp_prodsourcelabel) - tmpLog.debug('get_job_type decided for job_type: {0} (input job_type: {1}, tmp_prodsourcelabel: {2})'.format(workSpec.jobType, job_type, tmp_prodsourcelabel)) + tmpLog.debug('get_job_type decided for job_type: {0} (input job_type: {1}, queue_type: {2}, tmp_prodsourcelabel: {3})' + .format(workSpec.jobType, job_type, queue_type = queue_dict.get('type', None), tmp_prodsourcelabel)) if resource_type and resource_type != 'ANY': workSpec.resourceType = resource_type From f9e76fdb343eb77d6d6aece7a572bf159d78255d Mon Sep 17 00:00:00 2001 From: fbarreir Date: Mon, 20 Apr 2020 16:34:01 +0200 Subject: [PATCH 130/181] typo --- pandaharvester/harvesterworkermaker/simple_worker_maker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandaharvester/harvesterworkermaker/simple_worker_maker.py b/pandaharvester/harvesterworkermaker/simple_worker_maker.py index 147188d5..acb58fcb 100644 --- a/pandaharvester/harvesterworkermaker/simple_worker_maker.py +++ b/pandaharvester/harvesterworkermaker/simple_worker_maker.py @@ -172,7 +172,7 @@ def make_worker(self, jobspec_list, queue_config, job_type, resource_type): workSpec.jobType = self.get_job_type(None, job_type, queue_dict, tmp_prodsourcelabel) tmpLog.debug('get_job_type decided for job_type: {0} (input job_type: {1}, queue_type: {2}, tmp_prodsourcelabel: {3})' - .format(workSpec.jobType, job_type, queue_type = queue_dict.get('type', None), tmp_prodsourcelabel)) + .format(workSpec.jobType, job_type, queue_dict.get('type', None), tmp_prodsourcelabel)) if resource_type and resource_type != 'ANY': workSpec.resourceType = resource_type From bcb94238d034adf745dae8e21eb3b6c5835fb639 Mon Sep 17 00:00:00 2001 From: fahui Date: Tue, 21 Apr 2020 14:39:36 +0200 Subject: [PATCH 131/181] adjustable reconnectTimeout when reconnecting DB --- pandaharvester/commit_timestamp.py | 2 +- pandaharvester/harvestercore/core_utils.py | 13 +++++++++++++ pandaharvester/harvestercore/db_proxy.py | 17 +++++++++++++---- pandaharvester/harvesterfifo/mysql_fifo.py | 17 ++++++++++++++--- templates/panda_harvester.cfg.rpmnew.template | 2 ++ 5 files changed, 43 insertions(+), 8 deletions(-) diff --git a/pandaharvester/commit_timestamp.py b/pandaharvester/commit_timestamp.py index 4b84cd08..99fda7ab 100644 --- a/pandaharvester/commit_timestamp.py +++ b/pandaharvester/commit_timestamp.py @@ -1 +1 @@ -timestamp = "09-04-2020 17:48:10 on flin (by fahui)" +timestamp = "21-04-2020 12:39:36 on flin (by fahui)" diff --git a/pandaharvester/harvestercore/core_utils.py b/pandaharvester/harvestercore/core_utils.py index fcaade8d..148eca12 100644 --- a/pandaharvester/harvestercore/core_utils.py +++ b/pandaharvester/harvestercore/core_utils.py @@ -639,3 +639,16 @@ def pickle_to_text(data): # unpickle from text def unpickle_from_text(text): return pickle.loads(codecs.decode(text.encode(), 'base64')) + + +# increasing retry period after timeout or failure +def retry_period_sec(nth_retry, increment=1, max_retries=None, max_seconds=None, min_seconds=1): + nth = max(nth_retry, 1) + ret_period = max(min_seconds, 1) + if max_retries and nth_retry > max_retries: + return False + else: + ret_period += (nth - 1)*increment + if max_seconds: + ret_period = min(ret_period, max_seconds) + return ret_period diff --git a/pandaharvester/harvestercore/db_proxy.py b/pandaharvester/harvestercore/db_proxy.py index fbb313d0..84785e01 100644 --- a/pandaharvester/harvestercore/db_proxy.py +++ b/pandaharvester/harvestercore/db_proxy.py @@ -60,6 +60,9 @@ def __init__(self, thr_name=None, read_only=False): self.thrName = thr_name self.verbLog = None self.useInspect = False + self.reconnectTimeout = 300 + if hasattr(harvester_config.db, 'reconnectTimeout'): + self.reconnectTimeout = harvester_config.db.reconnectTimeout if harvester_config.db.verbose: self.verbLog = core_utils.make_logger(_logger, method_name='execute') if self.thrName is None: @@ -139,7 +142,7 @@ def fetchall(self): self.usingAppLock = True # exception handler for type of DBs - def _handle_exception(self, exc, retry_time=30): + def _handle_exception(self, exc): tmpLog = core_utils.make_logger(_logger, 'thr={0}'.format(self.thrName), method_name='_handle_exception') if harvester_config.db.engine == 'mariadb': tmpLog.warning('exception of mysql {0} occurred'.format(exc.__class__.__name__)) @@ -155,14 +158,20 @@ def _handle_exception(self, exc, retry_time=30): isOperationalError = True if isOperationalError: try_timestamp = time.time() - while time.time() - try_timestamp < retry_time: + n_retry = 1 + while time.time() - try_timestamp < self.reconnectTimeout: try: self.__init__() tmpLog.info('renewed connection') break except Exception as e: - tmpLog.error('failed to renew connection; {0}'.format(e)) - time.sleep(1) + tmpLog.error('failed to renew connection ({0} retries); {1}'.format(n_retry, e)) + sleep_time = core_utils.retry_period_sec(n_retry, increment=2, max_seconds=300, min_seconds=1) + if not sleep_time: + break + else: + time.sleep(sleep_time) + n_retry += 1 # convert param dict to list def convert_params(self, sql, varmap): diff --git a/pandaharvester/harvesterfifo/mysql_fifo.py b/pandaharvester/harvesterfifo/mysql_fifo.py index 684ab180..e53976c4 100644 --- a/pandaharvester/harvesterfifo/mysql_fifo.py +++ b/pandaharvester/harvesterfifo/mysql_fifo.py @@ -13,6 +13,11 @@ class MysqlFifo(PluginBase): # constructor def __init__(self, **kwarg): + self.reconnectTimeout = 300 + if hasattr(harvester_config, 'fifo') and hasattr(harvester_config.fifo, 'reconnectTimeout'): + self.reconnectTimeout = harvester_config.db.reconnectTimeout + elif hasattr(harvester_config.db, 'reconnectTimeout'): + self.reconnectTimeout = harvester_config.db.reconnectTimeout PluginBase.__init__(self, **kwarg) self.tableName = '{title}_FIFO'.format(title=self.titleName) # DB access attribues @@ -81,7 +86,7 @@ def fetchall(self): raise _e # decorator exception handler for type of DBs - def _handle_exception(method, retry_time=30): + def _handle_exception(method): def _decorator(_method, *args, **kwargs): @functools.wraps(_method) def _wrapped_method(self, *args, **kwargs): @@ -94,13 +99,19 @@ def _wrapped_method(self, *args, **kwargs): isOperationalError = True if isOperationalError: try_timestamp = time.time() - while time.time() - try_timestamp < retry_time: + n_retry = 1 + while time.time() - try_timestamp < self.reconnectTimeout: try: self.__init__() return except Exception as _e: exc = _e - time.sleep(1) + sleep_time = core_utils.retry_period_sec(n_retry, increment=2, max_seconds=300, min_seconds=1) + if not sleep_time: + break + else: + time.sleep(sleep_time) + n_retry += 1 raise exc else: raise exc diff --git a/templates/panda_harvester.cfg.rpmnew.template b/templates/panda_harvester.cfg.rpmnew.template index 736d34a6..7e49c3a4 100644 --- a/templates/panda_harvester.cfg.rpmnew.template +++ b/templates/panda_harvester.cfg.rpmnew.template @@ -66,6 +66,8 @@ host = localhost # port number for MariaDB. N/A for sqlite port = 3306 +# max time in seconds to keep trying to reconnect DB before timeout +reconnectTimeout = 300 From 02524161ab03d841a56c6ab91d34e4740d3c83a8 Mon Sep 17 00:00:00 2001 From: fbarreir Date: Wed, 22 Apr 2020 16:08:55 +0200 Subject: [PATCH 132/181] prodanaly: missing prodsourcelabel in check --- pandaharvester/harvesterworkermaker/simple_worker_maker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandaharvester/harvesterworkermaker/simple_worker_maker.py b/pandaharvester/harvesterworkermaker/simple_worker_maker.py index acb58fcb..4a1088d4 100644 --- a/pandaharvester/harvesterworkermaker/simple_worker_maker.py +++ b/pandaharvester/harvesterworkermaker/simple_worker_maker.py @@ -53,7 +53,7 @@ def get_job_type(self, job_spec, job_type, queue_dict, tmp_prodsourcelabel=None) elif job_type: job_type_final = job_type if tmp_prodsourcelabel: - if queue_type != 'analysis' and tmp_prodsourcelabel not in ('user', 'panda'): + if queue_type != 'analysis' and tmp_prodsourcelabel not in ('user', 'panda', 'managed'): # for production, unified or other types of queues we need to run neutral prodsourcelabels # with production proxy since they can't be distinguished and can fail job_type_final = 'managed' From 825be2cae8ab137b9194c373030232162db48745 Mon Sep 17 00:00:00 2001 From: tmaeno Date: Wed, 29 Apr 2020 21:56:50 +0200 Subject: [PATCH 133/181] added aux_extractor --- pandaharvester/commit_timestamp.py | 2 +- .../harvesterextractor/analysis_extractor.py | 2 +- .../harvesterextractor/aux_extractor.py | 36 +++++++++++++++++++ 3 files changed, 38 insertions(+), 2 deletions(-) create mode 100644 pandaharvester/harvesterextractor/aux_extractor.py diff --git a/pandaharvester/commit_timestamp.py b/pandaharvester/commit_timestamp.py index 99fda7ab..95f33f0c 100644 --- a/pandaharvester/commit_timestamp.py +++ b/pandaharvester/commit_timestamp.py @@ -1 +1 @@ -timestamp = "21-04-2020 12:39:36 on flin (by fahui)" +timestamp = "29-04-2020 19:56:50 on master (by tmaeno)" diff --git a/pandaharvester/harvesterextractor/analysis_extractor.py b/pandaharvester/harvesterextractor/analysis_extractor.py index 6ab43ecb..f37174fd 100644 --- a/pandaharvester/harvesterextractor/analysis_extractor.py +++ b/pandaharvester/harvesterextractor/analysis_extractor.py @@ -2,7 +2,7 @@ from .base_extractor import BaseExtractor -# extractor for analysis +# OBSOLETE - use aux_extractor class AnalysisExtractor(BaseExtractor): # constructor def __init__(self, **kwarg): diff --git a/pandaharvester/harvesterextractor/aux_extractor.py b/pandaharvester/harvesterextractor/aux_extractor.py new file mode 100644 index 00000000..97aab745 --- /dev/null +++ b/pandaharvester/harvesterextractor/aux_extractor.py @@ -0,0 +1,36 @@ +import re +from .base_extractor import BaseExtractor + + +# extractor for auxiliary input files +class AuxExtractor(BaseExtractor): + # constructor + def __init__(self, **kwarg): + BaseExtractor.__init__(self, **kwarg) + + # get auxiliary input files + def get_aux_inputs(self, jobspec): + url_list = [] + jobPars = jobspec.jobParams['jobPars'] + # transformation + trf = jobspec.jobParams['transformation'] + if trf is not None and trf.startswith('http'): + url_list.append(trf) + # extract source URL + tmpM = re.search(' --sourceURL\s+([^\s]+)', jobPars) + if tmpM is not None: + sourceURL = tmpM.group(1) + # extract sandbox + if jobspec.jobParams['prodSourceLabel'] == 'user': + tmpM = re.search('-a\s+([^\s]+)', jobPars) + else: + tmpM = re.search('-i\s+([^\s]+)', jobPars) + if tmpM is not None: + lfn = tmpM.group(1) + url = '{0}/cache/{1}'.format(sourceURL, lfn) + url_list.append(url) + # extract container image + if 'container_name' in jobspec.jobParams: + url = jobspec.jobParams['container_name'] + url_list.append(url) + return self.make_aux_inputs(url_list) From d85e0e38e2194c7d7f59d3397a35becde803aa69 Mon Sep 17 00:00:00 2001 From: tmaeno Date: Thu, 30 Apr 2020 13:37:14 +0200 Subject: [PATCH 134/181] added slurm_squeue_monitor --- pandaharvester/commit_timestamp.py | 2 +- .../harvestermonitor/slurm_squeue_monitor.py | 73 +++++++++++++++++++ 2 files changed, 74 insertions(+), 1 deletion(-) create mode 100644 pandaharvester/harvestermonitor/slurm_squeue_monitor.py diff --git a/pandaharvester/commit_timestamp.py b/pandaharvester/commit_timestamp.py index 95f33f0c..d0b4d3c1 100644 --- a/pandaharvester/commit_timestamp.py +++ b/pandaharvester/commit_timestamp.py @@ -1 +1 @@ -timestamp = "29-04-2020 19:56:50 on master (by tmaeno)" +timestamp = "30-04-2020 11:37:14 on master (by tmaeno)" diff --git a/pandaharvester/harvestermonitor/slurm_squeue_monitor.py b/pandaharvester/harvestermonitor/slurm_squeue_monitor.py new file mode 100644 index 00000000..7942ce0d --- /dev/null +++ b/pandaharvester/harvestermonitor/slurm_squeue_monitor.py @@ -0,0 +1,73 @@ +import re +try: + import subprocess32 as subprocess +except ImportError: + import subprocess + +from pandaharvester.harvestercore import core_utils +from pandaharvester.harvestercore.work_spec import WorkSpec +from pandaharvester.harvestercore.plugin_base import PluginBase + +# logger +baseLogger = core_utils.setup_logger('slurm_squeue_monitor') + + +# monitor for SLURM batch system with squeue +class SlurmSqueueMonitor(PluginBase): + # constructor + def __init__(self, **kwarg): + PluginBase.__init__(self, **kwarg) + + # check workers + def check_workers(self, workspec_list): + retList = [] + for workSpec in workspec_list: + # make logger + tmpLog = self.make_logger(baseLogger, 'workerID={0}'.format(workSpec.workerID), + method_name='check_workers') + # command + comStr = "squeue -j {0}".format(workSpec.batchID) + # check + tmpLog.debug('check with {0}'.format(comStr)) + p = subprocess.Popen(comStr.split(), + shell=False, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + newStatus = workSpec.status + # check return code + stdOut, stdErr = p.communicate() + retCode = p.returncode + tmpLog.debug('retCode={0}'.format(retCode)) + errStr = '' + stdOut_str = stdOut if (isinstance(stdOut, str) or stdOut is None) else stdOut.decode() + stdErr_str = stdErr if (isinstance(stdErr, str) or stdErr is None) else stdErr.decode() + if retCode == 0: + for tmpLine in stdOut_str.split('\n'): + tmpMatch = re.search('{0} '.format(workSpec.batchID), tmpLine) + if tmpMatch is not None: + errStr = tmpLine + batchStatus = tmpLine.split()[4] + if batchStatus in ['R', 'RUNNING', 'COMPLETING', 'STOPPED', 'SUSPENDED']: + newStatus = WorkSpec.ST_running + elif batchStatus in ['COMPLETED', 'PREEMPTED', 'TIMEOUT']: + newStatus = WorkSpec.ST_finished + elif batchStatus in ['CANCELLED']: + newStatus = WorkSpec.ST_cancelled + elif batchStatus in ['PD', 'CONFIGURING', 'PENDING']: + newStatus = WorkSpec.ST_submitted + else: + newStatus = WorkSpec.ST_failed + tmpLog.debug('batchStatus {0} -> workerStatus {1}'.format(batchStatus, + newStatus)) + break + retList.append((newStatus, errStr)) + else: + # squeue does not show finished jobs, gives return code 1 + # Assume finished for now. Maybe look in workdir. + newStatus = WorkSpec.ST_finished + errStr = '{0} {1}'.format(stdOut_str, stdErr_str) + tmpLog.error(errStr) + #if 'slurm_load_jobs error: Invalid job id specified' in errStr: + # newStatus = WorkSpec.ST_failed + retList.append((newStatus, errStr)) + return True, retList From c49660475bcced003863441200a2992ef9c5e9ab Mon Sep 17 00:00:00 2001 From: tmaeno Date: Mon, 4 May 2020 08:40:47 +0200 Subject: [PATCH 135/181] python3 fix --- pandaharvester/commit_timestamp.py | 2 +- pandaharvester/harvesterstager/rucio_stager_hpc.py | 2 +- pandaharvester/harvesterstager/rucio_stager_hpc_minikui.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pandaharvester/commit_timestamp.py b/pandaharvester/commit_timestamp.py index d0b4d3c1..8c0f1158 100644 --- a/pandaharvester/commit_timestamp.py +++ b/pandaharvester/commit_timestamp.py @@ -1 +1 @@ -timestamp = "30-04-2020 11:37:14 on master (by tmaeno)" +timestamp = "04-05-2020 06:40:47 on master (by tmaeno)" diff --git a/pandaharvester/harvesterstager/rucio_stager_hpc.py b/pandaharvester/harvesterstager/rucio_stager_hpc.py index 7426d9a3..95c49a0b 100644 --- a/pandaharvester/harvesterstager/rucio_stager_hpc.py +++ b/pandaharvester/harvesterstager/rucio_stager_hpc.py @@ -136,7 +136,7 @@ def trigger_stage_out(self, jobspec): stdout, stderr = process.communicate() fileSpec.attemptNr += 1 - stdout = stdout + " attemptNr: %s" % fileSpec.attemptNr + stdout = stdout.decode() + " attemptNr: %s" % fileSpec.attemptNr tmpLog.debug("stdout: %s" % stdout) tmpLog.debug("stderr: %s" % stderr) if process.returncode == 0: diff --git a/pandaharvester/harvesterstager/rucio_stager_hpc_minikui.py b/pandaharvester/harvesterstager/rucio_stager_hpc_minikui.py index 3133c7cc..0abb90bd 100644 --- a/pandaharvester/harvesterstager/rucio_stager_hpc_minikui.py +++ b/pandaharvester/harvesterstager/rucio_stager_hpc_minikui.py @@ -151,7 +151,7 @@ def _stage_one_file(fileSpec): stdout, stderr = process.communicate() fileSpec.attemptNr += 1 - stdout = stdout + " attemptNr: %s" % fileSpec.attemptNr + stdout = stdout.decode() + " attemptNr: %s" % fileSpec.attemptNr tmpLog.debug("stdout: %s" % stdout) tmpLog.debug("stderr: %s" % stderr) if process.returncode == 0: From 5cd64b92c9b95696009b3682428a9c060f36576c Mon Sep 17 00:00:00 2001 From: fahui Date: Mon, 4 May 2020 10:27:55 +0200 Subject: [PATCH 136/181] python3 fix --- pandaharvester/commit_timestamp.py | 2 +- .../harvesterpreparator/pilotmover_mt_preparator_kari.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandaharvester/commit_timestamp.py b/pandaharvester/commit_timestamp.py index 8c0f1158..ff7e0750 100644 --- a/pandaharvester/commit_timestamp.py +++ b/pandaharvester/commit_timestamp.py @@ -1 +1 @@ -timestamp = "04-05-2020 06:40:47 on master (by tmaeno)" +timestamp = "04-05-2020 08:27:55 on flin (by fahui)" diff --git a/pandaharvester/harvesterpreparator/pilotmover_mt_preparator_kari.py b/pandaharvester/harvesterpreparator/pilotmover_mt_preparator_kari.py index a949105b..0bef1361 100644 --- a/pandaharvester/harvesterpreparator/pilotmover_mt_preparator_kari.py +++ b/pandaharvester/harvesterpreparator/pilotmover_mt_preparator_kari.py @@ -133,7 +133,7 @@ def trigger_preparation(self, jobspec): ErrMsg = 'These files failed to download : ' if files: threads = [] - n_files_per_thread = (len(files) + self.n_threads - 1) / self.n_threads + n_files_per_thread = (len(files) + self.n_threads - 1) // self.n_threads tmpLog.debug('num files per thread: %s' % n_files_per_thread) for i in range(0, len(files), n_files_per_thread): sub_files = files[i:i + n_files_per_thread] From 9ac114cbde3ffb66da6c6bd2dcf5900b2fe6c709 Mon Sep 17 00:00:00 2001 From: David Cameron Date: Tue, 5 May 2020 10:17:49 +0200 Subject: [PATCH 137/181] check for undefined queueconfig --- pandaharvester/harvesterbody/propagator.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandaharvester/harvesterbody/propagator.py b/pandaharvester/harvesterbody/propagator.py index 5f4eeb18..22f7d7b1 100644 --- a/pandaharvester/harvesterbody/propagator.py +++ b/pandaharvester/harvesterbody/propagator.py @@ -53,7 +53,10 @@ def run(self): if tmpJobSpec.computingSite not in hbSuppressMap: queueConfig = self.queueConfigMapper.get_queue(tmpJobSpec.computingSite, tmpJobSpec.configID) - hbSuppressMap[tmpJobSpec.computingSite] = queueConfig.get_no_heartbeat_status() + if queueConfig: + hbSuppressMap[tmpJobSpec.computingSite] = queueConfig.get_no_heartbeat_status() + else: # assume truepilot + hbSuppressMap[tmpJobSpec.computingSite] = ['running', 'transferring', 'finished', 'failed'] # heartbeat is suppressed if tmpJobSpec.get_status() in hbSuppressMap[tmpJobSpec.computingSite] and \ not tmpJobSpec.not_suppress_heartbeat(): From 90e560f7ad2a74341c788219c5b1ba97ff3667e8 Mon Sep 17 00:00:00 2001 From: Doug Benjamin Date: Thu, 7 May 2020 06:49:48 -0500 Subject: [PATCH 138/181] initial commit xrdcp_stager --- .../harvesterstager/xrdcp_stager.py | 261 ++++++++++++++++++ 1 file changed, 261 insertions(+) create mode 100644 pandaharvester/harvesterstager/xrdcp_stager.py diff --git a/pandaharvester/harvesterstager/xrdcp_stager.py b/pandaharvester/harvesterstager/xrdcp_stager.py new file mode 100644 index 00000000..87ffb4ba --- /dev/null +++ b/pandaharvester/harvesterstager/xrdcp_stager.py @@ -0,0 +1,261 @@ +import os +import tempfile +import gc + +try: + import subprocess32 as subprocess +except Exception: + import subprocess + +from pandaharvester.harvestermover import mover_utils +from pandaharvester.harvestercore import core_utils +from .base_stager import BaseStager + +import uuid + +# logger +_logger = core_utils.setup_logger('xrdcp_stager') + +# stager plugin with https://xrootd.slac.stanford.edu/ xrdcp +""" + -- Example of plugin config + "stager": { + "name": "XrdcpStager", + "module": "pandaharvester.harvesterstager.xrdcp_stager", + # base path for destinattion xrdcp server + "dstBasePath": " root://dcgftp.usatlas.bnl.gov:1096//pnfs/usatlas.bnl.gov/BNLT0D1/rucio", + # base path for local access to the copied files + "localBasePath": "/hpcgpfs01/scratch/benjamin/harvester/rucio-data-area", + # max number of attempts + "maxAttempts": 3, + # check paths under localBasePath. + "checkLocalPath": true, + # options for xrdcp + "xrdcpOpts": "--retry 3 --cksum adler32 --debug 1" + } +""" + + +# dummy plugin for stager +class XrdcpStager(BaseStager): + # constructor + def __init__(self, **kwarg): + BaseStager.__init__(self, **kwarg) + if not hasattr(self, 'xrdcpOpts'): + self.xrdcpOpts = None + if not hasattr(self, 'maxAttempts'): + self.maxAttempts = 3 + if not hasattr(self, 'timeout'): + self.timeout = None + if not hasattr(self, 'checkLocalPath'): + self.checkLocalPath = True + + # check status + def check_stage_out_status(self, jobspec): + """Check the status of stage-out procedure. If staging-out is done synchronously in trigger_stage_out + this method should always return True. + Output files are available through jobspec.get_outfile_specs(skip_done=False) which gives + a list of FileSpecs not yet done. + FileSpec.attemptNr shows how many times the transfer was checked for the file. + If the file was successfully transferred, status should be set to 'finished'. + Or 'failed', if the file failed to be transferred. Once files are set to 'finished' or 'failed', + jobspec.get_outfile_specs(skip_done=False) ignores them. + + :param jobspec: job specifications + :type jobspec: JobSpec + :return: A tuple of return code (True: transfer success, False: fatal transfer failure, + None: on-going or temporary failure) and error dialog + :rtype: (bool, string) + """ + for fileSpec in jobspec.get_output_file_specs(skip_done=True): + fileSpec.status = 'finished' + return True, '' + + # trigger stage out + def trigger_stage_out(self, jobspec): + """Trigger the stage-out procedure for the job. + Output files are available through jobspec.get_outfile_specs(skip_done=False) which gives + a list of FileSpecs not yet done. + FileSpec.attemptNr shows how many times transfer was tried for the file so far. + + :param jobspec: job specifications + :type jobspec: JobSpec + :return: A tuple of return code (True: success, False: fatal failure, None: temporary failure) + and error dialog + :rtype: (bool, string) + """ + + # let gc clean up memory + gc.collect() + + # make logger + tmpLog = self.make_logger(baseLogger, 'PandaID={0}'.format(jobspec.PandaID), + method_name='trigger_preparation') + tmpLog.debug('start') + # get the environment + harvester_env = os.environ.copy() + #tmpLog.debug('Harvester environment : {}'.format(harvester_env)) + + xrdcpOutput = None + allfiles_transfered = True + overall_errMsg = "" + fileAttrs = jobspec.get_output_file_attributes() + # loop over all output files + for fileSpec in jobspec.get_output_file_specs(skip_done=True): + # fileSpec.objstoreID = 123 + # fileSpec.fileAttributes['guid'] + # construct source and destination paths + dstPath = mover_utils.construct_file_path(self.dstBasePath, fileAttrs[fileSpec.lfn]['scope'], + fileSpec.lfn) + # local path + localPath = mover_utils.construct_file_path(self.localBasePath, fileAttrs[fileSpec.lfn]['scope'], + fileSpec.lfn) + if self.checkLocalPath: + # check if already exits + if os.path.exists(localPath): + # calculate checksum + checksum = core_utils.calc_adler32(localPath) + checksum = 'ad:{0}'.format(checksum) + if checksum == fileAttrs[fileSpec.lfn]['checksum']: + continue + # collect list of output files + if xrdcpOutput is None: + xrdcpOutput = [dstPath] + else: + xrdcpOutput.append[dstPath] + # transfer using xrdcp one file at a time + tmpLog.debug('execute xrdcp') + args = ['xrdcp', '--nopbar', '--force'] + args_files = [localPath,dstPath] + if self.xrdcpOpts is not None: + args += self.xrdcpOpts.split() + args += args_files + fileSpec.attemptNr += 1 + try: + xrdcp_cmd = ' '.join(args) + tmpLog.debug('execute: {0}'.format(xrdcp_cmd)) + process = subprocess.Popen(xrdcp_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=harvester_env, shell=True) + try: + stdout, stderr = process.communicate(timeout=self.timeout) + except subprocess.TimeoutExpired: + process.kill() + stdout, stderr = process.communicate() + tmpLog.warning('command timeout') + return_code = process.returncode + if stdout is not None: + if not isinstance(stdout, str): + stdout = stdout.decode() + stdout = stdout.replace('\n', ' ') + if stderr is not None: + if not isinstance(stderr, str): + stderr = stderr.decode() + stderr = stderr.replace('\n', ' ') + tmpLog.debug("stdout: %s" % stdout) + tmpLog.debug("stderr: %s" % stderr) + except Exception: + core_utils.dump_error_message(tmpLog) + return_code = 1 + if return_code == 0: + fileSpec.status = 'finished' + else: + overall_errMsg += "file - {0} did not transfer error code {1} ".format(localPath,return_code) + allfiles_transfered = False + errMsg = 'failed with {0}'.format(return_code) + tmpLog.error(errMsg) + # check attemptNr + if fileSpec.attemptNr >= self.maxAttempts: + tmpLog.error('reached maxattempts: {0}, marked it as failed'.format(self.maxAttempts)) + fileSpec.status = 'failed' + + # force update + fileSpec.force_update('status') + tmpLog.debug('file: {0} status: {1}'.format(fileSpec.lfn, fileSpec.status)) + del process, stdout, stderr + + # end loop over output files + + # nothing to transfer + if xrdcpOutput is None: + tmpLog.debug('done with no transfers') + return True, '' + # check if all files were transfered + tmpLog.debug('done') + if allfiles_transfered : + return True, '' + else: + return None, overall_errMsg + + + # zip output files + def zip_output(self, jobspec): + """OBSOLETE : zip functions should be implemented in zipper plugins. + Zip output files. This method loops over jobspec.outFiles, which is a list of zip file's FileSpecs, + to make a zip file for each zip file's FileSpec. FileSpec.associatedFiles is a list of FileSpecs of + associated files to be zipped. The path of each associated file is available in associated + file's FileSpec.path. Once zip files are made, their FileSpec.path, FileSpec.fsize and + FileSpec.chksum need to be set. + + :param jobspec: job specifications + :type jobspec: JobSpec + :return: A tuple of return code (True for success, False otherwise) and error dialog + :rtype: (bool, string) + """ + # make logger + tmpLog = self.make_logger(_logger, 'PandaID={0}'.format(jobspec.PandaID), + method_name='zip_output') + return self.simple_zip_output(jobspec, tmpLog) + + # asynchronous zip output + def async_zip_output(self, jobspec): + """OBSOLETE : zip functions should be implemented in zipper plugins. + Zip output files asynchronously. This method is followed by post_zip_output(), + which is typically useful to trigger an asynchronous zipping mechanism such as batch job. + This method loops over jobspec.outFiles, which is a list of zip file's FileSpecs, to make + a zip file for each zip file's FileSpec. FileSpec.associatedFiles is a list of FileSpecs + of associated files to be zipped. The path of each associated file is available in associated + file's FileSpec.path. + + :param jobspec: job specifications + :type jobspec: JobSpec + :return: A tuple of return code (True for success, False otherwise) and error dialog + :rtype: (bool, string) + """ + # make logger + tmpLog = self.make_logger(_logger, 'PandaID={0}'.format(jobspec.PandaID), + method_name='zip_output') + # set some ID which can be used for lookup in post_zip_output() + groupID = str(uuid.uuid4()) + lfns = [] + for fileSpec in jobspec.outFiles: + lfns.append(fileSpec.lfn) + jobspec.set_groups_to_files({groupID: {'lfns': lfns, + 'groupStatus': 'zipping'} + } + ) + return True, '' + + # post zipping + def post_zip_output(self, jobspec): + """OBSOLETE : zip functions should be implemented in zipper plugins. + This method is executed after async_zip_output(), to do post-processing for zipping. + Once zip files are made, this method needs to look over jobspec.outFiles to set their + FileSpec.path, FileSpec.fsize, and FileSpec.chksum. + + :param jobspec: job specifications + :type jobspec: JobSpec + :return: A tuple of return code (True for success, False otherwise) and error dialog + :rtype: (bool, string) + """ + # make logger + tmpLog = self.make_logger(_logger, 'PandaID={0}'.format(jobspec.PandaID), + method_name='zip_output') + # get groups for lookup + groups = jobspec.get_groups_of_output_files() + # do something with groupIDs + pass + # update file attributes + for fileSpec in jobspec.outFiles: + fileSpec.path = '/path/to/zip' + fileSpec.fsize = 12345 + fileSpec.chksum = '66bb0985' + return True, '' From bfb6a044eb3853962adfe951e1e5874c675ebe75 Mon Sep 17 00:00:00 2001 From: Doug Benjamin Date: Thu, 7 May 2020 11:11:37 -0500 Subject: [PATCH 139/181] fix logger reference --- pandaharvester/harvesterstager/xrdcp_stager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandaharvester/harvesterstager/xrdcp_stager.py b/pandaharvester/harvesterstager/xrdcp_stager.py index 87ffb4ba..f66e7469 100644 --- a/pandaharvester/harvesterstager/xrdcp_stager.py +++ b/pandaharvester/harvesterstager/xrdcp_stager.py @@ -9,7 +9,7 @@ from pandaharvester.harvestermover import mover_utils from pandaharvester.harvestercore import core_utils -from .base_stager import BaseStager +from pandaharvester.harvesterstager.base_stager import BaseStager import uuid From 98e500ff1df65050ed1d6af5de2646b8ce5943ec Mon Sep 17 00:00:00 2001 From: Doug Benjamin Date: Thu, 7 May 2020 11:18:48 -0500 Subject: [PATCH 140/181] fix logger reference --- pandaharvester/harvesterstager/xrdcp_stager.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandaharvester/harvesterstager/xrdcp_stager.py b/pandaharvester/harvesterstager/xrdcp_stager.py index f66e7469..2909846a 100644 --- a/pandaharvester/harvesterstager/xrdcp_stager.py +++ b/pandaharvester/harvesterstager/xrdcp_stager.py @@ -89,8 +89,8 @@ def trigger_stage_out(self, jobspec): gc.collect() # make logger - tmpLog = self.make_logger(baseLogger, 'PandaID={0}'.format(jobspec.PandaID), - method_name='trigger_preparation') + tmpLog = self.make_logger(_logger, 'PandaID={0}'.format(jobspec.PandaID), + method_name='trigger_stage_out') tmpLog.debug('start') # get the environment harvester_env = os.environ.copy() From b7f871130d3b6fd64385add9d75c2901ef082194 Mon Sep 17 00:00:00 2001 From: Doug Benjamin Date: Thu, 7 May 2020 22:49:34 -0500 Subject: [PATCH 141/181] fix source file path --- pandaharvester/harvesterstager/xrdcp_stager.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandaharvester/harvesterstager/xrdcp_stager.py b/pandaharvester/harvesterstager/xrdcp_stager.py index 2909846a..11c9dfbb 100644 --- a/pandaharvester/harvesterstager/xrdcp_stager.py +++ b/pandaharvester/harvesterstager/xrdcp_stager.py @@ -110,6 +110,8 @@ def trigger_stage_out(self, jobspec): # local path localPath = mover_utils.construct_file_path(self.localBasePath, fileAttrs[fileSpec.lfn]['scope'], fileSpec.lfn) + tmpLog.debug('fileSpec.path - {0} fileSpec.lfn = {1}'.format(fileSpec.path,fileSpec.lfn)) + localPath = fileSpec.path if self.checkLocalPath: # check if already exits if os.path.exists(localPath): From d918c6955f175bff3596d461d505262030fe72ad Mon Sep 17 00:00:00 2001 From: Doug Benjamin Date: Fri, 8 May 2020 10:08:08 -0500 Subject: [PATCH 142/181] fix append bug --- pandaharvester/harvesterstager/xrdcp_stager.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandaharvester/harvesterstager/xrdcp_stager.py b/pandaharvester/harvesterstager/xrdcp_stager.py index 11c9dfbb..0905bdfe 100644 --- a/pandaharvester/harvesterstager/xrdcp_stager.py +++ b/pandaharvester/harvesterstager/xrdcp_stager.py @@ -124,7 +124,8 @@ def trigger_stage_out(self, jobspec): if xrdcpOutput is None: xrdcpOutput = [dstPath] else: - xrdcpOutput.append[dstPath] + if dstPath not in xrdcpOutput : + xrdcpOutput.append(dstPath) # transfer using xrdcp one file at a time tmpLog.debug('execute xrdcp') args = ['xrdcp', '--nopbar', '--force'] From ddf0f53632e766d7f4a471f0235d02dcbe12c3fa Mon Sep 17 00:00:00 2001 From: tmaeno Date: Mon, 11 May 2020 15:20:27 +0200 Subject: [PATCH 143/181] to trigger preparation if one or more files have to_prepare. --- pandaharvester/commit_timestamp.py | 2 +- pandaharvester/harvesterbody/preparator.py | 20 +++++++++++++++----- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/pandaharvester/commit_timestamp.py b/pandaharvester/commit_timestamp.py index ff7e0750..73f1bfd5 100644 --- a/pandaharvester/commit_timestamp.py +++ b/pandaharvester/commit_timestamp.py @@ -1 +1 @@ -timestamp = "04-05-2020 08:27:55 on flin (by fahui)" +timestamp = "11-05-2020 13:20:27 on master (by tmaeno)" diff --git a/pandaharvester/harvesterbody/preparator.py b/pandaharvester/harvesterbody/preparator.py index 10c1a20a..b49c27b5 100644 --- a/pandaharvester/harvesterbody/preparator.py +++ b/pandaharvester/harvesterbody/preparator.py @@ -195,13 +195,18 @@ def run(self): # check file status if queueConfig.ddmEndpointIn not in fileStatMap: fileStatMap[queueConfig.ddmEndpointIn] = dict() + # check if has to_prepare + hasToPrepare = False + for fileSpec in jobSpec.inFiles: + if fileSpec.status == 'to_prepare': + hasToPrepare = True + break newFileStatusData = [] toWait = False newInFiles = [] for fileSpec in jobSpec.inFiles: if fileSpec.status in ['preparing', 'to_prepare']: newInFiles.append(fileSpec) - if fileSpec.status == 'preparing': updateStatus = False if fileSpec.lfn not in fileStatMap[queueConfig.ddmEndpointIn]: fileStatMap[queueConfig.ddmEndpointIn][fileSpec.lfn] \ @@ -221,14 +226,19 @@ def run(self): fileSpec.groupStatus = groupInfo['groupStatus'] fileSpec.groupUpdateTime = groupInfo['groupUpdateTime'] updateStatus = True - elif 'to_prepare' in fileStatMap[queueConfig.ddmEndpointIn][fileSpec.lfn] or \ - 'triggered' in fileStatMap[queueConfig.ddmEndpointIn][fileSpec.lfn]: + elif (not hasToPrepare and + 'to_prepare' in fileStatMap[queueConfig.ddmEndpointIn][fileSpec.lfn]) or \ + 'triggered' in fileStatMap[queueConfig.ddmEndpointIn][fileSpec.lfn]: # the file is being prepared by another toWait = True + if fileSpec.status != 'preparing': + fileSpec.status = 'preparing' + updateStatus = True else: # change file status if the file is not prepared by another - fileSpec.status = 'to_prepare' - updateStatus = True + if fileSpec.status != 'to_prepare': + fileSpec.status = 'to_prepare' + updateStatus = True # set new status if updateStatus: newFileStatusData.append((fileSpec.fileID, fileSpec.lfn, fileSpec.status)) From c4fcd00b6bad5edf5364ad1ca12b14bb919119ca Mon Sep 17 00:00:00 2001 From: fbarreir Date: Wed, 13 May 2020 15:36:19 +0200 Subject: [PATCH 144/181] k8s: extended exception handling for multiple calls --- pandaharvester/harvestermisc/k8s_utils.py | 52 ++++++++++++++++------- 1 file changed, 37 insertions(+), 15 deletions(-) diff --git a/pandaharvester/harvestermisc/k8s_utils.py b/pandaharvester/harvestermisc/k8s_utils.py index 6d7d58bf..689bd61d 100644 --- a/pandaharvester/harvestermisc/k8s_utils.py +++ b/pandaharvester/harvestermisc/k8s_utils.py @@ -37,7 +37,7 @@ def read_yaml_file(self, yaml_file): def create_job_from_yaml(self, yaml_content, work_spec, cert, cert_in_secret=True, cpuadjustratio=100, memoryadjustratio=100): - tmp_log = core_utils.make_logger(base_logger, method_name='create_job_from_yaml') + # tmp_log = core_utils.make_logger(base_logger, method_name='create_job_from_yaml') panda_queues_dict = PandaQueuesDict() queue_name = panda_queues_dict.get_panda_queue_name(work_spec.computingSite) @@ -105,9 +105,15 @@ def create_job_from_yaml(self, yaml_content, work_spec, cert, cert_in_secret=Tru return rsp def get_pods_info(self): + + tmp_log = core_utils.make_logger(base_logger, method_name='get_pods_info') + pods_list = list() - ret = self.corev1.list_namespaced_pod(namespace=self.namespace) + try: + ret = self.corev1.list_namespaced_pod(namespace=self.namespace) + except Exception as _e: + tmp_log.error('Failed call to list_namespaced_pod with: {0}'.format(_e)) for i in ret.items: pod_info = {} @@ -133,16 +139,23 @@ def filter_pods_info(self, pods_list, job_name=None): return pods_list def get_jobs_info(self, job_name=None): + + tmp_log = core_utils.make_logger(base_logger, 'job_name={0}'.format(job_name), method_name='get_jobs_info') + jobs_list = list() field_selector = 'metadata.name=' + job_name if job_name else '' - ret = self.batchv1.list_namespaced_job(namespace=self.namespace, field_selector=field_selector) + try: + ret = self.batchv1.list_namespaced_job(namespace=self.namespace, field_selector=field_selector) + + for i in ret.items: + job_info = {'name': i.metadata.name, 'status': i.status.conditions[0].type, + 'status_reason': i.status.conditions[0].reason, + 'status_message': i.status.conditions[0].message} + jobs_list.append(job_info) + except Exception as _e: + tmp_log.error('Failed call to list_namespaced_job with: {0}'.format(_e)) - for i in ret.items: - job_info = {'name': i.metadata.name, 'status': i.status.conditions[0].type, - 'status_reason': i.status.conditions[0].reason, - 'status_message': i.status.conditions[0].message} - jobs_list.append(job_info) return jobs_list def delete_pods(self, pod_name_list): @@ -156,6 +169,8 @@ def delete_pods(self, pod_name_list): grace_period_seconds=0) except ApiException as _e: rsp['errMsg'] = '' if _e.status == 404 else _e.reason + except Exception as _e: + rsp['errMsg'] = _e.reason else: rsp['errMsg'] = '' ret_list.append(rsp) @@ -163,8 +178,12 @@ def delete_pods(self, pod_name_list): return ret_list def delete_job(self, job_name): - self.batchv1.delete_namespaced_job(name=job_name, namespace=self.namespace, body=self.deletev1, - grace_period_seconds=0) + tmp_log = core_utils.make_logger(base_logger, 'job_name={0}'.format(job_name), method_name='delete_job') + try: + self.batchv1.delete_namespaced_job(name=job_name, namespace=self.namespace, body=self.deletev1, + grace_period_seconds=0) + except Exception as _e: + tmp_log.error('Failed call to delete_namespaced_job with: {0}'.format(_e)) def set_proxy(self, proxy_path): with open(proxy_path) as f: @@ -213,17 +232,20 @@ def create_or_patch_secret(self, file_list, secret_name): data[filename] = base64.b64encode(aux).decode() body = client.V1Secret(data=data, metadata=metadata) try: - rsp = self.corev1.patch_namespaced_secret(name=secret_name, body=body, namespace=self.namespace) - except ApiException as e: - tmp_log.debug('Exception when patch secret: {0} . Try to create secret instead...'.format(e)) - rsp = self.corev1.create_namespaced_secret(body=body, namespace=self.namespace) + try: + rsp = self.corev1.patch_namespaced_secret(name=secret_name, body=body, namespace=self.namespace) + except ApiException as e: + tmp_log.debug('Exception when patch secret: {0} . Try to create secret instead...'.format(e)) + rsp = self.corev1.create_namespaced_secret(body=body, namespace=self.namespace) + except Exception as e: + tmp_log.error('Exception when patch or creating secret: {0}.'.format(e)) return rsp def get_pod_logs(self, pod_name, previous=False): tmp_log = core_utils.make_logger(base_logger, method_name='get_pod_logs') try: rsp = self.corev1.read_namespaced_pod_log(name=pod_name, namespace=self.namespace, previous=previous) - except ApiException as e: + except Exception as e: tmp_log.debug('Exception when getting logs for pod {0} : {1}. Skipped'.format(pod_name, e)) raise else: From d9937770525595b1147ce4454fb3fc14c6ba7f36 Mon Sep 17 00:00:00 2001 From: fbarreir Date: Wed, 13 May 2020 15:39:48 +0200 Subject: [PATCH 145/181] k8s: fix --- pandaharvester/harvestermisc/k8s_utils.py | 32 +++++++++++------------ 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/pandaharvester/harvestermisc/k8s_utils.py b/pandaharvester/harvestermisc/k8s_utils.py index 689bd61d..2d374648 100644 --- a/pandaharvester/harvestermisc/k8s_utils.py +++ b/pandaharvester/harvestermisc/k8s_utils.py @@ -114,22 +114,22 @@ def get_pods_info(self): ret = self.corev1.list_namespaced_pod(namespace=self.namespace) except Exception as _e: tmp_log.error('Failed call to list_namespaced_pod with: {0}'.format(_e)) - - for i in ret.items: - pod_info = {} - pod_info['name'] = i.metadata.name - pod_info['start_time'] = i.status.start_time.replace( - tzinfo=None) if i.status.start_time else i.status.start_time - pod_info['status'] = i.status.phase - pod_info['status_conditions'] = i.status.conditions - pod_info['job_name'] = i.metadata.labels[ - 'job-name'] if i.metadata.labels and 'job-name' in i.metadata.labels else None - pod_info['containers_state'] = [] - if i.status.container_statuses: - for cs in i.status.container_statuses: - if cs.state: - pod_info['containers_state'].append(cs.state) - pods_list.append(pod_info) + else: + for i in ret.items: + pod_info = {} + pod_info['name'] = i.metadata.name + pod_info['start_time'] = i.status.start_time.replace( + tzinfo=None) if i.status.start_time else i.status.start_time + pod_info['status'] = i.status.phase + pod_info['status_conditions'] = i.status.conditions + pod_info['job_name'] = i.metadata.labels[ + 'job-name'] if i.metadata.labels and 'job-name' in i.metadata.labels else None + pod_info['containers_state'] = [] + if i.status.container_statuses: + for cs in i.status.container_statuses: + if cs.state: + pod_info['containers_state'].append(cs.state) + pods_list.append(pod_info) return pods_list From ea4f1f33e69ac9b315be9a5373b60ba40eb6b19d Mon Sep 17 00:00:00 2001 From: Lincoln Bryant Date: Wed, 13 May 2020 10:04:51 -0500 Subject: [PATCH 146/181] changed to py3-style pritn statements --- .../harvestertest/stageInTest_globus.py | 37 +++++++++---------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/pandaharvester/harvestertest/stageInTest_globus.py b/pandaharvester/harvestertest/stageInTest_globus.py index 168ee496..ce6db5a0 100644 --- a/pandaharvester/harvestertest/stageInTest_globus.py +++ b/pandaharvester/harvestertest/stageInTest_globus.py @@ -9,6 +9,7 @@ import time import threading import logging +from __future__ import print_statement from future.utils import iteritems from pandaharvester.harvesterconfig import harvester_config from pandaharvester.harvestercore.job_spec import JobSpec @@ -29,9 +30,9 @@ def dump(obj): for attr in dir(obj): if hasattr( obj, attr ): - print( "obj.%s = %s" % (attr, getattr(obj, attr))) + print("obj.%s = %s" % (attr, getattr(obj, attr))) -print len(sys.argv) +print(len(sys.argv)) queueName = 'ALCF_Theta' job_id = 1111 globus_sleep_time = 15 @@ -249,7 +250,7 @@ def dump(obj): tmpLog.error('Failed to send intial files') sys.exit(3) -print "sleep {0} seconds".format(globus_sleep_time) +print("sleep {0} seconds".format(globus_sleep_time)) time.sleep(globus_sleep_time) # enter polling loop to see if the intial files have transfered @@ -282,7 +283,7 @@ def dump(obj): tmpStr = 'transfer task {0} status: {1}'.format(transferID,transferTasks[transferID]['status']) tmpLog.debug(tmpStr) if NotFound : - print "sleep {0} seconds".format(globus_sleep_time) + print("sleep {0} seconds".format(globus_sleep_time)) time.sleep(globus_sleep_time) ++iloop @@ -293,39 +294,37 @@ def dump(obj): #dump(queueConfig) -print "plugin={0}".format(preparatorCore.__class__.__name__) +print("plugin={0}".format(preparatorCore.__class__.__name__)) -print "testing stagein:" -print "BasePath from preparator configuration: %s " % preparatorCore.basePath +print("testing stagein:") +print("BasePath from preparator configuration: %s " % preparatorCore.basePath) tmpStat, tmpOut = preparatorCore.trigger_preparation(jobSpec) if tmpStat: - print " OK" + print(" OK") else: - print " NG {0}".format(tmpOut) + print(" NG {0}".format(tmpOut)) -print "sleep {0} seconds".format(globus_sleep_time) +print("sleep {0} seconds".format(globus_sleep_time)) time.sleep(globus_sleep_time) -print "testing status check" +print("testing status check") while True: tmpStat, tmpOut = preparatorCore.check_stage_in_status(jobSpec) if tmpStat == True: - print " OK" + print(" OK") break elif tmpStat == False: - print " NG {0}".format(tmpOut) + print(" NG {0}".format(tmpOut)) sys.exit(1) else: - print " still running. sleep 1 min" + print(" still running. sleep 1 min") time.sleep(60) -print - -print "checking path resolution" +print("checking path resolution") tmpStat, tmpOut = preparatorCore.resolve_input_paths(jobSpec) if tmpStat: - print " OK {0}".format(jobSpec.jobParams['inFilePaths']) + print(" OK {0}".format(jobSpec.jobParams['inFilePaths'])) else: - print " NG {0}".format(tmpOut) + print(" NG {0}".format(tmpOut)) From b7860eb09462dd30c4d6ee96c2726e049699d2b8 Mon Sep 17 00:00:00 2001 From: Lincoln Bryant Date: Wed, 13 May 2020 10:09:44 -0500 Subject: [PATCH 147/181] iteritems has been replaced with simply items in python 3 --- pandaharvester/harvestertest/stageInTest_globus.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandaharvester/harvestertest/stageInTest_globus.py b/pandaharvester/harvestertest/stageInTest_globus.py index ce6db5a0..2d8e9239 100644 --- a/pandaharvester/harvestertest/stageInTest_globus.py +++ b/pandaharvester/harvestertest/stageInTest_globus.py @@ -61,7 +61,7 @@ def dump(obj): tmpLog = core_utils.make_logger(_logger, method_name='stageInTest_go_preparator') tmpLog.debug('start') -for loggerName, loggerObj in logging.Logger.manager.loggerDict.iteritems(): +for loggerName, loggerObj in logging.Logger.manager.loggerDict.items(): #print "loggerName - {}".format(loggerName) if loggerName.startswith('panda.log'): if len(loggerObj.handlers) == 0: From db50e8a891f041f7dd949a94bf167faa19213879 Mon Sep 17 00:00:00 2001 From: Lincoln Bryant Date: Wed, 13 May 2020 10:17:43 -0500 Subject: [PATCH 148/181] unicode objects need to be encoded before hashing in Py3 --- pandaharvester/harvestertest/stageInTest_globus.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandaharvester/harvestertest/stageInTest_globus.py b/pandaharvester/harvestertest/stageInTest_globus.py index 2d8e9239..2b8ed9bd 100644 --- a/pandaharvester/harvestertest/stageInTest_globus.py +++ b/pandaharvester/harvestertest/stageInTest_globus.py @@ -181,7 +181,7 @@ def dump(obj): assFileSpec.fsize = random.randint(10, 100) # create source file hash = hashlib.md5() - hash.update('%s:%s' % (fileSpec.scope, fileSpec.lfn)) + hash.update(('%s:%s' % (fileSpec.scope, fileSpec.lfn)).encode('utf-8')) hash_hex = hash.hexdigest() correctedscope = "/".join(scope.split('.')) fileSpec.path = "{endPoint}/{scope}/{hash1}/{hash2}/{lfn}".format(endPoint=queueConfig.preparator['Globus_dstPath'], From cfed66066b620e70ad4ecf8fbd939d7e07e1fe9b Mon Sep 17 00:00:00 2001 From: fbarreir Date: Thu, 14 May 2020 11:39:39 +0200 Subject: [PATCH 149/181] k8s: issue with merge --- pandaharvester/harvestermisc/k8s_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandaharvester/harvestermisc/k8s_utils.py b/pandaharvester/harvestermisc/k8s_utils.py index e71c5781..e0e8c9a3 100644 --- a/pandaharvester/harvestermisc/k8s_utils.py +++ b/pandaharvester/harvestermisc/k8s_utils.py @@ -39,7 +39,7 @@ def read_yaml_file(self, yaml_file): def create_job_from_yaml(self, yaml_content, work_spec, container_image, executable, args, cert, cert_in_secret=True, cpu_adjust_ratio=100, memory_adjust_ratio=100,): - # tmp_log = core_utils.make_logger(base_logger, method_name='create_job_from_yaml') + tmp_log = core_utils.make_logger(base_logger, method_name='create_job_from_yaml') # consider PULL mode as default, unless specified submit_mode = 'PULL' From 79e892d1a222921771253a7e5031e7b8d7a41e92 Mon Sep 17 00:00:00 2001 From: fbarreir Date: Fri, 15 May 2020 11:09:45 +0200 Subject: [PATCH 150/181] create_or_patch_secret added rsp variable initialization --- pandaharvester/harvestermisc/k8s_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandaharvester/harvestermisc/k8s_utils.py b/pandaharvester/harvestermisc/k8s_utils.py index e0e8c9a3..c298850b 100644 --- a/pandaharvester/harvestermisc/k8s_utils.py +++ b/pandaharvester/harvestermisc/k8s_utils.py @@ -271,7 +271,7 @@ def create_or_patch_secret(self, file_list, secret_name): # api_version = 'v1' # kind = 'Secret' # type='kubernetes.io/tls' - + rsp = None tmp_log = core_utils.make_logger(base_logger, method_name='create_or_patch_secret') metadata = {'name': secret_name, 'namespace': self.namespace} From ddc4c1ca8afd42c36f8ede32f78305f5bff9adbf Mon Sep 17 00:00:00 2001 From: Tadashi Maeno Date: Fri, 15 May 2020 17:39:09 +0200 Subject: [PATCH 151/181] small cleanup --- pandaharvester/harvestertest/stageInTest_globus.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandaharvester/harvestertest/stageInTest_globus.py b/pandaharvester/harvestertest/stageInTest_globus.py index 2b8ed9bd..2957a70e 100644 --- a/pandaharvester/harvestertest/stageInTest_globus.py +++ b/pandaharvester/harvestertest/stageInTest_globus.py @@ -9,7 +9,6 @@ import time import threading import logging -from __future__ import print_statement from future.utils import iteritems from pandaharvester.harvesterconfig import harvester_config from pandaharvester.harvestercore.job_spec import JobSpec @@ -61,7 +60,7 @@ def dump(obj): tmpLog = core_utils.make_logger(_logger, method_name='stageInTest_go_preparator') tmpLog.debug('start') -for loggerName, loggerObj in logging.Logger.manager.loggerDict.items(): +for loggerName, loggerObj in iteritems(logging.Logger.manager.loggerDict): #print "loggerName - {}".format(loggerName) if loggerName.startswith('panda.log'): if len(loggerObj.handlers) == 0: From 5d33149f6c5607b7e7ff8dc5ff6c7a719c0a111e Mon Sep 17 00:00:00 2001 From: tmaeno Date: Wed, 20 May 2020 00:37:10 +0200 Subject: [PATCH 152/181] added containerPrefix to AuxExtractor --- pandaharvester/commit_timestamp.py | 2 +- pandaharvester/harvesterextractor/aux_extractor.py | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/pandaharvester/commit_timestamp.py b/pandaharvester/commit_timestamp.py index 73f1bfd5..c3bc3441 100644 --- a/pandaharvester/commit_timestamp.py +++ b/pandaharvester/commit_timestamp.py @@ -1 +1 @@ -timestamp = "11-05-2020 13:20:27 on master (by tmaeno)" +timestamp = "19-05-2020 22:37:10 on master (by tmaeno)" diff --git a/pandaharvester/harvesterextractor/aux_extractor.py b/pandaharvester/harvesterextractor/aux_extractor.py index 97aab745..67253806 100644 --- a/pandaharvester/harvesterextractor/aux_extractor.py +++ b/pandaharvester/harvesterextractor/aux_extractor.py @@ -6,6 +6,7 @@ class AuxExtractor(BaseExtractor): # constructor def __init__(self, **kwarg): + self.containerPrefix = None BaseExtractor.__init__(self, **kwarg) # get auxiliary input files @@ -32,5 +33,7 @@ def get_aux_inputs(self, jobspec): # extract container image if 'container_name' in jobspec.jobParams: url = jobspec.jobParams['container_name'] + if self.containerPrefix is not None and not url.startswith(self.containerPrefix): + url = self.containerPrefix + url url_list.append(url) return self.make_aux_inputs(url_list) From 7a1d6af0b286afb6560be5bd0d34d3b651997470 Mon Sep 17 00:00:00 2001 From: Pavlo Svirin Date: Sat, 16 May 2020 09:44:08 +0000 Subject: [PATCH 153/181] Added analysis for postmortem file --- .../harvestermonitor/slurm_squeue_monitor.py | 29 +++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/pandaharvester/harvestermonitor/slurm_squeue_monitor.py b/pandaharvester/harvestermonitor/slurm_squeue_monitor.py index 7942ce0d..d9a68e01 100644 --- a/pandaharvester/harvestermonitor/slurm_squeue_monitor.py +++ b/pandaharvester/harvestermonitor/slurm_squeue_monitor.py @@ -4,6 +4,9 @@ except ImportError: import subprocess +import json +import os + from pandaharvester.harvestercore import core_utils from pandaharvester.harvestercore.work_spec import WorkSpec from pandaharvester.harvestercore.plugin_base import PluginBase @@ -14,6 +17,8 @@ # monitor for SLURM batch system with squeue class SlurmSqueueMonitor(PluginBase): + _HARVESTER_POSTMORTEM_FILENAME="FINISHED" + # constructor def __init__(self, **kwarg): PluginBase.__init__(self, **kwarg) @@ -25,6 +30,25 @@ def check_workers(self, workspec_list): # make logger tmpLog = self.make_logger(baseLogger, 'workerID={0}'.format(workSpec.workerID), method_name='check_workers') + # here try to load file + current_postmortem_fname = '%s/%s' %(workSpec.accessPoint, SlurmSqueueMonitor._HARVESTER_POSTMORTEM_FILENAME) + + if os.path.exists(current_postmortem_fname): + with open(current_postmortem_fname) as postmortem: + try: + worker_status_json = json.load(postmortem) + if 'worker_status' in worker_status_json: + worker_status = None + if worker_status_json['worker_status']=='finished': + worker_status = WorkSpec.ST_finished + if worker_status_json['worker_status']=='failed': + worker_status = WorkSpec.ST_failed + if worker_status is not None: + retList.append((worker_status, '')) + continue + except json.JSONDecodeError: + tmpLog.debug('Not able to parse JSON in postmortem for a worker: %s, continung with SLURM CLI' % current_postmortem_fname) + # command comStr = "squeue -j {0}".format(workSpec.batchID) # check @@ -71,3 +95,8 @@ def check_workers(self, workspec_list): # newStatus = WorkSpec.ST_failed retList.append((newStatus, errStr)) return True, retList + + + def _get_worker_completion_details(): + # try to open FINISHED file + pass From 3fffe452b5a6265acfdd5ab106f240bf50704707 Mon Sep 17 00:00:00 2001 From: Pavlo Svirin Date: Sat, 16 May 2020 09:53:26 +0000 Subject: [PATCH 154/181] Added stage-in test for GO --- .../harvestertest/stageInTest_GlobusOnline.py | 77 +++++++++++++++++++ 1 file changed, 77 insertions(+) create mode 100644 pandaharvester/harvestertest/stageInTest_GlobusOnline.py diff --git a/pandaharvester/harvestertest/stageInTest_GlobusOnline.py b/pandaharvester/harvestertest/stageInTest_GlobusOnline.py new file mode 100644 index 00000000..f3fb4b89 --- /dev/null +++ b/pandaharvester/harvestertest/stageInTest_GlobusOnline.py @@ -0,0 +1,77 @@ +import sys +import time +from pandaharvester.harvestercore.queue_config_mapper import QueueConfigMapper +from pandaharvester.harvestercore.job_spec import JobSpec +from pilot.info.filespec import FileSpec + +queueName = sys.argv[1] + +queueConfigMapper = QueueConfigMapper() + +queueConfig = queueConfigMapper.get_queue(queueName) + +jobSpec = JobSpec() +new_file_data = {'scope': 'test', + 'lfn': 'TXT.19772875._044894.tar.gz.1', 'attemptNr': 0 } +new_file_spec = FileSpec(filetype='input', **new_file_data) +new_file_spec.attemptNr = 0 +new_file_spec.path = '/home/psvirin/harvester3' + +jobSpec.inFiles = {new_file_spec} +jobSpec.outFiles = {} +jobSpec.jobParams = { + 'inFiles': 'TXT.19772875._044894.tar.gz.1', + 'scopeIn': 'mc15_13TeV', + 'fsize': '658906675', + 'GUID': '7e3776f9bb0af341b03e59d3de895a13', + 'checksum': 'ad:3734bdd9', + 'ddmEndPointIn': 'BNL-OSG2_DATADISK', + 'realDatasetsIn': 'mc15_13TeV.363638.MGPy8EG_N30NLO_Wmunu_Ht500_700_BFilter.merge.DAOD_STDM4.e4944_s2726_r7772_r7676_p2842_tid09596175_00', + } +jobSpec.computingSite = queueName +jobSpec.PandaID='11111' + +from pandaharvester.harvestercore.plugin_factory import PluginFactory + +pluginFactory = PluginFactory() + +# get plugin +preparatorCore = pluginFactory.get_plugin(queueConfig.preparator) +print ("plugin={0}".format(preparatorCore.__class__.__name__)) + +print(jobSpec) + +print ("testing stagein:") +print ("BasePath from preparator configuration: %s " % preparatorCore.basePath) +preparatorCore.basePath = preparatorCore.basePath + "/testdata/" +print ("basePath redifuned for test data: %s " % preparatorCore.basePath) + +tmpStat, tmpOut = preparatorCore.trigger_preparation(jobSpec) +if tmpStat: + print (" OK") +else: + print (" NG {0}".format(tmpOut)) + +print + +print ("testing status check") +while True: + tmpStat, tmpOut = preparatorCore.check_stage_in_status(jobSpec) + if tmpStat is True: + print (" OK") + break + elif tmpStat is False: + print (" NG {0}".format(tmpOut)) + sys.exit(1) + else: + print (" still running. sleep 1 min") + time.sleep(60) + +print + +print ("checking path resolution") +tmpStat, tmpOut = preparatorCore.resolve_input_paths(jobSpec) +if tmpStat: + print (" OK {0}".format(jobSpec.jobParams['inFilePaths'])) +else: + print (" NG {0}".format(tmpOut)) From 4d1ad24e773f01741495738cacb698bac63f020c Mon Sep 17 00:00:00 2001 From: Pavlo Svirin Date: Mon, 18 May 2020 20:03:36 +0000 Subject: [PATCH 155/181] Added draft test for container retrieval --- .../container_auxpreparator_test.py | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 pandaharvester/harvestertest/container_auxpreparator_test.py diff --git a/pandaharvester/harvestertest/container_auxpreparator_test.py b/pandaharvester/harvestertest/container_auxpreparator_test.py new file mode 100644 index 00000000..372d49bd --- /dev/null +++ b/pandaharvester/harvestertest/container_auxpreparator_test.py @@ -0,0 +1,23 @@ +import json +import sys +import time +from pprint import pprint + +#from pandaharvester.harvestercore.queue_config_mapper import QueueConfigMapper +from pandaharvester.harvestercore.job_spec import JobSpec +from pandaharvester.harvesterextractor.aux_extractor import AuxExtractor + +job_data_json = """{"container_name": "atlas.athena:21.0.15_DBRelease-100.0.2_Patched", "PandaID": 4731765799, "jobsetID": 1,"taskID": 1, "transformation": "Sim_tf.py", "attemptNr": 1,"currentPriority": 1, "outFiles": "", "realDatasets": "", "ddmEndPointOut": "", "scopeOut": "", "scopeLog": "", "logFile": "", "logGUID": "", "files": [{"row_id": 30133581516, "pandaid": 4731765799, "modificationtime": "2020-05-18T10:29:21", "guid": "ADCF2DEC-3412-C64B-B8FB-E8629680AA4D", "lfn": "EVNT.21265061._000036.pool.root.1", "type": "input", "dataset": "mc16_13TeV.830011.H7EG_jetjet_JZ1.merge.EVNT.e7954_e7400_tid21265061_00", "status": "ready", "proddblock": "mc16_13TeV:mc16_13TeV.830011.H7EG_jetjet_JZ1.merge.EVNT.e7954_e7400_tid21265061_00", "proddblocktoken": "", "dispatchdblock": "", "dispatchdblocktoken": "", "destinationdblock": "", "destinationdblocktoken": "", "destinationse": "", "fsize": 307204932, "md5sum": "", "checksum": "ad:698afd11", "scope": "mc16_13TeV", "jeditaskid": 21265064, "datasetid": 311267780, "fileid": 21006488431, "attemptnr": 22, "destination": " ", "fsizemb": "292.97", "ruciodatasetname": "mc16_13TeV:mc16_13TeV.830011.H7EG_jetjet_JZ1.merge.EVNT.e7954_e7400_tid21265061_00", "datasetname": "mc16_13TeV:mc16_13TeV.830011.H7EG_jetjet_JZ1.merge.EVNT.e7954_e7400_tid21265061_00", "ddmsite": "LRZ-LMU", "creationdate": "2020-05-18T10:29:21", "oldfiletable": 1, "destinationdblock_vis": "", "maxattempt": 30}, {"row_id": 30133581517, "pandaid": 4731765799, "modificationtime": "2020-05-18T10:29:21", "guid": "819a609a-1650-455b-8980-9c190cb77064", "lfn": "352", "type": "pseudo_input", "dataset": "seq_number", "status": "unknown", "proddblock": "seq_number", "proddblocktoken": "", "dispatchdblock": "", "dispatchdblocktoken": "", "destinationdblock": "", "destinationdblocktoken": "", "destinationse": "", "fsize": 0, "md5sum": "", "checksum": "", "scope": "", "jeditaskid": 21265064, "datasetid": 311267781, "fileid": 21006489271, "attemptnr": 21, "destination": " ", "fsizemb": "0.00", "ruciodatasetname": "seq_number", "datasetname": "seq_number", "ddmsite": "LRZ-LMU", "creationdate": "2020-05-18T10:29:21", "oldfiletable": 1, "destinationdblock_vis": ""}, {"row_id": 30133581519, "pandaid": 4731765799, "modificationtime": "2020-05-18T10:29:21", "guid": "", "lfn": "HITS.21265064._002580.pool.root.1", "type": "output", "dataset": "mc16_13TeV.830011.H7EG_jetjet_JZ1.simul.HITS.e7954_e7400_s3126_tid21265064_00", "status": "failed", "proddblock": "", "proddblocktoken": "", "dispatchdblock": "", "dispatchdblocktoken": "", "destinationdblock": "mc16_13TeV.830011.NA.simul.HITS.21265064_sub2290908", "destinationdblocktoken": "dst:TOKYO-LCG2_DATADISK", "destinationse": "nucleus:TOKYO-LCG2", "fsize": 0, "md5sum": "", "checksum": "", "scope": "mc16_13TeV", "jeditaskid": 21265064, "datasetid": 311267782, "fileid": 21092403937, "attemptnr": 0, "destination": " ", "fsizemb": "0.00", "ruciodatasetname": "mc16_13TeV.830011.H7EG_jetjet_JZ1.simul.HITS.e7954_e7400_s3126_tid21265064_00", "datasetname": "mc16_13TeV.830011.H7EG_jetjet_JZ1.simul.HITS.e7954_e7400_s3126_tid21265064_00", "ddmsite": "TOKYO-LCG2", "dsttoken": "DATADISK", "creationdate": "2020-05-18T10:29:21", "oldfiletable": 1, "destinationdblock_vis": "sub2290908"}, {"row_id": 30133581518, "pandaid": 4731765799, "modificationtime": "2020-05-18T10:29:21", "guid": "73a868bd-acd5-4fb4-ade9-a66badd0e5a9", "lfn": "log.21265064._002580.job.log.tgz.1", "type": "log", "dataset": "mc16_13TeV.830011.H7EG_jetjet_JZ1.simul.log.e7954_e7400_s3126_tid21265064_00", "status": "failed", "proddblock": "", "proddblocktoken": "", "dispatchdblock": "", "dispatchdblocktoken": "", "destinationdblock": "mc16_13TeV.830011.NA.simul.log.21265064_sub2290905", "destinationdblocktoken": "ddd:LRZ-LMU_DATADISK", "destinationse": "LRZ-LMU_MUC", "fsize": 0, "md5sum": "", "checksum": "", "scope": "mc16_13TeV", "jeditaskid": 21265064, "datasetid": 311267783, "fileid": 21092403936, "attemptnr": 0, "destination": " ", "fsizemb": "0.00", "ruciodatasetname": "mc16_13TeV.830011.H7EG_jetjet_JZ1.simul.log.e7954_e7400_s3126_tid21265064_00", "datasetname": "mc16_13TeV.830011.H7EG_jetjet_JZ1.simul.log.e7954_e7400_s3126_tid21265064_00", "ddmsite": "LRZ-LMU", "creationdate": "2020-05-18T10:29:21", "oldfiletable": 1, "destinationdblock_vis": "sub2290905"}], "job": {"pandaid": 4731765799, "jobdefinitionid": 0, "schedulerid": "", "pilotid": "", "creationtime": "2020-05-18 03:42:54", "creationhost": "", "modificationtime": "2020-05-18 10:29:21", "modificationhost": "aipanda058.cern.ch", "atlasrelease": "Atlas-21.0.15", "transformation": "Sim_tf.py", "homepackage": "AtlasOffline/21.0.15", "prodserieslabel": "pandatest", "prodsourcelabel": "managed", "produserid": "dhirsch", "gshare": "Validation", "assignedpriority": 888, "currentpriority": 888, "attemptnr": 22, "maxattempt": 22, "jobname": "mc16_13TeV.830011.H7EG_jetjet_JZ1.simul.e7954_e7400_s3126.4727254713", "maxcpucount": 4611, "maxcpuunit": "kSI2kseconds", "maxdiskcount": 5255, "maxdiskunit": "MB ", "ipconnectivity": "yes ", "minramcount": 44100, "minramunit": "MB", "starttime": null, "endtime": "2020-05-18T10:29:21", "cpuconsumptiontime": 0, "cpuconsumptionunit": "", "commandtopilot": "tobekilled", "transexitcode": "", "piloterrorcode": 0, "piloterrordiag": "", "exeerrorcode": 0, "exeerrordiag": "", "superrorcode": 0, "superrordiag": "", "ddmerrorcode": 0, "ddmerrordiag": "", "brokerageerrorcode": 0, "brokerageerrordiag": "", "jobdispatchererrorcode": 0, "jobdispatchererrordiag": "", "taskbuffererrorcode": 100, "taskbuffererrordiag": "reassigned by JEDI", "computingsite": "LRZ-LMU_MUC", "computingelement": "", "jobparameters": "", "metadata": "", "proddblock": "mc16_13TeV:mc16_13TeV.830011.H7EG_jetjet_JZ1.merge.EVNT.e7954_e7400_tid21265061_00", "dispatchdblock": "", "destinationdblock": "mc16_13TeV.830011.H7EG_jetjet_JZ1.simul.log.e7954_e7400_s3126_tid21265064_00", "destinationse": "dst:TOKYO-LCG2_DATADISK/ATLASDATADISK", "nevents": 0, "grid": "", "cloud": "WORLD", "cpuconversion": null, "sourcesite": "", "destinationsite": "", "transfertype": "", "taskid": 21265064, "cmtconfig": "x86_64-slc6-gcc49-opt", "statechangetime": "2020-05-18 10:29:21", "proddbupdatetime": "2020-05-18T03:42:54", "lockedby": "jedi", "relocationflag": 1, "jobexecutionid": 0, "vo": "atlas", "pilottiming": "", "workinggroup": "AP_MCGN", "processingtype": "simul", "produsername": "dhirsch", "ninputfiles": null, "countrygroup": "", "batchid": "", "parentid": null, "specialhandling": "ddm:rucio,hc:DE,de", "jobsetid": 30864, "corecount": 96, "ninputdatafiles": 1, "inputfiletype": "EVNT", "inputfileproject": "mc16_13TeV", "inputfilebytes": 307204932, "noutputdatafiles": 0, "outputfilebytes": 0, "jobmetrics": "", "workqueue_id": 16, "jeditaskid": 21265064, "jobstatus": "closed", "actualcorecount": null, "reqid": 30864, "nucleus": "TOKYO-LCG2", "jobsubstatus": "toreassign", "eventservice": "ordinary", "hs06": 1920, "hs06sec": null, "maxrss": null, "maxvmem": null, "maxswap": null, "maxpss": null, "avgrss": null, "avgvmem": null, "avgswap": null, "avgpss": null, "maxwalltime": 4611, "resourcetype": "MCORE", "failedattempt": 4, "totrchar": null, "totwchar": null, "totrbytes": null, "totwbytes": null, "raterchar": null, "ratewchar": null, "raterbytes": null, "ratewbytes": null, "diskio": null, "memoryleak": null, "memoryleakx2": null, "container_name": "atlas.athena:21.0.15_DBRelease-100.0.2_Patched", "outputfiletype": "log", "homecloud": "DE", "errorinfo": "", "jobinfo": "", "duration": "", "durationsec": 0, "durationmin": 0, "waittime": "0:6:46:27", "priorityrange": "800:899", "jobsetrange": "30800:30899"}, "dsfiles": []}""" + +job_data = json.loads(job_data_json) + +job_data["jobPars"] = '--inputEVNTFile=EVNT.21265061._000036.pool.root.1 --maxEvents=1000 --postInclude "default:RecJobTransforms/UseFrontier.py" --preExec "EVNTtoHITS:simFlags.SimBarcodeOffset.set_Value_and_Lock(200000)" "EVNTtoHITS:simFlags.TRTRangeCut=30.0;simFlags.TightMuonStepping=True" --preInclude "EVNTtoHITS:SimulationJobOptions/preInclude.BeamPipeKill.py,SimulationJobOptions/preInclude.FrozenShowersFCalOnly.py" --skipEvents=1000 --firstEvent=331001 --outputHITSFile=HITS.21265064._002580.pool.root.1 --physicsList=FTFP_BERT_ATL_VALIDATION --randomSeed=352 --DBRelease="all:current" --conditionsTag "default:OFLCOND-MC16-SDR-14" --geometryVersion="default:ATLAS-R2-2016-01-00-01_VALIDATION" --runNumber=830011 --AMITag=s3126 --DataRunNumber=284500 --simulator=FullG4 --truthStrategy=MC15aPlus' + + +jobSpec = JobSpec() +jobSpec.convert_job_json(job_data) + +#pprint(jobSpec.jobParams) + +ae = AuxExtractor() +print(ae.get_aux_inputs(jobSpec)) From 106f5ce4f0353379fc3f4fae43dd409cd89c01fc Mon Sep 17 00:00:00 2001 From: fbarreir Date: Wed, 20 May 2020 17:06:52 +0200 Subject: [PATCH 156/181] k8s: reverted naming for cpu/mem adjust ratio config values --- pandaharvester/harvestersubmitter/k8s_submitter.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pandaharvester/harvestersubmitter/k8s_submitter.py b/pandaharvester/harvestersubmitter/k8s_submitter.py index b8ba61a1..07edff82 100644 --- a/pandaharvester/harvestersubmitter/k8s_submitter.py +++ b/pandaharvester/harvestersubmitter/k8s_submitter.py @@ -57,15 +57,15 @@ def __init__(self, **kwarg): # CPU adjust ratio try: - self.cpu_adjust_ratio + self.cpuAdjustRatio except AttributeError: - self.cpu_adjust_ratio = 100 + self.cpuAdjustRatio = 100 # Memory adjust ratio try: - self.memory_adjust_ratio + self.memoryAdjustRatio except AttributeError: - self.memory_adjust_ratio = 100 + self.memoryAdjustRatio = 100 def parse_params(self, job_params): tmp_log = self.make_logger(base_logger, method_name='parse_params') @@ -173,8 +173,8 @@ def submit_k8s_worker(self, work_spec): rsp, yaml_content_final = self.k8s_client.create_job_from_yaml(yaml_content, work_spec, container_image, executable, args, cert, cert_in_secret=use_secret, - cpu_adjust_ratio=self.cpu_adjust_ratio, - memory_adjust_ratio=self.memory_adjust_ratio) + cpu_adjust_ratio=self.cpuAdjustRatio, + memory_adjust_ratio=self.memoryAdjustRatio) except Exception as _e: tmp_log.error(traceback.format_exc()) err_str = 'Failed to create a JOB; {0}'.format(_e) From 018ea96b8e92db042daad20c2d7da74d3d05449f Mon Sep 17 00:00:00 2001 From: fbarreir Date: Fri, 22 May 2020 11:08:02 +0200 Subject: [PATCH 157/181] fixed arguments in submitterTest --- .../harvesterbody/worker_adjuster.py | 2 +- pandaharvester/harvestertest/submitterTest.py | 33 +++++++++++++++++-- 2 files changed, 32 insertions(+), 3 deletions(-) diff --git a/pandaharvester/harvesterbody/worker_adjuster.py b/pandaharvester/harvesterbody/worker_adjuster.py index 93e0fd03..bc27a116 100644 --- a/pandaharvester/harvesterbody/worker_adjuster.py +++ b/pandaharvester/harvesterbody/worker_adjuster.py @@ -214,7 +214,7 @@ def define_num_workers(self, static_num_workers, site_name): # TODO: needs to be recalculated simple_rt_nw_list = [] for job_type in _d: # jt: job type - for resource_type in _d[job_type]: # rt: resource type + for resource_type in _d[job_type]: # rt: resource type simple_rt_nw_list.append([resource_type, _d[job_type][resource_type].get('nNewWorkers', 0), 0]) _countdown = n_new_workers_max_agg diff --git a/pandaharvester/harvestertest/submitterTest.py b/pandaharvester/harvestertest/submitterTest.py index 9e995f54..200cd3f7 100644 --- a/pandaharvester/harvestertest/submitterTest.py +++ b/pandaharvester/harvestertest/submitterTest.py @@ -14,10 +14,39 @@ signal_utils.set_suicide_handler(None) os.wait() else: + + if len(sys.argv) not in (2, 4): + print("Wrong number of parameters. You can either:") + print(" - specify the queue name") + print(" - specify the queue name, jobType (managed, user) and resourceType (SCORE, SCORE_HIMEM, MCORE, MCORE_HIMEM)") + sys.exit(0) + queueName = sys.argv[1] queueConfigMapper = QueueConfigMapper() queueConfig = queueConfigMapper.get_queue(queueName) + if queueConfig.prodSourceLabel in ('user', 'managed'): + jobType = queueConfig.prodSourceLabel + else: + jobType = 'managed' # default, can be overwritten by parameters + + resourceType = 'SCORE' # default, can be overwritten by parameters + + if len(sys.argv) == 4: + # jobType should be 'managed' or 'user'. If not specified will default to a production job + if sys.argv[2] in ('user', 'managed'): + jobType = sys.argv[2] + else: + print ('value for jobType not valid, defaulted to {0}'.format(jobType)) + + # resourceType should be 'SCORE', 'SCORE_HIMEM', 'MCORE', 'MCORE_HIMEM'. If not specified defaults to single core + if sys.argv[3] in ('SCORE', 'SCORE_HIMEM', 'MCORE', 'MCORE_HIMEM'): + resourceType = sys.argv[3] + else: + print ('value for resourceType not valid, defaulted to {0}'.format(resourceType)) + + print ('Running with queueName:{0}, jobType:{1}, resourceType:{2}'.format(queueName, jobType, resourceType)) + pluginFactory = PluginFactory() com = CommunicatorPool() @@ -28,7 +57,7 @@ jobs, errStr = com.get_jobs(queueConfig.queueName, 'nodeName', queueConfig.prodSourceLabel, 'computingElement', 1, None) if len(jobs) == 0: - print ("Failed to get jobs at {0} due to {1}".format(queueConfig.queueName, errStr)) + print("Failed to get jobs at {0} due to {1}".format(queueConfig.queueName, errStr)) sys.exit(0) jobSpec = JobSpec() @@ -42,7 +71,7 @@ jobSpecList.append(jobSpec) maker = pluginFactory.get_plugin(queueConfig.workerMaker) - workSpec = maker.make_worker(jobSpecList, queueConfig, 'SCORE') # TODO: needs to be thought + workSpec = maker.make_worker(jobSpecList, queueConfig, jobType, resourceType) workSpec.accessPoint = queueConfig.messenger['accessPoint'] workSpec.mapType = queueConfig.mapType From 234f772d168e744d13bc316b9acc1a45269e02fa Mon Sep 17 00:00:00 2001 From: tmaeno Date: Fri, 22 May 2020 11:34:25 +0200 Subject: [PATCH 158/181] added aux_preparator.py --- pandaharvester/commit_timestamp.py | 2 +- .../harvesterpreparator/aux_preparator.py | 13 +++++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) create mode 100644 pandaharvester/harvesterpreparator/aux_preparator.py diff --git a/pandaharvester/commit_timestamp.py b/pandaharvester/commit_timestamp.py index c3bc3441..fcc3b7f0 100644 --- a/pandaharvester/commit_timestamp.py +++ b/pandaharvester/commit_timestamp.py @@ -1 +1 @@ -timestamp = "19-05-2020 22:37:10 on master (by tmaeno)" +timestamp = "22-05-2020 09:34:25 on master (by tmaeno)" diff --git a/pandaharvester/harvesterpreparator/aux_preparator.py b/pandaharvester/harvesterpreparator/aux_preparator.py new file mode 100644 index 00000000..4fd306cb --- /dev/null +++ b/pandaharvester/harvesterpreparator/aux_preparator.py @@ -0,0 +1,13 @@ +from . import analysis_aux_preparator +from .analysis_aux_preparator import AnalysisAuxPreparator +from pandaharvester.harvestercore import core_utils + +# logger +baseLogger = core_utils.setup_logger('aux_preparator') + +analysis_aux_preparator.baseLogger = baseLogger + + +# preparator plugin for auxiliary inputs +class AuxPreparator (AnalysisAuxPreparator): + pass \ No newline at end of file From b6a71f3e1d36f1a8761d305b017a268ff0630a0c Mon Sep 17 00:00:00 2001 From: fbarreir Date: Fri, 22 May 2020 16:50:49 +0200 Subject: [PATCH 159/181] k8s: Implemented GU queues --- .../harvestercloud/k8s_startup_script.py | 5 ++ .../harvestercloud/pilots_starter.py | 36 +++++++--- pandaharvester/harvestermisc/apfmon.py | 7 +- pandaharvester/harvestermisc/generic_utils.py | 14 ---- pandaharvester/harvestermisc/k8s_utils.py | 2 + .../harvestersubmitter/htcondor_submitter.py | 18 ++--- .../harvestersubmitter/k8s_submitter.py | 69 ++++++++++++++++--- 7 files changed, 106 insertions(+), 45 deletions(-) delete mode 100644 pandaharvester/harvestermisc/generic_utils.py diff --git a/pandaharvester/harvestercloud/k8s_startup_script.py b/pandaharvester/harvestercloud/k8s_startup_script.py index 51047ef7..be5bd8cd 100644 --- a/pandaharvester/harvestercloud/k8s_startup_script.py +++ b/pandaharvester/harvestercloud/k8s_startup_script.py @@ -1,5 +1,10 @@ #!/usr/bin/env python + +######################################################## +# OBSOLETE!!! USE PILOTS_STARTER.PY +######################################################## + """ This script will be executed at container startup - It will retrieve the proxy and panda queue from the environment diff --git a/pandaharvester/harvestercloud/pilots_starter.py b/pandaharvester/harvestercloud/pilots_starter.py index 150dd7a4..5ad5feb6 100644 --- a/pandaharvester/harvestercloud/pilots_starter.py +++ b/pandaharvester/harvestercloud/pilots_starter.py @@ -151,6 +151,12 @@ def get_configuration(): resource_type = os.environ.get('resourceType') logging.debug('[main] got resource type: {0}'.format(resource_type)) + prodSourceLabel = os.environ.get('prodSourceLabel') + logging.debug('[main] got prodSourceLabel: {0}'.format(prodSourceLabel)) + + job_type = os.environ.get('jobType') + logging.debug('[main] got job type: {0}'.format(job_type)) + # get the Harvester ID harvester_id = os.environ.get('HARVESTER_ID') logging.debug('[main] got Harvester ID: {0}'.format(harvester_id)) @@ -187,15 +193,15 @@ def get_configuration(): global CONFIG_DIR CONFIG_DIR = tmpdir + '/jobconfig' - return proxy_path, panda_site, panda_queue, resource_type, harvester_id, \ + return proxy_path, panda_site, panda_queue, resource_type, prodSourceLabel, job_type, harvester_id, \ worker_id, logs_frontend_w, logs_frontend_r, stdout_name, submit_mode if __name__ == "__main__": # get all the configuration from environment - proxy_path, panda_site, panda_queue, resource_type, harvester_id, worker_id, logs_frontend_w, logs_frontend_r, \ - destination_name, submit_mode = get_configuration() + proxy_path, panda_site, panda_queue, resource_type, prodSourceLabel, job_type, harvester_id, worker_id, \ + logs_frontend_w, logs_frontend_r, destination_name, submit_mode = get_configuration() # the pilot should propagate the download link via the pilotId field in the job table log_download_url = '{0}/{1}'.format(logs_frontend_r, destination_name) @@ -215,12 +221,24 @@ def get_configuration(): resource_type_option = '' if resource_type: resource_type_option = '--resource-type {0}'.format(resource_type) - wrapper_params = '-a {0} -s {1} -r {2} -q {3} {4}'.format(WORK_DIR, panda_site, panda_queue, panda_queue, - resource_type_option) - if 'ANALY' in panda_queue: - wrapper_params = '{0} -j user'.format(wrapper_params) - else: - wrapper_params = '{0} -j managed'.format(wrapper_params) + + psl_option = '' + if prodSourceLabel: + psl_option = '-j {0}'.format(prodSourceLabel) + + job_type_option = '' + if job_type: + job_type_option = '-i {0}'.format(job_type) + + wrapper_params = '-a {0} -s {1} -r {2} -q {3} {4} {5} {6}'.format(WORK_DIR, panda_site, panda_queue, panda_queue, + resource_type_option, psl_option, job_type_option) + + # TODO: This should be removed once we start using prodSourceLabel + if not psl_option: + if 'ANALY' in panda_queue: + wrapper_params = '{0} -j user'.format(wrapper_params) + else: + wrapper_params = '{0} -j managed'.format(wrapper_params) if submit_mode == 'PUSH': # job configuration files need to be copied, because k8s configmap mounts as read-only file system diff --git a/pandaharvester/harvestermisc/apfmon.py b/pandaharvester/harvestermisc/apfmon.py index 42625e99..acd26dee 100644 --- a/pandaharvester/harvestermisc/apfmon.py +++ b/pandaharvester/harvestermisc/apfmon.py @@ -8,9 +8,8 @@ import traceback from pandaharvester.harvesterconfig import harvester_config -from pandaharvester.harvestercore import core_utils from pandaharvester import panda_pkg_info -from pandaharvester.harvestermisc import generic_utils +from pandaharvester.harvestermisc import core_utils from pandaharvester.harvestercore.work_spec import WorkSpec from pandaharvester.harvestermisc.info_utils import PandaQueuesDict @@ -117,7 +116,7 @@ def create_labels(self): panda_queues_dict = PandaQueuesDict() # publish the active queues to APF mon in shards - for sites in generic_utils.create_shards(all_sites, 20): + for sites in core_utils.create_shards(all_sites, 20): labels = [] for site in sites: try: @@ -271,7 +270,7 @@ def create_workers(self, worker_spec_list): url = '{0}/jobs'.format(self.base_url) - for worker_spec_shard in generic_utils.create_shards(worker_spec_list, 20): + for worker_spec_shard in core_utils.create_shards(worker_spec_list, 20): apfmon_workers = [] for worker_spec in worker_spec_shard: batch_id = worker_spec.batchID diff --git a/pandaharvester/harvestermisc/generic_utils.py b/pandaharvester/harvestermisc/generic_utils.py deleted file mode 100644 index 4db657da..00000000 --- a/pandaharvester/harvestermisc/generic_utils.py +++ /dev/null @@ -1,14 +0,0 @@ -def create_shards(input_list, size): - """ - Creates shards of size n from the input list. - """ - shard, i = [], 0 - for element in input_list: - shard.append(element) - i += 1 - if i == size: - yield shard - shard, i = [], 0 - - if i > 0: - yield shard \ No newline at end of file diff --git a/pandaharvester/harvestermisc/k8s_utils.py b/pandaharvester/harvestermisc/k8s_utils.py index c298850b..63560701 100644 --- a/pandaharvester/harvestermisc/k8s_utils.py +++ b/pandaharvester/harvestermisc/k8s_utils.py @@ -120,6 +120,8 @@ def create_job_from_yaml(self, yaml_content, work_spec, container_image, execut {'name': 'computingSite', 'value': work_spec.computingSite}, {'name': 'pandaQueueName', 'value': queue_name}, {'name': 'resourceType', 'value': work_spec.resourceType}, + {'name': 'prodSourceLabel', 'value': work_spec.prodSourceLabel}, + {'name': 'jobType', 'value': work_spec.jobType}, {'name': 'proxySecretPath', 'value': cert if cert_in_secret else None}, {'name': 'proxyContent', 'value': None if cert_in_secret else self.set_proxy(cert)}, {'name': 'workerID', 'value': str(work_spec.workerID)}, diff --git a/pandaharvester/harvestersubmitter/htcondor_submitter.py b/pandaharvester/harvestersubmitter/htcondor_submitter.py index 9114449a..01e63604 100644 --- a/pandaharvester/harvestersubmitter/htcondor_submitter.py +++ b/pandaharvester/harvestersubmitter/htcondor_submitter.py @@ -825,6 +825,15 @@ def _handle_one_worker(workspec, to_submit=to_submit_any): }) return data + def _propagate_attributes(workspec, tmpVal): + # make logger + tmpLog = core_utils.make_logger(baseLogger, 'workerID={0}'.format(workspec.workerID), + method_name='_propagate_attributes') + (retVal, tmpDict) = tmpVal + workspec.set_attributes_with_dict(tmpDict) + tmpLog.debug('Done workspec attributes propagation') + return retVal + def _choose_proxy(workspec): """ Choose the proxy based on the job type @@ -839,15 +848,6 @@ def _choose_proxy(workspec): return proxy - def _propagate_attributes(workspec, tmpVal): - # make logger - tmpLog = core_utils.make_logger(baseLogger, 'workerID={0}'.format(workspec.workerID), - method_name='_propagate_attributes') - (retVal, tmpDict) = tmpVal - workspec.set_attributes_with_dict(tmpDict) - tmpLog.debug('Done workspec attributes propagation') - return retVal - tmpLog.debug('finished preparing worker attributes') # map(_handle_one_worker, workspec_list) diff --git a/pandaharvester/harvestersubmitter/k8s_submitter.py b/pandaharvester/harvestersubmitter/k8s_submitter.py index 07edff82..484be79b 100644 --- a/pandaharvester/harvestersubmitter/k8s_submitter.py +++ b/pandaharvester/harvestersubmitter/k8s_submitter.py @@ -12,6 +12,7 @@ from pandaharvester.harvestercore.plugin_base import PluginBase from pandaharvester.harvestermisc.k8s_utils import k8s_Client from pandaharvester.harvesterconfig import harvester_config +from pandaharvester.harvestermisc.info_utils import PandaQueuesDict # logger base_logger = core_utils.setup_logger('k8s_submitter') @@ -48,13 +49,33 @@ def __init__(self, **kwarg): else: if (not self.nProcesses) or (self.nProcesses < 1): self.nProcesses = 1 - # x509 proxy + # x509 proxy: obsolete mode try: self.x509UserProxy except AttributeError: if os.getenv('X509_USER_PROXY'): self.x509UserProxy = os.getenv('X509_USER_PROXY') + # x509 proxy for analysis jobs in grandly unified queues + try: + self.x509UserProxyAnalysis + except AttributeError: + self.x509UserProxyAnalysis = os.getenv('X509_USER_PROXY_ANAL') + + # x509 proxy through k8s secrets: preferred way + try: + self.proxySecretPath + except AttributeError: + if os.getenv('PROXY_SECRET_PATH'): + self.proxySecretPath = os.getenv('PROXY_SECRET_PATH') + + # analysis x509 proxy through k8s secrets: on GU queues + try: + self.proxySecretPathAnalysis + except AttributeError: + if os.getenv('PROXY_SECRET_PATH_ANAL'): + self.proxySecretPath = os.getenv('PROXY_SECRET_PATH_ANAL') + # CPU adjust ratio try: self.cpuAdjustRatio @@ -139,6 +160,37 @@ def build_executable(self, job_fields, job_pars_parsed): return executable, args + def _choose_proxy(self, workspec, is_grandly_unified_queue): + """ + Choose the proxy based on the job type and whether k8s secrets are enabled + """ + cert = None + use_secret = False + job_type = workspec.jobType + + if is_grandly_unified_queue and job_type in ('user', 'panda', 'analysis'): + if self.proxySecretPathAnalysis: + cert = self.proxySecretPathAnalysis + use_secret = True + elif self.proxySecretPath: + cert = self.proxySecretPath + use_secret = True + elif self.x509UserProxyAnalysis: + cert = self.x509UserProxyAnalysis + use_secret = False + elif self.x509UserProxy: + cert = self.x509UserProxy + use_secret = False + else: + if self.proxySecretPath: + cert = self.proxySecretPath + use_secret = True + elif self.x509UserProxy: + cert = self.x509UserProxy + use_secret = False + + return cert, use_secret + def submit_k8s_worker(self, work_spec): tmp_log = self.make_logger(base_logger, method_name='submit_k8s_worker') @@ -159,17 +211,16 @@ def submit_k8s_worker(self, work_spec): tmp_log.debug('container_image: "{0}"; executable: "{1}"; args: "{2}"'.format(container_image, executable, args)) - if hasattr(self, 'proxySecretPath'): - cert = self.proxySecretPath - use_secret = True - elif hasattr(self, 'x509UserProxy'): - cert = self.x509UserProxy - use_secret = False - else: - err_str = 'No proxy specified in proxySecretPath or x509UserProxy; not submitted' + # choose the appropriate proxy + panda_queues_dict = PandaQueuesDict() + is_grandly_unified_queue = panda_queues_dict.is_grandly_unified_queue(self.queueName) + cert, use_secret = self._choose_proxy(work_spec, is_grandly_unified_queue) + if not cert: + err_str = 'No proxy specified in proxySecretPath or x509UserProxy. Not submitted' tmp_return_value = (False, err_str) return tmp_return_value + # submit the worker rsp, yaml_content_final = self.k8s_client.create_job_from_yaml(yaml_content, work_spec, container_image, executable, args, cert, cert_in_secret=use_secret, From c06393a00b74b9018b7b82a8e3aa020f755d298a Mon Sep 17 00:00:00 2001 From: fbarreir Date: Fri, 22 May 2020 17:22:27 +0200 Subject: [PATCH 160/181] wrong import in apfmon --- pandaharvester/harvestermisc/apfmon.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandaharvester/harvestermisc/apfmon.py b/pandaharvester/harvestermisc/apfmon.py index acd26dee..2f7a6675 100644 --- a/pandaharvester/harvestermisc/apfmon.py +++ b/pandaharvester/harvestermisc/apfmon.py @@ -9,7 +9,7 @@ from pandaharvester.harvesterconfig import harvester_config from pandaharvester import panda_pkg_info -from pandaharvester.harvestermisc import core_utils +from pandaharvester.harvestercore import core_utils from pandaharvester.harvestercore.work_spec import WorkSpec from pandaharvester.harvestermisc.info_utils import PandaQueuesDict From 02ee93033d3c01fc2f4b601850e0ed24fb1674ee Mon Sep 17 00:00:00 2001 From: fbarreir Date: Fri, 22 May 2020 19:02:29 +0200 Subject: [PATCH 161/181] k8s: fix in GU queue --- pandaharvester/harvestermisc/k8s_utils.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandaharvester/harvestermisc/k8s_utils.py b/pandaharvester/harvestermisc/k8s_utils.py index 63560701..25700f5a 100644 --- a/pandaharvester/harvestermisc/k8s_utils.py +++ b/pandaharvester/harvestermisc/k8s_utils.py @@ -36,7 +36,7 @@ def read_yaml_file(self, yaml_file): return yaml_content - def create_job_from_yaml(self, yaml_content, work_spec, container_image, executable, args, + def create_job_from_yaml(self, yaml_content, work_spec, prod_source_label, container_image, executable, args, cert, cert_in_secret=True, cpu_adjust_ratio=100, memory_adjust_ratio=100,): tmp_log = core_utils.make_logger(base_logger, method_name='create_job_from_yaml') @@ -56,6 +56,7 @@ def create_job_from_yaml(self, yaml_content, work_spec, container_image, execut # retrieve panda queue information panda_queues_dict = PandaQueuesDict() queue_name = panda_queues_dict.get_panda_queue_name(work_spec.computingSite) + prod_source_label = harvester_queue_config.get_source_label(workspec.jobType) # set the worker name yaml_content['metadata']['name'] = yaml_content['metadata']['name'] + "-" + str(work_spec.workerID) @@ -120,7 +121,7 @@ def create_job_from_yaml(self, yaml_content, work_spec, container_image, execut {'name': 'computingSite', 'value': work_spec.computingSite}, {'name': 'pandaQueueName', 'value': queue_name}, {'name': 'resourceType', 'value': work_spec.resourceType}, - {'name': 'prodSourceLabel', 'value': work_spec.prodSourceLabel}, + {'name': 'prodSourceLabel', 'value': prod_source_label}, {'name': 'jobType', 'value': work_spec.jobType}, {'name': 'proxySecretPath', 'value': cert if cert_in_secret else None}, {'name': 'proxyContent', 'value': None if cert_in_secret else self.set_proxy(cert)}, From d1995c5f127c789abb6f0e603971995fc45575b5 Mon Sep 17 00:00:00 2001 From: fbarreir Date: Mon, 25 May 2020 12:38:16 +0200 Subject: [PATCH 162/181] k8s: fixes for GU queues --- pandaharvester/harvestermisc/k8s_utils.py | 5 ----- pandaharvester/harvestersubmitter/k8s_submitter.py | 10 ++++++++-- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/pandaharvester/harvestermisc/k8s_utils.py b/pandaharvester/harvestermisc/k8s_utils.py index 25700f5a..c2fe980f 100644 --- a/pandaharvester/harvestermisc/k8s_utils.py +++ b/pandaharvester/harvestermisc/k8s_utils.py @@ -53,11 +53,6 @@ def create_job_from_yaml(self, yaml_content, work_spec, prod_source_label, conta if not res: # if the configmap creation failed, don't submit a job because the pod creation will hang return res, 'Failed to create a configmap' - # retrieve panda queue information - panda_queues_dict = PandaQueuesDict() - queue_name = panda_queues_dict.get_panda_queue_name(work_spec.computingSite) - prod_source_label = harvester_queue_config.get_source_label(workspec.jobType) - # set the worker name yaml_content['metadata']['name'] = yaml_content['metadata']['name'] + "-" + str(work_spec.workerID) diff --git a/pandaharvester/harvestersubmitter/k8s_submitter.py b/pandaharvester/harvestersubmitter/k8s_submitter.py index 484be79b..e7966868 100644 --- a/pandaharvester/harvestersubmitter/k8s_submitter.py +++ b/pandaharvester/harvestersubmitter/k8s_submitter.py @@ -13,6 +13,7 @@ from pandaharvester.harvestermisc.k8s_utils import k8s_Client from pandaharvester.harvesterconfig import harvester_config from pandaharvester.harvestermisc.info_utils import PandaQueuesDict +from pandaharvester.harvestercore.queue_config_mapper import QueueConfigMapper # logger base_logger = core_utils.setup_logger('k8s_submitter') @@ -194,6 +195,11 @@ def _choose_proxy(self, workspec, is_grandly_unified_queue): def submit_k8s_worker(self, work_spec): tmp_log = self.make_logger(base_logger, method_name='submit_k8s_worker') + # get info from harvester queue config + _queueConfigMapper = QueueConfigMapper() + harvester_queue_config = _queueConfigMapper.get_queue(self.queueName) + prod_source_label = harvester_queue_config.get_source_label(workspec.jobType) + # set the stdout log file log_file_name = '{0}_{1}.out'.format(harvester_config.master.harvester_id, work_spec.workerID) work_spec.set_log_file('stdout', '{0}/{1}'.format(self.logBaseURL, log_file_name)) @@ -221,8 +227,8 @@ def submit_k8s_worker(self, work_spec): return tmp_return_value # submit the worker - rsp, yaml_content_final = self.k8s_client.create_job_from_yaml(yaml_content, work_spec, container_image, - executable, args, + rsp, yaml_content_final = self.k8s_client.create_job_from_yaml(yaml_content, work_spec, prod_source_label, + container_image, executable, args, cert, cert_in_secret=use_secret, cpu_adjust_ratio=self.cpuAdjustRatio, memory_adjust_ratio=self.memoryAdjustRatio) From b8d6c7dcd268bf280b9e027ec11a829fb2a5dc88 Mon Sep 17 00:00:00 2001 From: fbarreir Date: Mon, 25 May 2020 13:13:22 +0200 Subject: [PATCH 163/181] k8s: typo --- pandaharvester/harvestersubmitter/k8s_submitter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandaharvester/harvestersubmitter/k8s_submitter.py b/pandaharvester/harvestersubmitter/k8s_submitter.py index e7966868..ce649678 100644 --- a/pandaharvester/harvestersubmitter/k8s_submitter.py +++ b/pandaharvester/harvestersubmitter/k8s_submitter.py @@ -198,7 +198,7 @@ def submit_k8s_worker(self, work_spec): # get info from harvester queue config _queueConfigMapper = QueueConfigMapper() harvester_queue_config = _queueConfigMapper.get_queue(self.queueName) - prod_source_label = harvester_queue_config.get_source_label(workspec.jobType) + prod_source_label = harvester_queue_config.get_source_label(work_spec.jobType) # set the stdout log file log_file_name = '{0}_{1}.out'.format(harvester_config.master.harvester_id, work_spec.workerID) From 4259de1b7d03cd19386914938ac2e0ed53d872c7 Mon Sep 17 00:00:00 2001 From: fbarreir Date: Mon, 25 May 2020 13:18:20 +0200 Subject: [PATCH 164/181] k8s: typo --- pandaharvester/harvestermisc/k8s_utils.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandaharvester/harvestermisc/k8s_utils.py b/pandaharvester/harvestermisc/k8s_utils.py index c2fe980f..ab085042 100644 --- a/pandaharvester/harvestermisc/k8s_utils.py +++ b/pandaharvester/harvestermisc/k8s_utils.py @@ -53,6 +53,10 @@ def create_job_from_yaml(self, yaml_content, work_spec, prod_source_label, conta if not res: # if the configmap creation failed, don't submit a job because the pod creation will hang return res, 'Failed to create a configmap' + # retrieve panda queue information + panda_queues_dict = PandaQueuesDict() + queue_name = panda_queues_dict.get_panda_queue_name(work_spec.computingSite) + # set the worker name yaml_content['metadata']['name'] = yaml_content['metadata']['name'] + "-" + str(work_spec.workerID) From c4ead373072e431fc06350fd953cf40beff3d52c Mon Sep 17 00:00:00 2001 From: fbarreir Date: Wed, 27 May 2020 12:21:05 +0200 Subject: [PATCH 165/181] k8s: added max pod lifetime read from AGIS and defaulted to 4 days --- pandaharvester/harvestermisc/k8s_utils.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pandaharvester/harvestermisc/k8s_utils.py b/pandaharvester/harvestermisc/k8s_utils.py index ab085042..a44f02fe 100644 --- a/pandaharvester/harvestermisc/k8s_utils.py +++ b/pandaharvester/harvestermisc/k8s_utils.py @@ -37,7 +37,7 @@ def read_yaml_file(self, yaml_file): return yaml_content def create_job_from_yaml(self, yaml_content, work_spec, prod_source_label, container_image, executable, args, - cert, cert_in_secret=True, cpu_adjust_ratio=100, memory_adjust_ratio=100,): + cert, cert_in_secret=True, cpu_adjust_ratio=100, memory_adjust_ratio=100, max_time=None): tmp_log = core_utils.make_logger(base_logger, method_name='create_job_from_yaml') @@ -147,6 +147,12 @@ def create_job_from_yaml(self, yaml_content, work_spec, prod_source_label, conta if 'affinity' not in yaml_content['spec']['template']['spec']: yaml_content = self.set_affinity(yaml_content) + # set max_time to avoid having a pod running forever + if 'activeDeadlineSeconds' not in yaml_content['spec']['template']['spec']: + if not max_time: # 4 days + max_time = 4 * 24 * 23600 + yaml_content['spec']['template']['spec']['activeDeadlineSeconds'] = max_time + tmp_log.debug('creating job {0}'.format(yaml_content)) rsp = self.batchv1.create_namespaced_job(body=yaml_content, namespace=self.namespace) From d627b374a13d38be235160807524719d95c8fc34 Mon Sep 17 00:00:00 2001 From: fbarreir Date: Wed, 27 May 2020 12:22:50 +0200 Subject: [PATCH 166/181] k8s: added max time to k8s pods --- pandaharvester/harvestersubmitter/k8s_submitter.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/pandaharvester/harvestersubmitter/k8s_submitter.py b/pandaharvester/harvestersubmitter/k8s_submitter.py index ce649678..a36997e1 100644 --- a/pandaharvester/harvestersubmitter/k8s_submitter.py +++ b/pandaharvester/harvestersubmitter/k8s_submitter.py @@ -226,12 +226,20 @@ def submit_k8s_worker(self, work_spec): tmp_return_value = (False, err_str) return tmp_return_value + # get the walltime limit + try: + max_time = panda_queues_dict.get(self.queueName)['maxtime'] + except Exception as e: + tmp_log.warning('Could not retrieve maxtime field for queue {0}'.format(self.queueName)) + max_time = None + # submit the worker rsp, yaml_content_final = self.k8s_client.create_job_from_yaml(yaml_content, work_spec, prod_source_label, container_image, executable, args, cert, cert_in_secret=use_secret, cpu_adjust_ratio=self.cpuAdjustRatio, - memory_adjust_ratio=self.memoryAdjustRatio) + memory_adjust_ratio=self.memoryAdjustRatio, + max_time=max_time) except Exception as _e: tmp_log.error(traceback.format_exc()) err_str = 'Failed to create a JOB; {0}'.format(_e) From b18fb858cf160e7b9483d24701846fe6ea7c4e58 Mon Sep 17 00:00:00 2001 From: fbarreir Date: Thu, 28 May 2020 15:16:43 +0200 Subject: [PATCH 167/181] k8s: usage of label_selector in list_namespaced_pod calls --- pandaharvester/harvestermessenger/k8s_messenger.py | 2 +- pandaharvester/harvestermisc/k8s_utils.py | 14 +++++++++++--- pandaharvester/harvestermonitor/k8s_monitor.py | 8 ++++---- 3 files changed, 16 insertions(+), 8 deletions(-) diff --git a/pandaharvester/harvestermessenger/k8s_messenger.py b/pandaharvester/harvestermessenger/k8s_messenger.py index d28702bb..232f756d 100644 --- a/pandaharvester/harvestermessenger/k8s_messenger.py +++ b/pandaharvester/harvestermessenger/k8s_messenger.py @@ -27,7 +27,7 @@ def __init__(self, **kwargs): def post_processing(self, workspec, jobspec_list, map_type): """ - Do the folloiwing in post_processing, i.e. when workers terminate (finished/failed/cancelled) + Do the following in post_processing, i.e. when workers terminate (finished/failed/cancelled) - Fetch logs of the pod from k8s - Store or upload logs """ diff --git a/pandaharvester/harvestermisc/k8s_utils.py b/pandaharvester/harvestermisc/k8s_utils.py index a44f02fe..7c38f48f 100644 --- a/pandaharvester/harvestermisc/k8s_utils.py +++ b/pandaharvester/harvestermisc/k8s_utils.py @@ -158,14 +158,22 @@ def create_job_from_yaml(self, yaml_content, work_spec, prod_source_label, conta rsp = self.batchv1.create_namespaced_job(body=yaml_content, namespace=self.namespace) return rsp, yaml_content - def get_pods_info(self): + def get_pods_info(self, workspec_list=[]): tmp_log = core_utils.make_logger(base_logger, method_name='get_pods_info') - pods_list = list() + if workspec_list: + batch_ids_list = [workspec.batchID for workspec in workspec_list] + batch_ids_concat = ','.join(batch_ids_list) + label_selector = 'job-name in ({0})'.format(batch_ids_concat) + else: + label_selector = '' + + tmp_log.debug('label_selector: {0}'.format(label_selector)) + try: - ret = self.corev1.list_namespaced_pod(namespace=self.namespace) + ret = self.corev1.list_namespaced_pod(namespace=self.namespace, label_selector=label_selector) except Exception as _e: tmp_log.error('Failed call to list_namespaced_pod with: {0}'.format(_e)) else: diff --git a/pandaharvester/harvestermonitor/k8s_monitor.py b/pandaharvester/harvestermonitor/k8s_monitor.py index f7b3d013..31162711 100644 --- a/pandaharvester/harvestermonitor/k8s_monitor.py +++ b/pandaharvester/harvestermonitor/k8s_monitor.py @@ -81,10 +81,10 @@ def check_pods_status(self, pods_status_list, containers_state_list): return new_status, sub_msg - def check_a_job(self, workspec): + def check_a_worker(self, workspec): # set logger tmp_log = self.make_logger(base_logger, 'workerID={0} batchID={1}'.format(workspec.workerID, workspec.batchID), - method_name='check_a_job') + method_name='check_a_worker') # initialization job_id = workspec.batchID @@ -154,11 +154,11 @@ def check_workers(self, workspec_list): ret_list.append(('', err_str)) return False, ret_list - self._all_pods_list = self.k8s_client.get_pods_info() + self._all_pods_list = self.k8s_client.get_pods_info(workspec_list=workspec_list) # resolve status requested workers with ThreadPoolExecutor(self.nProcesses) as thread_pool: - ret_iterator = thread_pool.map(self.check_a_job, workspec_list) + ret_iterator = thread_pool.map(self.check_a_worker, workspec_list) ret_list = list(ret_iterator) From c85eb58530bdf38a73f3c10deca3e39f38d2fac7 Mon Sep 17 00:00:00 2001 From: fbarreir Date: Thu, 28 May 2020 18:38:25 +0200 Subject: [PATCH 168/181] k8s: sweeper using label selector --- pandaharvester/harvestersweeper/k8s_sweeper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandaharvester/harvestersweeper/k8s_sweeper.py b/pandaharvester/harvestersweeper/k8s_sweeper.py index fa0639ab..71fb64ca 100644 --- a/pandaharvester/harvestersweeper/k8s_sweeper.py +++ b/pandaharvester/harvestersweeper/k8s_sweeper.py @@ -56,7 +56,7 @@ def __init__(self, **kwarg): def kill_workers(self, work_spec_list): tmp_log = self.make_logger(base_logger, method_name='kill_workers') - self._all_pods_list = self.k8s_client.get_pods_info() + self._all_pods_list = self.k8s_client.get_pods_info(work_spec_list=work_spec_list) ret_list = [] for work_spec in work_spec_list: From 79631bace42b15036740562912795bba2be2505c Mon Sep 17 00:00:00 2001 From: fbarreir Date: Thu, 28 May 2020 18:43:59 +0200 Subject: [PATCH 169/181] k8s: typo --- pandaharvester/harvestersweeper/k8s_sweeper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandaharvester/harvestersweeper/k8s_sweeper.py b/pandaharvester/harvestersweeper/k8s_sweeper.py index 71fb64ca..6421194a 100644 --- a/pandaharvester/harvestersweeper/k8s_sweeper.py +++ b/pandaharvester/harvestersweeper/k8s_sweeper.py @@ -56,7 +56,7 @@ def __init__(self, **kwarg): def kill_workers(self, work_spec_list): tmp_log = self.make_logger(base_logger, method_name='kill_workers') - self._all_pods_list = self.k8s_client.get_pods_info(work_spec_list=work_spec_list) + self._all_pods_list = self.k8s_client.get_pods_info(workspec_list=work_spec_list) ret_list = [] for work_spec in work_spec_list: From 3c803bc415620cbd744104d368106466ca867799 Mon Sep 17 00:00:00 2001 From: fbarreir Date: Thu, 28 May 2020 19:19:35 +0200 Subject: [PATCH 170/181] k8s: get_jobs_info changed from field_selector to label_selector --- pandaharvester/harvestermisc/k8s_utils.py | 24 ++++++++++++------- .../harvestersweeper/k8s_sweeper.py | 2 +- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/pandaharvester/harvestermisc/k8s_utils.py b/pandaharvester/harvestermisc/k8s_utils.py index 7c38f48f..1c0ab34b 100644 --- a/pandaharvester/harvestermisc/k8s_utils.py +++ b/pandaharvester/harvestermisc/k8s_utils.py @@ -158,18 +158,22 @@ def create_job_from_yaml(self, yaml_content, work_spec, prod_source_label, conta rsp = self.batchv1.create_namespaced_job(body=yaml_content, namespace=self.namespace) return rsp, yaml_content - def get_pods_info(self, workspec_list=[]): - - tmp_log = core_utils.make_logger(base_logger, method_name='get_pods_info') - pods_list = list() - + def generate_ls_from_wsl(self, workspec_list=[]): if workspec_list: - batch_ids_list = [workspec.batchID for workspec in workspec_list] + batch_ids_list = [workspec.batchID for workspec in workspec_list if workspec.batchID] batch_ids_concat = ','.join(batch_ids_list) label_selector = 'job-name in ({0})'.format(batch_ids_concat) else: label_selector = '' + return label_selector + + def get_pods_info(self, workspec_list=[]): + + tmp_log = core_utils.make_logger(base_logger, method_name='get_pods_info') + pods_list = list() + + label_selector = self.generate_ls_from_wsl(workspec_list) tmp_log.debug('label_selector: {0}'.format(label_selector)) try: @@ -199,15 +203,17 @@ def filter_pods_info(self, pods_list, job_name=None): pods_list = [i for i in pods_list if i['job_name'] == job_name] return pods_list - def get_jobs_info(self, job_name=None): + def get_jobs_info(self, workspec_list=[]): tmp_log = core_utils.make_logger(base_logger, 'job_name={0}'.format(job_name), method_name='get_jobs_info') jobs_list = list() - field_selector = 'metadata.name=' + job_name if job_name else '' + label_selector = self.generate_ls_from_wsl(workspec_list) + tmp_log.debug('label_selector: {0}'.format(label_selector)) + try: - ret = self.batchv1.list_namespaced_job(namespace=self.namespace, field_selector=field_selector) + ret = self.batchv1.list_namespaced_job(namespace=self.namespace, label_selector=label_selector) for i in ret.items: job_info = { diff --git a/pandaharvester/harvestersweeper/k8s_sweeper.py b/pandaharvester/harvestersweeper/k8s_sweeper.py index 6421194a..ba4ca877 100644 --- a/pandaharvester/harvestersweeper/k8s_sweeper.py +++ b/pandaharvester/harvestersweeper/k8s_sweeper.py @@ -90,7 +90,7 @@ def kill_workers(self, work_spec_list): # retrieve the associated pods pods_list = self.k8s_client.filter_pods_info(self._all_pods_list, job_name=batch_id) pods_name = [pods_info['name'] for pods_info in pods_list] - job_info = self.k8s_client.get_jobs_info(batch_id) + job_info = self.k8s_client.get_jobs_info(workspec_list=[work_spec]) # retrieve the associated pods if not job_info: ret_list = self.k8s_client.delete_pods(pods_name) From ac8986727a0c1005398bb5bc1bec0df855956d04 Mon Sep 17 00:00:00 2001 From: fbarreir Date: Thu, 28 May 2020 19:22:08 +0200 Subject: [PATCH 171/181] k8s: removed debug comment --- pandaharvester/harvestermisc/k8s_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandaharvester/harvestermisc/k8s_utils.py b/pandaharvester/harvestermisc/k8s_utils.py index 1c0ab34b..e3088af7 100644 --- a/pandaharvester/harvestermisc/k8s_utils.py +++ b/pandaharvester/harvestermisc/k8s_utils.py @@ -174,7 +174,7 @@ def get_pods_info(self, workspec_list=[]): pods_list = list() label_selector = self.generate_ls_from_wsl(workspec_list) - tmp_log.debug('label_selector: {0}'.format(label_selector)) + # tmp_log.debug('label_selector: {0}'.format(label_selector)) try: ret = self.corev1.list_namespaced_pod(namespace=self.namespace, label_selector=label_selector) @@ -210,7 +210,7 @@ def get_jobs_info(self, workspec_list=[]): jobs_list = list() label_selector = self.generate_ls_from_wsl(workspec_list) - tmp_log.debug('label_selector: {0}'.format(label_selector)) + # tmp_log.debug('label_selector: {0}'.format(label_selector)) try: ret = self.batchv1.list_namespaced_job(namespace=self.namespace, label_selector=label_selector) From 626f29dba85c5ef82fa8adbd726e5841978debc7 Mon Sep 17 00:00:00 2001 From: fbarreir Date: Thu, 28 May 2020 19:24:01 +0200 Subject: [PATCH 172/181] k8s: typo --- pandaharvester/harvestermisc/k8s_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandaharvester/harvestermisc/k8s_utils.py b/pandaharvester/harvestermisc/k8s_utils.py index e3088af7..d9e80235 100644 --- a/pandaharvester/harvestermisc/k8s_utils.py +++ b/pandaharvester/harvestermisc/k8s_utils.py @@ -205,7 +205,7 @@ def filter_pods_info(self, pods_list, job_name=None): def get_jobs_info(self, workspec_list=[]): - tmp_log = core_utils.make_logger(base_logger, 'job_name={0}'.format(job_name), method_name='get_jobs_info') + tmp_log = core_utils.make_logger(base_logger, method_name='get_jobs_info') jobs_list = list() From cdd68e3a3733efcd059467432da0aa3d72cfbf71 Mon Sep 17 00:00:00 2001 From: Lincoln Bryant Date: Thu, 28 May 2020 12:42:41 -0500 Subject: [PATCH 173/181] check for a number at the END of the string --- pandaharvester/harvestersubmitter/slurm_submitter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandaharvester/harvestersubmitter/slurm_submitter.py b/pandaharvester/harvestersubmitter/slurm_submitter.py index 42e91468..feddc8ea 100644 --- a/pandaharvester/harvestersubmitter/slurm_submitter.py +++ b/pandaharvester/harvestersubmitter/slurm_submitter.py @@ -51,7 +51,7 @@ def submit_workers(self, workspec_list): stdErr_str = stdErr if (isinstance(stdErr, str) or stdErr is None) else stdErr.decode() if retCode == 0: # extract batchID - workSpec.batchID = re.search('[^0-9]*([0-9]+)[^0-9]*', '{0}'.format(stdOut_str)).group(1) + workSpec.batchID = re.search('[^0-9]*([0-9]+)[^0-9]*$', '{0}'.format(stdOut_str)).group(1) tmpLog.debug('batchID={0}'.format(workSpec.batchID)) # set log files if self.uploadLog: From 359112f61cd8994af1a034f621b447b74d0832c7 Mon Sep 17 00:00:00 2001 From: fbarreir Date: Thu, 28 May 2020 19:43:40 +0200 Subject: [PATCH 174/181] k8s: delete only job, not pod in addition --- pandaharvester/harvestersweeper/k8s_sweeper.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandaharvester/harvestersweeper/k8s_sweeper.py b/pandaharvester/harvestersweeper/k8s_sweeper.py index ba4ca877..86c24786 100644 --- a/pandaharvester/harvestersweeper/k8s_sweeper.py +++ b/pandaharvester/harvestersweeper/k8s_sweeper.py @@ -87,6 +87,7 @@ def kill_workers(self, work_spec_list): tmp_log.error(err_str) tmp_ret_val = (False, err_str) + """ # retrieve the associated pods pods_list = self.k8s_client.filter_pods_info(self._all_pods_list, job_name=batch_id) pods_name = [pods_info['name'] for pods_info in pods_list] @@ -105,6 +106,7 @@ def kill_workers(self, work_spec_list): tmp_log.error(err_str) err_str_list.append(err_str) tmp_ret_val = (False, ','.join(err_str_list)) + """ else: # the worker cannot be cleaned tmp_ret_val = (True, '') From 31c368527726180ed15ba6e32881f71720fc4b30 Mon Sep 17 00:00:00 2001 From: fbarreir Date: Thu, 28 May 2020 19:47:51 +0200 Subject: [PATCH 175/181] k8s: disambiguation --- pandaharvester/harvestersweeper/k8s_sweeper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandaharvester/harvestersweeper/k8s_sweeper.py b/pandaharvester/harvestersweeper/k8s_sweeper.py index 86c24786..9486364c 100644 --- a/pandaharvester/harvestersweeper/k8s_sweeper.py +++ b/pandaharvester/harvestersweeper/k8s_sweeper.py @@ -76,7 +76,7 @@ def kill_workers(self, work_spec_list): tmp_log.error(err_str) tmp_ret_val = (False, err_str) else: - tmp_log.debug('No job/configmap associated to worker {0}'.format(work_spec.workerID)) + tmp_log.debug('No pandajob/configmap associated to worker {0}'.format(work_spec.workerID)) # delete the job try: From d8999cd4a6ec345cc37909fc8861d381929dcd04 Mon Sep 17 00:00:00 2001 From: fbarreir Date: Fri, 29 May 2020 11:37:26 +0200 Subject: [PATCH 176/181] k8s: protection in k8s secret cred manager --- .../harvestercredmanager/k8s_secret_cred_manager.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandaharvester/harvestercredmanager/k8s_secret_cred_manager.py b/pandaharvester/harvestercredmanager/k8s_secret_cred_manager.py index ddcfa370..aeee36ef 100644 --- a/pandaharvester/harvestercredmanager/k8s_secret_cred_manager.py +++ b/pandaharvester/harvestercredmanager/k8s_secret_cred_manager.py @@ -46,7 +46,11 @@ def __init__(self, **kwarg): e.__class__.__name__, e)) raise # k8s client - self.k8s_client = k8s_Client(namespace=self.k8s_namespace, config_file=self.k8s_config_file) + try: + self.k8s_client = k8s_Client(namespace=self.k8s_namespace, config_file=self.k8s_config_file) + except Exception as e: + mainLog.error('Problem instantiating k8s client for {0}'.format(self.k8s_config_file)) + raise # check proxy def check_credential(self): From d64daf8037e1c1febe4006303b4d05c21438a54c Mon Sep 17 00:00:00 2001 From: fbarreir Date: Fri, 29 May 2020 11:40:41 +0200 Subject: [PATCH 177/181] k8s: protection in cred_manager.py against faulty cred plugins --- pandaharvester/harvesterbody/cred_manager.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/pandaharvester/harvesterbody/cred_manager.py b/pandaharvester/harvesterbody/cred_manager.py index 1f97536f..75774123 100644 --- a/pandaharvester/harvesterbody/cred_manager.py +++ b/pandaharvester/harvesterbody/cred_manager.py @@ -42,8 +42,13 @@ def __init__(self, single_mode=False): pluginPar['inCertFile'] = inCertFile pluginPar['outCertFile'] = outCertFile pluginPar['voms'] = voms - exeCore = self.pluginFactory.get_plugin(pluginPar) - self.exeCores.append(exeCore) + try: + exeCore = self.pluginFactory.get_plugin(pluginPar) + self.exeCores.append(exeCore) + except Exception as e: + _logger.error('Problem instantiating cred manager for {0}'.format(pluginPar)) + _logger.error('Exception {0}'.format(e)) + # get list def get_list(self, data): From 8d0672e27e394ec1f6ac2c4844adcc6f95843070 Mon Sep 17 00:00:00 2001 From: fbarreir Date: Fri, 29 May 2020 17:54:05 +0200 Subject: [PATCH 178/181] monitor: protection against exceptions thrown by plugins --- pandaharvester/harvesterbody/monitor.py | 30 +++++++++++++++++++------ 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/pandaharvester/harvesterbody/monitor.py b/pandaharvester/harvesterbody/monitor.py index 547a42f1..a1400452 100644 --- a/pandaharvester/harvesterbody/monitor.py +++ b/pandaharvester/harvesterbody/monitor.py @@ -85,7 +85,12 @@ def run(self): # loop over all workers for queueName, configIdWorkSpecs in iteritems(workSpecsPerQueue): for configID, workSpecsList in iteritems(configIdWorkSpecs): - retVal = self.monitor_agent_core(lockedBy, queueName, workSpecsList, config_id=configID, check_source='DB') + try: + retVal = self.monitor_agent_core(lockedBy, queueName, workSpecsList, config_id=configID, check_source='DB') + except Exception as e: + mainLog.error('monitor_agent_core excepted with {0}'.format(e)) + retVal = None # skip the loop + if monitor_fifo.enabled and retVal is not None: workSpecsToEnqueue, workSpecsToEnqueueToHead, timeNow_timestamp, fifoCheckInterval = retVal if workSpecsToEnqueue: @@ -192,8 +197,13 @@ def run(self): else: workSpec.pandaid_list = [] workSpec.force_update('pandaid_list') - retVal = self.monitor_agent_core(lockedBy, queueName, workSpecsList, from_fifo=True, - config_id=configID, check_source='FIFO') + try: + retVal = self.monitor_agent_core(lockedBy, queueName, workSpecsList, from_fifo=True, + config_id=configID, check_source='FIFO') + except Exception as e: + mainLog.error('monitor_agent_core excepted with {0}'.format(e)) + retVal = None # skip the loop + if retVal is not None: workSpecsToEnqueue, workSpecsToEnqueueToHead, timeNow_timestamp, fifoCheckInterval = retVal qc_key = (queueName, configID) @@ -767,10 +777,16 @@ def monitor_event_digester(self, locked_by, max_events): for configID, workSpecsList in iteritems(_val): qc_key = (queueName, configID) tmpLog.debug('checking workers of queueName={0} configID={1}'.format(*qc_key)) - retVal = self.monitor_agent_core(locked_by, queueName, workSpecsList, - from_fifo=True, config_id=configID, - check_source='Event') - retMap[qc_key] = retVal + try: + retVal = self.monitor_agent_core(locked_by, queueName, workSpecsList, + from_fifo=True, config_id=configID, + check_source='Event') + except Exception as e: + tmpLog.error('monitor_agent_core excepted with {0}'.format(e)) + retVal = None # skip the loop + + if retVal: + retMap[qc_key] = retVal tmpLog.debug('done') return retMap From 80a15e2668416f3e33a8d785f9e8f6b16b2506bf Mon Sep 17 00:00:00 2001 From: fbarreir Date: Thu, 11 Jun 2020 19:34:51 +0200 Subject: [PATCH 179/181] K8s: disable proxy testing --- pandaharvester/harvestercloud/pilots_starter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandaharvester/harvestercloud/pilots_starter.py b/pandaharvester/harvestercloud/pilots_starter.py index 5ad5feb6..5b2390f6 100644 --- a/pandaharvester/harvestercloud/pilots_starter.py +++ b/pandaharvester/harvestercloud/pilots_starter.py @@ -245,7 +245,7 @@ def get_configuration(): # and therefore the pilot cannot execute in the same directory copy_files_in_dir(CONFIG_DIR, WORK_DIR) - command = "/tmp/runpilot2-wrapper.sh {0} -i PR -w generic --pilot-user=ATLAS --url=https://pandaserver.cern.ch -d --harvester-submit-mode={1} --allow-same-user=False | tee /tmp/wrapper-wid.log". \ + command = "/tmp/runpilot2-wrapper.sh {0} -t False -i PR -w generic --pilot-user=ATLAS --url=https://pandaserver.cern.ch -d --harvester-submit-mode={1} --allow-same-user=False | tee /tmp/wrapper-wid.log". \ format(wrapper_params, submit_mode) try: subprocess.call(command, shell=True) From b725374d5b81a46495544b5c66a8bb345d528c41 Mon Sep 17 00:00:00 2001 From: fbarreir Date: Fri, 12 Jun 2020 17:22:53 +0200 Subject: [PATCH 180/181] K8s: issues with pilot -t flag --- pandaharvester/harvestercloud/pilots_starter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandaharvester/harvestercloud/pilots_starter.py b/pandaharvester/harvestercloud/pilots_starter.py index 5b2390f6..5ad5feb6 100644 --- a/pandaharvester/harvestercloud/pilots_starter.py +++ b/pandaharvester/harvestercloud/pilots_starter.py @@ -245,7 +245,7 @@ def get_configuration(): # and therefore the pilot cannot execute in the same directory copy_files_in_dir(CONFIG_DIR, WORK_DIR) - command = "/tmp/runpilot2-wrapper.sh {0} -t False -i PR -w generic --pilot-user=ATLAS --url=https://pandaserver.cern.ch -d --harvester-submit-mode={1} --allow-same-user=False | tee /tmp/wrapper-wid.log". \ + command = "/tmp/runpilot2-wrapper.sh {0} -i PR -w generic --pilot-user=ATLAS --url=https://pandaserver.cern.ch -d --harvester-submit-mode={1} --allow-same-user=False | tee /tmp/wrapper-wid.log". \ format(wrapper_params, submit_mode) try: subprocess.call(command, shell=True) From 49b590a754ad2e28d394bab2b5160262f7f1735a Mon Sep 17 00:00:00 2001 From: fbarreir Date: Mon, 15 Jun 2020 13:14:24 +0200 Subject: [PATCH 181/181] k8s: Adding -t option --- pandaharvester/harvestercloud/pilots_starter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandaharvester/harvestercloud/pilots_starter.py b/pandaharvester/harvestercloud/pilots_starter.py index 5ad5feb6..ab93ea77 100644 --- a/pandaharvester/harvestercloud/pilots_starter.py +++ b/pandaharvester/harvestercloud/pilots_starter.py @@ -245,7 +245,7 @@ def get_configuration(): # and therefore the pilot cannot execute in the same directory copy_files_in_dir(CONFIG_DIR, WORK_DIR) - command = "/tmp/runpilot2-wrapper.sh {0} -i PR -w generic --pilot-user=ATLAS --url=https://pandaserver.cern.ch -d --harvester-submit-mode={1} --allow-same-user=False | tee /tmp/wrapper-wid.log". \ + command = "/tmp/runpilot2-wrapper.sh {0} -i PR -w generic --pilot-user=ATLAS --url=https://pandaserver.cern.ch -d --harvester-submit-mode={1} --allow-same-user=False -t | tee /tmp/wrapper-wid.log". \ format(wrapper_params, submit_mode) try: subprocess.call(command, shell=True)