From 3ee48638310dcd5443e6e5fa853b9e5496d20096 Mon Sep 17 00:00:00 2001 From: mightqxc Date: Wed, 18 Feb 2026 10:01:05 +0100 Subject: [PATCH 01/38] get job stats with prodsourcelable for worker adjuster/maker --- pandaharvester/commit_timestamp.py | 2 +- .../harvesterbody/worker_adjuster.py | 24 ++++++++++++------ pandaharvester/harvesterbody/worker_maker.py | 18 +++++++------ .../panda_communicator.py | 25 +++++++++++++++++++ .../simple_worker_maker.py | 15 ++++++----- .../panda/panda_harvester.cfg.rpmnew.template | 1 + 6 files changed, 62 insertions(+), 23 deletions(-) diff --git a/pandaharvester/commit_timestamp.py b/pandaharvester/commit_timestamp.py index a0348a26..5541d728 100644 --- a/pandaharvester/commit_timestamp.py +++ b/pandaharvester/commit_timestamp.py @@ -1 +1 @@ -timestamp = "13-02-2026 07:45:33 on flin (by mightqxc)" +timestamp = "18-02-2026 09:01:05 on flin (by mightqxc)" diff --git a/pandaharvester/harvesterbody/worker_adjuster.py b/pandaharvester/harvesterbody/worker_adjuster.py index ba685bad..f48d66cd 100644 --- a/pandaharvester/harvesterbody/worker_adjuster.py +++ b/pandaharvester/harvesterbody/worker_adjuster.py @@ -145,7 +145,17 @@ def get_activate_worker_factor(self, site_name=None, job_type=None, resource_typ return ret_val # define number of workers to submit based on various information - def define_num_workers(self, static_num_workers, site_name): + def define_num_workers(self, static_num_workers, site_name) -> dict | None: + """ + Define number of workers to submit based on various information, including static site config, queue status, job statistics, and throttler if defined. The function also updates APF monitoring with the decision and the reason. + + Args: + static_num_workers (dict): A dict of the form {queue_name: {job_type: {resource_type: {"nQueue": int, "nReady": int, "nRunning": int, "nNewWorkers": int}}}} defining the static number of workers to submit for each queue, job type and resource type. + site_name (str): The name of the site for which to define the number of workers. + + Returns: + (dict|None): The updated static_num_workers dict with the defined number of new workers to submit in the "nNewWorkers" field, or None if an error occurred. + """ tmp_log = core_utils.make_logger(_logger, f"site={site_name}", method_name="define_num_workers") tmp_log.debug("start") tmp_log.debug(f"static_num_workers: {static_num_workers}") @@ -159,7 +169,7 @@ def define_num_workers(self, static_num_workers, site_name): queue_stat = queue_stat.data # get job statistics - job_stats = self.dbProxy.get_cache("job_statistics.json", None) + job_stats = self.dbProxy.get_cache("job_statistics_new.json", None) if job_stats is not None: job_stats = job_stats.data @@ -300,15 +310,13 @@ def define_num_workers(self, static_num_workers, site_name): if self.get_queue_no_pilots_when_no_active_jobs(queue_name): n_min_pilots = 0 - queue_activated = job_stats[queue_name]["activated"] - tmp_log.debug(f"available activated panda jobs {queue_activated}") + tmp_n_activated = sum(job_stats[queue_name]["activated"].values()) + tmp_log.debug(f"available activated panda jobs {tmp_n_activated}") activate_worker_factor = self.get_activate_worker_factor(queue_name, job_type, resource_type, queue_dict, queue_config) - if job_stats[queue_name]["activated"] * activate_worker_factor > 0: + if tmp_n_activated * activate_worker_factor > 0: n_min_pilots = 1 - n_activated = max( - int(job_stats[queue_name]["activated"] * activate_worker_factor), n_min_pilots - ) # avoid no activity queues + n_activated = max(int(tmp_n_activated * activate_worker_factor), n_min_pilots) # avoid no activity queues except KeyError: # zero job in the queue tmp_log.debug("no job in queue") diff --git a/pandaharvester/harvesterbody/worker_maker.py b/pandaharvester/harvesterbody/worker_maker.py index 01fa0be0..3a1df118 100644 --- a/pandaharvester/harvesterbody/worker_maker.py +++ b/pandaharvester/harvesterbody/worker_maker.py @@ -18,8 +18,10 @@ def get_plugin(self, queue_config): return self.pluginFactory.get_plugin(queue_config.workerMaker) # make workers - def make_workers(self, jobchunk_list, queue_config, n_ready, job_type, resource_type, maker=None): - tmpLog = core_utils.make_logger(_logger, f"queue={queue_config.queueName} jtype={job_type} rtype={resource_type}", method_name="make_workers") + def make_workers(self, jobchunk_list, queue_config, n_ready, job_type, resource_type, prod_source_label=None, maker=None): + tmpLog = core_utils.make_logger( + _logger, f"queue={queue_config.queueName} jtype={job_type} rtype={resource_type} pslabel={prod_source_label}", method_name="make_workers" + ) tmpLog.debug("start") try: # get plugin @@ -37,7 +39,7 @@ def make_workers(self, jobchunk_list, queue_config, n_ready, job_type, resource_ for iChunk, jobChunk in enumerate(jobchunk_list): # make a worker if iChunk >= n_ready: - workSpec = maker.make_worker(jobChunk, queue_config, job_type, resource_type) + workSpec = maker.make_worker(jobChunk, queue_config, job_type, resource_type, prod_source_label=prod_source_label) else: # use ready worker if iChunk < len(readyWorkers): @@ -63,35 +65,35 @@ def make_workers(self, jobchunk_list, queue_config, n_ready, job_type, resource_ return [], jobchunk_list # get number of jobs per worker - def get_num_jobs_per_worker(self, queue_config, n_workers, job_type, resource_type, maker=None): + def get_num_jobs_per_worker(self, queue_config, n_workers, job_type, resource_type, prod_source_label=None, maker=None): # get plugin if maker is None: maker = self.pluginFactory.get_plugin(queue_config.workerMaker) return maker.get_num_jobs_per_worker(n_workers) # get number of workers per job - def get_num_workers_per_job(self, queue_config, n_workers, job_type, resource_type, maker=None): + def get_num_workers_per_job(self, queue_config, n_workers, job_type, resource_type, prod_source_label=None, maker=None): # get plugin if maker is None: maker = self.pluginFactory.get_plugin(queue_config.workerMaker) return maker.get_num_workers_per_job(n_workers) # check number of ready resources - def num_ready_resources(self, queue_config, job_type, resource_type, maker=None): + def num_ready_resources(self, queue_config, job_type, resource_type, prod_source_label=None, maker=None): # get plugin if maker is None: maker = self.pluginFactory.get_plugin(queue_config.workerMaker) return maker.num_ready_resources() # get upper limit on the cumulative total of workers per job - def get_max_workers_per_job_in_total(self, queue_config, job_type, resource_type, maker=None): + def get_max_workers_per_job_in_total(self, queue_config, job_type, resource_type, prod_source_label=None, maker=None): # get plugin if maker is None: maker = self.pluginFactory.get_plugin(queue_config.workerMaker) return maker.get_max_workers_per_job_in_total() # get upper limit on the number of new workers per job in a cycle - def get_max_workers_per_job_per_cycle(self, queue_config, job_type, resource_type, maker=None): + def get_max_workers_per_job_per_cycle(self, queue_config, job_type, resource_type, prod_source_label=None, maker=None): # get plugin if maker is None: maker = self.pluginFactory.get_plugin(queue_config.workerMaker) diff --git a/pandaharvester/harvestercommunicator/panda_communicator.py b/pandaharvester/harvestercommunicator/panda_communicator.py index 70f8bdc2..2e0a0bb4 100644 --- a/pandaharvester/harvestercommunicator/panda_communicator.py +++ b/pandaharvester/harvestercommunicator/panda_communicator.py @@ -744,6 +744,31 @@ def get_job_stats(self): return stats, "OK" + # get job statistics: new function with prodsourcelabel, under testing and may replace the old one + def get_job_stats_new(self): + tmp_log = self.make_logger(method_name="get_job_stats_new") + tmp_log.debug("Start") + + tmp_status, tmp_response = self.request_ssl("GET", "statistics/active_job_stats_by_site_prodsourcelabel", {}) + stats = {} + ret_message = "FAILED" + + # Communication issue + if tmp_status is False: + core_utils.dump_error_message(tmp_log, tmp_response) + return stats, ret_message + + tmp_success = tmp_response.get("success", False) + tmp_message = tmp_response.get("message") + stats = tmp_response.get("data") + + if not tmp_success: + ret_message = tmp_message + core_utils.dump_error_message(tmp_log, ret_message) + return stats, ret_message + + return stats, "OK" + # update workers def update_workers(self, workspec_list): tmp_log = self.make_logger(method_name="update_workers") diff --git a/pandaharvester/harvesterworkermaker/simple_worker_maker.py b/pandaharvester/harvesterworkermaker/simple_worker_maker.py index ab824e27..171868d0 100644 --- a/pandaharvester/harvesterworkermaker/simple_worker_maker.py +++ b/pandaharvester/harvesterworkermaker/simple_worker_maker.py @@ -71,8 +71,8 @@ def get_job_type(self, job_spec, job_type, queue_dict, tmp_prodsourcelabel=None) return job_type_final # make a worker from jobs - def make_worker(self, jobspec_list, queue_config, job_type, resource_type): - tmp_log = self.make_logger(_logger, f"queue={queue_config.queueName}:{job_type}:{resource_type}", method_name="make_worker") + def make_worker(self, jobspec_list, queue_config, job_type, resource_type, prod_source_label=None): + tmp_log = self.make_logger(_logger, f"queue={queue_config.queueName}:{job_type}:{resource_type}:{prod_source_label}", method_name="make_worker") tmp_log.debug(f"jobspec_list: {jobspec_list}") @@ -161,10 +161,13 @@ def make_worker(self, jobspec_list, queue_config, job_type, resource_type): else: # when no job - # randomize pilot type with weighting - pdpm = getattr(queue_config, "prodSourceLabelRandomWeightsPermille", {}) - choice_list = core_utils.make_choice_list(pdpm=pdpm, default="managed") - tmp_prodsourcelabel = random.choice(choice_list) + tmp_prodsourcelabel = prod_source_label + if tmp_prodsourcelabel is None: + # no specified prodsourcelabel; randomize pilot type with weighting + pdpm = getattr(queue_config, "prodSourceLabelRandomWeightsPermille", {}) + choice_list = core_utils.make_choice_list(pdpm=pdpm, default="managed") + tmp_prodsourcelabel = random.choice(choice_list) + fake_job = JobSpec() fake_job.jobParams = {"prodSourceLabel": tmp_prodsourcelabel} work_spec.pilotType = fake_job.get_pilot_type() diff --git a/templates/panda/panda_harvester.cfg.rpmnew.template b/templates/panda/panda_harvester.cfg.rpmnew.template index 8f6cb59a..e9b3f2ff 100644 --- a/templates/panda/panda_harvester.cfg.rpmnew.template +++ b/templates/panda/panda_harvester.cfg.rpmnew.template @@ -656,6 +656,7 @@ data = proxy_production||file://path_to/FIXME_proxy_production resource_types.json||panda_server:get_resource_types job_statistics.json||panda_server:get_job_stats + job_statistics_new.json||panda_server:get_job_stats_new worker_statistics.json||panda_server:get_worker_stats_from_panda # BNL_key||panda_cache:BNL_ObjectStoreKey.pub&BNL_ObjectStoreKey # globus_secret||panda_cache:GlobusClientID_1&GlobusRefreshToken_1 From d92df010437baa3bd6c93b73f3214718e168a3ff Mon Sep 17 00:00:00 2001 From: mightqxc Date: Tue, 24 Feb 2026 10:48:26 +0100 Subject: [PATCH 02/38] fix with jobstats API change --- pandaharvester/commit_timestamp.py | 2 +- pandaharvester/harvesterbody/worker_adjuster.py | 4 ++-- pandaharvester/harvestercommunicator/panda_communicator.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pandaharvester/commit_timestamp.py b/pandaharvester/commit_timestamp.py index 5541d728..ac9a6a1f 100644 --- a/pandaharvester/commit_timestamp.py +++ b/pandaharvester/commit_timestamp.py @@ -1 +1 @@ -timestamp = "18-02-2026 09:01:05 on flin (by mightqxc)" +timestamp = "24-02-2026 09:48:27 on flin (by mightqxc)" diff --git a/pandaharvester/harvesterbody/worker_adjuster.py b/pandaharvester/harvesterbody/worker_adjuster.py index f48d66cd..24891e65 100644 --- a/pandaharvester/harvesterbody/worker_adjuster.py +++ b/pandaharvester/harvesterbody/worker_adjuster.py @@ -169,7 +169,7 @@ def define_num_workers(self, static_num_workers, site_name) -> dict | None: queue_stat = queue_stat.data # get job statistics - job_stats = self.dbProxy.get_cache("job_statistics_new.json", None) + job_stats = self.dbProxy.get_cache("job_statistics.json", None) if job_stats is not None: job_stats = job_stats.data @@ -310,7 +310,7 @@ def define_num_workers(self, static_num_workers, site_name) -> dict | None: if self.get_queue_no_pilots_when_no_active_jobs(queue_name): n_min_pilots = 0 - tmp_n_activated = sum(job_stats[queue_name]["activated"].values()) + tmp_n_activated = job_stats[queue_name]["activated"] tmp_log.debug(f"available activated panda jobs {tmp_n_activated}") activate_worker_factor = self.get_activate_worker_factor(queue_name, job_type, resource_type, queue_dict, queue_config) diff --git a/pandaharvester/harvestercommunicator/panda_communicator.py b/pandaharvester/harvestercommunicator/panda_communicator.py index 2e0a0bb4..59894f2a 100644 --- a/pandaharvester/harvestercommunicator/panda_communicator.py +++ b/pandaharvester/harvestercommunicator/panda_communicator.py @@ -749,7 +749,7 @@ def get_job_stats_new(self): tmp_log = self.make_logger(method_name="get_job_stats_new") tmp_log.debug("Start") - tmp_status, tmp_response = self.request_ssl("GET", "statistics/active_job_stats_by_site_prodsourcelabel", {}) + tmp_status, tmp_response = self.request_ssl("GET", "statistics/active_job_detailed_stats_by_site", {}) stats = {} ret_message = "FAILED" From ae999a7fdf5b00b338b47a104afd97d34f779f78 Mon Sep 17 00:00:00 2001 From: mightqxc Date: Wed, 25 Mar 2026 10:52:32 +0100 Subject: [PATCH 03/38] iamtokencred: default port of ARC CE --- pandaharvester/commit_timestamp.py | 2 +- pandaharvester/harvestercredmanager/iam_token_cred_manager.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/pandaharvester/commit_timestamp.py b/pandaharvester/commit_timestamp.py index 6072a0b5..3309cc3d 100644 --- a/pandaharvester/commit_timestamp.py +++ b/pandaharvester/commit_timestamp.py @@ -1 +1 @@ -timestamp = "19-03-2026 14:06:48 on flin (by mightqxc)" +timestamp = "25-03-2026 09:52:32 on flin (by mightqxc)" diff --git a/pandaharvester/harvestercredmanager/iam_token_cred_manager.py b/pandaharvester/harvestercredmanager/iam_token_cred_manager.py index 7f9700e6..9880a520 100644 --- a/pandaharvester/harvestercredmanager/iam_token_cred_manager.py +++ b/pandaharvester/harvestercredmanager/iam_token_cred_manager.py @@ -24,6 +24,7 @@ # default port for CEs default_port_map = { "htcondor-ce": 9619, + "arc-ce": 443, } # credential manager with IAM token From d5b0b79da376896de318cf1ccef545490d2f68c3 Mon Sep 17 00:00:00 2001 From: mightqxc Date: Wed, 25 Mar 2026 10:56:36 +0100 Subject: [PATCH 04/38] pretty --- pandaharvester/commit_timestamp.py | 2 +- pandaharvester/harvesterbody/worker_adjuster.py | 10 ++++------ 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/pandaharvester/commit_timestamp.py b/pandaharvester/commit_timestamp.py index 3309cc3d..7ffaa74a 100644 --- a/pandaharvester/commit_timestamp.py +++ b/pandaharvester/commit_timestamp.py @@ -1 +1 @@ -timestamp = "25-03-2026 09:52:32 on flin (by mightqxc)" +timestamp = "25-03-2026 09:56:36 on flin (by mightqxc)" diff --git a/pandaharvester/harvesterbody/worker_adjuster.py b/pandaharvester/harvesterbody/worker_adjuster.py index 1c91eff8..94161e85 100644 --- a/pandaharvester/harvesterbody/worker_adjuster.py +++ b/pandaharvester/harvesterbody/worker_adjuster.py @@ -345,15 +345,13 @@ def _normalize_job_type_any(queue_dict): if self.get_queue_no_pilots_when_no_active_jobs(queue_name): n_min_pilots = 0 - queue_activated = job_stats[queue_name]["activated"] - tmp_log.debug(f"available activated panda jobs {queue_activated}") + tmp_n_activated_jobs = job_stats[queue_name]["activated"] + tmp_log.debug(f"available activated panda jobs {tmp_n_activated_jobs}") activate_worker_factor = self.get_activate_worker_factor(queue_name, job_type, resource_type, queue_dict, queue_config) - if job_stats[queue_name]["activated"] * activate_worker_factor > 0: + if tmp_n_activated_jobs * activate_worker_factor > 0: n_min_pilots = 1 - n_activated = max( - int(job_stats[queue_name]["activated"] * activate_worker_factor), n_min_pilots - ) # avoid no activity queues + n_activated = max(int(tmp_n_activated_jobs * activate_worker_factor), n_min_pilots) # avoid no activity queues except KeyError: # zero job in the queue tmp_log.debug("no job in queue") From 34f74c654104369c97888f5184497fcf6ce22974 Mon Sep 17 00:00:00 2001 From: mightqxc Date: Wed, 25 Mar 2026 11:19:59 +0100 Subject: [PATCH 05/38] submitter: reduce redundant calls per resource_type --- pandaharvester/commit_timestamp.py | 2 +- pandaharvester/harvesterbody/submitter.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pandaharvester/commit_timestamp.py b/pandaharvester/commit_timestamp.py index 7ffaa74a..6504ab51 100644 --- a/pandaharvester/commit_timestamp.py +++ b/pandaharvester/commit_timestamp.py @@ -1 +1 @@ -timestamp = "25-03-2026 09:56:36 on flin (by mightqxc)" +timestamp = "25-03-2026 10:19:59 on flin (by mightqxc)" diff --git a/pandaharvester/harvesterbody/submitter.py b/pandaharvester/harvesterbody/submitter.py index fac890ec..37406b23 100644 --- a/pandaharvester/harvesterbody/submitter.py +++ b/pandaharvester/harvesterbody/submitter.py @@ -85,6 +85,9 @@ def run(self): # loop over all queues and resource types for queue_name in n_workers_per_queue_jt_rt: job_type = DEFAULT_JOB_TYPE + # get queue + queue_config = self.queue_configMapper.get_queue(queue_name) + workerMakerCore = self.workerMaker.get_plugin(queue_config) for resource_type in n_workers_per_queue_jt_rt[queue_name][job_type]: tmp_val = n_workers_per_queue_jt_rt[queue_name][job_type][resource_type] tmp_log = self.make_logger(_logger, f"id={locked_by} queue={queue_name} jtype={job_type} rtype={resource_type}", method_name="run") @@ -103,9 +106,6 @@ def run(self): if nWorkers == 0: tmp_log.debug("skipped since no new worker is needed based on current stats") continue - # get queue - queue_config = self.queue_configMapper.get_queue(queue_name) - workerMakerCore = self.workerMaker.get_plugin(queue_config) # check if resource is ready if hasattr(workerMakerCore, "dynamicSizing") and workerMakerCore.dynamicSizing is True: numReadyResources = self.workerMaker.num_ready_resources(queue_config, job_type, resource_type, workerMakerCore) From 199277041a2fdc91bf781456c8bac75065a63f16 Mon Sep 17 00:00:00 2001 From: mightqxc Date: Wed, 25 Mar 2026 13:02:03 +0100 Subject: [PATCH 06/38] worker_adjuster: add prod_source_label --- pandaharvester/commit_timestamp.py | 2 +- pandaharvester/harvesterbody/job_fetcher.py | 4 +- pandaharvester/harvesterbody/submitter.py | 591 +++++++++--------- .../harvesterbody/worker_adjuster.py | 276 ++++++-- .../panda_communicator.py | 2 +- .../simple_worker_maker.py | 24 +- 6 files changed, 530 insertions(+), 369 deletions(-) diff --git a/pandaharvester/commit_timestamp.py b/pandaharvester/commit_timestamp.py index 6504ab51..66d78b14 100644 --- a/pandaharvester/commit_timestamp.py +++ b/pandaharvester/commit_timestamp.py @@ -1 +1 @@ -timestamp = "25-03-2026 10:19:59 on flin (by mightqxc)" +timestamp = "25-03-2026 12:02:04 on flin (by mightqxc)" diff --git a/pandaharvester/harvesterbody/job_fetcher.py b/pandaharvester/harvesterbody/job_fetcher.py index d836e370..a434b1ca 100644 --- a/pandaharvester/harvesterbody/job_fetcher.py +++ b/pandaharvester/harvesterbody/job_fetcher.py @@ -67,13 +67,13 @@ def run(self): if n_jobs == 0: tmpLog.debug("no job to fetch; skip") continue - # prodsourcelabel + # prod_source_label try: is_grandly_unified_queue = pandaQueueDict.is_grandly_unified_queue(siteName) except Exception: is_grandly_unified_queue = False default_prodSourceLabel = queueConfig.get_source_label(is_gu=is_grandly_unified_queue) - # randomize prodsourcelabel if configured + # randomize prod_source_label if configured pdpm = getattr(queueConfig, "prodSourceLabelRandomWeightsPermille", {}) choice_list = core_utils.make_choice_list(pdpm=pdpm, default=default_prodSourceLabel) prodSourceLabel = random.choice(choice_list) diff --git a/pandaharvester/harvesterbody/submitter.py b/pandaharvester/harvesterbody/submitter.py index 37406b23..cc9503c2 100644 --- a/pandaharvester/harvesterbody/submitter.py +++ b/pandaharvester/harvesterbody/submitter.py @@ -88,315 +88,318 @@ def run(self): # get queue queue_config = self.queue_configMapper.get_queue(queue_name) workerMakerCore = self.workerMaker.get_plugin(queue_config) - for resource_type in n_workers_per_queue_jt_rt[queue_name][job_type]: - tmp_val = n_workers_per_queue_jt_rt[queue_name][job_type][resource_type] - tmp_log = self.make_logger(_logger, f"id={locked_by} queue={queue_name} jtype={job_type} rtype={resource_type}", method_name="run") - try: - tmp_log.debug("start") - tmp_log.debug(f"workers status: {tmp_val}") - nWorkers = tmp_val["nNewWorkers"] + tmp_val["nReady"] - nReady = tmp_val["nReady"] + for prod_source_label in n_workers_per_queue_jt_rt[queue_name][job_type]: + for resource_type in n_workers_per_queue_jt_rt[queue_name][job_type][prod_source_label]: + tmp_val = n_workers_per_queue_jt_rt[queue_name][job_type][prod_source_label][resource_type] + tmp_log = self.make_logger( + _logger, f"id={locked_by} queue={queue_name} jtype={job_type} rtype={resource_type}", method_name="run" + ) + try: + tmp_log.debug("start") + tmp_log.debug(f"workers status: {tmp_val}") + nWorkers = tmp_val["nNewWorkers"] + tmp_val["nReady"] + nReady = tmp_val["nReady"] - # check queue - if not self.queue_configMapper.has_queue(queue_name): - tmp_log.error("config not found") - continue + # check queue + if not self.queue_configMapper.has_queue(queue_name): + tmp_log.error("config not found") + continue - # no new workers - if nWorkers == 0: - tmp_log.debug("skipped since no new worker is needed based on current stats") - continue - # check if resource is ready - if hasattr(workerMakerCore, "dynamicSizing") and workerMakerCore.dynamicSizing is True: - numReadyResources = self.workerMaker.num_ready_resources(queue_config, job_type, resource_type, workerMakerCore) - tmp_log.debug(f"numReadyResources: {numReadyResources}") - if not numReadyResources: - if hasattr(workerMakerCore, "staticWorkers"): - nQRWorkers = tmp_val["nQueue"] + tmp_val["nRunning"] - tmp_log.debug(f"staticWorkers: {workerMakerCore.staticWorkers}, nQRWorkers(Queue+Running): {nQRWorkers}") - if nQRWorkers >= workerMakerCore.staticWorkers: - tmp_log.debug("No left static workers, skip") - continue + # no new workers + if nWorkers == 0: + tmp_log.debug("skipped since no new worker is needed based on current stats") + continue + # check if resource is ready + if hasattr(workerMakerCore, "dynamicSizing") and workerMakerCore.dynamicSizing is True: + numReadyResources = self.workerMaker.num_ready_resources(queue_config, job_type, resource_type, workerMakerCore) + tmp_log.debug(f"numReadyResources: {numReadyResources}") + if not numReadyResources: + if hasattr(workerMakerCore, "staticWorkers"): + nQRWorkers = tmp_val["nQueue"] + tmp_val["nRunning"] + tmp_log.debug(f"staticWorkers: {workerMakerCore.staticWorkers}, nQRWorkers(Queue+Running): {nQRWorkers}") + if nQRWorkers >= workerMakerCore.staticWorkers: + tmp_log.debug("No left static workers, skip") + continue + else: + nWorkers = min(workerMakerCore.staticWorkers - nQRWorkers, nWorkers) + tmp_log.debug(f"staticWorkers: {workerMakerCore.staticWorkers}, nWorkers: {nWorkers}") else: - nWorkers = min(workerMakerCore.staticWorkers - nQRWorkers, nWorkers) - tmp_log.debug(f"staticWorkers: {workerMakerCore.staticWorkers}, nWorkers: {nWorkers}") + tmp_log.debug("skip since no resources are ready") + continue else: - tmp_log.debug("skip since no resources are ready") - continue + nWorkers = min(nWorkers, numReadyResources) + # post action of worker maker + if hasattr(workerMakerCore, "skipOnFail") and workerMakerCore.skipOnFail is True: + skipOnFail = True else: - nWorkers = min(nWorkers, numReadyResources) - # post action of worker maker - if hasattr(workerMakerCore, "skipOnFail") and workerMakerCore.skipOnFail is True: - skipOnFail = True - else: - skipOnFail = False - # actions based on mapping type - if queue_config.mapType == WorkSpec.MT_NoJob: - # workers without jobs - jobChunks = [] - for i in range(nWorkers): - jobChunks.append([]) - elif queue_config.mapType == WorkSpec.MT_OneToOne: - # one worker per one job - jobChunks = self.dbProxy.get_job_chunks_for_workers( - queue_name, - nWorkers, - nReady, - 1, - None, - queue_config.useJobLateBinding, - harvester_config.submitter.checkInterval, - harvester_config.submitter.lockInterval, - locked_by, - ) - elif queue_config.mapType == WorkSpec.MT_MultiJobs: - # one worker for multiple jobs - nJobsPerWorker = self.workerMaker.get_num_jobs_per_worker( - queue_config, nWorkers, job_type, resource_type, maker=workerMakerCore - ) - tmp_log.debug(f"nJobsPerWorker={nJobsPerWorker}") - jobChunks = self.dbProxy.get_job_chunks_for_workers( - queue_name, - nWorkers, - nReady, - nJobsPerWorker, - None, - queue_config.useJobLateBinding, - harvester_config.submitter.checkInterval, - harvester_config.submitter.lockInterval, - locked_by, - queue_config.allowJobMixture, - ) - elif queue_config.mapType == WorkSpec.MT_MultiWorkers: - # multiple workers for one job - nWorkersPerJob = self.workerMaker.get_num_workers_per_job( - queue_config, nWorkers, job_type, resource_type, maker=workerMakerCore - ) - maxWorkersPerJob = self.workerMaker.get_max_workers_per_job_in_total( - queue_config, job_type, resource_type, maker=workerMakerCore - ) - maxWorkersPerJobPerCycle = self.workerMaker.get_max_workers_per_job_per_cycle( - queue_config, job_type, resource_type, maker=workerMakerCore - ) - tmp_log.debug(f"nWorkersPerJob={nWorkersPerJob}") - jobChunks = self.dbProxy.get_job_chunks_for_workers( - queue_name, - nWorkers, - nReady, - None, - nWorkersPerJob, - queue_config.useJobLateBinding, - harvester_config.submitter.checkInterval, - harvester_config.submitter.lockInterval, - locked_by, - max_workers_per_job_in_total=maxWorkersPerJob, - max_workers_per_job_per_cycle=maxWorkersPerJobPerCycle, - ) - else: - tmp_log.error(f"unknown mapType={queue_config.mapType}") - continue + skipOnFail = False + # actions based on mapping type + if queue_config.mapType == WorkSpec.MT_NoJob: + # workers without jobs + jobChunks = [] + for i in range(nWorkers): + jobChunks.append([]) + elif queue_config.mapType == WorkSpec.MT_OneToOne: + # one worker per one job + jobChunks = self.dbProxy.get_job_chunks_for_workers( + queue_name, + nWorkers, + nReady, + 1, + None, + queue_config.useJobLateBinding, + harvester_config.submitter.checkInterval, + harvester_config.submitter.lockInterval, + locked_by, + ) + elif queue_config.mapType == WorkSpec.MT_MultiJobs: + # one worker for multiple jobs + nJobsPerWorker = self.workerMaker.get_num_jobs_per_worker( + queue_config, nWorkers, job_type, resource_type, maker=workerMakerCore + ) + tmp_log.debug(f"nJobsPerWorker={nJobsPerWorker}") + jobChunks = self.dbProxy.get_job_chunks_for_workers( + queue_name, + nWorkers, + nReady, + nJobsPerWorker, + None, + queue_config.useJobLateBinding, + harvester_config.submitter.checkInterval, + harvester_config.submitter.lockInterval, + locked_by, + queue_config.allowJobMixture, + ) + elif queue_config.mapType == WorkSpec.MT_MultiWorkers: + # multiple workers for one job + nWorkersPerJob = self.workerMaker.get_num_workers_per_job( + queue_config, nWorkers, job_type, resource_type, maker=workerMakerCore + ) + maxWorkersPerJob = self.workerMaker.get_max_workers_per_job_in_total( + queue_config, job_type, resource_type, maker=workerMakerCore + ) + maxWorkersPerJobPerCycle = self.workerMaker.get_max_workers_per_job_per_cycle( + queue_config, job_type, resource_type, maker=workerMakerCore + ) + tmp_log.debug(f"nWorkersPerJob={nWorkersPerJob}") + jobChunks = self.dbProxy.get_job_chunks_for_workers( + queue_name, + nWorkers, + nReady, + None, + nWorkersPerJob, + queue_config.useJobLateBinding, + harvester_config.submitter.checkInterval, + harvester_config.submitter.lockInterval, + locked_by, + max_workers_per_job_in_total=maxWorkersPerJob, + max_workers_per_job_per_cycle=maxWorkersPerJobPerCycle, + ) + else: + tmp_log.error(f"unknown mapType={queue_config.mapType}") + continue - tmp_log.debug(f"got {len(jobChunks)} job chunks") - if len(jobChunks) == 0: - continue - # make workers - okChunks, ngChunks = self.workerMaker.make_workers( - jobChunks, queue_config, nReady, job_type, resource_type, maker=workerMakerCore - ) + tmp_log.debug(f"got {len(jobChunks)} job chunks") + if len(jobChunks) == 0: + continue + # make workers + okChunks, ngChunks = self.workerMaker.make_workers( + jobChunks, queue_config, nReady, job_type, resource_type, prod_source_label=prod_source_label, maker=workerMakerCore + ) - if len(ngChunks) == 0: - tmp_log.debug(f"successfully made {len(okChunks)} workers") - else: - tmp_log.debug(f"made {len(okChunks)} workers, while {len(ngChunks)} workers failed") - timeNow = core_utils.naive_utcnow() - timeNow_timestamp = time.time() - pandaIDs = set() - # NG (=not good) - for ngJobs in ngChunks: - for job_spec in ngJobs: - if skipOnFail: - # release jobs when workers are not made - pandaIDs.add(job_spec.PandaID) - else: - job_spec.status = "failed" - job_spec.subStatus = "failed_to_make" - job_spec.stateChangeTime = timeNow - job_spec.locked_by = None - errStr = "failed to make a worker" - job_spec.set_pilot_error(PilotErrors.SETUPFAILURE, errStr) - job_spec.trigger_propagation() - self.dbProxy.update_job(job_spec, {"locked_by": locked_by, "subStatus": "prepared"}) - # OK - work_specList = [] - if len(okChunks) > 0: - for work_spec, okJobs in okChunks: - # has job - if (queue_config.useJobLateBinding and work_spec.workerID is None) or queue_config.mapType == WorkSpec.MT_NoJob: - work_spec.hasJob = 0 - else: - work_spec.hasJob = 1 - if work_spec.nJobsToReFill in [None, 0]: - work_spec.set_jobspec_list(okJobs) + if len(ngChunks) == 0: + tmp_log.debug(f"successfully made {len(okChunks)} workers") + else: + tmp_log.debug(f"made {len(okChunks)} workers, while {len(ngChunks)} workers failed") + timeNow = core_utils.naive_utcnow() + timeNow_timestamp = time.time() + pandaIDs = set() + # NG (=not good) + for ngJobs in ngChunks: + for job_spec in ngJobs: + if skipOnFail: + # release jobs when workers are not made + pandaIDs.add(job_spec.PandaID) else: - # refill free slots during the worker is running - work_spec.set_jobspec_list(okJobs[: work_spec.nJobsToReFill]) - work_spec.nJobsToReFill = None - for job_spec in okJobs[work_spec.nJobsToReFill :]: - pandaIDs.add(job_spec.PandaID) - work_spec.set_num_jobs_with_list() - # map type - work_spec.mapType = queue_config.mapType - # queue name - work_spec.computingSite = queue_config.queueName - # set access point - work_spec.accessPoint = queue_config.messenger["accessPoint"] - # sync level - work_spec.syncLevel = queue_config.get_synchronization_level() - # events - if len(okJobs) > 0 and ( - "eventService" in okJobs[0].jobParams or "cloneJob" in okJobs[0].jobParams or "isHPO" in okJobs[0].jobParams - ): - work_spec.eventsRequest = WorkSpec.EV_useEvents - work_specList.append(work_spec) - if len(work_specList) > 0: - sw = core_utils.get_stopwatch() - # get plugin for submitter - submitterCore = self.pluginFactory.get_plugin(queue_config.submitter) - if submitterCore is None: - # not found - tmp_log.error(f"submitter plugin for {job_spec.computingSite} not found") - continue - # get plugin for messenger - messenger = self.pluginFactory.get_plugin(queue_config.messenger) - if messenger is None: - # not found - tmp_log.error(f"messenger plugin for {job_spec.computingSite} not found") - continue - # setup access points - messenger.setup_access_points(work_specList) - # feed jobs - for work_spec in work_specList: - if work_spec.hasJob == 1: - tmpStat = messenger.feed_jobs(work_spec, work_spec.get_jobspec_list()) - if tmpStat is False: - tmp_log.error(f"failed to send jobs to workerID={work_spec.workerID}") + job_spec.status = "failed" + job_spec.subStatus = "failed_to_make" + job_spec.stateChangeTime = timeNow + job_spec.locked_by = None + errStr = "failed to make a worker" + job_spec.set_pilot_error(PilotErrors.SETUPFAILURE, errStr) + job_spec.trigger_propagation() + self.dbProxy.update_job(job_spec, {"locked_by": locked_by, "subStatus": "prepared"}) + # OK + work_specList = [] + if len(okChunks) > 0: + for work_spec, okJobs in okChunks: + # has job + if (queue_config.useJobLateBinding and work_spec.workerID is None) or queue_config.mapType == WorkSpec.MT_NoJob: + work_spec.hasJob = 0 else: - tmp_log.debug(f"sent jobs to workerID={work_spec.workerID} with {tmpStat}") - # insert workers - self.dbProxy.insert_workers(work_specList, locked_by) - # submit - sw.reset() - tmp_log.info(f"submitting {len(work_specList)} workers") - work_specList, tmpRetList, tmpStrList = self.submit_workers(submitterCore, work_specList) - tmp_log.debug(f"done submitting {len(work_specList)} workers" + sw.get_elapsed_time()) - # collect successful jobs - okPandaIDs = set() - for iWorker, (tmpRet, tmpStr) in enumerate(zip(tmpRetList, tmpStrList)): - if tmpRet: + work_spec.hasJob = 1 + if work_spec.nJobsToReFill in [None, 0]: + work_spec.set_jobspec_list(okJobs) + else: + # refill free slots during the worker is running + work_spec.set_jobspec_list(okJobs[: work_spec.nJobsToReFill]) + work_spec.nJobsToReFill = None + for job_spec in okJobs[work_spec.nJobsToReFill :]: + pandaIDs.add(job_spec.PandaID) + work_spec.set_num_jobs_with_list() + # map type + work_spec.mapType = queue_config.mapType + # queue name + work_spec.computingSite = queue_config.queueName + # set access point + work_spec.accessPoint = queue_config.messenger["accessPoint"] + # sync level + work_spec.syncLevel = queue_config.get_synchronization_level() + # events + if len(okJobs) > 0 and ( + "eventService" in okJobs[0].jobParams or "cloneJob" in okJobs[0].jobParams or "isHPO" in okJobs[0].jobParams + ): + work_spec.eventsRequest = WorkSpec.EV_useEvents + work_specList.append(work_spec) + if len(work_specList) > 0: + sw = core_utils.get_stopwatch() + # get plugin for submitter + submitterCore = self.pluginFactory.get_plugin(queue_config.submitter) + if submitterCore is None: + # not found + tmp_log.error(f"submitter plugin for {job_spec.computingSite} not found") + continue + # get plugin for messenger + messenger = self.pluginFactory.get_plugin(queue_config.messenger) + if messenger is None: + # not found + tmp_log.error(f"messenger plugin for {job_spec.computingSite} not found") + continue + # setup access points + messenger.setup_access_points(work_specList) + # feed jobs + for work_spec in work_specList: + if work_spec.hasJob == 1: + tmpStat = messenger.feed_jobs(work_spec, work_spec.get_jobspec_list()) + if tmpStat is False: + tmp_log.error(f"failed to send jobs to workerID={work_spec.workerID}") + else: + tmp_log.debug(f"sent jobs to workerID={work_spec.workerID} with {tmpStat}") + # insert workers + self.dbProxy.insert_workers(work_specList, locked_by) + # submit + sw.reset() + tmp_log.info(f"submitting {len(work_specList)} workers") + work_specList, tmpRetList, tmpStrList = self.submit_workers(submitterCore, work_specList) + tmp_log.debug(f"done submitting {len(work_specList)} workers" + sw.get_elapsed_time()) + # collect successful jobs + okPandaIDs = set() + for iWorker, (tmpRet, tmpStr) in enumerate(zip(tmpRetList, tmpStrList)): + if tmpRet: + work_spec, jobList = okChunks[iWorker] + jobList = work_spec.get_jobspec_list() + if jobList is not None: + for job_spec in jobList: + okPandaIDs.add(job_spec.PandaID) + # loop over all workers + for iWorker, (tmpRet, tmpStr) in enumerate(zip(tmpRetList, tmpStrList)): work_spec, jobList = okChunks[iWorker] + # set harvesterHost + work_spec.harvesterHost = socket.gethostname() + # use associated job list since it can be truncated for re-filling jobList = work_spec.get_jobspec_list() - if jobList is not None: + # set status + if not tmpRet: + # failed submission + errStr = f"failed to submit a workerID={work_spec.workerID} with {tmpStr}" + tmp_log.error(errStr) + work_spec.set_status(WorkSpec.ST_missed) + work_spec.set_dialog_message(tmpStr) + work_spec.set_pilot_error(PilotErrors.SETUPFAILURE, errStr) + work_spec.set_pilot_closed() + if jobList is not None: + # increment attempt number + newJobList = [] + for job_spec in jobList: + # skip if successful with another worker + if job_spec.PandaID in okPandaIDs: + continue + if job_spec.submissionAttempts is None: + job_spec.submissionAttempts = 0 + job_spec.submissionAttempts += 1 + # max attempt or permanent error + if tmpRet is False or job_spec.submissionAttempts >= queue_config.maxSubmissionAttempts: + newJobList.append(job_spec) + else: + self.dbProxy.increment_submission_attempt(job_spec.PandaID, job_spec.submissionAttempts) + jobList = newJobList + elif queue_config.useJobLateBinding and work_spec.hasJob == 1: + # directly go to running after feeding jobs for late biding + work_spec.set_status(WorkSpec.ST_running) + else: + # normal successful submission + work_spec.set_status(WorkSpec.ST_submitted) + work_spec.submitTime = timeNow + work_spec.modificationTime = timeNow + work_spec.checkTime = timeNow + if self.monitor_fifo.enabled: + work_spec.set_work_params({"lastCheckAt": timeNow_timestamp}) + # prefetch events + if ( + tmpRet + and work_spec.hasJob == 1 + and work_spec.eventsRequest == WorkSpec.EV_useEvents + and queue_config.prefetchEvents + ): + work_spec.eventsRequest = WorkSpec.EV_requestEvents + eventsRequestParams = dict() for job_spec in jobList: - okPandaIDs.add(job_spec.PandaID) - # loop over all workers - for iWorker, (tmpRet, tmpStr) in enumerate(zip(tmpRetList, tmpStrList)): - work_spec, jobList = okChunks[iWorker] - # set harvesterHost - work_spec.harvesterHost = socket.gethostname() - # use associated job list since it can be truncated for re-filling - jobList = work_spec.get_jobspec_list() - # set status - if not tmpRet: - # failed submission - errStr = f"failed to submit a workerID={work_spec.workerID} with {tmpStr}" - tmp_log.error(errStr) - work_spec.set_status(WorkSpec.ST_missed) - work_spec.set_dialog_message(tmpStr) - work_spec.set_pilot_error(PilotErrors.SETUPFAILURE, errStr) - work_spec.set_pilot_closed() + eventsRequestParams[job_spec.PandaID] = { + "pandaID": job_spec.PandaID, + "taskID": job_spec.taskID, + "jobsetID": job_spec.jobParams["jobsetID"], + "nRanges": max(int(math.ceil(work_spec.nCore / len(jobList))), job_spec.jobParams["coreCount"]) + * queue_config.initEventsMultipler, + } + if "isHPO" in job_spec.jobParams: + if "sourceURL" in job_spec.jobParams: + sourceURL = job_spec.jobParams["sourceURL"] + else: + sourceURL = None + eventsRequestParams[job_spec.PandaID].update({"isHPO": True, "jobsetID": 0, "sourceURL": sourceURL}) + work_spec.eventsRequestParams = eventsRequestParams + # register worker + tmpStat = self.dbProxy.register_worker(work_spec, jobList, locked_by) if jobList is not None: - # increment attempt number - newJobList = [] for job_spec in jobList: - # skip if successful with another worker - if job_spec.PandaID in okPandaIDs: - continue - if job_spec.submissionAttempts is None: - job_spec.submissionAttempts = 0 - job_spec.submissionAttempts += 1 - # max attempt or permanent error - if tmpRet is False or job_spec.submissionAttempts >= queue_config.maxSubmissionAttempts: - newJobList.append(job_spec) - else: - self.dbProxy.increment_submission_attempt(job_spec.PandaID, job_spec.submissionAttempts) - jobList = newJobList - elif queue_config.useJobLateBinding and work_spec.hasJob == 1: - # directly go to running after feeding jobs for late biding - work_spec.set_status(WorkSpec.ST_running) - else: - # normal successful submission - work_spec.set_status(WorkSpec.ST_submitted) - work_spec.submitTime = timeNow - work_spec.modificationTime = timeNow - work_spec.checkTime = timeNow - if self.monitor_fifo.enabled: - work_spec.set_work_params({"lastCheckAt": timeNow_timestamp}) - # prefetch events - if ( - tmpRet - and work_spec.hasJob == 1 - and work_spec.eventsRequest == WorkSpec.EV_useEvents - and queue_config.prefetchEvents - ): - work_spec.eventsRequest = WorkSpec.EV_requestEvents - eventsRequestParams = dict() - for job_spec in jobList: - eventsRequestParams[job_spec.PandaID] = { - "pandaID": job_spec.PandaID, - "taskID": job_spec.taskID, - "jobsetID": job_spec.jobParams["jobsetID"], - "nRanges": max(int(math.ceil(work_spec.nCore / len(jobList))), job_spec.jobParams["coreCount"]) - * queue_config.initEventsMultipler, - } - if "isHPO" in job_spec.jobParams: - if "sourceURL" in job_spec.jobParams: - sourceURL = job_spec.jobParams["sourceURL"] - else: - sourceURL = None - eventsRequestParams[job_spec.PandaID].update({"isHPO": True, "jobsetID": 0, "sourceURL": sourceURL}) - work_spec.eventsRequestParams = eventsRequestParams - # register worker - tmpStat = self.dbProxy.register_worker(work_spec, jobList, locked_by) - if jobList is not None: - for job_spec in jobList: - pandaIDs.add(job_spec.PandaID) - if tmpStat: - if tmpRet: - tmpStr = "submitted a workerID={0} for PandaID={1} with submissionHost={2} batchID={3}" - tmp_log.info( - tmpStr.format(work_spec.workerID, job_spec.PandaID, work_spec.submissionHost, work_spec.batchID) - ) + pandaIDs.add(job_spec.PandaID) + if tmpStat: + if tmpRet: + tmpStr = "submitted a workerID={0} for PandaID={1} with submissionHost={2} batchID={3}" + tmp_log.info( + tmpStr.format(work_spec.workerID, job_spec.PandaID, work_spec.submissionHost, work_spec.batchID) + ) + else: + tmpStr = "failed to submit a workerID={0} for PandaID={1}" + tmp_log.error(tmpStr.format(work_spec.workerID, job_spec.PandaID)) else: - tmpStr = "failed to submit a workerID={0} for PandaID={1}" - tmp_log.error(tmpStr.format(work_spec.workerID, job_spec.PandaID)) - else: - tmpStr = "failed to register a worker for PandaID={0} with submissionHost={1} batchID={2}" - tmp_log.error(tmpStr.format(job_spec.PandaID, work_spec.submissionHost, work_spec.batchID)) - # enqueue to monitor fifo - if self.monitor_fifo.enabled and queue_config.mapType != WorkSpec.MT_MultiWorkers: - work_specsToEnqueue = [[w] for w in work_specList if w.status in (WorkSpec.ST_submitted, WorkSpec.ST_running)] - check_delay = min( - getattr(harvester_config.monitor, "eventBasedCheckInterval", harvester_config.monitor.checkInterval), - getattr(harvester_config.monitor, "fifoCheckInterval", harvester_config.monitor.checkInterval), - ) - monitor_fifo.put((queue_name, work_specsToEnqueue), time.time() + check_delay) - main_log.debug("put workers to monitor FIFO") - submitted = True - # release jobs - self.dbProxy.release_jobs(pandaIDs, locked_by) - tmp_log.info("done") - except Exception: - core_utils.dump_error_message(tmp_log) + tmpStr = "failed to register a worker for PandaID={0} with submissionHost={1} batchID={2}" + tmp_log.error(tmpStr.format(job_spec.PandaID, work_spec.submissionHost, work_spec.batchID)) + # enqueue to monitor fifo + if self.monitor_fifo.enabled and queue_config.mapType != WorkSpec.MT_MultiWorkers: + work_specsToEnqueue = [[w] for w in work_specList if w.status in (WorkSpec.ST_submitted, WorkSpec.ST_running)] + check_delay = min( + getattr(harvester_config.monitor, "eventBasedCheckInterval", harvester_config.monitor.checkInterval), + getattr(harvester_config.monitor, "fifoCheckInterval", harvester_config.monitor.checkInterval), + ) + monitor_fifo.put((queue_name, work_specsToEnqueue), time.time() + check_delay) + main_log.debug("put workers to monitor FIFO") + submitted = True + # release jobs + self.dbProxy.release_jobs(pandaIDs, locked_by) + tmp_log.info("done") + except Exception: + core_utils.dump_error_message(tmp_log) # release the site self.dbProxy.release_site(site_name, locked_by) if sw_main.get_elapsed_time_in_sec() > queue_lock_interval: diff --git a/pandaharvester/harvesterbody/worker_adjuster.py b/pandaharvester/harvesterbody/worker_adjuster.py index 94161e85..6c478a51 100644 --- a/pandaharvester/harvesterbody/worker_adjuster.py +++ b/pandaharvester/harvesterbody/worker_adjuster.py @@ -2,6 +2,8 @@ import math import traceback +import polars as pl + from pandaharvester.harvesterconfig import harvester_config from pandaharvester.harvestercore import core_utils from pandaharvester.harvestercore.db_proxy_pool import DBProxyPool as DBProxy @@ -146,6 +148,85 @@ def get_activate_worker_factor(self, site_name=None, job_type=None, resource_typ tmp_log.debug(f"ret_val={ret_val}") return ret_val + # convert nested dict structure to polars dataframe + def _dict_to_dataframe(self, static_num_workers: dict) -> pl.DataFrame: + """ + Convert nested dict structure to polars dataframe. + + Input dict structure: + {queue_name: {job_type: {resource_type: {nQueue, nReady, nRunning, nNewWorkers}}}} + + Output dataframe columns: + [job_type, resource_type, prod_source_label, nQueue, nReady, nRunning, nNewWorkers] + """ + rows = [] + for queue_name, queue_dict in static_num_workers.items(): + for job_type, rt_dict in queue_dict.items(): + for resource_type, stats in rt_dict.items(): + rows.append( + { + "job_type": job_type, + "resource_type": resource_type, + "prod_source_label": None, # Placeholder for new field + "nQueue": stats.get("nQueue", 0), + "nReady": stats.get("nReady", 0), + "nRunning": stats.get("nRunning", 0), + "nNewWorkers": stats.get("nNewWorkers", 0), + } + ) + + if not rows: + # Return empty dataframe with correct schema + return pl.DataFrame( + { + "job_type": [], + "resource_type": [], + "prod_source_label": [], + "nQueue": [], + "nReady": [], + "nRunning": [], + "nNewWorkers": [], + } + ) + + return pl.DataFrame(rows) + + # convert polars dataframe back to nested dict structure + def _dataframe_to_dict(self, df: pl.DataFrame, queue_name: str) -> dict: + """ + Convert polars dataframe back to nested dict structure. + + Input dataframe columns: + [job_type, resource_type, prod_source_label, nQueue, nReady, nRunning, nNewWorkers] + + Output dict structure: + {queue_name: {job_type: {prod_source_label: {resource_type: {nQueue, nReady, nRunning, nNewWorkers}}}}} + + Note: prod_source_label is now a dimension in the output dict, nested before resource_type. + """ + result = {queue_name: {}} + + for row in df.iter_rows(named=True): + job_type = row["job_type"] + prod_source_label = row["prod_source_label"] + resource_type = row["resource_type"] + + # Initialize nested dicts as needed + if job_type not in result[queue_name]: + result[queue_name][job_type] = {} + if prod_source_label not in result[queue_name][job_type]: + result[queue_name][job_type][prod_source_label] = {} + + # Store the stats for this resource type + result[queue_name][job_type][prod_source_label][resource_type] = { + "nQueue": row["nQueue"], + "nReady": row["nReady"], + "nRunning": row["nRunning"], + "nNewWorkers": row["nNewWorkers"], + } + + return result + # define number of workers to submit based on various information def define_num_workers(self, static_num_workers, site_name) -> dict | None: """ @@ -197,6 +278,7 @@ def _normalize_job_type_any(queue_dict): _normalize_job_type_any(queue_dict) dyn_num_workers = copy.deepcopy(static_num_workers) + try: # get queue status queue_stat = self.dbProxy.get_cache("panda_queues.json", None) @@ -216,8 +298,18 @@ def _normalize_job_type_any(queue_dict): # get resource type mapper rt_mapper = ResourceTypeMapper() - # define num of new workers + # Track results for all queues + result_dict = {} + + # define num of new workers - process by queue for queue_name in static_num_workers: + # Create dataframe for current queue only + df_queue_dict = {queue_name: static_num_workers[queue_name]} + df_queue = self._dict_to_dataframe(df_queue_dict) + + # You can add sorting here if needed, e.g.: + # df_queue = df_queue.sort(by=["job_type", "resource_type", "prod_source_label"]) + # get queue queue_config = self.queue_configMapper.get_queue(queue_name) worker_limits_dict = {} @@ -238,8 +330,21 @@ def _normalize_job_type_any(queue_dict): apf_msg = None apf_data = None job_type = DEFAULT_JOB_TYPE - for resource_type, tmp_val in static_num_workers[queue_name][job_type].items(): - tmp_log.debug(f"Processing queue {queue_name} job_type {job_type} resource_type {resource_type} with static_num_workers {tmp_val}") + + # Single loop over dataframe rows - replaces nested for-loops + for row_idx, row in enumerate(df_queue.iter_rows(named=True)): + job_type = row["job_type"] + resource_type = row["resource_type"] + prod_source_label = row["prod_source_label"] + n_queue = row["nQueue"] + n_ready = row["nReady"] + n_running = row["nRunning"] + + tmp_log.debug( + f"Processing queue {queue_name} job_type {job_type} resource_type {resource_type} " + f"prod_source_label {prod_source_label} with static_num_workers " + f"nQueue={n_queue} nReady={n_ready} nRunning={n_running}" + ) # get cores and memory request per worker of this resource_type queue_dict = panda_queues_dict.get(queue_name, {}) @@ -247,7 +352,7 @@ def _normalize_job_type_any(queue_dict): # set 0 to num of new workers when the queue is disabled if queue_name in queue_stat and queue_stat[queue_name]["status"] in ["offline", "standby", "maintenance"]: - dyn_num_workers[queue_name][job_type][resource_type]["nNewWorkers"] = 0 + df_queue = df_queue.with_columns(pl.when(pl.col("job_type") == job_type).then(0).otherwise(pl.col("nNewWorkers")).alias("nNewWorkers")) ret_msg = f"set n_new_workers=0 since status={queue_stat[queue_name]['status']}" tmp_log.debug(ret_msg) apf_msg = f"Not submitting workers since queue status = {queue_stat[queue_name]['status']}" @@ -255,7 +360,7 @@ def _normalize_job_type_any(queue_dict): # protection against not-up-to-date queue config if queue_config is None: - dyn_num_workers[queue_name][job_type][resource_type]["nNewWorkers"] = 0 + df_queue = df_queue.with_columns(pl.when(pl.col("job_type") == job_type).then(0).otherwise(pl.col("nNewWorkers")).alias("nNewWorkers")) ret_msg = "set n_new_workers=0 due to missing queue_config" tmp_log.debug(ret_msg) apf_msg = "Not submitting workers because of missing queue_config" @@ -274,23 +379,32 @@ def _normalize_job_type_any(queue_dict): if throttler is not None: to_throttle, tmp_msg = throttler.to_be_throttled(queue_config, queue_config_mapper=self.queue_configMapper) if to_throttle: - dyn_num_workers[queue_name][job_type][resource_type]["nNewWorkers"] = 0 + df_queue = df_queue.with_columns( + pl.when(pl.col("job_type") == job_type).then(0).otherwise(pl.col("nNewWorkers")).alias("nNewWorkers") + ) ret_msg = f"set n_new_workers=0 by {throttler.__class__.__name__}:{tmp_msg}" tmp_log.debug(ret_msg) continue # check stats - n_queue = tmp_val["nQueue"] - n_ready = tmp_val["nReady"] - n_running = tmp_val["nRunning"] if resource_type != "ANY" and job_type != "ANY" and job_type is not None: n_queue_total += n_queue n_ready_total += n_ready n_running_total += n_running + if queue_config.runMode == "slave": - n_new_workers_def = tmp_val["nNewWorkers"] + n_new_workers_def = row["nNewWorkers"] if n_new_workers_def == 0: - dyn_num_workers[queue_name][job_type][resource_type]["nNewWorkers"] = 0 + df_queue = df_queue.with_columns( + pl.when( + (pl.col("job_type") == job_type) + & (pl.col("resource_type") == resource_type) + & (pl.col("prod_source_label") == prod_source_label) + ) + .then(0) + .otherwise(pl.col("nNewWorkers")) + .alias("nNewWorkers") + ) ret_msg = "set n_new_workers=0 by panda in slave mode" tmp_log.debug(ret_msg) continue @@ -306,17 +420,17 @@ def _normalize_job_type_any(queue_dict): pass elif (n_queue + n_ready + n_running) >= max_workers > 0: # enough workers in the system - ret_msg = f"No n_new_workers since n_queue({n_queue}) + n_ready({n_ready}) + n_running({n_running}) " f">= max_workers({max_workers})" + ret_msg = f"No n_new_workers since n_queue({n_queue}) + n_ready({n_ready}) + n_running({n_running}) >= max_workers({max_workers})" tmp_log.debug(ret_msg) pass elif queue_limit_cores is not None and cores_queue >= queue_limit_cores: # enough queuing cores - ret_msg = f"No n_new_workers since cores_queue({cores_queue}) >= " f"queue_limit_cores({queue_limit_cores})" + ret_msg = f"No n_new_workers since cores_queue({cores_queue}) >= queue_limit_cores({queue_limit_cores})" tmp_log.debug(ret_msg) pass elif queue_limit_memory is not None and memory_queue >= queue_limit_memory: - # enough queuing cores - ret_msg = f"No n_new_workers since memory_queue({memory_queue} MB) >= " f"queue_limit_memory({queue_limit_memory} MB)" + # enough queuing memory + ret_msg = f"No n_new_workers since memory_queue({memory_queue} MB) >= queue_limit_memory({queue_limit_memory} MB)" tmp_log.debug(ret_msg) pass else: @@ -387,7 +501,14 @@ def _normalize_job_type_any(queue_dict): if self.maxNewWorkers is not None and self.maxNewWorkers > 0: n_new_workers = min(n_new_workers, self.maxNewWorkers) tmp_log.debug(f"setting n_new_workers to {n_new_workers} in order to respect universal maxNewWorkers") - dyn_num_workers[queue_name][job_type][resource_type]["nNewWorkers"] = n_new_workers + df_queue = df_queue.with_columns( + pl.when( + (pl.col("job_type") == job_type) & (pl.col("resource_type") == resource_type) & (pl.col("prod_source_label") == prod_source_label) + ) + .then(n_new_workers) + .otherwise(pl.col("nNewWorkers")) + .alias("nNewWorkers") + ) # adjust n_new_workers for UCORE to let aggregations over RT respect nQueueLimitWorker and max_workers if queue_config is None: @@ -396,12 +517,13 @@ def _normalize_job_type_any(queue_dict): tmp_log.debug(ret_msg) else: max_new_workers_per_cycle = queue_config.maxNewWorkersPerCycle - if len(dyn_num_workers[queue_name]) > 1: - total_new_workers_rts = 0 - for _jt in dyn_num_workers[queue_name]: - for _rt in dyn_num_workers[queue_name][_jt]: - if _jt != "ANY" and _rt != "ANY": - total_new_workers_rts = total_new_workers_rts + dyn_num_workers[queue_name][_jt][_rt]["nNewWorkers"] + + # Check if we have multiple job types for this queue + unique_job_types = df_queue["job_type"].unique() + + if len(unique_job_types) > 1: + total_new_workers_rts = df_queue.filter((pl.col("job_type") != "ANY") & (pl.col("resource_type") != "ANY"))["nNewWorkers"].sum() + n_new_workers_max_agg = min(max(n_queue_limit - n_queue_total, 0), max(max_workers - n_queue_total - n_ready_total - n_running_total, 0)) if max_new_workers_per_cycle >= 0: n_new_workers_max_agg = min(n_new_workers_max_agg, max_new_workers_per_cycle) @@ -411,52 +533,88 @@ def _normalize_job_type_any(queue_dict): # exceeded max, to adjust if total_new_workers_rts > n_new_workers_max_agg: if n_new_workers_max_agg == 0: - for job_type in dyn_num_workers[queue_name]: - for resource_type in dyn_num_workers[queue_name][job_type]: - dyn_num_workers[queue_name][job_type][resource_type]["nNewWorkers"] = 0 + df_queue = df_queue.with_columns(pl.when(True).then(0).otherwise(pl.col("nNewWorkers")).alias("nNewWorkers")) tmp_log.debug("No n_new_workers since n_new_workers_max_agg=0 for UCORE") else: tmp_log.debug(f"n_new_workers_max_agg={n_new_workers_max_agg} for UCORE") - _d = dyn_num_workers[queue_name].copy() - del _d["ANY"] - - # TODO: needs to be recalculated - simple_rt_nw_list = [] - for job_type in _d: # jt: job type - for resource_type in _d[job_type]: # rt: resource type - simple_rt_nw_list.append([(resource_type, job_type), _d[job_type][resource_type].get("nNewWorkers", 0), 0]) - - _countdown = n_new_workers_max_agg - for _rt_list in simple_rt_nw_list: - (resource_type, job_type), n_new_workers_orig, _r = _rt_list - n_new_workers, remainder = divmod(n_new_workers_orig * n_new_workers_max_agg, total_new_workers_rts) - dyn_num_workers[queue_name][job_type].setdefault(resource_type, {"nReady": 0, "nRunning": 0, "nQueue": 0, "nNewWorkers": 0}) - dyn_num_workers[queue_name][job_type][resource_type]["nNewWorkers"] = n_new_workers - _rt_list[2] = remainder - _countdown -= n_new_workers - _s_list = sorted(simple_rt_nw_list, key=(lambda x: x[1])) - sorted_rt_nw_list = sorted(_s_list, key=(lambda x: x[2]), reverse=True) - for (resource_type, job_type), n_new_workers_orig, remainder in sorted_rt_nw_list: - if _countdown <= 0: - break - dyn_num_workers[queue_name][job_type][resource_type]["nNewWorkers"] += 1 - _countdown -= 1 - for job_type in dyn_num_workers[queue_name]: - for resource_type in dyn_num_workers[queue_name][job_type]: - if job_type == "ANY" or resource_type == "ANY": - continue - n_new_workers = dyn_num_workers[queue_name][job_type][resource_type]["nNewWorkers"] - tmp_log.debug( - "setting n_new_workers to {0} of job_type {1} resource_type {2} in order to respect RT aggregations for UCORE".format( - n_new_workers, job_type, resource_type + + # Build a list of (resource_type, job_type, nNewWorkers, original_index) for redistribution + df_to_adjust = df_queue.filter((pl.col("job_type") != "ANY") & (pl.col("resource_type") != "ANY")) + + if len(df_to_adjust) > 0: + # Calculate distribution factors + simple_rt_nw_list = [] + for row_data in df_to_adjust.iter_rows(named=True): + n_new_workers_orig = row_data["nNewWorkers"] + simple_rt_nw_list.append([(row_data["resource_type"], row_data["job_type"]), n_new_workers_orig, 0]) # remainder + + # Distribute workers proportionally + _countdown = n_new_workers_max_agg + for _rt_list in simple_rt_nw_list: + (resource_type, job_type), n_new_workers_orig, _r = _rt_list + if total_new_workers_rts > 0: + n_new_workers, remainder = divmod(n_new_workers_orig * n_new_workers_max_agg, total_new_workers_rts) + else: + n_new_workers, remainder = 0, 0 + + # Update the dataframe + df_queue = df_queue.with_columns( + pl.when( + (pl.col("resource_type") == resource_type) + & (pl.col("job_type") == job_type) + & (pl.col("job_type") != "ANY") + & (pl.col("resource_type") != "ANY") + ) + .then(n_new_workers) + .otherwise(pl.col("nNewWorkers")) + .alias("nNewWorkers") ) - ) + _rt_list[2] = remainder + _countdown -= n_new_workers + + # Distribute remaining workers by remainder + _s_list = sorted(simple_rt_nw_list, key=(lambda x: x[1])) + sorted_rt_nw_list = sorted(_s_list, key=(lambda x: x[2]), reverse=True) + for (resource_type, job_type), n_new_workers_orig, remainder in sorted_rt_nw_list: + if _countdown <= 0: + break + df_queue = df_queue.with_columns( + pl.when( + (pl.col("resource_type") == resource_type) + & (pl.col("job_type") == job_type) + & (pl.col("job_type") != "ANY") + & (pl.col("resource_type") != "ANY") + ) + .then(pl.col("nNewWorkers") + 1) + .otherwise(pl.col("nNewWorkers")) + .alias("nNewWorkers") + ) + _countdown -= 1 + + # Log adjustments + df_queue_final = df_queue.filter((pl.col("job_type") != "ANY") & (pl.col("resource_type") != "ANY")) + for row_data in df_queue_final.iter_rows(named=True): + n_new_workers = row_data["nNewWorkers"] + tmp_log.debug( + f"setting n_new_workers to {n_new_workers} of job_type {row_data['job_type']} " + f"resource_type {row_data['resource_type']} prod_source_label {row_data['prod_source_label']} " + f"in order to respect RT aggregations for UCORE" + ) if not apf_msg: - apf_data = copy.deepcopy(dyn_num_workers[queue_name]) + # Convert current queue back to dict format for APF monitoring + dict_queue_apf = self._dataframe_to_dict(df_queue, queue_name) + apf_data = dict_queue_apf.get(queue_name, {}) self.apf_mon.update_label(queue_name, apf_msg, apf_data) + # Store the updated queue version + dict_result = self._dataframe_to_dict(df_queue, queue_name) + result_dict.update(dict_result) + + # Return the final result + dyn_num_workers = result_dict + # dump tmp_log.debug(f"defined {str(dyn_num_workers)}") return dyn_num_workers diff --git a/pandaharvester/harvestercommunicator/panda_communicator.py b/pandaharvester/harvestercommunicator/panda_communicator.py index 59894f2a..7b1b5113 100644 --- a/pandaharvester/harvestercommunicator/panda_communicator.py +++ b/pandaharvester/harvestercommunicator/panda_communicator.py @@ -744,7 +744,7 @@ def get_job_stats(self): return stats, "OK" - # get job statistics: new function with prodsourcelabel, under testing and may replace the old one + # get job statistics: new function with prod_source_label, under testing and may replace the old one def get_job_stats_new(self): tmp_log = self.make_logger(method_name="get_job_stats_new") tmp_log.debug("Start") diff --git a/pandaharvester/harvesterworkermaker/simple_worker_maker.py b/pandaharvester/harvesterworkermaker/simple_worker_maker.py index 171868d0..77a20381 100644 --- a/pandaharvester/harvesterworkermaker/simple_worker_maker.py +++ b/pandaharvester/harvesterworkermaker/simple_worker_maker.py @@ -42,7 +42,7 @@ def get_job_core_and_memory(self, queue_dict, job_spec): return job_core_count, job_memory - def get_job_type(self, job_spec, job_type, queue_dict, tmp_prodsourcelabel=None): + def get_job_type(self, job_spec, job_type, queue_dict, tmp_prod_source_label=None): queue_type = queue_dict.get("type", None) # 1. get prodSourceLabel from job (PUSH) @@ -52,9 +52,9 @@ def get_job_type(self, job_spec, job_type, queue_dict, tmp_prodsourcelabel=None) # 2. get prodSourceLabel from the specified job_type (PULL UPS) elif job_type: job_type_final = job_type - if tmp_prodsourcelabel: - if queue_type != "analysis" and tmp_prodsourcelabel not in ("user", "panda", "managed"): - # for production, unified or other types of queues we need to run neutral prodsourcelabels + if tmp_prod_source_label: + if queue_type != "analysis" and tmp_prod_source_label not in ("user", "panda", "managed"): + # for production, unified or other types of queues we need to run neutral prod_source_labels # with production proxy since they can't be distinguished and can fail job_type_final = "managed" @@ -161,24 +161,24 @@ def make_worker(self, jobspec_list, queue_config, job_type, resource_type, prod_ else: # when no job - tmp_prodsourcelabel = prod_source_label - if tmp_prodsourcelabel is None: - # no specified prodsourcelabel; randomize pilot type with weighting + tmp_prod_source_label = prod_source_label + if tmp_prod_source_label is None: + # no specified prod_source_label; randomize pilot type with weighting pdpm = getattr(queue_config, "prodSourceLabelRandomWeightsPermille", {}) choice_list = core_utils.make_choice_list(pdpm=pdpm, default="managed") - tmp_prodsourcelabel = random.choice(choice_list) + tmp_prod_source_label = random.choice(choice_list) fake_job = JobSpec() - fake_job.jobParams = {"prodSourceLabel": tmp_prodsourcelabel} + fake_job.jobParams = {"prodSourceLabel": tmp_prod_source_label} work_spec.pilotType = fake_job.get_pilot_type() del fake_job if work_spec.pilotType in ["RC", "ALRB", "PT"]: tmp_log.info(f"a worker has pilotType={work_spec.pilotType}") - work_spec.jobType = self.get_job_type(None, job_type, queue_dict, tmp_prodsourcelabel) + work_spec.jobType = self.get_job_type(None, job_type, queue_dict, tmp_prod_source_label) tmp_log.debug( - "get_job_type decided for job_type: {0} (input job_type: {1}, queue_type: {2}, tmp_prodsourcelabel: {3})".format( - work_spec.jobType, job_type, queue_dict.get("type", None), tmp_prodsourcelabel + "get_job_type decided for job_type: {0} (input job_type: {1}, queue_type: {2}, tmp_prod_source_label: {3})".format( + work_spec.jobType, job_type, queue_dict.get("type", None), tmp_prod_source_label ) ) From 33607ae2cc6fbed1b352fa5884d457093681e99e Mon Sep 17 00:00:00 2001 From: mightqxc Date: Wed, 25 Mar 2026 13:17:05 +0100 Subject: [PATCH 07/38] prod_source_label default ANY ; fixes --- pandaharvester/commit_timestamp.py | 2 +- .../harvesterbody/worker_adjuster.py | 2 +- pandaharvester/harvesterbody/worker_maker.py | 12 ++--- pandaharvester/harvestermisc/apfmon.py | 50 ++++++++++++++----- .../harvestersubmitter/submitter_common.py | 2 +- .../simple_worker_maker.py | 8 +-- 6 files changed, 51 insertions(+), 25 deletions(-) diff --git a/pandaharvester/commit_timestamp.py b/pandaharvester/commit_timestamp.py index 66d78b14..011a521d 100644 --- a/pandaharvester/commit_timestamp.py +++ b/pandaharvester/commit_timestamp.py @@ -1 +1 @@ -timestamp = "25-03-2026 12:02:04 on flin (by mightqxc)" +timestamp = "25-03-2026 12:17:05 on flin (by mightqxc)" diff --git a/pandaharvester/harvesterbody/worker_adjuster.py b/pandaharvester/harvesterbody/worker_adjuster.py index 6c478a51..7b7e6e4c 100644 --- a/pandaharvester/harvesterbody/worker_adjuster.py +++ b/pandaharvester/harvesterbody/worker_adjuster.py @@ -167,7 +167,7 @@ def _dict_to_dataframe(self, static_num_workers: dict) -> pl.DataFrame: { "job_type": job_type, "resource_type": resource_type, - "prod_source_label": None, # Placeholder for new field + "prod_source_label": "ANY", "nQueue": stats.get("nQueue", 0), "nReady": stats.get("nReady", 0), "nRunning": stats.get("nRunning", 0), diff --git a/pandaharvester/harvesterbody/worker_maker.py b/pandaharvester/harvesterbody/worker_maker.py index 3a1df118..14cd7f3a 100644 --- a/pandaharvester/harvesterbody/worker_maker.py +++ b/pandaharvester/harvesterbody/worker_maker.py @@ -18,7 +18,7 @@ def get_plugin(self, queue_config): return self.pluginFactory.get_plugin(queue_config.workerMaker) # make workers - def make_workers(self, jobchunk_list, queue_config, n_ready, job_type, resource_type, prod_source_label=None, maker=None): + def make_workers(self, jobchunk_list, queue_config, n_ready, job_type, resource_type, prod_source_label="ANY", maker=None): tmpLog = core_utils.make_logger( _logger, f"queue={queue_config.queueName} jtype={job_type} rtype={resource_type} pslabel={prod_source_label}", method_name="make_workers" ) @@ -65,35 +65,35 @@ def make_workers(self, jobchunk_list, queue_config, n_ready, job_type, resource_ return [], jobchunk_list # get number of jobs per worker - def get_num_jobs_per_worker(self, queue_config, n_workers, job_type, resource_type, prod_source_label=None, maker=None): + def get_num_jobs_per_worker(self, queue_config, n_workers, job_type, resource_type, prod_source_label="ANY", maker=None): # get plugin if maker is None: maker = self.pluginFactory.get_plugin(queue_config.workerMaker) return maker.get_num_jobs_per_worker(n_workers) # get number of workers per job - def get_num_workers_per_job(self, queue_config, n_workers, job_type, resource_type, prod_source_label=None, maker=None): + def get_num_workers_per_job(self, queue_config, n_workers, job_type, resource_type, prod_source_label="ANY", maker=None): # get plugin if maker is None: maker = self.pluginFactory.get_plugin(queue_config.workerMaker) return maker.get_num_workers_per_job(n_workers) # check number of ready resources - def num_ready_resources(self, queue_config, job_type, resource_type, prod_source_label=None, maker=None): + def num_ready_resources(self, queue_config, job_type, resource_type, prod_source_label="ANY", maker=None): # get plugin if maker is None: maker = self.pluginFactory.get_plugin(queue_config.workerMaker) return maker.num_ready_resources() # get upper limit on the cumulative total of workers per job - def get_max_workers_per_job_in_total(self, queue_config, job_type, resource_type, prod_source_label=None, maker=None): + def get_max_workers_per_job_in_total(self, queue_config, job_type, resource_type, prod_source_label="ANY", maker=None): # get plugin if maker is None: maker = self.pluginFactory.get_plugin(queue_config.workerMaker) return maker.get_max_workers_per_job_in_total() # get upper limit on the number of new workers per job in a cycle - def get_max_workers_per_job_per_cycle(self, queue_config, job_type, resource_type, prod_source_label=None, maker=None): + def get_max_workers_per_job_per_cycle(self, queue_config, job_type, resource_type, prod_source_label="ANY", maker=None): # get plugin if maker is None: maker = self.pluginFactory.get_plugin(queue_config.workerMaker) diff --git a/pandaharvester/harvestermisc/apfmon.py b/pandaharvester/harvestermisc/apfmon.py index 95e549b4..721ac02f 100644 --- a/pandaharvester/harvestermisc/apfmon.py +++ b/pandaharvester/harvestermisc/apfmon.py @@ -7,6 +7,7 @@ import traceback import requests + from pandaharvester import panda_pkg_info from pandaharvester.harvesterconfig import harvester_config from pandaharvester.harvestercore import core_utils @@ -163,20 +164,45 @@ def massage_label_data(self, data): return data try: - any = data["ANY"] - agg = {} - for rtype in data: - if rtype == "ANY": - continue - else: - for value in data[rtype]: - agg.setdefault(value, 0) - agg[value] += data[rtype][value] + # First aggregate over prodsourcelabel, then over resource_type + # Data structure: {prodsourcelabel: {resource_type: {values}}, "ANY": {...}} - if agg: - data["ANY"] = agg + # Extract the "ANY" prodsourcelabel if it exists + any_data = data.get("ANY", {}) + + # Aggregate across all prodsourcelabels for each resource_type + agg_by_rtype = {} + for prodsourcelabel in data: + if prodsourcelabel == "ANY": + continue + # data[prodsourcelabel] is like {resource_type: {values}} + for rtype in data[prodsourcelabel]: + if rtype == "ANY": + continue + # Aggregate values across all prodsourcelabels for this resource_type + if rtype not in agg_by_rtype: + agg_by_rtype[rtype] = {} + for value_key, value_count in data[prodsourcelabel][rtype].items(): + agg_by_rtype[rtype].setdefault(value_key, 0) + agg_by_rtype[rtype][value_key] += value_count + + # Now aggregate across all resource_types to create final "ANY" + final_agg = {} + for rtype in agg_by_rtype: + for value_key, value_count in agg_by_rtype[rtype].items(): + final_agg.setdefault(value_key, 0) + final_agg[value_key] += value_count + + # Update data structure: keep prodsourcelabel level but aggregate to "ANY" + if final_agg: + # Rebuild data with aggregated "ANY" at the prodsourcelabel level + result = {} + for prodsourcelabel in data: + result[prodsourcelabel] = data[prodsourcelabel] + result["ANY"] = final_agg + data = result else: - data["ANY"] = any + data["ANY"] = any_data tmp_log.debug(f"Massaged to data: {data}") diff --git a/pandaharvester/harvestersubmitter/submitter_common.py b/pandaharvester/harvestersubmitter/submitter_common.py index ac4ea3ff..845ad0dd 100644 --- a/pandaharvester/harvestersubmitter/submitter_common.py +++ b/pandaharvester/harvestersubmitter/submitter_common.py @@ -10,7 +10,7 @@ # and piloturl (pilot option --piloturl) for pilot 2 -def get_complicated_pilot_options(pilot_type, pilot_url=None, pilot_version="", prod_source_label=None, prod_rc_permille=0): +def get_complicated_pilot_options(pilot_type, pilot_url=None, pilot_version="", prod_source_label="ANY", prod_rc_permille=0): # for pilot 3 is_pilot3 = True if pilot_version.startswith("3") else False # basic map diff --git a/pandaharvester/harvesterworkermaker/simple_worker_maker.py b/pandaharvester/harvesterworkermaker/simple_worker_maker.py index 77a20381..6d636ab9 100644 --- a/pandaharvester/harvesterworkermaker/simple_worker_maker.py +++ b/pandaharvester/harvesterworkermaker/simple_worker_maker.py @@ -42,7 +42,7 @@ def get_job_core_and_memory(self, queue_dict, job_spec): return job_core_count, job_memory - def get_job_type(self, job_spec, job_type, queue_dict, tmp_prod_source_label=None): + def get_job_type(self, job_spec, job_type, queue_dict, tmp_prod_source_label="ANY"): queue_type = queue_dict.get("type", None) # 1. get prodSourceLabel from job (PUSH) @@ -52,7 +52,7 @@ def get_job_type(self, job_spec, job_type, queue_dict, tmp_prod_source_label=Non # 2. get prodSourceLabel from the specified job_type (PULL UPS) elif job_type: job_type_final = job_type - if tmp_prod_source_label: + if tmp_prod_source_label != "ANY": if queue_type != "analysis" and tmp_prod_source_label not in ("user", "panda", "managed"): # for production, unified or other types of queues we need to run neutral prod_source_labels # with production proxy since they can't be distinguished and can fail @@ -71,7 +71,7 @@ def get_job_type(self, job_spec, job_type, queue_dict, tmp_prod_source_label=Non return job_type_final # make a worker from jobs - def make_worker(self, jobspec_list, queue_config, job_type, resource_type, prod_source_label=None): + def make_worker(self, jobspec_list, queue_config, job_type, resource_type, prod_source_label="ANY"): tmp_log = self.make_logger(_logger, f"queue={queue_config.queueName}:{job_type}:{resource_type}:{prod_source_label}", method_name="make_worker") tmp_log.debug(f"jobspec_list: {jobspec_list}") @@ -162,7 +162,7 @@ def make_worker(self, jobspec_list, queue_config, job_type, resource_type, prod_ else: # when no job tmp_prod_source_label = prod_source_label - if tmp_prod_source_label is None: + if tmp_prod_source_label != "ANY": # no specified prod_source_label; randomize pilot type with weighting pdpm = getattr(queue_config, "prodSourceLabelRandomWeightsPermille", {}) choice_list = core_utils.make_choice_list(pdpm=pdpm, default="managed") From 3c8d009dfe2cc6da19b295f1e5f89333f1764bc6 Mon Sep 17 00:00:00 2001 From: mightqxc Date: Wed, 25 Mar 2026 14:43:05 +0100 Subject: [PATCH 08/38] swap resource_type and prod_source_label in nested dict --- pandaharvester/commit_timestamp.py | 2 +- pandaharvester/harvesterbody/submitter.py | 10 ++-- .../harvesterbody/worker_adjuster.py | 29 +++++++---- pandaharvester/harvestermisc/apfmon.py | 48 +++++++++---------- 4 files changed, 50 insertions(+), 39 deletions(-) diff --git a/pandaharvester/commit_timestamp.py b/pandaharvester/commit_timestamp.py index 011a521d..37fb8e2c 100644 --- a/pandaharvester/commit_timestamp.py +++ b/pandaharvester/commit_timestamp.py @@ -1 +1 @@ -timestamp = "25-03-2026 12:17:05 on flin (by mightqxc)" +timestamp = "25-03-2026 13:43:05 on flin (by mightqxc)" diff --git a/pandaharvester/harvesterbody/submitter.py b/pandaharvester/harvesterbody/submitter.py index cc9503c2..145a8471 100644 --- a/pandaharvester/harvesterbody/submitter.py +++ b/pandaharvester/harvesterbody/submitter.py @@ -88,11 +88,13 @@ def run(self): # get queue queue_config = self.queue_configMapper.get_queue(queue_name) workerMakerCore = self.workerMaker.get_plugin(queue_config) - for prod_source_label in n_workers_per_queue_jt_rt[queue_name][job_type]: - for resource_type in n_workers_per_queue_jt_rt[queue_name][job_type][prod_source_label]: - tmp_val = n_workers_per_queue_jt_rt[queue_name][job_type][prod_source_label][resource_type] + for resource_type in n_workers_per_queue_jt_rt[queue_name][job_type]: + for prod_source_label in n_workers_per_queue_jt_rt[queue_name][job_type][resource_type]: + tmp_val = n_workers_per_queue_jt_rt[queue_name][job_type][resource_type][prod_source_label] tmp_log = self.make_logger( - _logger, f"id={locked_by} queue={queue_name} jtype={job_type} rtype={resource_type}", method_name="run" + _logger, + f"id={locked_by} queue={queue_name} jtype={job_type} rtype={resource_type} pslabel={prod_source_label}", + method_name="run", ) try: tmp_log.debug("start") diff --git a/pandaharvester/harvesterbody/worker_adjuster.py b/pandaharvester/harvesterbody/worker_adjuster.py index 7b7e6e4c..30bec74b 100644 --- a/pandaharvester/harvesterbody/worker_adjuster.py +++ b/pandaharvester/harvesterbody/worker_adjuster.py @@ -200,9 +200,7 @@ def _dataframe_to_dict(self, df: pl.DataFrame, queue_name: str) -> dict: [job_type, resource_type, prod_source_label, nQueue, nReady, nRunning, nNewWorkers] Output dict structure: - {queue_name: {job_type: {prod_source_label: {resource_type: {nQueue, nReady, nRunning, nNewWorkers}}}}} - - Note: prod_source_label is now a dimension in the output dict, nested before resource_type. + {queue_name: {job_type: {resource_type: {prod_source_label: {nQueue, nReady, nRunning, nNewWorkers}}}}} """ result = {queue_name: {}} @@ -214,11 +212,13 @@ def _dataframe_to_dict(self, df: pl.DataFrame, queue_name: str) -> dict: # Initialize nested dicts as needed if job_type not in result[queue_name]: result[queue_name][job_type] = {} - if prod_source_label not in result[queue_name][job_type]: - result[queue_name][job_type][prod_source_label] = {} + if resource_type not in result[queue_name][job_type]: + result[queue_name][job_type][resource_type] = {} + if prod_source_label not in result[queue_name][job_type][resource_type]: + result[queue_name][job_type][resource_type][prod_source_label] = {} - # Store the stats for this resource type - result[queue_name][job_type][prod_source_label][resource_type] = { + # Store the stats for this prod_source_label + result[queue_name][job_type][resource_type][prod_source_label] = { "nQueue": row["nQueue"], "nReady": row["nReady"], "nRunning": row["nRunning"], @@ -237,7 +237,7 @@ def define_num_workers(self, static_num_workers, site_name) -> dict | None: site_name (str): The name of the site for which to define the number of workers. Returns: - (dict|None): The updated static_num_workers dict with the defined number of new workers to submit in the "nNewWorkers" field, or None if an error occurred. + (dict|None): A dict of the form {queue_name: {job_type: {resource_type: {prod_source_label: {"nQueue": int, "nReady": int, "nRunning": int, "nNewWorkers": int}}}}} with the defined number of new workers to submit in the "nNewWorkers" field, or None if an error occurred. """ tmp_log = core_utils.make_logger(_logger, f"site={site_name}", method_name="define_num_workers") tmp_log.debug("start") @@ -292,6 +292,10 @@ def _normalize_job_type_any(queue_dict): if job_stats is not None: job_stats = job_stats.data + job_stats_new = self.dbProxy.get_cache("job_statistics_new.json", None) + if job_stats_new is not None: + job_stats_new = job_stats_new.data + # get panda queues dict from CRIC panda_queues_dict = PandaQueuesDict() @@ -301,12 +305,17 @@ def _normalize_job_type_any(queue_dict): # Track results for all queues result_dict = {} + # define a priority list for prod_source_label to ensure consistent ordering in the dataframe and processing. + prioritized_pslabels = ["rc_alrb"] + # define num of new workers - process by queue for queue_name in static_num_workers: # Create dataframe for current queue only df_queue_dict = {queue_name: static_num_workers[queue_name]} df_queue = self._dict_to_dataframe(df_queue_dict) + # Get activated jobs stats of prioritized prod_source_labels for this queue + # You can add sorting here if needed, e.g.: # df_queue = df_queue.sort(by=["job_type", "resource_type", "prod_source_label"]) @@ -341,8 +350,8 @@ def _normalize_job_type_any(queue_dict): n_running = row["nRunning"] tmp_log.debug( - f"Processing queue {queue_name} job_type {job_type} resource_type {resource_type} " - f"prod_source_label {prod_source_label} with static_num_workers " + f"Processing queue={queue_name} job_type={job_type} resource_type={resource_type} " + f"prod_source_label={prod_source_label} with static_num_workers " f"nQueue={n_queue} nReady={n_ready} nRunning={n_running}" ) diff --git a/pandaharvester/harvestermisc/apfmon.py b/pandaharvester/harvestermisc/apfmon.py index 721ac02f..a607220f 100644 --- a/pandaharvester/harvestermisc/apfmon.py +++ b/pandaharvester/harvestermisc/apfmon.py @@ -164,41 +164,41 @@ def massage_label_data(self, data): return data try: - # First aggregate over prodsourcelabel, then over resource_type - # Data structure: {prodsourcelabel: {resource_type: {values}}, "ANY": {...}} + # First aggregate over resource_type, then over prod_source_label + # Data structure: {resource_type: {prod_source_label: {values}}, "ANY": {...}} - # Extract the "ANY" prodsourcelabel if it exists + # Extract the "ANY" resource_type if it exists any_data = data.get("ANY", {}) - # Aggregate across all prodsourcelabels for each resource_type - agg_by_rtype = {} - for prodsourcelabel in data: - if prodsourcelabel == "ANY": + # Aggregate across all resource_types for each prod_source_label + agg_by_pslabel = {} + for rtype in data: + if rtype == "ANY": continue - # data[prodsourcelabel] is like {resource_type: {values}} - for rtype in data[prodsourcelabel]: - if rtype == "ANY": + # data[rtype] is like {prod_source_label: {values}} + for prod_source_label in data[rtype]: + if prod_source_label == "ANY": continue - # Aggregate values across all prodsourcelabels for this resource_type - if rtype not in agg_by_rtype: - agg_by_rtype[rtype] = {} - for value_key, value_count in data[prodsourcelabel][rtype].items(): - agg_by_rtype[rtype].setdefault(value_key, 0) - agg_by_rtype[rtype][value_key] += value_count - - # Now aggregate across all resource_types to create final "ANY" + # Aggregate values across all resource_types for this prod_source_label + if prod_source_label not in agg_by_pslabel: + agg_by_pslabel[prod_source_label] = {} + for value_key, value_count in data[rtype][prod_source_label].items(): + agg_by_pslabel[prod_source_label].setdefault(value_key, 0) + agg_by_pslabel[prod_source_label][value_key] += value_count + + # Now aggregate across all prod_source_labels to create final "ANY" final_agg = {} - for rtype in agg_by_rtype: - for value_key, value_count in agg_by_rtype[rtype].items(): + for prod_source_label in agg_by_pslabel: + for value_key, value_count in agg_by_pslabel[prod_source_label].items(): final_agg.setdefault(value_key, 0) final_agg[value_key] += value_count - # Update data structure: keep prodsourcelabel level but aggregate to "ANY" + # Update data structure: keep resource_type level but aggregate to "ANY" if final_agg: - # Rebuild data with aggregated "ANY" at the prodsourcelabel level + # Rebuild data with aggregated "ANY" at the resource_type level result = {} - for prodsourcelabel in data: - result[prodsourcelabel] = data[prodsourcelabel] + for rtype in data: + result[rtype] = data[rtype] result["ANY"] = final_agg data = result else: From 242a6cae301714e77eda6e5eb6f460be116dcd20 Mon Sep 17 00:00:00 2001 From: mightqxc Date: Thu, 26 Mar 2026 00:04:35 +0100 Subject: [PATCH 09/38] Add map of pilotType and prodSourceLabel --- pandaharvester/commit_timestamp.py | 2 +- pandaharvester/harvesterbody/submitter.py | 3 +- .../harvesterbody/worker_adjuster.py | 448 ++++++++++-------- pandaharvester/harvestercore/core_utils.py | 36 ++ pandaharvester/harvestercore/db_proxy.py | 56 ++- 5 files changed, 328 insertions(+), 217 deletions(-) diff --git a/pandaharvester/commit_timestamp.py b/pandaharvester/commit_timestamp.py index 37fb8e2c..37f8f10f 100644 --- a/pandaharvester/commit_timestamp.py +++ b/pandaharvester/commit_timestamp.py @@ -1 +1 @@ -timestamp = "25-03-2026 13:43:05 on flin (by mightqxc)" +timestamp = "25-03-2026 23:04:35 on flin (by mightqxc)" diff --git a/pandaharvester/harvesterbody/submitter.py b/pandaharvester/harvesterbody/submitter.py index 145a8471..5554c753 100644 --- a/pandaharvester/harvesterbody/submitter.py +++ b/pandaharvester/harvesterbody/submitter.py @@ -69,7 +69,8 @@ def run(self): if tmp_resource_type in res_map[DEFAULT_JOB_TYPE]: tmp_queue_name = res_map[DEFAULT_JOB_TYPE][tmp_resource_type] if tmp_queue_name in current_workers: - current_workers[tmp_queue_name][DEFAULT_JOB_TYPE][tmp_resource_type]["nNewWorkers"] = tmp_new_val + for tmp_pilot_type in current_workers[tmp_queue_name][DEFAULT_JOB_TYPE][tmp_resource_type]: + current_workers[tmp_queue_name][DEFAULT_JOB_TYPE][tmp_resource_type][tmp_pilot_type]["nNewWorkers"] = tmp_new_val # define number of new workers if len(current_workers) == 0: diff --git a/pandaharvester/harvesterbody/worker_adjuster.py b/pandaharvester/harvesterbody/worker_adjuster.py index 30bec74b..aad7636a 100644 --- a/pandaharvester/harvesterbody/worker_adjuster.py +++ b/pandaharvester/harvesterbody/worker_adjuster.py @@ -154,7 +154,7 @@ def _dict_to_dataframe(self, static_num_workers: dict) -> pl.DataFrame: Convert nested dict structure to polars dataframe. Input dict structure: - {queue_name: {job_type: {resource_type: {nQueue, nReady, nRunning, nNewWorkers}}}} + {queue_name: {job_type: {resource_type: {tmp_pilot_type: {nQueue, nReady, nRunning, nNewWorkers}}}}} Output dataframe columns: [job_type, resource_type, prod_source_label, nQueue, nReady, nRunning, nNewWorkers] @@ -162,18 +162,38 @@ def _dict_to_dataframe(self, static_num_workers: dict) -> pl.DataFrame: rows = [] for queue_name, queue_dict in static_num_workers.items(): for job_type, rt_dict in queue_dict.items(): - for resource_type, stats in rt_dict.items(): - rows.append( - { - "job_type": job_type, - "resource_type": resource_type, - "prod_source_label": "ANY", - "nQueue": stats.get("nQueue", 0), - "nReady": stats.get("nReady", 0), - "nRunning": stats.get("nRunning", 0), - "nNewWorkers": stats.get("nNewWorkers", 0), - } - ) + for resource_type, pt_dict in rt_dict.items(): + # Handle case where the value is still the old structure (without pilotType) + # or new structure (with pilotType dimension) + if isinstance(pt_dict, dict): + # Check if this looks like stats dict (has nQueue, nReady, etc.) or pilot type dict + if "nQueue" in pt_dict or "nReady" in pt_dict or "nRunning" in pt_dict or "nNewWorkers" in pt_dict: + # Old structure - pt_dict contains the stats directly + rows.append( + { + "job_type": job_type, + "resource_type": resource_type, + "prod_source_label": "ANY", + "nQueue": pt_dict.get("nQueue", 0), + "nReady": pt_dict.get("nReady", 0), + "nRunning": pt_dict.get("nRunning", 0), + "nNewWorkers": pt_dict.get("nNewWorkers", 0), + } + ) + else: + # New structure - pt_dict contains pilot types as keys + for tmp_pilot_type, stats in pt_dict.items(): + rows.append( + { + "job_type": job_type, + "resource_type": resource_type, + "prod_source_label": core_utils.special_pilot_type_to_prod_source_label(tmp_pilot_type) or "ANY", + "nQueue": stats.get("nQueue", 0), + "nReady": stats.get("nReady", 0), + "nRunning": stats.get("nRunning", 0), + "nNewWorkers": stats.get("nNewWorkers", 0), + } + ) if not rows: # Return empty dataframe with correct schema @@ -227,13 +247,64 @@ def _dataframe_to_dict(self, df: pl.DataFrame, queue_name: str) -> dict: return result + # convert job statistics dict to polars dataframe for better accessibility + def _job_stats_to_dataframe(self, job_stats: dict | None = None) -> pl.DataFrame: + """ + Convert job statistics dict to polars dataframe. + + Input dict structure: + {computing_site: {resource_type: {prod_source_label: {job_status: n_jobs}}}} + + Output dataframe columns: + [computing_site, resource_type, prod_source_label, job_status, n_jobs] + """ + rows = [] + if job_stats is None: + return pl.DataFrame( + { + "computing_site": [], + "resource_type": [], + "prod_source_label": [], + "job_status": [], + "n_jobs": [], + } + ) + + for computing_site, rt_dict in job_stats.items(): + for resource_type, psl_dict in rt_dict.items(): + for prod_source_label, job_status_dict in psl_dict.items(): + for job_status, n_jobs in job_status_dict.items(): + rows.append( + { + "computing_site": computing_site, + "resource_type": resource_type, + "prod_source_label": prod_source_label, + "job_status": job_status, + "n_jobs": n_jobs, + } + ) + + if not rows: + # Return empty dataframe with correct schema + return pl.DataFrame( + { + "computing_site": [], + "resource_type": [], + "prod_source_label": [], + "job_status": [], + "n_jobs": [], + } + ) + + return pl.DataFrame(rows) + # define number of workers to submit based on various information def define_num_workers(self, static_num_workers, site_name) -> dict | None: """ Define number of workers to submit based on various information, including static site config, queue status, job statistics, and throttler if defined. The function also updates APF monitoring with the decision and the reason. Args: - static_num_workers (dict): A dict of the form {queue_name: {job_type: {resource_type: {"nQueue": int, "nReady": int, "nRunning": int, "nNewWorkers": int}}}} defining the static number of workers to submit for each queue, job type and resource type. + static_num_workers (dict): A dict of the form {queue_name: {job_type: {resource_type: {tmp_pilot_type: {"nQueue": int, "nReady": int, "nRunning": int, "nNewWorkers": int}}}}} defining the static number of workers to submit for each queue, job type, resource type and pilot type. site_name (str): The name of the site for which to define the number of workers. Returns: @@ -292,9 +363,11 @@ def _normalize_job_type_any(queue_dict): if job_stats is not None: job_stats = job_stats.data + df_job_stats_new = self._job_stats_to_dataframe(None) job_stats_new = self.dbProxy.get_cache("job_statistics_new.json", None) if job_stats_new is not None: - job_stats_new = job_stats_new.data + # Convert to dataframe for better accessibility + df_job_stats_new = self._job_stats_to_dataframe(job_stats_new.data) # get panda queues dict from CRIC panda_queues_dict = PandaQueuesDict() @@ -310,14 +383,18 @@ def _normalize_job_type_any(queue_dict): # define num of new workers - process by queue for queue_name in static_num_workers: - # Create dataframe for current queue only - df_queue_dict = {queue_name: static_num_workers[queue_name]} - df_queue = self._dict_to_dataframe(df_queue_dict) + # Work with nested dict instead of dataframe for updates + queue_dict_updates = copy.deepcopy(static_num_workers[queue_name]) - # Get activated jobs stats of prioritized prod_source_labels for this queue + # # fill in prioritized prod_source_labels according to activated jobs stats of this queue + # tmp_df_queue = df_queue + # for prod_source_label in prioritized_pslabels: + # # number of activated jobs with this prod_source_label + # n_activated_pslabel = df_job_stats_new.filter(...) + # tmp_df_queue = ... - # You can add sorting here if needed, e.g.: - # df_queue = df_queue.sort(by=["job_type", "resource_type", "prod_source_label"]) + # # You can add sorting here if needed, e.g.: + # # df_queue = df_queue.sort(by=["job_type", "resource_type", "prod_source_label"]) # get queue queue_config = self.queue_configMapper.get_queue(queue_name) @@ -340,184 +417,170 @@ def _normalize_job_type_any(queue_dict): apf_data = None job_type = DEFAULT_JOB_TYPE - # Single loop over dataframe rows - replaces nested for-loops - for row_idx, row in enumerate(df_queue.iter_rows(named=True)): - job_type = row["job_type"] - resource_type = row["resource_type"] - prod_source_label = row["prod_source_label"] - n_queue = row["nQueue"] - n_ready = row["nReady"] - n_running = row["nRunning"] - - tmp_log.debug( - f"Processing queue={queue_name} job_type={job_type} resource_type={resource_type} " - f"prod_source_label={prod_source_label} with static_num_workers " - f"nQueue={n_queue} nReady={n_ready} nRunning={n_running}" - ) - - # get cores and memory request per worker of this resource_type - queue_dict = panda_queues_dict.get(queue_name, {}) - rtype_request_cores, rtype_request_memory = rt_mapper.calculate_worker_requirements(resource_type, queue_dict) - - # set 0 to num of new workers when the queue is disabled - if queue_name in queue_stat and queue_stat[queue_name]["status"] in ["offline", "standby", "maintenance"]: - df_queue = df_queue.with_columns(pl.when(pl.col("job_type") == job_type).then(0).otherwise(pl.col("nNewWorkers")).alias("nNewWorkers")) - ret_msg = f"set n_new_workers=0 since status={queue_stat[queue_name]['status']}" - tmp_log.debug(ret_msg) - apf_msg = f"Not submitting workers since queue status = {queue_stat[queue_name]['status']}" - continue + # Iterate through dict to process each row + for job_type_key, rt_dict in queue_dict_updates.items(): + for resource_type, psl_dict in rt_dict.items(): + for prod_source_label, stats in psl_dict.items(): + job_type = job_type_key + n_queue = stats["nQueue"] + n_ready = stats["nReady"] + n_running = stats["nRunning"] - # protection against not-up-to-date queue config - if queue_config is None: - df_queue = df_queue.with_columns(pl.when(pl.col("job_type") == job_type).then(0).otherwise(pl.col("nNewWorkers")).alias("nNewWorkers")) - ret_msg = "set n_new_workers=0 due to missing queue_config" - tmp_log.debug(ret_msg) - apf_msg = "Not submitting workers because of missing queue_config" - continue - - # get throttler - if queue_name not in self.throttlerMap: - if hasattr(queue_config, "throttler"): - throttler = self.pluginFactory.get_plugin(queue_config.throttler) - else: - throttler = None - self.throttlerMap[queue_name] = throttler - - # check throttler - throttler = self.throttlerMap[queue_name] - if throttler is not None: - to_throttle, tmp_msg = throttler.to_be_throttled(queue_config, queue_config_mapper=self.queue_configMapper) - if to_throttle: - df_queue = df_queue.with_columns( - pl.when(pl.col("job_type") == job_type).then(0).otherwise(pl.col("nNewWorkers")).alias("nNewWorkers") + tmp_log.debug( + f"Processing queue={queue_name} job_type={job_type} resource_type={resource_type} " + f"prod_source_label={prod_source_label} with static_num_workers " + f"nQueue={n_queue} nReady={n_ready} nRunning={n_running}" ) - ret_msg = f"set n_new_workers=0 by {throttler.__class__.__name__}:{tmp_msg}" - tmp_log.debug(ret_msg) - continue - - # check stats - if resource_type != "ANY" and job_type != "ANY" and job_type is not None: - n_queue_total += n_queue - n_ready_total += n_ready - n_running_total += n_running - - if queue_config.runMode == "slave": - n_new_workers_def = row["nNewWorkers"] - if n_new_workers_def == 0: - df_queue = df_queue.with_columns( - pl.when( - (pl.col("job_type") == job_type) - & (pl.col("resource_type") == resource_type) - & (pl.col("prod_source_label") == prod_source_label) + + # get cores and memory request per worker of this resource_type + queue_dict = panda_queues_dict.get(queue_name, {}) + rtype_request_cores, rtype_request_memory = rt_mapper.calculate_worker_requirements(resource_type, queue_dict) + + # set 0 to num of new workers when the queue is disabled + if queue_name in queue_stat and queue_stat[queue_name]["status"] in ["offline", "standby", "maintenance"]: + stats["nNewWorkers"] = 0 + ret_msg = f"set n_new_workers=0 since status={queue_stat[queue_name]['status']}" + tmp_log.debug(ret_msg) + apf_msg = f"Not submitting workers since queue status = {queue_stat[queue_name]['status']}" + continue + + # protection against not-up-to-date queue config + if queue_config is None: + stats["nNewWorkers"] = 0 + ret_msg = "set n_new_workers=0 due to missing queue_config" + tmp_log.debug(ret_msg) + apf_msg = "Not submitting workers because of missing queue_config" + continue + + # get throttler + if queue_name not in self.throttlerMap: + if hasattr(queue_config, "throttler"): + throttler = self.pluginFactory.get_plugin(queue_config.throttler) + else: + throttler = None + self.throttlerMap[queue_name] = throttler + + # check throttler + throttler = self.throttlerMap[queue_name] + if throttler is not None: + to_throttle, tmp_msg = throttler.to_be_throttled(queue_config, queue_config_mapper=self.queue_configMapper) + if to_throttle: + stats["nNewWorkers"] = 0 + ret_msg = f"set n_new_workers=0 by {throttler.__class__.__name__}:{tmp_msg}" + tmp_log.debug(ret_msg) + continue + + # check stats + if resource_type != "ANY" and job_type != "ANY" and job_type is not None: + n_queue_total += n_queue + n_ready_total += n_ready + n_running_total += n_running + + if queue_config.runMode == "slave": + n_new_workers_def = stats["nNewWorkers"] + if n_new_workers_def == 0: + stats["nNewWorkers"] = 0 + ret_msg = "set n_new_workers=0 by panda in slave mode" + tmp_log.debug(ret_msg) + continue + else: + n_new_workers_def = None + + # define num of new workers based on static site config + n_new_workers = 0 + if n_queue >= n_queue_limit_per_rt > 0: + # enough queued workers + ret_msg = f"No n_new_workers since n_queue({n_queue})>=n_queue_limit_per_rt({n_queue_limit_per_rt})" + tmp_log.debug(ret_msg) + pass + elif (n_queue + n_ready + n_running) >= max_workers > 0: + # enough workers in the system + ret_msg = ( + f"No n_new_workers since n_queue({n_queue}) + n_ready({n_ready}) + n_running({n_running}) >= max_workers({max_workers})" ) - .then(0) - .otherwise(pl.col("nNewWorkers")) - .alias("nNewWorkers") - ) - ret_msg = "set n_new_workers=0 by panda in slave mode" - tmp_log.debug(ret_msg) - continue - else: - n_new_workers_def = None - - # define num of new workers based on static site config - n_new_workers = 0 - if n_queue >= n_queue_limit_per_rt > 0: - # enough queued workers - ret_msg = f"No n_new_workers since n_queue({n_queue})>=n_queue_limit_per_rt({n_queue_limit_per_rt})" - tmp_log.debug(ret_msg) - pass - elif (n_queue + n_ready + n_running) >= max_workers > 0: - # enough workers in the system - ret_msg = f"No n_new_workers since n_queue({n_queue}) + n_ready({n_ready}) + n_running({n_running}) >= max_workers({max_workers})" - tmp_log.debug(ret_msg) - pass - elif queue_limit_cores is not None and cores_queue >= queue_limit_cores: - # enough queuing cores - ret_msg = f"No n_new_workers since cores_queue({cores_queue}) >= queue_limit_cores({queue_limit_cores})" - tmp_log.debug(ret_msg) - pass - elif queue_limit_memory is not None and memory_queue >= queue_limit_memory: - # enough queuing memory - ret_msg = f"No n_new_workers since memory_queue({memory_queue} MB) >= queue_limit_memory({queue_limit_memory} MB)" - tmp_log.debug(ret_msg) - pass - else: - max_queued_workers = None + tmp_log.debug(ret_msg) + pass + elif queue_limit_cores is not None and cores_queue >= queue_limit_cores: + # enough queuing cores + ret_msg = f"No n_new_workers since cores_queue({cores_queue}) >= queue_limit_cores({queue_limit_cores})" + tmp_log.debug(ret_msg) + pass + elif queue_limit_memory is not None and memory_queue >= queue_limit_memory: + # enough queuing memory + ret_msg = f"No n_new_workers since memory_queue({memory_queue} MB) >= queue_limit_memory({queue_limit_memory} MB)" + tmp_log.debug(ret_msg) + pass + else: + max_queued_workers = None - if n_queue_limit_per_rt > 0: # there is a limit set for the queue - max_queued_workers = n_queue_limit_per_rt + if n_queue_limit_per_rt > 0: # there is a limit set for the queue + max_queued_workers = n_queue_limit_per_rt - # Reset the maxQueueWorkers according to particular - if n_new_workers_def is not None: # don't surpass limits given centrally + # Reset the maxQueueWorkers according to particular + if n_new_workers_def is not None: # don't surpass limits given centrally - maxQueuedWorkers_slave = n_new_workers_def + n_queue - if max_queued_workers is not None: - max_queued_workers = min(maxQueuedWorkers_slave, max_queued_workers) - else: - max_queued_workers = maxQueuedWorkers_slave + maxQueuedWorkers_slave = n_new_workers_def + n_queue + if max_queued_workers is not None: + max_queued_workers = min(maxQueuedWorkers_slave, max_queued_workers) + else: + max_queued_workers = maxQueuedWorkers_slave - elif queue_config.mapType == "NoJob": # for pull mode, limit to activated jobs - if job_stats is None: - tmp_log.warning("n_activated not defined, defaulting to configured queue limits") - pass - else: - # limit the queue to the number of activated jobs to avoid empty pilots - try: - n_min_pilots = 1 - if self.get_queue_no_pilots_when_no_active_jobs(queue_name): - n_min_pilots = 0 - - tmp_n_activated_jobs = job_stats[queue_name]["activated"] - tmp_log.debug(f"available activated panda jobs {tmp_n_activated_jobs}") - - activate_worker_factor = self.get_activate_worker_factor(queue_name, job_type, resource_type, queue_dict, queue_config) - if tmp_n_activated_jobs * activate_worker_factor > 0: - n_min_pilots = 1 - n_activated = max(int(tmp_n_activated_jobs * activate_worker_factor), n_min_pilots) # avoid no activity queues - except KeyError: - # zero job in the queue - tmp_log.debug("no job in queue") - if self.get_queue_no_pilots_when_no_active_jobs(queue_name): - n_activated = 0 + elif queue_config.mapType == "NoJob": # for pull mode, limit to activated jobs + if job_stats is None: + tmp_log.warning("n_activated not defined, defaulting to configured queue limits") + pass else: - n_activated = max(1 - n_queue - n_ready - n_running, 0) - finally: - queue_limit = max_queued_workers - max_queued_workers = min(n_activated, max_queued_workers) - tmp_log.debug(f"limiting max_queued_workers to min(n_activated={n_activated}, queue_limit={queue_limit})") - - if max_queued_workers is None: # no value found, use default value - max_queued_workers = 1 - - # new workers - n_new_workers = max(max_queued_workers - n_queue, 0) - tmp_log.debug(f"setting n_new_workers to {n_new_workers} in max_queued_workers calculation") - if max_workers > 0: - n_new_workers = min(n_new_workers, max(max_workers - n_queue - n_ready - n_running, 0)) - tmp_log.debug(f"setting n_new_workers to {n_new_workers} to respect max_workers") - if queue_limit_cores: - new_worker_cores_max = max(queue_limit_cores - cores_queue, 0) - n_new_workers = min(n_new_workers, math.ceil(new_worker_cores_max / rtype_request_cores)) - tmp_log.debug(f"setting n_new_workers to {n_new_workers} to respect queue_limit_cores") - if queue_limit_memory: - new_worker_memory_max = max(queue_limit_memory - memory_queue, 0) - n_new_workers = min(n_new_workers, math.ceil(new_worker_memory_max / rtype_request_memory)) - tmp_log.debug(f"setting n_new_workers to {n_new_workers} to respect queue_limit_memory") - if queue_config.maxNewWorkersPerCycle > 0: - n_new_workers = min(n_new_workers, queue_config.maxNewWorkersPerCycle) - tmp_log.debug(f"setting n_new_workers to {n_new_workers} in order to respect maxNewWorkersPerCycle") - if self.maxNewWorkers is not None and self.maxNewWorkers > 0: - n_new_workers = min(n_new_workers, self.maxNewWorkers) - tmp_log.debug(f"setting n_new_workers to {n_new_workers} in order to respect universal maxNewWorkers") - df_queue = df_queue.with_columns( - pl.when( - (pl.col("job_type") == job_type) & (pl.col("resource_type") == resource_type) & (pl.col("prod_source_label") == prod_source_label) - ) - .then(n_new_workers) - .otherwise(pl.col("nNewWorkers")) - .alias("nNewWorkers") - ) + # limit the queue to the number of activated jobs to avoid empty pilots + try: + n_min_pilots = 1 + if self.get_queue_no_pilots_when_no_active_jobs(queue_name): + n_min_pilots = 0 + + tmp_n_activated_jobs = job_stats[queue_name]["activated"] + tmp_log.debug(f"available activated panda jobs {tmp_n_activated_jobs}") + + activate_worker_factor = self.get_activate_worker_factor( + queue_name, job_type, resource_type, queue_dict, queue_config + ) + if tmp_n_activated_jobs * activate_worker_factor > 0: + n_min_pilots = 1 + n_activated = max(int(tmp_n_activated_jobs * activate_worker_factor), n_min_pilots) # avoid no activity queues + except KeyError: + # zero job in the queue + tmp_log.debug("no job in queue") + if self.get_queue_no_pilots_when_no_active_jobs(queue_name): + n_activated = 0 + else: + n_activated = max(1 - n_queue - n_ready - n_running, 0) + finally: + queue_limit = max_queued_workers + max_queued_workers = min(n_activated, max_queued_workers) + tmp_log.debug(f"limiting max_queued_workers to min(n_activated={n_activated}, queue_limit={queue_limit})") + + if max_queued_workers is None: # no value found, use default value + max_queued_workers = 1 + + # new workers + n_new_workers = max(max_queued_workers - n_queue, 0) + tmp_log.debug(f"setting n_new_workers to {n_new_workers} in max_queued_workers calculation") + if max_workers > 0: + n_new_workers = min(n_new_workers, max(max_workers - n_queue - n_ready - n_running, 0)) + tmp_log.debug(f"setting n_new_workers to {n_new_workers} to respect max_workers") + if queue_limit_cores: + new_worker_cores_max = max(queue_limit_cores - cores_queue, 0) + n_new_workers = min(n_new_workers, math.ceil(new_worker_cores_max / rtype_request_cores)) + tmp_log.debug(f"setting n_new_workers to {n_new_workers} to respect queue_limit_cores") + if queue_limit_memory: + new_worker_memory_max = max(queue_limit_memory - memory_queue, 0) + n_new_workers = min(n_new_workers, math.ceil(new_worker_memory_max / rtype_request_memory)) + tmp_log.debug(f"setting n_new_workers to {n_new_workers} to respect queue_limit_memory") + if queue_config.maxNewWorkersPerCycle > 0: + n_new_workers = min(n_new_workers, queue_config.maxNewWorkersPerCycle) + tmp_log.debug(f"setting n_new_workers to {n_new_workers} in order to respect maxNewWorkersPerCycle") + if self.maxNewWorkers is not None and self.maxNewWorkers > 0: + n_new_workers = min(n_new_workers, self.maxNewWorkers) + tmp_log.debug(f"setting n_new_workers to {n_new_workers} in order to respect universal maxNewWorkers") + stats["nNewWorkers"] = n_new_workers # adjust n_new_workers for UCORE to let aggregations over RT respect nQueueLimitWorker and max_workers if queue_config is None: @@ -527,6 +590,9 @@ def _normalize_job_type_any(queue_dict): else: max_new_workers_per_cycle = queue_config.maxNewWorkersPerCycle + # Convert updated dict to dataframe for aggregation operations + df_queue = self._dict_to_dataframe({queue_name: queue_dict_updates}) + # Check if we have multiple job types for this queue unique_job_types = df_queue["job_type"].unique() diff --git a/pandaharvester/harvestercore/core_utils.py b/pandaharvester/harvestercore/core_utils.py index a3dcd60f..fc283c52 100644 --- a/pandaharvester/harvestercore/core_utils.py +++ b/pandaharvester/harvestercore/core_utils.py @@ -706,3 +706,39 @@ def naive_utcfromtimestamp(timestamp: float) -> datetime: datetime: current UTC date and time, without tzinfo """ return aware_utcfromtimestamp(timestamp).replace(tzinfo=None) + + +def special_pilot_type_to_prod_source_label(pilot_type: str) -> str | None: + """ + Convert special pilotType of worker to prodSourceLabel of PanDA job. + + Args: + pilot_type (str): pilotType of worker, e.g. "RC", "ALRB", "PT" (except "PR" which is production) + + Returns: + str: prodSourceLabel of PanDA job, e.g. "rc_test2", "rc_alrb", "ptest"; None if no mapping is defined for the given pilot_type + """ + pilot_type_to_prod_source_label_map = { + "RC": "rc_test2", + "ALRB": "rc_alrb", + "PT": "ptest", + } + return pilot_type_to_prod_source_label_map.get(pilot_type, None) + + +def prod_source_label_to_pilot_type(prod_source_label: str) -> str: + """ + Convert prodSourceLabel of PanDA job to pilotType of worker. + + Args: + prod_source_label (str): prodSourceLabel of PanDA job, e.g. "rc_test2", "rc_alrb", "ptest" + + Returns: + str: pilotType of worker, e.g. "RC", "ALRB", "PT"; default to "PR" (production) if no mapping is defined for the given prod_source_label + """ + prod_source_label_to_pilot_type_map = { + "rc_test2": "RC", + "rc_alrb": "ALRB", + "ptest": "PT", + } + return prod_source_label_to_pilot_type_map.get(prod_source_label, "PR") diff --git a/pandaharvester/harvestercore/db_proxy.py b/pandaharvester/harvestercore/db_proxy.py index 23fc4e70..4e6bc03c 100644 --- a/pandaharvester/harvestercore/db_proxy.py +++ b/pandaharvester/harvestercore/db_proxy.py @@ -1672,7 +1672,7 @@ def get_queues_to_submit(self, lookup_interval, lock_interval, locked_by, queue_ sql_delete_orphaned_worker = f"DELETE FROM {workTableName} WHERE workerID=:workerID " # sql to count nQueue - sql_count_workers = f"SELECT status, COUNT(*) cnt FROM {workTableName} WHERE computingSite=:computingSite " + sql_count_workers = f"SELECT pilotType, status, COUNT(*) cnt FROM {workTableName} WHERE computingSite=:computingSite " # sql to count re-fillers sql_count_refillers = ( @@ -1747,12 +1747,13 @@ def get_queues_to_submit(self, lookup_interval, lock_interval, locked_by, queue_ if resourceType != "ANY": varMap[":resourceType"] = resourceType sql_count_workers_tmp += "AND resourceType=:resourceType " - sql_count_workers_tmp += "GROUP BY status " + sql_count_workers_tmp += "GROUP BY pilotType, status " self.execute(sql_count_workers_tmp, varMap) - nQueue = 0 - nReady = 0 - nRunning = 0 - for workerStatus, tmpNum in self.cur.fetchall(): + + for pilotType, workerStatus, tmpNum in self.cur.fetchall(): + nQueue = 0 + nReady = 0 + nRunning = 0 if workerStatus in [WorkSpec.ST_submitted, WorkSpec.ST_pending, WorkSpec.ST_idle]: nQueue += tmpNum elif workerStatus in [WorkSpec.ST_ready]: @@ -1760,24 +1761,31 @@ def get_queues_to_submit(self, lookup_interval, lock_interval, locked_by, queue_ elif workerStatus in [WorkSpec.ST_running]: nRunning += tmpNum - # count nFillers - varMap = dict() - varMap[":computingSite"] = queueName - varMap[":status"] = WorkSpec.ST_running - sql_count_refillers_tmp = sql_count_refillers - if jobType != "ANY": - varMap[":jobType"] = jobType - sql_count_refillers_tmp += "AND jobType=:jobType " - if resourceType != "ANY": - varMap[":resourceType"] = resourceType - sql_count_refillers_tmp += "AND resourceType=:resourceType " - self.execute(sql_count_refillers_tmp, varMap) - (nReFill,) = self.cur.fetchone() - nReady += nReFill - - retMap.setdefault(queueName, {}) - retMap[queueName].setdefault(jobType, {}) - retMap[queueName][jobType][resourceType] = {"nReady": nReady, "nRunning": nRunning, "nQueue": nQueue, "nNewWorkers": nNewWorkers} + # count nFillers + varMap = dict() + varMap[":computingSite"] = queueName + varMap[":status"] = WorkSpec.ST_running + sql_count_refillers_tmp = sql_count_refillers + if jobType != "ANY": + varMap[":jobType"] = jobType + sql_count_refillers_tmp += "AND jobType=:jobType " + if resourceType != "ANY": + varMap[":resourceType"] = resourceType + sql_count_refillers_tmp += "AND resourceType=:resourceType " + self.execute(sql_count_refillers_tmp, varMap) + (nReFill,) = self.cur.fetchone() + nReady += nReFill + + retMap.setdefault(queueName, {}) + retMap[queueName].setdefault(jobType, {}) + retMap[queueName][jobType].setdefault(resourceType, {}) + retMap[queueName][jobType][resourceType][pilotType] = { + "nReady": nReady, + "nRunning": nRunning, + "nQueue": nQueue, + "nNewWorkers": nNewWorkers, + } + resourceMap.setdefault(jobType, {}) resourceMap[jobType][resourceType] = queueName From 86cc3ac34ccf6a93a3577ba14b5f612ee372a87f Mon Sep 17 00:00:00 2001 From: mightqxc Date: Thu, 26 Mar 2026 00:11:20 +0100 Subject: [PATCH 10/38] revert worker_adjuster --- pandaharvester/commit_timestamp.py | 2 +- .../harvesterbody/worker_adjuster.py | 611 ++++++------------ 2 files changed, 190 insertions(+), 423 deletions(-) diff --git a/pandaharvester/commit_timestamp.py b/pandaharvester/commit_timestamp.py index 37f8f10f..04a194f8 100644 --- a/pandaharvester/commit_timestamp.py +++ b/pandaharvester/commit_timestamp.py @@ -1 +1 @@ -timestamp = "25-03-2026 23:04:35 on flin (by mightqxc)" +timestamp = "25-03-2026 23:11:20 on flin (by mightqxc)" diff --git a/pandaharvester/harvesterbody/worker_adjuster.py b/pandaharvester/harvesterbody/worker_adjuster.py index aad7636a..94161e85 100644 --- a/pandaharvester/harvesterbody/worker_adjuster.py +++ b/pandaharvester/harvesterbody/worker_adjuster.py @@ -2,8 +2,6 @@ import math import traceback -import polars as pl - from pandaharvester.harvesterconfig import harvester_config from pandaharvester.harvestercore import core_utils from pandaharvester.harvestercore.db_proxy_pool import DBProxyPool as DBProxy @@ -148,167 +146,17 @@ def get_activate_worker_factor(self, site_name=None, job_type=None, resource_typ tmp_log.debug(f"ret_val={ret_val}") return ret_val - # convert nested dict structure to polars dataframe - def _dict_to_dataframe(self, static_num_workers: dict) -> pl.DataFrame: - """ - Convert nested dict structure to polars dataframe. - - Input dict structure: - {queue_name: {job_type: {resource_type: {tmp_pilot_type: {nQueue, nReady, nRunning, nNewWorkers}}}}} - - Output dataframe columns: - [job_type, resource_type, prod_source_label, nQueue, nReady, nRunning, nNewWorkers] - """ - rows = [] - for queue_name, queue_dict in static_num_workers.items(): - for job_type, rt_dict in queue_dict.items(): - for resource_type, pt_dict in rt_dict.items(): - # Handle case where the value is still the old structure (without pilotType) - # or new structure (with pilotType dimension) - if isinstance(pt_dict, dict): - # Check if this looks like stats dict (has nQueue, nReady, etc.) or pilot type dict - if "nQueue" in pt_dict or "nReady" in pt_dict or "nRunning" in pt_dict or "nNewWorkers" in pt_dict: - # Old structure - pt_dict contains the stats directly - rows.append( - { - "job_type": job_type, - "resource_type": resource_type, - "prod_source_label": "ANY", - "nQueue": pt_dict.get("nQueue", 0), - "nReady": pt_dict.get("nReady", 0), - "nRunning": pt_dict.get("nRunning", 0), - "nNewWorkers": pt_dict.get("nNewWorkers", 0), - } - ) - else: - # New structure - pt_dict contains pilot types as keys - for tmp_pilot_type, stats in pt_dict.items(): - rows.append( - { - "job_type": job_type, - "resource_type": resource_type, - "prod_source_label": core_utils.special_pilot_type_to_prod_source_label(tmp_pilot_type) or "ANY", - "nQueue": stats.get("nQueue", 0), - "nReady": stats.get("nReady", 0), - "nRunning": stats.get("nRunning", 0), - "nNewWorkers": stats.get("nNewWorkers", 0), - } - ) - - if not rows: - # Return empty dataframe with correct schema - return pl.DataFrame( - { - "job_type": [], - "resource_type": [], - "prod_source_label": [], - "nQueue": [], - "nReady": [], - "nRunning": [], - "nNewWorkers": [], - } - ) - - return pl.DataFrame(rows) - - # convert polars dataframe back to nested dict structure - def _dataframe_to_dict(self, df: pl.DataFrame, queue_name: str) -> dict: - """ - Convert polars dataframe back to nested dict structure. - - Input dataframe columns: - [job_type, resource_type, prod_source_label, nQueue, nReady, nRunning, nNewWorkers] - - Output dict structure: - {queue_name: {job_type: {resource_type: {prod_source_label: {nQueue, nReady, nRunning, nNewWorkers}}}}} - """ - result = {queue_name: {}} - - for row in df.iter_rows(named=True): - job_type = row["job_type"] - prod_source_label = row["prod_source_label"] - resource_type = row["resource_type"] - - # Initialize nested dicts as needed - if job_type not in result[queue_name]: - result[queue_name][job_type] = {} - if resource_type not in result[queue_name][job_type]: - result[queue_name][job_type][resource_type] = {} - if prod_source_label not in result[queue_name][job_type][resource_type]: - result[queue_name][job_type][resource_type][prod_source_label] = {} - - # Store the stats for this prod_source_label - result[queue_name][job_type][resource_type][prod_source_label] = { - "nQueue": row["nQueue"], - "nReady": row["nReady"], - "nRunning": row["nRunning"], - "nNewWorkers": row["nNewWorkers"], - } - - return result - - # convert job statistics dict to polars dataframe for better accessibility - def _job_stats_to_dataframe(self, job_stats: dict | None = None) -> pl.DataFrame: - """ - Convert job statistics dict to polars dataframe. - - Input dict structure: - {computing_site: {resource_type: {prod_source_label: {job_status: n_jobs}}}} - - Output dataframe columns: - [computing_site, resource_type, prod_source_label, job_status, n_jobs] - """ - rows = [] - if job_stats is None: - return pl.DataFrame( - { - "computing_site": [], - "resource_type": [], - "prod_source_label": [], - "job_status": [], - "n_jobs": [], - } - ) - - for computing_site, rt_dict in job_stats.items(): - for resource_type, psl_dict in rt_dict.items(): - for prod_source_label, job_status_dict in psl_dict.items(): - for job_status, n_jobs in job_status_dict.items(): - rows.append( - { - "computing_site": computing_site, - "resource_type": resource_type, - "prod_source_label": prod_source_label, - "job_status": job_status, - "n_jobs": n_jobs, - } - ) - - if not rows: - # Return empty dataframe with correct schema - return pl.DataFrame( - { - "computing_site": [], - "resource_type": [], - "prod_source_label": [], - "job_status": [], - "n_jobs": [], - } - ) - - return pl.DataFrame(rows) - # define number of workers to submit based on various information def define_num_workers(self, static_num_workers, site_name) -> dict | None: """ Define number of workers to submit based on various information, including static site config, queue status, job statistics, and throttler if defined. The function also updates APF monitoring with the decision and the reason. Args: - static_num_workers (dict): A dict of the form {queue_name: {job_type: {resource_type: {tmp_pilot_type: {"nQueue": int, "nReady": int, "nRunning": int, "nNewWorkers": int}}}}} defining the static number of workers to submit for each queue, job type, resource type and pilot type. + static_num_workers (dict): A dict of the form {queue_name: {job_type: {resource_type: {"nQueue": int, "nReady": int, "nRunning": int, "nNewWorkers": int}}}} defining the static number of workers to submit for each queue, job type and resource type. site_name (str): The name of the site for which to define the number of workers. Returns: - (dict|None): A dict of the form {queue_name: {job_type: {resource_type: {prod_source_label: {"nQueue": int, "nReady": int, "nRunning": int, "nNewWorkers": int}}}}} with the defined number of new workers to submit in the "nNewWorkers" field, or None if an error occurred. + (dict|None): The updated static_num_workers dict with the defined number of new workers to submit in the "nNewWorkers" field, or None if an error occurred. """ tmp_log = core_utils.make_logger(_logger, f"site={site_name}", method_name="define_num_workers") tmp_log.debug("start") @@ -349,7 +197,6 @@ def _normalize_job_type_any(queue_dict): _normalize_job_type_any(queue_dict) dyn_num_workers = copy.deepcopy(static_num_workers) - try: # get queue status queue_stat = self.dbProxy.get_cache("panda_queues.json", None) @@ -363,39 +210,14 @@ def _normalize_job_type_any(queue_dict): if job_stats is not None: job_stats = job_stats.data - df_job_stats_new = self._job_stats_to_dataframe(None) - job_stats_new = self.dbProxy.get_cache("job_statistics_new.json", None) - if job_stats_new is not None: - # Convert to dataframe for better accessibility - df_job_stats_new = self._job_stats_to_dataframe(job_stats_new.data) - # get panda queues dict from CRIC panda_queues_dict = PandaQueuesDict() # get resource type mapper rt_mapper = ResourceTypeMapper() - # Track results for all queues - result_dict = {} - - # define a priority list for prod_source_label to ensure consistent ordering in the dataframe and processing. - prioritized_pslabels = ["rc_alrb"] - - # define num of new workers - process by queue + # define num of new workers for queue_name in static_num_workers: - # Work with nested dict instead of dataframe for updates - queue_dict_updates = copy.deepcopy(static_num_workers[queue_name]) - - # # fill in prioritized prod_source_labels according to activated jobs stats of this queue - # tmp_df_queue = df_queue - # for prod_source_label in prioritized_pslabels: - # # number of activated jobs with this prod_source_label - # n_activated_pslabel = df_job_stats_new.filter(...) - # tmp_df_queue = ... - - # # You can add sorting here if needed, e.g.: - # # df_queue = df_queue.sort(by=["job_type", "resource_type", "prod_source_label"]) - # get queue queue_config = self.queue_configMapper.get_queue(queue_name) worker_limits_dict = {} @@ -416,171 +238,156 @@ def _normalize_job_type_any(queue_dict): apf_msg = None apf_data = None job_type = DEFAULT_JOB_TYPE + for resource_type, tmp_val in static_num_workers[queue_name][job_type].items(): + tmp_log.debug(f"Processing queue {queue_name} job_type {job_type} resource_type {resource_type} with static_num_workers {tmp_val}") + + # get cores and memory request per worker of this resource_type + queue_dict = panda_queues_dict.get(queue_name, {}) + rtype_request_cores, rtype_request_memory = rt_mapper.calculate_worker_requirements(resource_type, queue_dict) + + # set 0 to num of new workers when the queue is disabled + if queue_name in queue_stat and queue_stat[queue_name]["status"] in ["offline", "standby", "maintenance"]: + dyn_num_workers[queue_name][job_type][resource_type]["nNewWorkers"] = 0 + ret_msg = f"set n_new_workers=0 since status={queue_stat[queue_name]['status']}" + tmp_log.debug(ret_msg) + apf_msg = f"Not submitting workers since queue status = {queue_stat[queue_name]['status']}" + continue - # Iterate through dict to process each row - for job_type_key, rt_dict in queue_dict_updates.items(): - for resource_type, psl_dict in rt_dict.items(): - for prod_source_label, stats in psl_dict.items(): - job_type = job_type_key - n_queue = stats["nQueue"] - n_ready = stats["nReady"] - n_running = stats["nRunning"] - - tmp_log.debug( - f"Processing queue={queue_name} job_type={job_type} resource_type={resource_type} " - f"prod_source_label={prod_source_label} with static_num_workers " - f"nQueue={n_queue} nReady={n_ready} nRunning={n_running}" - ) - - # get cores and memory request per worker of this resource_type - queue_dict = panda_queues_dict.get(queue_name, {}) - rtype_request_cores, rtype_request_memory = rt_mapper.calculate_worker_requirements(resource_type, queue_dict) - - # set 0 to num of new workers when the queue is disabled - if queue_name in queue_stat and queue_stat[queue_name]["status"] in ["offline", "standby", "maintenance"]: - stats["nNewWorkers"] = 0 - ret_msg = f"set n_new_workers=0 since status={queue_stat[queue_name]['status']}" - tmp_log.debug(ret_msg) - apf_msg = f"Not submitting workers since queue status = {queue_stat[queue_name]['status']}" - continue - - # protection against not-up-to-date queue config - if queue_config is None: - stats["nNewWorkers"] = 0 - ret_msg = "set n_new_workers=0 due to missing queue_config" - tmp_log.debug(ret_msg) - apf_msg = "Not submitting workers because of missing queue_config" - continue - - # get throttler - if queue_name not in self.throttlerMap: - if hasattr(queue_config, "throttler"): - throttler = self.pluginFactory.get_plugin(queue_config.throttler) - else: - throttler = None - self.throttlerMap[queue_name] = throttler - - # check throttler - throttler = self.throttlerMap[queue_name] - if throttler is not None: - to_throttle, tmp_msg = throttler.to_be_throttled(queue_config, queue_config_mapper=self.queue_configMapper) - if to_throttle: - stats["nNewWorkers"] = 0 - ret_msg = f"set n_new_workers=0 by {throttler.__class__.__name__}:{tmp_msg}" - tmp_log.debug(ret_msg) - continue + # protection against not-up-to-date queue config + if queue_config is None: + dyn_num_workers[queue_name][job_type][resource_type]["nNewWorkers"] = 0 + ret_msg = "set n_new_workers=0 due to missing queue_config" + tmp_log.debug(ret_msg) + apf_msg = "Not submitting workers because of missing queue_config" + continue - # check stats - if resource_type != "ANY" and job_type != "ANY" and job_type is not None: - n_queue_total += n_queue - n_ready_total += n_ready - n_running_total += n_running - - if queue_config.runMode == "slave": - n_new_workers_def = stats["nNewWorkers"] - if n_new_workers_def == 0: - stats["nNewWorkers"] = 0 - ret_msg = "set n_new_workers=0 by panda in slave mode" - tmp_log.debug(ret_msg) - continue - else: - n_new_workers_def = None - - # define num of new workers based on static site config - n_new_workers = 0 - if n_queue >= n_queue_limit_per_rt > 0: - # enough queued workers - ret_msg = f"No n_new_workers since n_queue({n_queue})>=n_queue_limit_per_rt({n_queue_limit_per_rt})" - tmp_log.debug(ret_msg) - pass - elif (n_queue + n_ready + n_running) >= max_workers > 0: - # enough workers in the system - ret_msg = ( - f"No n_new_workers since n_queue({n_queue}) + n_ready({n_ready}) + n_running({n_running}) >= max_workers({max_workers})" - ) - tmp_log.debug(ret_msg) - pass - elif queue_limit_cores is not None and cores_queue >= queue_limit_cores: - # enough queuing cores - ret_msg = f"No n_new_workers since cores_queue({cores_queue}) >= queue_limit_cores({queue_limit_cores})" - tmp_log.debug(ret_msg) - pass - elif queue_limit_memory is not None and memory_queue >= queue_limit_memory: - # enough queuing memory - ret_msg = f"No n_new_workers since memory_queue({memory_queue} MB) >= queue_limit_memory({queue_limit_memory} MB)" - tmp_log.debug(ret_msg) - pass - else: - max_queued_workers = None + # get throttler + if queue_name not in self.throttlerMap: + if hasattr(queue_config, "throttler"): + throttler = self.pluginFactory.get_plugin(queue_config.throttler) + else: + throttler = None + self.throttlerMap[queue_name] = throttler + + # check throttler + throttler = self.throttlerMap[queue_name] + if throttler is not None: + to_throttle, tmp_msg = throttler.to_be_throttled(queue_config, queue_config_mapper=self.queue_configMapper) + if to_throttle: + dyn_num_workers[queue_name][job_type][resource_type]["nNewWorkers"] = 0 + ret_msg = f"set n_new_workers=0 by {throttler.__class__.__name__}:{tmp_msg}" + tmp_log.debug(ret_msg) + continue + + # check stats + n_queue = tmp_val["nQueue"] + n_ready = tmp_val["nReady"] + n_running = tmp_val["nRunning"] + if resource_type != "ANY" and job_type != "ANY" and job_type is not None: + n_queue_total += n_queue + n_ready_total += n_ready + n_running_total += n_running + if queue_config.runMode == "slave": + n_new_workers_def = tmp_val["nNewWorkers"] + if n_new_workers_def == 0: + dyn_num_workers[queue_name][job_type][resource_type]["nNewWorkers"] = 0 + ret_msg = "set n_new_workers=0 by panda in slave mode" + tmp_log.debug(ret_msg) + continue + else: + n_new_workers_def = None + + # define num of new workers based on static site config + n_new_workers = 0 + if n_queue >= n_queue_limit_per_rt > 0: + # enough queued workers + ret_msg = f"No n_new_workers since n_queue({n_queue})>=n_queue_limit_per_rt({n_queue_limit_per_rt})" + tmp_log.debug(ret_msg) + pass + elif (n_queue + n_ready + n_running) >= max_workers > 0: + # enough workers in the system + ret_msg = f"No n_new_workers since n_queue({n_queue}) + n_ready({n_ready}) + n_running({n_running}) " f">= max_workers({max_workers})" + tmp_log.debug(ret_msg) + pass + elif queue_limit_cores is not None and cores_queue >= queue_limit_cores: + # enough queuing cores + ret_msg = f"No n_new_workers since cores_queue({cores_queue}) >= " f"queue_limit_cores({queue_limit_cores})" + tmp_log.debug(ret_msg) + pass + elif queue_limit_memory is not None and memory_queue >= queue_limit_memory: + # enough queuing cores + ret_msg = f"No n_new_workers since memory_queue({memory_queue} MB) >= " f"queue_limit_memory({queue_limit_memory} MB)" + tmp_log.debug(ret_msg) + pass + else: + max_queued_workers = None - if n_queue_limit_per_rt > 0: # there is a limit set for the queue - max_queued_workers = n_queue_limit_per_rt + if n_queue_limit_per_rt > 0: # there is a limit set for the queue + max_queued_workers = n_queue_limit_per_rt - # Reset the maxQueueWorkers according to particular - if n_new_workers_def is not None: # don't surpass limits given centrally + # Reset the maxQueueWorkers according to particular + if n_new_workers_def is not None: # don't surpass limits given centrally - maxQueuedWorkers_slave = n_new_workers_def + n_queue - if max_queued_workers is not None: - max_queued_workers = min(maxQueuedWorkers_slave, max_queued_workers) - else: - max_queued_workers = maxQueuedWorkers_slave + maxQueuedWorkers_slave = n_new_workers_def + n_queue + if max_queued_workers is not None: + max_queued_workers = min(maxQueuedWorkers_slave, max_queued_workers) + else: + max_queued_workers = maxQueuedWorkers_slave - elif queue_config.mapType == "NoJob": # for pull mode, limit to activated jobs - if job_stats is None: - tmp_log.warning("n_activated not defined, defaulting to configured queue limits") - pass + elif queue_config.mapType == "NoJob": # for pull mode, limit to activated jobs + if job_stats is None: + tmp_log.warning("n_activated not defined, defaulting to configured queue limits") + pass + else: + # limit the queue to the number of activated jobs to avoid empty pilots + try: + n_min_pilots = 1 + if self.get_queue_no_pilots_when_no_active_jobs(queue_name): + n_min_pilots = 0 + + tmp_n_activated_jobs = job_stats[queue_name]["activated"] + tmp_log.debug(f"available activated panda jobs {tmp_n_activated_jobs}") + + activate_worker_factor = self.get_activate_worker_factor(queue_name, job_type, resource_type, queue_dict, queue_config) + if tmp_n_activated_jobs * activate_worker_factor > 0: + n_min_pilots = 1 + n_activated = max(int(tmp_n_activated_jobs * activate_worker_factor), n_min_pilots) # avoid no activity queues + except KeyError: + # zero job in the queue + tmp_log.debug("no job in queue") + if self.get_queue_no_pilots_when_no_active_jobs(queue_name): + n_activated = 0 else: - # limit the queue to the number of activated jobs to avoid empty pilots - try: - n_min_pilots = 1 - if self.get_queue_no_pilots_when_no_active_jobs(queue_name): - n_min_pilots = 0 - - tmp_n_activated_jobs = job_stats[queue_name]["activated"] - tmp_log.debug(f"available activated panda jobs {tmp_n_activated_jobs}") - - activate_worker_factor = self.get_activate_worker_factor( - queue_name, job_type, resource_type, queue_dict, queue_config - ) - if tmp_n_activated_jobs * activate_worker_factor > 0: - n_min_pilots = 1 - n_activated = max(int(tmp_n_activated_jobs * activate_worker_factor), n_min_pilots) # avoid no activity queues - except KeyError: - # zero job in the queue - tmp_log.debug("no job in queue") - if self.get_queue_no_pilots_when_no_active_jobs(queue_name): - n_activated = 0 - else: - n_activated = max(1 - n_queue - n_ready - n_running, 0) - finally: - queue_limit = max_queued_workers - max_queued_workers = min(n_activated, max_queued_workers) - tmp_log.debug(f"limiting max_queued_workers to min(n_activated={n_activated}, queue_limit={queue_limit})") - - if max_queued_workers is None: # no value found, use default value - max_queued_workers = 1 - - # new workers - n_new_workers = max(max_queued_workers - n_queue, 0) - tmp_log.debug(f"setting n_new_workers to {n_new_workers} in max_queued_workers calculation") - if max_workers > 0: - n_new_workers = min(n_new_workers, max(max_workers - n_queue - n_ready - n_running, 0)) - tmp_log.debug(f"setting n_new_workers to {n_new_workers} to respect max_workers") - if queue_limit_cores: - new_worker_cores_max = max(queue_limit_cores - cores_queue, 0) - n_new_workers = min(n_new_workers, math.ceil(new_worker_cores_max / rtype_request_cores)) - tmp_log.debug(f"setting n_new_workers to {n_new_workers} to respect queue_limit_cores") - if queue_limit_memory: - new_worker_memory_max = max(queue_limit_memory - memory_queue, 0) - n_new_workers = min(n_new_workers, math.ceil(new_worker_memory_max / rtype_request_memory)) - tmp_log.debug(f"setting n_new_workers to {n_new_workers} to respect queue_limit_memory") - if queue_config.maxNewWorkersPerCycle > 0: - n_new_workers = min(n_new_workers, queue_config.maxNewWorkersPerCycle) - tmp_log.debug(f"setting n_new_workers to {n_new_workers} in order to respect maxNewWorkersPerCycle") - if self.maxNewWorkers is not None and self.maxNewWorkers > 0: - n_new_workers = min(n_new_workers, self.maxNewWorkers) - tmp_log.debug(f"setting n_new_workers to {n_new_workers} in order to respect universal maxNewWorkers") - stats["nNewWorkers"] = n_new_workers + n_activated = max(1 - n_queue - n_ready - n_running, 0) + finally: + queue_limit = max_queued_workers + max_queued_workers = min(n_activated, max_queued_workers) + tmp_log.debug(f"limiting max_queued_workers to min(n_activated={n_activated}, queue_limit={queue_limit})") + + if max_queued_workers is None: # no value found, use default value + max_queued_workers = 1 + + # new workers + n_new_workers = max(max_queued_workers - n_queue, 0) + tmp_log.debug(f"setting n_new_workers to {n_new_workers} in max_queued_workers calculation") + if max_workers > 0: + n_new_workers = min(n_new_workers, max(max_workers - n_queue - n_ready - n_running, 0)) + tmp_log.debug(f"setting n_new_workers to {n_new_workers} to respect max_workers") + if queue_limit_cores: + new_worker_cores_max = max(queue_limit_cores - cores_queue, 0) + n_new_workers = min(n_new_workers, math.ceil(new_worker_cores_max / rtype_request_cores)) + tmp_log.debug(f"setting n_new_workers to {n_new_workers} to respect queue_limit_cores") + if queue_limit_memory: + new_worker_memory_max = max(queue_limit_memory - memory_queue, 0) + n_new_workers = min(n_new_workers, math.ceil(new_worker_memory_max / rtype_request_memory)) + tmp_log.debug(f"setting n_new_workers to {n_new_workers} to respect queue_limit_memory") + if queue_config.maxNewWorkersPerCycle > 0: + n_new_workers = min(n_new_workers, queue_config.maxNewWorkersPerCycle) + tmp_log.debug(f"setting n_new_workers to {n_new_workers} in order to respect maxNewWorkersPerCycle") + if self.maxNewWorkers is not None and self.maxNewWorkers > 0: + n_new_workers = min(n_new_workers, self.maxNewWorkers) + tmp_log.debug(f"setting n_new_workers to {n_new_workers} in order to respect universal maxNewWorkers") + dyn_num_workers[queue_name][job_type][resource_type]["nNewWorkers"] = n_new_workers # adjust n_new_workers for UCORE to let aggregations over RT respect nQueueLimitWorker and max_workers if queue_config is None: @@ -589,16 +396,12 @@ def _normalize_job_type_any(queue_dict): tmp_log.debug(ret_msg) else: max_new_workers_per_cycle = queue_config.maxNewWorkersPerCycle - - # Convert updated dict to dataframe for aggregation operations - df_queue = self._dict_to_dataframe({queue_name: queue_dict_updates}) - - # Check if we have multiple job types for this queue - unique_job_types = df_queue["job_type"].unique() - - if len(unique_job_types) > 1: - total_new_workers_rts = df_queue.filter((pl.col("job_type") != "ANY") & (pl.col("resource_type") != "ANY"))["nNewWorkers"].sum() - + if len(dyn_num_workers[queue_name]) > 1: + total_new_workers_rts = 0 + for _jt in dyn_num_workers[queue_name]: + for _rt in dyn_num_workers[queue_name][_jt]: + if _jt != "ANY" and _rt != "ANY": + total_new_workers_rts = total_new_workers_rts + dyn_num_workers[queue_name][_jt][_rt]["nNewWorkers"] n_new_workers_max_agg = min(max(n_queue_limit - n_queue_total, 0), max(max_workers - n_queue_total - n_ready_total - n_running_total, 0)) if max_new_workers_per_cycle >= 0: n_new_workers_max_agg = min(n_new_workers_max_agg, max_new_workers_per_cycle) @@ -608,88 +411,52 @@ def _normalize_job_type_any(queue_dict): # exceeded max, to adjust if total_new_workers_rts > n_new_workers_max_agg: if n_new_workers_max_agg == 0: - df_queue = df_queue.with_columns(pl.when(True).then(0).otherwise(pl.col("nNewWorkers")).alias("nNewWorkers")) + for job_type in dyn_num_workers[queue_name]: + for resource_type in dyn_num_workers[queue_name][job_type]: + dyn_num_workers[queue_name][job_type][resource_type]["nNewWorkers"] = 0 tmp_log.debug("No n_new_workers since n_new_workers_max_agg=0 for UCORE") else: tmp_log.debug(f"n_new_workers_max_agg={n_new_workers_max_agg} for UCORE") - - # Build a list of (resource_type, job_type, nNewWorkers, original_index) for redistribution - df_to_adjust = df_queue.filter((pl.col("job_type") != "ANY") & (pl.col("resource_type") != "ANY")) - - if len(df_to_adjust) > 0: - # Calculate distribution factors - simple_rt_nw_list = [] - for row_data in df_to_adjust.iter_rows(named=True): - n_new_workers_orig = row_data["nNewWorkers"] - simple_rt_nw_list.append([(row_data["resource_type"], row_data["job_type"]), n_new_workers_orig, 0]) # remainder - - # Distribute workers proportionally - _countdown = n_new_workers_max_agg - for _rt_list in simple_rt_nw_list: - (resource_type, job_type), n_new_workers_orig, _r = _rt_list - if total_new_workers_rts > 0: - n_new_workers, remainder = divmod(n_new_workers_orig * n_new_workers_max_agg, total_new_workers_rts) - else: - n_new_workers, remainder = 0, 0 - - # Update the dataframe - df_queue = df_queue.with_columns( - pl.when( - (pl.col("resource_type") == resource_type) - & (pl.col("job_type") == job_type) - & (pl.col("job_type") != "ANY") - & (pl.col("resource_type") != "ANY") - ) - .then(n_new_workers) - .otherwise(pl.col("nNewWorkers")) - .alias("nNewWorkers") - ) - _rt_list[2] = remainder - _countdown -= n_new_workers - - # Distribute remaining workers by remainder - _s_list = sorted(simple_rt_nw_list, key=(lambda x: x[1])) - sorted_rt_nw_list = sorted(_s_list, key=(lambda x: x[2]), reverse=True) - for (resource_type, job_type), n_new_workers_orig, remainder in sorted_rt_nw_list: - if _countdown <= 0: - break - df_queue = df_queue.with_columns( - pl.when( - (pl.col("resource_type") == resource_type) - & (pl.col("job_type") == job_type) - & (pl.col("job_type") != "ANY") - & (pl.col("resource_type") != "ANY") - ) - .then(pl.col("nNewWorkers") + 1) - .otherwise(pl.col("nNewWorkers")) - .alias("nNewWorkers") + _d = dyn_num_workers[queue_name].copy() + del _d["ANY"] + + # TODO: needs to be recalculated + simple_rt_nw_list = [] + for job_type in _d: # jt: job type + for resource_type in _d[job_type]: # rt: resource type + simple_rt_nw_list.append([(resource_type, job_type), _d[job_type][resource_type].get("nNewWorkers", 0), 0]) + + _countdown = n_new_workers_max_agg + for _rt_list in simple_rt_nw_list: + (resource_type, job_type), n_new_workers_orig, _r = _rt_list + n_new_workers, remainder = divmod(n_new_workers_orig * n_new_workers_max_agg, total_new_workers_rts) + dyn_num_workers[queue_name][job_type].setdefault(resource_type, {"nReady": 0, "nRunning": 0, "nQueue": 0, "nNewWorkers": 0}) + dyn_num_workers[queue_name][job_type][resource_type]["nNewWorkers"] = n_new_workers + _rt_list[2] = remainder + _countdown -= n_new_workers + _s_list = sorted(simple_rt_nw_list, key=(lambda x: x[1])) + sorted_rt_nw_list = sorted(_s_list, key=(lambda x: x[2]), reverse=True) + for (resource_type, job_type), n_new_workers_orig, remainder in sorted_rt_nw_list: + if _countdown <= 0: + break + dyn_num_workers[queue_name][job_type][resource_type]["nNewWorkers"] += 1 + _countdown -= 1 + for job_type in dyn_num_workers[queue_name]: + for resource_type in dyn_num_workers[queue_name][job_type]: + if job_type == "ANY" or resource_type == "ANY": + continue + n_new_workers = dyn_num_workers[queue_name][job_type][resource_type]["nNewWorkers"] + tmp_log.debug( + "setting n_new_workers to {0} of job_type {1} resource_type {2} in order to respect RT aggregations for UCORE".format( + n_new_workers, job_type, resource_type ) - _countdown -= 1 - - # Log adjustments - df_queue_final = df_queue.filter((pl.col("job_type") != "ANY") & (pl.col("resource_type") != "ANY")) - for row_data in df_queue_final.iter_rows(named=True): - n_new_workers = row_data["nNewWorkers"] - tmp_log.debug( - f"setting n_new_workers to {n_new_workers} of job_type {row_data['job_type']} " - f"resource_type {row_data['resource_type']} prod_source_label {row_data['prod_source_label']} " - f"in order to respect RT aggregations for UCORE" - ) + ) if not apf_msg: - # Convert current queue back to dict format for APF monitoring - dict_queue_apf = self._dataframe_to_dict(df_queue, queue_name) - apf_data = dict_queue_apf.get(queue_name, {}) + apf_data = copy.deepcopy(dyn_num_workers[queue_name]) self.apf_mon.update_label(queue_name, apf_msg, apf_data) - # Store the updated queue version - dict_result = self._dataframe_to_dict(df_queue, queue_name) - result_dict.update(dict_result) - - # Return the final result - dyn_num_workers = result_dict - # dump tmp_log.debug(f"defined {str(dyn_num_workers)}") return dyn_num_workers From c326f7291ca5487b580da106f6c627ff25cd5919 Mon Sep 17 00:00:00 2001 From: mightqxc Date: Thu, 26 Mar 2026 00:38:44 +0100 Subject: [PATCH 11/38] worker_adjuster and submitter: add pilot_type and prod_source_label --- pandaharvester/commit_timestamp.py | 2 +- pandaharvester/harvesterbody/submitter.py | 6 +- .../harvesterbody/worker_adjuster.py | 322 +++++++++--------- 3 files changed, 172 insertions(+), 158 deletions(-) diff --git a/pandaharvester/commit_timestamp.py b/pandaharvester/commit_timestamp.py index 04a194f8..0cb76980 100644 --- a/pandaharvester/commit_timestamp.py +++ b/pandaharvester/commit_timestamp.py @@ -1 +1 @@ -timestamp = "25-03-2026 23:11:20 on flin (by mightqxc)" +timestamp = "25-03-2026 23:38:44 on flin (by mightqxc)" diff --git a/pandaharvester/harvesterbody/submitter.py b/pandaharvester/harvesterbody/submitter.py index 5554c753..0aaf4059 100644 --- a/pandaharvester/harvesterbody/submitter.py +++ b/pandaharvester/harvesterbody/submitter.py @@ -90,8 +90,10 @@ def run(self): queue_config = self.queue_configMapper.get_queue(queue_name) workerMakerCore = self.workerMaker.get_plugin(queue_config) for resource_type in n_workers_per_queue_jt_rt[queue_name][job_type]: - for prod_source_label in n_workers_per_queue_jt_rt[queue_name][job_type][resource_type]: - tmp_val = n_workers_per_queue_jt_rt[queue_name][job_type][resource_type][prod_source_label] + for pilot_type in n_workers_per_queue_jt_rt[queue_name][job_type][resource_type]: + tmp_val = n_workers_per_queue_jt_rt[queue_name][job_type][resource_type][pilot_type] + # get prod_source_label from pilot_type for worker maker + prod_source_label = core_utils.special_pilot_type_to_prod_source_label(pilot_type) tmp_log = self.make_logger( _logger, f"id={locked_by} queue={queue_name} jtype={job_type} rtype={resource_type} pslabel={prod_source_label}", diff --git a/pandaharvester/harvesterbody/worker_adjuster.py b/pandaharvester/harvesterbody/worker_adjuster.py index 94161e85..f6a0ba90 100644 --- a/pandaharvester/harvesterbody/worker_adjuster.py +++ b/pandaharvester/harvesterbody/worker_adjuster.py @@ -152,7 +152,7 @@ def define_num_workers(self, static_num_workers, site_name) -> dict | None: Define number of workers to submit based on various information, including static site config, queue status, job statistics, and throttler if defined. The function also updates APF monitoring with the decision and the reason. Args: - static_num_workers (dict): A dict of the form {queue_name: {job_type: {resource_type: {"nQueue": int, "nReady": int, "nRunning": int, "nNewWorkers": int}}}} defining the static number of workers to submit for each queue, job type and resource type. + static_num_workers (dict): A dict of the form {queue_name: {job_type: {resource_type: {pilot_type: {"nQueue": int, "nReady": int, "nRunning": int, "nNewWorkers": int}}}}} defining the static number of workers to submit for each queue, job type, resource type and pilot type. site_name (str): The name of the site for which to define the number of workers. Returns: @@ -238,156 +238,161 @@ def _normalize_job_type_any(queue_dict): apf_msg = None apf_data = None job_type = DEFAULT_JOB_TYPE - for resource_type, tmp_val in static_num_workers[queue_name][job_type].items(): - tmp_log.debug(f"Processing queue {queue_name} job_type {job_type} resource_type {resource_type} with static_num_workers {tmp_val}") - - # get cores and memory request per worker of this resource_type - queue_dict = panda_queues_dict.get(queue_name, {}) - rtype_request_cores, rtype_request_memory = rt_mapper.calculate_worker_requirements(resource_type, queue_dict) - - # set 0 to num of new workers when the queue is disabled - if queue_name in queue_stat and queue_stat[queue_name]["status"] in ["offline", "standby", "maintenance"]: - dyn_num_workers[queue_name][job_type][resource_type]["nNewWorkers"] = 0 - ret_msg = f"set n_new_workers=0 since status={queue_stat[queue_name]['status']}" - tmp_log.debug(ret_msg) - apf_msg = f"Not submitting workers since queue status = {queue_stat[queue_name]['status']}" - continue - - # protection against not-up-to-date queue config - if queue_config is None: - dyn_num_workers[queue_name][job_type][resource_type]["nNewWorkers"] = 0 - ret_msg = "set n_new_workers=0 due to missing queue_config" - tmp_log.debug(ret_msg) - apf_msg = "Not submitting workers because of missing queue_config" - continue - - # get throttler - if queue_name not in self.throttlerMap: - if hasattr(queue_config, "throttler"): - throttler = self.pluginFactory.get_plugin(queue_config.throttler) - else: - throttler = None - self.throttlerMap[queue_name] = throttler - - # check throttler - throttler = self.throttlerMap[queue_name] - if throttler is not None: - to_throttle, tmp_msg = throttler.to_be_throttled(queue_config, queue_config_mapper=self.queue_configMapper) - if to_throttle: - dyn_num_workers[queue_name][job_type][resource_type]["nNewWorkers"] = 0 - ret_msg = f"set n_new_workers=0 by {throttler.__class__.__name__}:{tmp_msg}" + for resource_type, pilot_type_dict in static_num_workers[queue_name][job_type].items(): + for pilot_type, tmp_val in pilot_type_dict.items(): + tmp_log.debug( + f"Processing queue={queue_name} job_type={job_type} resource_type={resource_type} pilot_type={pilot_type} with static_num_workers={tmp_val}" + ) + + # get cores and memory request per worker of this resource_type + queue_dict = panda_queues_dict.get(queue_name, {}) + rtype_request_cores, rtype_request_memory = rt_mapper.calculate_worker_requirements(resource_type, queue_dict) + + # set 0 to num of new workers when the queue is disabled + if queue_name in queue_stat and queue_stat[queue_name]["status"] in ["offline", "standby", "maintenance"]: + dyn_num_workers[queue_name][job_type][resource_type][pilot_type]["nNewWorkers"] = 0 + ret_msg = f"set n_new_workers=0 since status={queue_stat[queue_name]['status']}" tmp_log.debug(ret_msg) + apf_msg = f"Not submitting workers since queue status = {queue_stat[queue_name]['status']}" continue - # check stats - n_queue = tmp_val["nQueue"] - n_ready = tmp_val["nReady"] - n_running = tmp_val["nRunning"] - if resource_type != "ANY" and job_type != "ANY" and job_type is not None: - n_queue_total += n_queue - n_ready_total += n_ready - n_running_total += n_running - if queue_config.runMode == "slave": - n_new_workers_def = tmp_val["nNewWorkers"] - if n_new_workers_def == 0: - dyn_num_workers[queue_name][job_type][resource_type]["nNewWorkers"] = 0 - ret_msg = "set n_new_workers=0 by panda in slave mode" + # protection against not-up-to-date queue config + if queue_config is None: + dyn_num_workers[queue_name][job_type][resource_type][pilot_type]["nNewWorkers"] = 0 + ret_msg = "set n_new_workers=0 due to missing queue_config" tmp_log.debug(ret_msg) + apf_msg = "Not submitting workers because of missing queue_config" continue - else: - n_new_workers_def = None - - # define num of new workers based on static site config - n_new_workers = 0 - if n_queue >= n_queue_limit_per_rt > 0: - # enough queued workers - ret_msg = f"No n_new_workers since n_queue({n_queue})>=n_queue_limit_per_rt({n_queue_limit_per_rt})" - tmp_log.debug(ret_msg) - pass - elif (n_queue + n_ready + n_running) >= max_workers > 0: - # enough workers in the system - ret_msg = f"No n_new_workers since n_queue({n_queue}) + n_ready({n_ready}) + n_running({n_running}) " f">= max_workers({max_workers})" - tmp_log.debug(ret_msg) - pass - elif queue_limit_cores is not None and cores_queue >= queue_limit_cores: - # enough queuing cores - ret_msg = f"No n_new_workers since cores_queue({cores_queue}) >= " f"queue_limit_cores({queue_limit_cores})" - tmp_log.debug(ret_msg) - pass - elif queue_limit_memory is not None and memory_queue >= queue_limit_memory: - # enough queuing cores - ret_msg = f"No n_new_workers since memory_queue({memory_queue} MB) >= " f"queue_limit_memory({queue_limit_memory} MB)" - tmp_log.debug(ret_msg) - pass - else: - max_queued_workers = None - if n_queue_limit_per_rt > 0: # there is a limit set for the queue - max_queued_workers = n_queue_limit_per_rt - - # Reset the maxQueueWorkers according to particular - if n_new_workers_def is not None: # don't surpass limits given centrally - - maxQueuedWorkers_slave = n_new_workers_def + n_queue - if max_queued_workers is not None: - max_queued_workers = min(maxQueuedWorkers_slave, max_queued_workers) + # get throttler + if queue_name not in self.throttlerMap: + if hasattr(queue_config, "throttler"): + throttler = self.pluginFactory.get_plugin(queue_config.throttler) else: - max_queued_workers = maxQueuedWorkers_slave - - elif queue_config.mapType == "NoJob": # for pull mode, limit to activated jobs - if job_stats is None: - tmp_log.warning("n_activated not defined, defaulting to configured queue limits") - pass - else: - # limit the queue to the number of activated jobs to avoid empty pilots - try: - n_min_pilots = 1 - if self.get_queue_no_pilots_when_no_active_jobs(queue_name): - n_min_pilots = 0 - - tmp_n_activated_jobs = job_stats[queue_name]["activated"] - tmp_log.debug(f"available activated panda jobs {tmp_n_activated_jobs}") + throttler = None + self.throttlerMap[queue_name] = throttler + + # check throttler + throttler = self.throttlerMap[queue_name] + if throttler is not None: + to_throttle, tmp_msg = throttler.to_be_throttled(queue_config, queue_config_mapper=self.queue_configMapper) + if to_throttle: + dyn_num_workers[queue_name][job_type][resource_type][pilot_type]["nNewWorkers"] = 0 + ret_msg = f"set n_new_workers=0 by {throttler.__class__.__name__}:{tmp_msg}" + tmp_log.debug(ret_msg) + continue + + # check stats + n_queue = tmp_val["nQueue"] + n_ready = tmp_val["nReady"] + n_running = tmp_val["nRunning"] + if resource_type != "ANY" and job_type != "ANY" and job_type is not None: + n_queue_total += n_queue + n_ready_total += n_ready + n_running_total += n_running + if queue_config.runMode == "slave": + n_new_workers_def = tmp_val["nNewWorkers"] + if n_new_workers_def == 0: + dyn_num_workers[queue_name][job_type][resource_type][pilot_type]["nNewWorkers"] = 0 + ret_msg = "set n_new_workers=0 by panda in slave mode" + tmp_log.debug(ret_msg) + continue + else: + n_new_workers_def = None - activate_worker_factor = self.get_activate_worker_factor(queue_name, job_type, resource_type, queue_dict, queue_config) - if tmp_n_activated_jobs * activate_worker_factor > 0: + # define num of new workers based on static site config + n_new_workers = 0 + if n_queue >= n_queue_limit_per_rt > 0: + # enough queued workers + ret_msg = f"No n_new_workers since n_queue({n_queue})>=n_queue_limit_per_rt({n_queue_limit_per_rt})" + tmp_log.debug(ret_msg) + pass + elif (n_queue + n_ready + n_running) >= max_workers > 0: + # enough workers in the system + ret_msg = ( + f"No n_new_workers since n_queue({n_queue}) + n_ready({n_ready}) + n_running({n_running}) " f">= max_workers({max_workers})" + ) + tmp_log.debug(ret_msg) + pass + elif queue_limit_cores is not None and cores_queue >= queue_limit_cores: + # enough queuing cores + ret_msg = f"No n_new_workers since cores_queue({cores_queue}) >= " f"queue_limit_cores({queue_limit_cores})" + tmp_log.debug(ret_msg) + pass + elif queue_limit_memory is not None and memory_queue >= queue_limit_memory: + # enough queuing cores + ret_msg = f"No n_new_workers since memory_queue({memory_queue} MB) >= " f"queue_limit_memory({queue_limit_memory} MB)" + tmp_log.debug(ret_msg) + pass + else: + max_queued_workers = None + + if n_queue_limit_per_rt > 0: # there is a limit set for the queue + max_queued_workers = n_queue_limit_per_rt + + # Reset the maxQueueWorkers according to particular + if n_new_workers_def is not None: # don't surpass limits given centrally + + maxQueuedWorkers_slave = n_new_workers_def + n_queue + if max_queued_workers is not None: + max_queued_workers = min(maxQueuedWorkers_slave, max_queued_workers) + else: + max_queued_workers = maxQueuedWorkers_slave + + elif queue_config.mapType == "NoJob": # for pull mode, limit to activated jobs + if job_stats is None: + tmp_log.warning("n_activated not defined, defaulting to configured queue limits") + pass + else: + # limit the queue to the number of activated jobs to avoid empty pilots + try: n_min_pilots = 1 - n_activated = max(int(tmp_n_activated_jobs * activate_worker_factor), n_min_pilots) # avoid no activity queues - except KeyError: - # zero job in the queue - tmp_log.debug("no job in queue") - if self.get_queue_no_pilots_when_no_active_jobs(queue_name): - n_activated = 0 - else: - n_activated = max(1 - n_queue - n_ready - n_running, 0) - finally: - queue_limit = max_queued_workers - max_queued_workers = min(n_activated, max_queued_workers) - tmp_log.debug(f"limiting max_queued_workers to min(n_activated={n_activated}, queue_limit={queue_limit})") - - if max_queued_workers is None: # no value found, use default value - max_queued_workers = 1 - - # new workers - n_new_workers = max(max_queued_workers - n_queue, 0) - tmp_log.debug(f"setting n_new_workers to {n_new_workers} in max_queued_workers calculation") - if max_workers > 0: - n_new_workers = min(n_new_workers, max(max_workers - n_queue - n_ready - n_running, 0)) - tmp_log.debug(f"setting n_new_workers to {n_new_workers} to respect max_workers") - if queue_limit_cores: - new_worker_cores_max = max(queue_limit_cores - cores_queue, 0) - n_new_workers = min(n_new_workers, math.ceil(new_worker_cores_max / rtype_request_cores)) - tmp_log.debug(f"setting n_new_workers to {n_new_workers} to respect queue_limit_cores") - if queue_limit_memory: - new_worker_memory_max = max(queue_limit_memory - memory_queue, 0) - n_new_workers = min(n_new_workers, math.ceil(new_worker_memory_max / rtype_request_memory)) - tmp_log.debug(f"setting n_new_workers to {n_new_workers} to respect queue_limit_memory") - if queue_config.maxNewWorkersPerCycle > 0: - n_new_workers = min(n_new_workers, queue_config.maxNewWorkersPerCycle) - tmp_log.debug(f"setting n_new_workers to {n_new_workers} in order to respect maxNewWorkersPerCycle") - if self.maxNewWorkers is not None and self.maxNewWorkers > 0: - n_new_workers = min(n_new_workers, self.maxNewWorkers) - tmp_log.debug(f"setting n_new_workers to {n_new_workers} in order to respect universal maxNewWorkers") - dyn_num_workers[queue_name][job_type][resource_type]["nNewWorkers"] = n_new_workers + if self.get_queue_no_pilots_when_no_active_jobs(queue_name): + n_min_pilots = 0 + + tmp_n_activated_jobs = job_stats[queue_name]["activated"] + tmp_log.debug(f"available activated panda jobs {tmp_n_activated_jobs}") + + activate_worker_factor = self.get_activate_worker_factor(queue_name, job_type, resource_type, queue_dict, queue_config) + if tmp_n_activated_jobs * activate_worker_factor > 0: + n_min_pilots = 1 + n_activated = max(int(tmp_n_activated_jobs * activate_worker_factor), n_min_pilots) # avoid no activity queues + except KeyError: + # zero job in the queue + tmp_log.debug("no job in queue") + if self.get_queue_no_pilots_when_no_active_jobs(queue_name): + n_activated = 0 + else: + n_activated = max(1 - n_queue - n_ready - n_running, 0) + finally: + queue_limit = max_queued_workers + max_queued_workers = min(n_activated, max_queued_workers) + tmp_log.debug(f"limiting max_queued_workers to min(n_activated={n_activated}, queue_limit={queue_limit})") + + if max_queued_workers is None: # no value found, use default value + max_queued_workers = 1 + + # new workers + n_new_workers = max(max_queued_workers - n_queue, 0) + tmp_log.debug(f"setting n_new_workers to {n_new_workers} in max_queued_workers calculation") + if max_workers > 0: + n_new_workers = min(n_new_workers, max(max_workers - n_queue - n_ready - n_running, 0)) + tmp_log.debug(f"setting n_new_workers to {n_new_workers} to respect max_workers") + if queue_limit_cores: + new_worker_cores_max = max(queue_limit_cores - cores_queue, 0) + n_new_workers = min(n_new_workers, math.ceil(new_worker_cores_max / rtype_request_cores)) + tmp_log.debug(f"setting n_new_workers to {n_new_workers} to respect queue_limit_cores") + if queue_limit_memory: + new_worker_memory_max = max(queue_limit_memory - memory_queue, 0) + n_new_workers = min(n_new_workers, math.ceil(new_worker_memory_max / rtype_request_memory)) + tmp_log.debug(f"setting n_new_workers to {n_new_workers} to respect queue_limit_memory") + if queue_config.maxNewWorkersPerCycle > 0: + n_new_workers = min(n_new_workers, queue_config.maxNewWorkersPerCycle) + tmp_log.debug(f"setting n_new_workers to {n_new_workers} in order to respect maxNewWorkersPerCycle") + if self.maxNewWorkers is not None and self.maxNewWorkers > 0: + n_new_workers = min(n_new_workers, self.maxNewWorkers) + tmp_log.debug(f"setting n_new_workers to {n_new_workers} in order to respect universal maxNewWorkers") + dyn_num_workers[queue_name][job_type][resource_type][pilot_type]["nNewWorkers"] = n_new_workers # adjust n_new_workers for UCORE to let aggregations over RT respect nQueueLimitWorker and max_workers if queue_config is None: @@ -401,7 +406,8 @@ def _normalize_job_type_any(queue_dict): for _jt in dyn_num_workers[queue_name]: for _rt in dyn_num_workers[queue_name][_jt]: if _jt != "ANY" and _rt != "ANY": - total_new_workers_rts = total_new_workers_rts + dyn_num_workers[queue_name][_jt][_rt]["nNewWorkers"] + for _pt in dyn_num_workers[queue_name][_jt][_rt]: + total_new_workers_rts = total_new_workers_rts + dyn_num_workers[queue_name][_jt][_rt][_pt]["nNewWorkers"] n_new_workers_max_agg = min(max(n_queue_limit - n_queue_total, 0), max(max_workers - n_queue_total - n_ready_total - n_running_total, 0)) if max_new_workers_per_cycle >= 0: n_new_workers_max_agg = min(n_new_workers_max_agg, max_new_workers_per_cycle) @@ -413,7 +419,8 @@ def _normalize_job_type_any(queue_dict): if n_new_workers_max_agg == 0: for job_type in dyn_num_workers[queue_name]: for resource_type in dyn_num_workers[queue_name][job_type]: - dyn_num_workers[queue_name][job_type][resource_type]["nNewWorkers"] = 0 + for pilot_type in dyn_num_workers[queue_name][job_type][resource_type]: + dyn_num_workers[queue_name][job_type][resource_type][pilot_type]["nNewWorkers"] = 0 tmp_log.debug("No n_new_workers since n_new_workers_max_agg=0 for UCORE") else: tmp_log.debug(f"n_new_workers_max_agg={n_new_workers_max_agg} for UCORE") @@ -424,33 +431,38 @@ def _normalize_job_type_any(queue_dict): simple_rt_nw_list = [] for job_type in _d: # jt: job type for resource_type in _d[job_type]: # rt: resource type - simple_rt_nw_list.append([(resource_type, job_type), _d[job_type][resource_type].get("nNewWorkers", 0), 0]) + for pilot_type in _d[job_type][resource_type]: # pt: pilot type + simple_rt_nw_list.append( + [(resource_type, job_type, pilot_type), _d[job_type][resource_type][pilot_type].get("nNewWorkers", 0), 0] + ) _countdown = n_new_workers_max_agg for _rt_list in simple_rt_nw_list: - (resource_type, job_type), n_new_workers_orig, _r = _rt_list + (resource_type, job_type, pilot_type), n_new_workers_orig, _r = _rt_list n_new_workers, remainder = divmod(n_new_workers_orig * n_new_workers_max_agg, total_new_workers_rts) - dyn_num_workers[queue_name][job_type].setdefault(resource_type, {"nReady": 0, "nRunning": 0, "nQueue": 0, "nNewWorkers": 0}) - dyn_num_workers[queue_name][job_type][resource_type]["nNewWorkers"] = n_new_workers + dyn_num_workers[queue_name][job_type].setdefault(resource_type, {}) + dyn_num_workers[queue_name][job_type][resource_type].setdefault( + pilot_type, {"nReady": 0, "nRunning": 0, "nQueue": 0, "nNewWorkers": 0} + ) + dyn_num_workers[queue_name][job_type][resource_type][pilot_type]["nNewWorkers"] = n_new_workers _rt_list[2] = remainder _countdown -= n_new_workers _s_list = sorted(simple_rt_nw_list, key=(lambda x: x[1])) sorted_rt_nw_list = sorted(_s_list, key=(lambda x: x[2]), reverse=True) - for (resource_type, job_type), n_new_workers_orig, remainder in sorted_rt_nw_list: + for (resource_type, job_type, pilot_type), n_new_workers_orig, remainder in sorted_rt_nw_list: if _countdown <= 0: break - dyn_num_workers[queue_name][job_type][resource_type]["nNewWorkers"] += 1 + dyn_num_workers[queue_name][job_type][resource_type][pilot_type]["nNewWorkers"] += 1 _countdown -= 1 for job_type in dyn_num_workers[queue_name]: for resource_type in dyn_num_workers[queue_name][job_type]: if job_type == "ANY" or resource_type == "ANY": continue - n_new_workers = dyn_num_workers[queue_name][job_type][resource_type]["nNewWorkers"] - tmp_log.debug( - "setting n_new_workers to {0} of job_type {1} resource_type {2} in order to respect RT aggregations for UCORE".format( - n_new_workers, job_type, resource_type + for pilot_type in dyn_num_workers[queue_name][job_type][resource_type]: + n_new_workers = dyn_num_workers[queue_name][job_type][resource_type][pilot_type]["nNewWorkers"] + tmp_log.debug( + f"setting n_new_workers to {n_new_workers} of job_type={job_type} resource_type={resource_type} pilot_type={pilot_type} in order to respect RT aggregations for UCORE" ) - ) if not apf_msg: apf_data = copy.deepcopy(dyn_num_workers[queue_name]) From be4fe53be6cd66138468bdda5d15338e1361cfa6 Mon Sep 17 00:00:00 2001 From: mightqxc Date: Thu, 26 Mar 2026 13:46:47 +0100 Subject: [PATCH 12/38] worker_adjuster: add logic for prod_source_label --- pandaharvester/commit_timestamp.py | 2 +- .../harvesterbody/worker_adjuster.py | 131 +++++++++++++++++- pandaharvester/harvestercore/core_utils.py | 6 +- pandaharvester/harvestercore/db_proxy.py | 10 +- 4 files changed, 143 insertions(+), 6 deletions(-) diff --git a/pandaharvester/commit_timestamp.py b/pandaharvester/commit_timestamp.py index 0cb76980..33aabcf1 100644 --- a/pandaharvester/commit_timestamp.py +++ b/pandaharvester/commit_timestamp.py @@ -1 +1 @@ -timestamp = "25-03-2026 23:38:44 on flin (by mightqxc)" +timestamp = "26-03-2026 12:46:47 on flin (by mightqxc)" diff --git a/pandaharvester/harvesterbody/worker_adjuster.py b/pandaharvester/harvesterbody/worker_adjuster.py index f6a0ba90..0f635f73 100644 --- a/pandaharvester/harvesterbody/worker_adjuster.py +++ b/pandaharvester/harvesterbody/worker_adjuster.py @@ -2,6 +2,8 @@ import math import traceback +import polars + from pandaharvester.harvesterconfig import harvester_config from pandaharvester.harvestercore import core_utils from pandaharvester.harvestercore.db_proxy_pool import DBProxyPool as DBProxy @@ -53,6 +55,95 @@ def __init__(self, queue_config_mapper): tmp_log.warning("default no_pilots_when_no_active_jobs = False") self.no_pilots_when_no_active_jobs = False + # transform job statistics dict to polars dataframe + def _job_stats_to_df(self, job_stats_dict: dict | None) -> polars.DataFrame: + """ + Transform nested job statistics dict into a polars dataframe. + + Args: + job_stats_dict (dict|None): Dict structure from getDetailedJobStatistics with form + {computing_site: {resource_type: {prod_source_label: {job_status: n_jobs}}}} + If None, returns an empty dataframe with the correct schema. + + Returns: + polars.DataFrame: Dataframe with columns: computing_site (Utf8), resource_type (Utf8), + prod_source_label (Utf8), job_status (Utf8), n_jobs (Int64) + """ + schema = { + "computing_site": polars.Utf8, + "resource_type": polars.Utf8, + "prod_source_label": polars.Utf8, + "job_status": polars.Utf8, + "n_jobs": polars.Int64, + } + if job_stats_dict is None: + return polars.DataFrame(schema=schema) + else: + return polars.from_records( + [ + { + "computing_site": computing_site, + "resource_type": resource_type, + "prod_source_label": prod_source_label, + "job_status": job_status, + "n_jobs": n_jobs, + } + for computing_site, resource_types in job_stats_dict.items() + for resource_type, prod_labels in resource_types.items() + for prod_source_label, statuses in prod_labels.items() + for job_status, n_jobs in statuses.items() + ], + schema=schema, + ) + + # transform num workers dict to polars dataframe + def _num_workers_dict_to_df(self, num_workers_dict: dict | None) -> polars.DataFrame: + """ + Transform nested num workers dict into a polars dataframe. + + Args: + num_workers_dict (dict|None): Dict structure with form + {queue_name: {job_type: {resource_type: {pilot_type: {"nQueue": int, "nReady": int, "nRunning": int, "nNewWorkers": int}}}}} + If None, returns an empty dataframe with the correct schema. + + Returns: + polars.DataFrame: Dataframe with columns: queue_name (Utf8), job_type (Utf8), + resource_type (Utf8), pilot_type (Utf8), nQueue (Int64), nReady (Int64), nRunning (Int64), nNewWorkers (Int64) + """ + schema = { + "queue_name": polars.Utf8, + "job_type": polars.Utf8, + "resource_type": polars.Utf8, + "pilot_type": polars.Utf8, + "nQueue": polars.Int64, + "nReady": polars.Int64, + "nRunning": polars.Int64, + "nNewWorkers": polars.Int64, + } + + if num_workers_dict is None: + return polars.DataFrame(schema=schema) + else: + return polars.from_records( + [ + { + "queue_name": queue_name, + "job_type": job_type, + "resource_type": resource_type, + "pilot_type": pilot_type, + "nQueue": pilot_data.get("nQueue", 0), + "nReady": pilot_data.get("nReady", 0), + "nRunning": pilot_data.get("nRunning", 0), + "nNewWorkers": pilot_data.get("nNewWorkers", 0), + } + for queue_name, job_types in num_workers_dict.items() + for job_type, resource_types in job_types.items() + for resource_type, pilot_types in resource_types.items() + for pilot_type, pilot_data in pilot_types.items() + ], + schema=schema, + ) + # get queue noPilotsWhenNoActiveJobs def get_queue_no_pilots_when_no_active_jobs(self, site_name=None): tmp_log = core_utils.make_logger(_logger, f"site={site_name}", method_name="get_queue_no_pilots_when_no_active_jobs") @@ -196,7 +287,6 @@ def _normalize_job_type_any(queue_dict): for queue_name, queue_dict in static_num_workers.items(): _normalize_job_type_any(queue_dict) - dyn_num_workers = copy.deepcopy(static_num_workers) try: # get queue status queue_stat = self.dbProxy.get_cache("panda_queues.json", None) @@ -210,12 +300,47 @@ def _normalize_job_type_any(queue_dict): if job_stats is not None: job_stats = job_stats.data + job_stats_new_df = self._job_stats_to_df(None) + job_stats_new = self.dbProxy.get_cache("job_statistics_new.json", None) + if job_stats_new is not None: + job_stats_new_df = self._job_stats_to_df(job_stats_new.data) + + # prioritized prod_source_labels for pilot submission + PRIORITIZED_PROD_SOURCE_LABELS = ["rc_alrb"] + # get panda queues dict from CRIC panda_queues_dict = PandaQueuesDict() # get resource type mapper rt_mapper = ResourceTypeMapper() + # set initial nNewWorkers for pilot types based on number of activated jobs + tmp_new_workers_df = self._static_num_workers_to_df(static_num_workers) + tmp_master_df = ( + job_stats_new_df.filter(polars.col("job_status") == "activated") + .with_columns( + polars.col("prod_source_label").map_elements(core_utils.prod_source_label_to_pilot_type, return_dtype=polars.Utf8).alias("pilot_type") + ) + .join( + tmp_new_workers_df, + left_on=["computing_site", "resource_type", "pilot_type"], + right_on=["queue_name", "resource_type", "pilot_type"], + how="right", + ) + .group_by(["queue_name", "job_type", "resource_type", "pilot_type"]) + .agg( + polars.col("nQueue").max(), + polars.col("nReady").max(), + polars.col("nRunning").max(), + polars.col("nNewWorkers").max(), + polars.col("n_jobs").sum().alias("n_activated_jobs"), + ) + ) + tmp_log.debug(f"master_df: \n{tmp_master_df}") + ... + + dyn_num_workers = copy.deepcopy(static_num_workers) + # define num of new workers for queue_name in static_num_workers: # get queue @@ -238,8 +363,12 @@ def _normalize_job_type_any(queue_dict): apf_msg = None apf_data = None job_type = DEFAULT_JOB_TYPE + # loop over resource types and pilot types to define nNewWorkers for resource_type, pilot_type_dict in static_num_workers[queue_name][job_type].items(): for pilot_type, tmp_val in pilot_type_dict.items(): + if pilot_type == "ANY": + continue + tmp_log.debug( f"Processing queue={queue_name} job_type={job_type} resource_type={resource_type} pilot_type={pilot_type} with static_num_workers={tmp_val}" ) diff --git a/pandaharvester/harvestercore/core_utils.py b/pandaharvester/harvestercore/core_utils.py index fc283c52..16da5901 100644 --- a/pandaharvester/harvestercore/core_utils.py +++ b/pandaharvester/harvestercore/core_utils.py @@ -708,7 +708,7 @@ def naive_utcfromtimestamp(timestamp: float) -> datetime: return aware_utcfromtimestamp(timestamp).replace(tzinfo=None) -def special_pilot_type_to_prod_source_label(pilot_type: str) -> str | None: +def special_pilot_type_to_prod_source_label(pilot_type: str) -> str: """ Convert special pilotType of worker to prodSourceLabel of PanDA job. @@ -716,14 +716,14 @@ def special_pilot_type_to_prod_source_label(pilot_type: str) -> str | None: pilot_type (str): pilotType of worker, e.g. "RC", "ALRB", "PT" (except "PR" which is production) Returns: - str: prodSourceLabel of PanDA job, e.g. "rc_test2", "rc_alrb", "ptest"; None if no mapping is defined for the given pilot_type + str: prodSourceLabel of PanDA job, e.g. "rc_test2", "rc_alrb", "ptest"; "ANY" if no mapping is defined for the given pilot_type """ pilot_type_to_prod_source_label_map = { "RC": "rc_test2", "ALRB": "rc_alrb", "PT": "ptest", } - return pilot_type_to_prod_source_label_map.get(pilot_type, None) + return pilot_type_to_prod_source_label_map.get(pilot_type, "ANY") def prod_source_label_to_pilot_type(prod_source_label: str) -> str: diff --git a/pandaharvester/harvestercore/db_proxy.py b/pandaharvester/harvestercore/db_proxy.py index 4e6bc03c..96602d80 100644 --- a/pandaharvester/harvestercore/db_proxy.py +++ b/pandaharvester/harvestercore/db_proxy.py @@ -1783,8 +1783,16 @@ def get_queues_to_submit(self, lookup_interval, lock_interval, locked_by, queue_ "nReady": nReady, "nRunning": nRunning, "nQueue": nQueue, - "nNewWorkers": nNewWorkers, + "nNewWorkers": 0, } + # ANY pilotType + retMap[queueName][jobType][resourceType]["ANY"].setdefault(pilotType, {"nReady": 0, "nRunning": 0, "nQueue": 0, "nNewWorkers": 0}) + retMap[queueName][jobType][resourceType]["ANY"][pilotType]["nReady"] += nReady + retMap[queueName][jobType][resourceType]["ANY"][pilotType]["nRunning"] += nRunning + retMap[queueName][jobType][resourceType]["ANY"][pilotType]["nQueue"] += nQueue + + # set nNewWorkers only in ANY pilotType + retMap[queueName][jobType][resourceType]["ANY"]["nNewWorkers"] = nNewWorkers resourceMap.setdefault(jobType, {}) resourceMap[jobType][resourceType] = queueName From 5486bfe5efb6d1fa77321ea38078327549f628a7 Mon Sep 17 00:00:00 2001 From: mightqxc Date: Thu, 26 Mar 2026 16:29:45 +0100 Subject: [PATCH 13/38] work_adjuster: more logic, dependency --- pandaharvester/commit_timestamp.py | 2 +- .../harvesterbody/worker_adjuster.py | 168 +++++++++++++----- pandaharvester/harvestercore/db_proxy.py | 1 + setup.py | 1 + 4 files changed, 128 insertions(+), 44 deletions(-) diff --git a/pandaharvester/commit_timestamp.py b/pandaharvester/commit_timestamp.py index 33aabcf1..58297b4a 100644 --- a/pandaharvester/commit_timestamp.py +++ b/pandaharvester/commit_timestamp.py @@ -1 +1 @@ -timestamp = "26-03-2026 12:46:47 on flin (by mightqxc)" +timestamp = "26-03-2026 15:29:46 on flin (by mightqxc)" diff --git a/pandaharvester/harvesterbody/worker_adjuster.py b/pandaharvester/harvesterbody/worker_adjuster.py index 0f635f73..8a11e1f2 100644 --- a/pandaharvester/harvesterbody/worker_adjuster.py +++ b/pandaharvester/harvesterbody/worker_adjuster.py @@ -2,7 +2,7 @@ import math import traceback -import polars +import polars as pl from pandaharvester.harvesterconfig import harvester_config from pandaharvester.harvestercore import core_utils @@ -17,6 +17,14 @@ DEFAULT_JOB_TYPE = "managed" +# polars config +pl.Config.set_ascii_tables(True) +pl.Config.set_tbl_hide_dataframe_shape(True) +pl.Config.set_tbl_hide_column_data_types(True) +pl.Config.set_tbl_rows(-1) +pl.Config.set_tbl_cols(-1) +pl.Config.set_tbl_width_chars(140) + # class to define number of workers to submit class WorkerAdjuster(object): @@ -56,7 +64,7 @@ def __init__(self, queue_config_mapper): self.no_pilots_when_no_active_jobs = False # transform job statistics dict to polars dataframe - def _job_stats_to_df(self, job_stats_dict: dict | None) -> polars.DataFrame: + def _job_stats_to_df(self, job_stats_dict: dict | None) -> pl.DataFrame: """ Transform nested job statistics dict into a polars dataframe. @@ -70,16 +78,16 @@ def _job_stats_to_df(self, job_stats_dict: dict | None) -> polars.DataFrame: prod_source_label (Utf8), job_status (Utf8), n_jobs (Int64) """ schema = { - "computing_site": polars.Utf8, - "resource_type": polars.Utf8, - "prod_source_label": polars.Utf8, - "job_status": polars.Utf8, - "n_jobs": polars.Int64, + "computing_site": pl.Utf8, + "resource_type": pl.Utf8, + "prod_source_label": pl.Utf8, + "job_status": pl.Utf8, + "n_jobs": pl.Int64, } if job_stats_dict is None: - return polars.DataFrame(schema=schema) + return pl.DataFrame(schema=schema) else: - return polars.from_records( + return pl.from_records( [ { "computing_site": computing_site, @@ -97,7 +105,7 @@ def _job_stats_to_df(self, job_stats_dict: dict | None) -> polars.DataFrame: ) # transform num workers dict to polars dataframe - def _num_workers_dict_to_df(self, num_workers_dict: dict | None) -> polars.DataFrame: + def _num_workers_dict_to_df(self, num_workers_dict: dict | None) -> pl.DataFrame: """ Transform nested num workers dict into a polars dataframe. @@ -111,20 +119,20 @@ def _num_workers_dict_to_df(self, num_workers_dict: dict | None) -> polars.DataF resource_type (Utf8), pilot_type (Utf8), nQueue (Int64), nReady (Int64), nRunning (Int64), nNewWorkers (Int64) """ schema = { - "queue_name": polars.Utf8, - "job_type": polars.Utf8, - "resource_type": polars.Utf8, - "pilot_type": polars.Utf8, - "nQueue": polars.Int64, - "nReady": polars.Int64, - "nRunning": polars.Int64, - "nNewWorkers": polars.Int64, + "queue_name": pl.Utf8, + "job_type": pl.Utf8, + "resource_type": pl.Utf8, + "pilot_type": pl.Utf8, + "nQueue": pl.Int64, + "nReady": pl.Int64, + "nRunning": pl.Int64, + "nNewWorkers": pl.Int64, } if num_workers_dict is None: - return polars.DataFrame(schema=schema) + return pl.DataFrame(schema=schema) else: - return polars.from_records( + return pl.from_records( [ { "queue_name": queue_name, @@ -314,30 +322,107 @@ def _normalize_job_type_any(queue_dict): # get resource type mapper rt_mapper = ResourceTypeMapper() - # set initial nNewWorkers for pilot types based on number of activated jobs - tmp_new_workers_df = self._static_num_workers_to_df(static_num_workers) - tmp_master_df = ( - job_stats_new_df.filter(polars.col("job_status") == "activated") - .with_columns( - polars.col("prod_source_label").map_elements(core_utils.prod_source_label_to_pilot_type, return_dtype=polars.Utf8).alias("pilot_type") + for queue_name in static_num_workers: + # set initial nNewWorkers for pilot types based on number of activated jobs + tmp_new_workers_df = ( + self._num_workers_dict_to_df(static_num_workers) + .filter(pl.col("queue_name") == queue_name) + .with_columns( + [ + pl.col("queue_name").fill_null(pl.lit(queue_name)), + pl.col("resource_type").fill_null(pl.lit("ANY")), + pl.col("pilot_type").fill_null(pl.lit("ANY")), + pl.col("nQueue").fill_null(0), + pl.col("nReady").fill_null(0), + pl.col("nRunning").fill_null(0), + pl.col("nNewWorkers").fill_null(0), + ] + ) + ) + # tmp_log.debug(f"DEBUG: tmp_new_workers_df after filter shape: {tmp_new_workers_df.shape}") + # tmp_log.debug(f"DEBUG: tmp_new_workers_df columns: {tmp_new_workers_df.columns}") + # tmp_log.debug(f"DEBUG: tmp_new_workers_df:\n{tmp_new_workers_df}") + + activated_df = ( + job_stats_new_df.filter((pl.col("computing_site") == queue_name) & (pl.col("job_status") == "activated")) + .with_columns( + pl.col("computing_site").alias("queue_name"), + pl.col("prod_source_label").map_elements(core_utils.prod_source_label_to_pilot_type, return_dtype=pl.Utf8).alias("pilot_type"), + ) + .select(["queue_name", "resource_type", "pilot_type", "n_jobs"]) + ) + # Add aggregated rows with resource_type="ANY" (sum over all resource_types for each pilot_type) + activated_df_any_rt = ( + activated_df.group_by(["queue_name", "pilot_type"]) + .agg(pl.col("n_jobs").sum()) + .with_columns(pl.lit("ANY").alias("resource_type")) + .select(["queue_name", "resource_type", "pilot_type", "n_jobs"]) + ) + # Add aggregated rows with pilot_type="ANY" (sum over all pilot_types for each resource_type) + activated_df_any_pt = ( + activated_df.group_by(["queue_name", "resource_type"]) + .agg(pl.col("n_jobs").sum()) + .with_columns(pl.lit("ANY").alias("pilot_type")) + .select(["queue_name", "resource_type", "pilot_type", "n_jobs"]) + ) + # Add aggregated row with both resource_type="ANY" and pilot_type="ANY" (sum over all) + activated_df_any_both = ( + activated_df.select(pl.col("n_jobs").sum()) + .with_columns(pl.lit(queue_name).alias("queue_name"), pl.lit("ANY").alias("resource_type"), pl.lit("ANY").alias("pilot_type")) + .select(["queue_name", "resource_type", "pilot_type", "n_jobs"]) ) - .join( + + activated_df = pl.concat([activated_df, activated_df_any_rt, activated_df_any_pt, activated_df_any_both]) + # tmp_log.debug(f"DEBUG: activated_df after filter shape: {activated_df.shape}") + # tmp_log.debug(f"DEBUG: activated_df columns: {activated_df.columns}") + # tmp_log.debug(f"DEBUG: activated_df:\n{activated_df}") + + joined_df = activated_df.join( tmp_new_workers_df, - left_on=["computing_site", "resource_type", "pilot_type"], - right_on=["queue_name", "resource_type", "pilot_type"], - how="right", + on=["queue_name", "resource_type", "pilot_type"], + how="full", + ).with_columns( + [ + pl.col("queue_name").fill_null(pl.lit(queue_name)), + pl.col("resource_type").fill_null(pl.lit("ANY")), + pl.col("pilot_type").fill_null(pl.lit("ANY")), + pl.col("nQueue").fill_null(0), + pl.col("nReady").fill_null(0), + pl.col("nRunning").fill_null(0), + pl.col("nNewWorkers").fill_null(0), + pl.col("job_type").fill_null(DEFAULT_JOB_TYPE), + ] ) - .group_by(["queue_name", "job_type", "resource_type", "pilot_type"]) - .agg( - polars.col("nQueue").max(), - polars.col("nReady").max(), - polars.col("nRunning").max(), - polars.col("nNewWorkers").max(), - polars.col("n_jobs").sum().alias("n_activated_jobs"), + # tmp_log.debug(f"DEBUG: joined_df shape: {joined_df.shape}") + # tmp_log.debug(f"DEBUG: joined_df columns: {joined_df.columns}") + # tmp_log.debug(f"DEBUG: joined_df:\n{joined_df}") + + tmp_master_df = ( + joined_df.group_by(["queue_name", "job_type", "resource_type", "pilot_type"]) + .agg( + pl.col("nQueue").max(), + pl.col("nReady").max(), + pl.col("nRunning").max(), + pl.col("nNewWorkers").max(), + pl.col("n_jobs").fill_null(0).sum().alias("n_activated_jobs"), + ) + .sort( + [ + "queue_name", + "job_type", + pl.when(pl.col("resource_type") == "ANY").then(1).otherwise(0), + "resource_type", + pl.when(pl.col("pilot_type") == "ANY").then(1).otherwise(0), + "pilot_type", + ] + ) ) - ) - tmp_log.debug(f"master_df: \n{tmp_master_df}") - ... + tmp_log.debug(f"master_df: \n{tmp_master_df}") + ... + for job_type in static_num_workers[queue_name]: + # remove pilot type ANY + for resource_type, pilot_type_dict in static_num_workers[queue_name][job_type].items(): + del pilot_type_dict["ANY"] dyn_num_workers = copy.deepcopy(static_num_workers) @@ -366,9 +451,6 @@ def _normalize_job_type_any(queue_dict): # loop over resource types and pilot types to define nNewWorkers for resource_type, pilot_type_dict in static_num_workers[queue_name][job_type].items(): for pilot_type, tmp_val in pilot_type_dict.items(): - if pilot_type == "ANY": - continue - tmp_log.debug( f"Processing queue={queue_name} job_type={job_type} resource_type={resource_type} pilot_type={pilot_type} with static_num_workers={tmp_val}" ) diff --git a/pandaharvester/harvestercore/db_proxy.py b/pandaharvester/harvestercore/db_proxy.py index 96602d80..d0b44ca3 100644 --- a/pandaharvester/harvestercore/db_proxy.py +++ b/pandaharvester/harvestercore/db_proxy.py @@ -1786,6 +1786,7 @@ def get_queues_to_submit(self, lookup_interval, lock_interval, locked_by, queue_ "nNewWorkers": 0, } # ANY pilotType + retMap[queueName][jobType][resourceType].setdefault("ANY", {}) retMap[queueName][jobType][resourceType]["ANY"].setdefault(pilotType, {"nReady": 0, "nRunning": 0, "nQueue": 0, "nNewWorkers": 0}) retMap[queueName][jobType][resourceType]["ANY"][pilotType]["nReady"] += nReady retMap[queueName][jobType][resourceType]["ANY"][pilotType]["nRunning"] += nRunning diff --git a/setup.py b/setup.py index e70194d2..8997b1ce 100644 --- a/setup.py +++ b/setup.py @@ -36,6 +36,7 @@ "pexpect", "psutil >= 5.4.8", "panda-pilot >= 2.7.2.1", + "polars", ], # optional pip dependencies extras_require={ From 8d9a70791a95796ccf922a1d38d171edd59b7be7 Mon Sep 17 00:00:00 2001 From: mightqxc Date: Thu, 26 Mar 2026 18:52:14 +0100 Subject: [PATCH 14/38] fixes --- pandaharvester/commit_timestamp.py | 2 +- pandaharvester/harvesterbody/submitter.py | 2 +- .../harvesterbody/worker_adjuster.py | 118 ++++++++++++++++-- pandaharvester/harvestercore/db_proxy.py | 6 +- 4 files changed, 116 insertions(+), 12 deletions(-) diff --git a/pandaharvester/commit_timestamp.py b/pandaharvester/commit_timestamp.py index 58297b4a..8b32d745 100644 --- a/pandaharvester/commit_timestamp.py +++ b/pandaharvester/commit_timestamp.py @@ -1 +1 @@ -timestamp = "26-03-2026 15:29:46 on flin (by mightqxc)" +timestamp = "26-03-2026 17:52:15 on flin (by mightqxc)" diff --git a/pandaharvester/harvesterbody/submitter.py b/pandaharvester/harvesterbody/submitter.py index 0aaf4059..e1f05ea7 100644 --- a/pandaharvester/harvesterbody/submitter.py +++ b/pandaharvester/harvesterbody/submitter.py @@ -96,7 +96,7 @@ def run(self): prod_source_label = core_utils.special_pilot_type_to_prod_source_label(pilot_type) tmp_log = self.make_logger( _logger, - f"id={locked_by} queue={queue_name} jtype={job_type} rtype={resource_type} pslabel={prod_source_label}", + f"id={locked_by} queue={queue_name} jtype={job_type} rtype={resource_type} ptype={pilot_type} pslabel={prod_source_label}", method_name="run", ) try: diff --git a/pandaharvester/harvesterbody/worker_adjuster.py b/pandaharvester/harvesterbody/worker_adjuster.py index 8a11e1f2..d3799a33 100644 --- a/pandaharvester/harvesterbody/worker_adjuster.py +++ b/pandaharvester/harvesterbody/worker_adjuster.py @@ -315,6 +315,7 @@ def _normalize_job_type_any(queue_dict): # prioritized prod_source_labels for pilot submission PRIORITIZED_PROD_SOURCE_LABELS = ["rc_alrb"] + PRIORITIZED_PILOT_TYPES = [core_utils.prod_source_label_to_pilot_type(label) for label in PRIORITIZED_PROD_SOURCE_LABELS] # get panda queues dict from CRIC panda_queues_dict = PandaQueuesDict() @@ -323,7 +324,6 @@ def _normalize_job_type_any(queue_dict): rt_mapper = ResourceTypeMapper() for queue_name in static_num_workers: - # set initial nNewWorkers for pilot types based on number of activated jobs tmp_new_workers_df = ( self._num_workers_dict_to_df(static_num_workers) .filter(pl.col("queue_name") == queue_name) @@ -417,14 +417,118 @@ def _normalize_job_type_any(queue_dict): ] ) ) - tmp_log.debug(f"master_df: \n{tmp_master_df}") - ... - for job_type in static_num_workers[queue_name]: - # remove pilot type ANY - for resource_type, pilot_type_dict in static_num_workers[queue_name][job_type].items(): - del pilot_type_dict["ANY"] + # tmp_log.debug(f"master_df: \n{tmp_master_df}") + master_df = tmp_master_df.clone() + + tmp_static_num_workers = copy.deepcopy(static_num_workers) + + # update tmp_static_num_workers with tmp_master_df + # for row in tmp_master_df.iter_rows(named=True): + # queue_name_from_row = row["queue_name"] + # job_type = row["job_type"] + # resource_type = row["resource_type"] + # pilot_type = row["pilot_type"] + # # create missing keys in nested dictionary + # if queue_name_from_row not in tmp_static_num_workers: + # tmp_static_num_workers[queue_name_from_row] = {} + # if job_type not in tmp_static_num_workers[queue_name_from_row]: + # tmp_static_num_workers[queue_name_from_row][job_type] = {} + # if resource_type not in tmp_static_num_workers[queue_name_from_row][job_type]: + # tmp_static_num_workers[queue_name_from_row][job_type][resource_type] = {} + # if pilot_type not in tmp_static_num_workers[queue_name_from_row][job_type][resource_type]: + # tmp_static_num_workers[queue_name_from_row][job_type][resource_type][pilot_type] = {} + # # update values + # tmp_static_num_workers[queue_name_from_row][job_type][resource_type][pilot_type].update({ + # "nQueue": row["nQueue"], + # "nReady": row["nReady"], + # "nRunning": row["nRunning"], + # "nNewWorkers": row["nNewWorkers"], + # }) + + queue_config = self.queue_configMapper.get_queue(queue_name) + queue_dict = panda_queues_dict.get(queue_name, {}) + # set initial nNewWorkers for pilot types based on number of activated jobs and the activate worker factor + for job_type in tmp_static_num_workers[queue_name]: + for resource_type, pilot_type_dict in tmp_static_num_workers[queue_name][job_type].items(): + total_n_new_workers = pilot_type_dict["ANY"]["nNewWorkers"] + if total_n_new_workers <= 0: + continue + # calculate the total number of new workers needed for prioritized pilot types + remaining_n_new_workers = total_n_new_workers + activate_worker_factor = self.get_activate_worker_factor(queue_name, job_type, resource_type, queue_dict, queue_config) + prio_ptype_result = tmp_master_df.filter( + (pl.col("queue_name") == queue_name) + & (pl.col("job_type") == job_type) + & (pl.col("resource_type") == resource_type) + & (pl.col("pilot_type").is_in(PRIORITIZED_PILOT_TYPES)) + ).select([pl.col("n_activated_jobs").sum(), pl.col("nQueue").sum()]) + if prio_ptype_result.shape[0] > 0: + total_prio_ptype_n_activated_jobs, total_prio_ptype_nQueue = prio_ptype_result.row(0) + else: + total_prio_ptype_n_activated_jobs, total_prio_ptype_nQueue = 0, 0 + total_prio_ptype_calculated_n_new_workers = max( + int(total_prio_ptype_n_activated_jobs * activate_worker_factor) - total_prio_ptype_nQueue, 0 + ) + if total_prio_ptype_calculated_n_new_workers > 0: + adjust_ratio = min(total_n_new_workers / total_prio_ptype_calculated_n_new_workers, 1) + for pilot_type, tmp_val in pilot_type_dict.items(): + if pilot_type in PRIORITIZED_PILOT_TYPES: + pt_result = tmp_master_df.filter( + (pl.col("queue_name") == queue_name) + & (pl.col("job_type") == job_type) + & (pl.col("resource_type") == resource_type) + & (pl.col("pilot_type") == pilot_type) + ).select([pl.col("n_activated_jobs"), pl.col("nQueue")]) + if pt_result.shape[0] > 0: + n_activated_jobs, nQueue = pt_result.row(0) + else: + n_activated_jobs, nQueue = 0, 0 + calculated_n_new_workers = int(max(int(n_activated_jobs * activate_worker_factor) - nQueue, 0) * adjust_ratio) + if calculated_n_new_workers <= 0: + continue + tmp_static_num_workers[queue_name][job_type][resource_type][pilot_type]["nNewWorkers"] = calculated_n_new_workers + remaining_n_new_workers -= calculated_n_new_workers + master_df = master_df.with_columns( + pl.when( + (pl.col("queue_name") == queue_name) + & (pl.col("job_type") == job_type) + & (pl.col("resource_type") == resource_type) + & (pl.col("pilot_type") == pilot_type) + ) + .then(pl.lit(calculated_n_new_workers)) + .otherwise(pl.col("nNewWorkers")) + .alias("nNewWorkers") + ) + tmp_log.debug( + f"set initial nNewWorkers to {calculated_n_new_workers} for queue={queue_name} job_type={job_type} resource_type={resource_type} pilot_type={pilot_type}" + ) + if remaining_n_new_workers > 0: + # add remaining n_new_workers to PR pilot_type + tmp_static_num_workers[queue_name][job_type][resource_type]["PR"]["nNewWorkers"] += remaining_n_new_workers + master_df = master_df.with_columns( + pl.when( + (pl.col("queue_name") == queue_name) + & (pl.col("job_type") == job_type) + & (pl.col("resource_type") == resource_type) + & (pl.col("pilot_type") == "PR") + ) + .then(pl.lit(remaining_n_new_workers)) + .otherwise(pl.col("nNewWorkers")) + .alias("nNewWorkers") + ) + tmp_log.debug(f"master_df: \n{master_df}") + # remove pilot type ANY + for job_type in tmp_static_num_workers[queue_name]: + for resource_type, pilot_type_dict in tmp_static_num_workers[queue_name][job_type].items(): + if "ANY" in pilot_type_dict: + del pilot_type_dict["ANY"] dyn_num_workers = copy.deepcopy(static_num_workers) + for queue_name in dyn_num_workers: + for job_type in dyn_num_workers[queue_name]: + for resource_type, pilot_type_dict in dyn_num_workers[queue_name][job_type].items(): + if "ANY" in pilot_type_dict: + del pilot_type_dict["ANY"] # define num of new workers for queue_name in static_num_workers: diff --git a/pandaharvester/harvestercore/db_proxy.py b/pandaharvester/harvestercore/db_proxy.py index d0b44ca3..6138f0f8 100644 --- a/pandaharvester/harvestercore/db_proxy.py +++ b/pandaharvester/harvestercore/db_proxy.py @@ -1788,9 +1788,9 @@ def get_queues_to_submit(self, lookup_interval, lock_interval, locked_by, queue_ # ANY pilotType retMap[queueName][jobType][resourceType].setdefault("ANY", {}) retMap[queueName][jobType][resourceType]["ANY"].setdefault(pilotType, {"nReady": 0, "nRunning": 0, "nQueue": 0, "nNewWorkers": 0}) - retMap[queueName][jobType][resourceType]["ANY"][pilotType]["nReady"] += nReady - retMap[queueName][jobType][resourceType]["ANY"][pilotType]["nRunning"] += nRunning - retMap[queueName][jobType][resourceType]["ANY"][pilotType]["nQueue"] += nQueue + retMap[queueName][jobType][resourceType]["ANY"]["nReady"] += nReady + retMap[queueName][jobType][resourceType]["ANY"]["nRunning"] += nRunning + retMap[queueName][jobType][resourceType]["ANY"]["nQueue"] += nQueue # set nNewWorkers only in ANY pilotType retMap[queueName][jobType][resourceType]["ANY"]["nNewWorkers"] = nNewWorkers From a89323f59368eb4593877a875ed95184ebc879c7 Mon Sep 17 00:00:00 2001 From: mightqxc Date: Thu, 26 Mar 2026 23:44:35 +0100 Subject: [PATCH 15/38] fix --- pandaharvester/commit_timestamp.py | 2 +- pandaharvester/harvestercore/db_proxy.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/pandaharvester/commit_timestamp.py b/pandaharvester/commit_timestamp.py index 8b32d745..772e2ced 100644 --- a/pandaharvester/commit_timestamp.py +++ b/pandaharvester/commit_timestamp.py @@ -1 +1 @@ -timestamp = "26-03-2026 17:52:15 on flin (by mightqxc)" +timestamp = "26-03-2026 22:44:37 on flin (by mightqxc)" diff --git a/pandaharvester/harvestercore/db_proxy.py b/pandaharvester/harvestercore/db_proxy.py index 6138f0f8..d5008e15 100644 --- a/pandaharvester/harvestercore/db_proxy.py +++ b/pandaharvester/harvestercore/db_proxy.py @@ -1786,8 +1786,7 @@ def get_queues_to_submit(self, lookup_interval, lock_interval, locked_by, queue_ "nNewWorkers": 0, } # ANY pilotType - retMap[queueName][jobType][resourceType].setdefault("ANY", {}) - retMap[queueName][jobType][resourceType]["ANY"].setdefault(pilotType, {"nReady": 0, "nRunning": 0, "nQueue": 0, "nNewWorkers": 0}) + retMap[queueName][jobType][resourceType].setdefault("ANY", {"nReady": 0, "nRunning": 0, "nQueue": 0, "nNewWorkers": 0}) retMap[queueName][jobType][resourceType]["ANY"]["nReady"] += nReady retMap[queueName][jobType][resourceType]["ANY"]["nRunning"] += nRunning retMap[queueName][jobType][resourceType]["ANY"]["nQueue"] += nQueue From 97671c0561bac4213a1a0ee8d3d88bdf79364676 Mon Sep 17 00:00:00 2001 From: mightqxc Date: Thu, 26 Mar 2026 23:53:06 +0100 Subject: [PATCH 16/38] fix --- pandaharvester/commit_timestamp.py | 2 +- .../harvesterbody/worker_adjuster.py | 48 ++++++++++--------- 2 files changed, 26 insertions(+), 24 deletions(-) diff --git a/pandaharvester/commit_timestamp.py b/pandaharvester/commit_timestamp.py index 772e2ced..ace6d35c 100644 --- a/pandaharvester/commit_timestamp.py +++ b/pandaharvester/commit_timestamp.py @@ -1 +1 @@ -timestamp = "26-03-2026 22:44:37 on flin (by mightqxc)" +timestamp = "26-03-2026 22:53:06 on flin (by mightqxc)" diff --git a/pandaharvester/harvesterbody/worker_adjuster.py b/pandaharvester/harvesterbody/worker_adjuster.py index d3799a33..638736a2 100644 --- a/pandaharvester/harvesterbody/worker_adjuster.py +++ b/pandaharvester/harvesterbody/worker_adjuster.py @@ -423,27 +423,29 @@ def _normalize_job_type_any(queue_dict): tmp_static_num_workers = copy.deepcopy(static_num_workers) # update tmp_static_num_workers with tmp_master_df - # for row in tmp_master_df.iter_rows(named=True): - # queue_name_from_row = row["queue_name"] - # job_type = row["job_type"] - # resource_type = row["resource_type"] - # pilot_type = row["pilot_type"] - # # create missing keys in nested dictionary - # if queue_name_from_row not in tmp_static_num_workers: - # tmp_static_num_workers[queue_name_from_row] = {} - # if job_type not in tmp_static_num_workers[queue_name_from_row]: - # tmp_static_num_workers[queue_name_from_row][job_type] = {} - # if resource_type not in tmp_static_num_workers[queue_name_from_row][job_type]: - # tmp_static_num_workers[queue_name_from_row][job_type][resource_type] = {} - # if pilot_type not in tmp_static_num_workers[queue_name_from_row][job_type][resource_type]: - # tmp_static_num_workers[queue_name_from_row][job_type][resource_type][pilot_type] = {} - # # update values - # tmp_static_num_workers[queue_name_from_row][job_type][resource_type][pilot_type].update({ - # "nQueue": row["nQueue"], - # "nReady": row["nReady"], - # "nRunning": row["nRunning"], - # "nNewWorkers": row["nNewWorkers"], - # }) + for row in tmp_master_df.iter_rows(named=True): + queue_name_from_row = row["queue_name"] + job_type = row["job_type"] + resource_type = row["resource_type"] + pilot_type = row["pilot_type"] + # create missing keys in nested dictionary + if queue_name_from_row not in tmp_static_num_workers: + tmp_static_num_workers[queue_name_from_row] = {} + if job_type not in tmp_static_num_workers[queue_name_from_row]: + tmp_static_num_workers[queue_name_from_row][job_type] = {} + if resource_type not in tmp_static_num_workers[queue_name_from_row][job_type]: + tmp_static_num_workers[queue_name_from_row][job_type][resource_type] = {} + if pilot_type not in tmp_static_num_workers[queue_name_from_row][job_type][resource_type]: + tmp_static_num_workers[queue_name_from_row][job_type][resource_type][pilot_type] = {} + # update values + tmp_static_num_workers[queue_name_from_row][job_type][resource_type][pilot_type].update( + { + "nQueue": row["nQueue"], + "nReady": row["nReady"], + "nRunning": row["nRunning"], + "nNewWorkers": row["nNewWorkers"], + } + ) queue_config = self.queue_configMapper.get_queue(queue_name) queue_dict = panda_queues_dict.get(queue_name, {}) @@ -518,8 +520,8 @@ def _normalize_job_type_any(queue_dict): ) tmp_log.debug(f"master_df: \n{master_df}") # remove pilot type ANY - for job_type in tmp_static_num_workers[queue_name]: - for resource_type, pilot_type_dict in tmp_static_num_workers[queue_name][job_type].items(): + for job_type in static_num_workers[queue_name]: + for resource_type, pilot_type_dict in static_num_workers[queue_name][job_type].items(): if "ANY" in pilot_type_dict: del pilot_type_dict["ANY"] From 4f4868f290af7b1e9b960800516bf43d4b27d403 Mon Sep 17 00:00:00 2001 From: mightqxc Date: Fri, 27 Mar 2026 09:15:34 +0100 Subject: [PATCH 17/38] worker_adjuster: fix behavior --- pandaharvester/commit_timestamp.py | 2 +- .../harvesterbody/worker_adjuster.py | 22 +++++++++++++++---- .../simple_worker_maker.py | 2 +- 3 files changed, 20 insertions(+), 6 deletions(-) diff --git a/pandaharvester/commit_timestamp.py b/pandaharvester/commit_timestamp.py index ace6d35c..e74fba4a 100644 --- a/pandaharvester/commit_timestamp.py +++ b/pandaharvester/commit_timestamp.py @@ -1 +1 @@ -timestamp = "26-03-2026 22:53:06 on flin (by mightqxc)" +timestamp = "27-03-2026 08:15:34 on flin (by mightqxc)" diff --git a/pandaharvester/harvesterbody/worker_adjuster.py b/pandaharvester/harvesterbody/worker_adjuster.py index 638736a2..135a207c 100644 --- a/pandaharvester/harvesterbody/worker_adjuster.py +++ b/pandaharvester/harvesterbody/worker_adjuster.py @@ -16,6 +16,7 @@ _logger = core_utils.setup_logger("worker_adjuster") DEFAULT_JOB_TYPE = "managed" +DEFAULT_PILOT_TYPE = "PR" # polars config pl.Config.set_ascii_tables(True) @@ -489,6 +490,9 @@ def _normalize_job_type_any(queue_dict): if calculated_n_new_workers <= 0: continue tmp_static_num_workers[queue_name][job_type][resource_type][pilot_type]["nNewWorkers"] = calculated_n_new_workers + static_num_workers[queue_name].setdefault(job_type, {}).setdefault(resource_type, {}).setdefault( + pilot_type, {"nReady": 0, "nRunning": 0, "nQueue": 0, "nNewWorkers": 0} + )["nNewWorkers"] = calculated_n_new_workers remaining_n_new_workers -= calculated_n_new_workers master_df = master_df.with_columns( pl.when( @@ -502,17 +506,20 @@ def _normalize_job_type_any(queue_dict): .alias("nNewWorkers") ) tmp_log.debug( - f"set initial nNewWorkers to {calculated_n_new_workers} for queue={queue_name} job_type={job_type} resource_type={resource_type} pilot_type={pilot_type}" + f"Set initial nNewWorkers to {calculated_n_new_workers} for queue={queue_name} job_type={job_type} resource_type={resource_type} pilot_type={pilot_type}" ) if remaining_n_new_workers > 0: - # add remaining n_new_workers to PR pilot_type - tmp_static_num_workers[queue_name][job_type][resource_type]["PR"]["nNewWorkers"] += remaining_n_new_workers + # add remaining n_new_workers to DEFAULT_PILOT_TYPE PR + tmp_static_num_workers[queue_name][job_type][resource_type][DEFAULT_PILOT_TYPE]["nNewWorkers"] += remaining_n_new_workers + static_num_workers[queue_name].setdefault(job_type, {}).setdefault(resource_type, {}).setdefault( + DEFAULT_PILOT_TYPE, {"nReady": 0, "nRunning": 0, "nQueue": 0, "nNewWorkers": 0} + )["nNewWorkers"] = tmp_static_num_workers[queue_name][job_type][resource_type][DEFAULT_PILOT_TYPE]["nNewWorkers"] master_df = master_df.with_columns( pl.when( (pl.col("queue_name") == queue_name) & (pl.col("job_type") == job_type) & (pl.col("resource_type") == resource_type) - & (pl.col("pilot_type") == "PR") + & (pl.col("pilot_type") == DEFAULT_PILOT_TYPE) ) .then(pl.lit(remaining_n_new_workers)) .otherwise(pl.col("nNewWorkers")) @@ -616,6 +623,13 @@ def _normalize_job_type_any(queue_dict): continue else: n_new_workers_def = None + if pilot_type != DEFAULT_PILOT_TYPE: + n_new_workers_def = tmp_val["nNewWorkers"] + if n_new_workers_def == 0: + dyn_num_workers[queue_name][job_type][resource_type][pilot_type]["nNewWorkers"] = 0 + ret_msg = f"got n_new_workers=0 for non-{DEFAULT_PILOT_TYPE} pilot_type in self mode; skipped" + tmp_log.debug(ret_msg) + continue # define num of new workers based on static site config n_new_workers = 0 diff --git a/pandaharvester/harvesterworkermaker/simple_worker_maker.py b/pandaharvester/harvesterworkermaker/simple_worker_maker.py index 6d636ab9..929842c6 100644 --- a/pandaharvester/harvesterworkermaker/simple_worker_maker.py +++ b/pandaharvester/harvesterworkermaker/simple_worker_maker.py @@ -162,7 +162,7 @@ def make_worker(self, jobspec_list, queue_config, job_type, resource_type, prod_ else: # when no job tmp_prod_source_label = prod_source_label - if tmp_prod_source_label != "ANY": + if tmp_prod_source_label == "ANY": # no specified prod_source_label; randomize pilot type with weighting pdpm = getattr(queue_config, "prodSourceLabelRandomWeightsPermille", {}) choice_list = core_utils.make_choice_list(pdpm=pdpm, default="managed") From 4a642ac74c9cc2aca1f198b88a1eb82bddf059ff Mon Sep 17 00:00:00 2001 From: mightqxc Date: Fri, 27 Mar 2026 09:45:04 +0100 Subject: [PATCH 18/38] fix on get_queues_to_submit stats --- pandaharvester/commit_timestamp.py | 2 +- pandaharvester/harvesterbody/worker_adjuster.py | 2 +- pandaharvester/harvestercore/db_proxy.py | 17 +++++++++++------ 3 files changed, 13 insertions(+), 8 deletions(-) diff --git a/pandaharvester/commit_timestamp.py b/pandaharvester/commit_timestamp.py index e74fba4a..3619502f 100644 --- a/pandaharvester/commit_timestamp.py +++ b/pandaharvester/commit_timestamp.py @@ -1 +1 @@ -timestamp = "27-03-2026 08:15:34 on flin (by mightqxc)" +timestamp = "27-03-2026 08:45:06 on flin (by mightqxc)" diff --git a/pandaharvester/harvesterbody/worker_adjuster.py b/pandaharvester/harvesterbody/worker_adjuster.py index 135a207c..8174490b 100644 --- a/pandaharvester/harvesterbody/worker_adjuster.py +++ b/pandaharvester/harvesterbody/worker_adjuster.py @@ -413,7 +413,7 @@ def _normalize_job_type_any(queue_dict): "job_type", pl.when(pl.col("resource_type") == "ANY").then(1).otherwise(0), "resource_type", - pl.when(pl.col("pilot_type") == "ANY").then(1).otherwise(0), + pl.when(pl.col("pilot_type") == "ANY").then(2).when(pl.col("pilot_type") == DEFAULT_PILOT_TYPE).then(0).otherwise(1), "pilot_type", ] ) diff --git a/pandaharvester/harvestercore/db_proxy.py b/pandaharvester/harvestercore/db_proxy.py index d5008e15..f3e9ed4d 100644 --- a/pandaharvester/harvestercore/db_proxy.py +++ b/pandaharvester/harvestercore/db_proxy.py @@ -1779,12 +1779,17 @@ def get_queues_to_submit(self, lookup_interval, lock_interval, locked_by, queue_ retMap.setdefault(queueName, {}) retMap[queueName].setdefault(jobType, {}) retMap[queueName][jobType].setdefault(resourceType, {}) - retMap[queueName][jobType][resourceType][pilotType] = { - "nReady": nReady, - "nRunning": nRunning, - "nQueue": nQueue, - "nNewWorkers": 0, - } + # Initialize or update pilot type entry + if pilotType not in retMap[queueName][jobType][resourceType]: + retMap[queueName][jobType][resourceType][pilotType] = { + "nReady": 0, + "nRunning": 0, + "nQueue": 0, + "nNewWorkers": 0, + } + retMap[queueName][jobType][resourceType][pilotType]["nReady"] += nReady + retMap[queueName][jobType][resourceType][pilotType]["nRunning"] += nRunning + retMap[queueName][jobType][resourceType][pilotType]["nQueue"] += nQueue # ANY pilotType retMap[queueName][jobType][resourceType].setdefault("ANY", {"nReady": 0, "nRunning": 0, "nQueue": 0, "nNewWorkers": 0}) retMap[queueName][jobType][resourceType]["ANY"]["nReady"] += nReady From 406467cc797b05b7437f2322f524ab6fac81cacf Mon Sep 17 00:00:00 2001 From: mightqxc Date: Fri, 27 Mar 2026 10:21:05 +0100 Subject: [PATCH 19/38] revert apfmon; pretty --- pandaharvester/commit_timestamp.py | 2 +- .../harvesterbody/worker_adjuster.py | 5 ++- pandaharvester/harvestermisc/apfmon.py | 45 +++++-------------- 3 files changed, 15 insertions(+), 37 deletions(-) diff --git a/pandaharvester/commit_timestamp.py b/pandaharvester/commit_timestamp.py index 3619502f..11d4ff2d 100644 --- a/pandaharvester/commit_timestamp.py +++ b/pandaharvester/commit_timestamp.py @@ -1 +1 @@ -timestamp = "27-03-2026 08:45:06 on flin (by mightqxc)" +timestamp = "27-03-2026 09:21:06 on flin (by mightqxc)" diff --git a/pandaharvester/harvesterbody/worker_adjuster.py b/pandaharvester/harvesterbody/worker_adjuster.py index 8174490b..697b571d 100644 --- a/pandaharvester/harvesterbody/worker_adjuster.py +++ b/pandaharvester/harvesterbody/worker_adjuster.py @@ -525,7 +525,10 @@ def _normalize_job_type_any(queue_dict): .otherwise(pl.col("nNewWorkers")) .alias("nNewWorkers") ) - tmp_log.debug(f"master_df: \n{master_df}") + display_master_df = master_df.select( + ["job_type", "resource_type", "pilot_type", "nQueue", "nReady", "nRunning", "nNewWorkers", "n_activated_jobs"] + ) + tmp_log.debug(f"master_df: \n{display_master_df}") # remove pilot type ANY for job_type in static_num_workers[queue_name]: for resource_type, pilot_type_dict in static_num_workers[queue_name][job_type].items(): diff --git a/pandaharvester/harvestermisc/apfmon.py b/pandaharvester/harvestermisc/apfmon.py index a607220f..d73325f1 100644 --- a/pandaharvester/harvestermisc/apfmon.py +++ b/pandaharvester/harvestermisc/apfmon.py @@ -164,45 +164,20 @@ def massage_label_data(self, data): return data try: - # First aggregate over resource_type, then over prod_source_label - # Data structure: {resource_type: {prod_source_label: {values}}, "ANY": {...}} - - # Extract the "ANY" resource_type if it exists - any_data = data.get("ANY", {}) - - # Aggregate across all resource_types for each prod_source_label - agg_by_pslabel = {} + any = data["ANY"] + agg = {} for rtype in data: if rtype == "ANY": continue - # data[rtype] is like {prod_source_label: {values}} - for prod_source_label in data[rtype]: - if prod_source_label == "ANY": - continue - # Aggregate values across all resource_types for this prod_source_label - if prod_source_label not in agg_by_pslabel: - agg_by_pslabel[prod_source_label] = {} - for value_key, value_count in data[rtype][prod_source_label].items(): - agg_by_pslabel[prod_source_label].setdefault(value_key, 0) - agg_by_pslabel[prod_source_label][value_key] += value_count - - # Now aggregate across all prod_source_labels to create final "ANY" - final_agg = {} - for prod_source_label in agg_by_pslabel: - for value_key, value_count in agg_by_pslabel[prod_source_label].items(): - final_agg.setdefault(value_key, 0) - final_agg[value_key] += value_count - - # Update data structure: keep resource_type level but aggregate to "ANY" - if final_agg: - # Rebuild data with aggregated "ANY" at the resource_type level - result = {} - for rtype in data: - result[rtype] = data[rtype] - result["ANY"] = final_agg - data = result + else: + for value in data[rtype]: + agg.setdefault(value, 0) + agg[value] += data[rtype][value] + + if agg: + data["ANY"] = agg else: - data["ANY"] = any_data + data["ANY"] = any tmp_log.debug(f"Massaged to data: {data}") From f3ea7d3abf4300167ff7b14a27249b38f6fe2fa6 Mon Sep 17 00:00:00 2001 From: mightqxc Date: Fri, 27 Mar 2026 10:27:23 +0100 Subject: [PATCH 20/38] update apfmon accordingly --- pandaharvester/commit_timestamp.py | 2 +- pandaharvester/harvestermisc/apfmon.py | 44 ++++++++++++++++++++------ 2 files changed, 35 insertions(+), 11 deletions(-) diff --git a/pandaharvester/commit_timestamp.py b/pandaharvester/commit_timestamp.py index 11d4ff2d..62ad6e29 100644 --- a/pandaharvester/commit_timestamp.py +++ b/pandaharvester/commit_timestamp.py @@ -1 +1 @@ -timestamp = "27-03-2026 09:21:06 on flin (by mightqxc)" +timestamp = "27-03-2026 09:27:24 on flin (by mightqxc)" diff --git a/pandaharvester/harvestermisc/apfmon.py b/pandaharvester/harvestermisc/apfmon.py index d73325f1..2a8a35c8 100644 --- a/pandaharvester/harvestermisc/apfmon.py +++ b/pandaharvester/harvestermisc/apfmon.py @@ -164,20 +164,44 @@ def massage_label_data(self, data): return data try: - any = data["ANY"] + # Preserve original ANY if it exists + any_backup = data.get("ANY", {}) agg = {} - for rtype in data: - if rtype == "ANY": + + # Iterate through job_type, resource_type, pilot_type dimensions + # and aggregate all metrics for non-ANY combinations + for job_type in data: + if job_type == "ANY": continue - else: - for value in data[rtype]: - agg.setdefault(value, 0) - agg[value] += data[rtype][value] + for resource_type in data[job_type]: + if resource_type == "ANY": + continue + + for pilot_type in data[job_type][resource_type]: + if pilot_type == "ANY": + continue + + # Get all metrics for this combination + metrics = data[job_type][resource_type][pilot_type] + for metric_key, metric_value in metrics.items(): + if isinstance(metric_value, (int, float)): + agg.setdefault(metric_key, 0) + agg[metric_key] += metric_value + + # Update the ANY entry with aggregated values if agg: - data["ANY"] = agg - else: - data["ANY"] = any + # Initialize ANY structure if needed + if "ANY" not in data: + data["ANY"] = {} + if "ANY" not in data["ANY"]: + data["ANY"]["ANY"] = {} + if "ANY" not in data["ANY"]["ANY"]: + data["ANY"]["ANY"]["ANY"] = {} + + data["ANY"]["ANY"]["ANY"].update(agg) + elif any_backup: + data["ANY"] = any_backup tmp_log.debug(f"Massaged to data: {data}") From 7de58948e2345a76eba5381f1ff9338b3bb5b828 Mon Sep 17 00:00:00 2001 From: mightqxc Date: Mon, 30 Mar 2026 16:07:31 +0200 Subject: [PATCH 21/38] submitter: fixing nNewworkers from commands --- pandaharvester/commit_timestamp.py | 2 +- pandaharvester/harvesterbody/submitter.py | 4 ++-- pandaharvester/harvesterbody/worker_adjuster.py | 6 ++++++ 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/pandaharvester/commit_timestamp.py b/pandaharvester/commit_timestamp.py index 62ad6e29..e91e1b3e 100644 --- a/pandaharvester/commit_timestamp.py +++ b/pandaharvester/commit_timestamp.py @@ -1 +1 @@ -timestamp = "27-03-2026 09:27:24 on flin (by mightqxc)" +timestamp = "30-03-2026 14:07:32 on flin (by mightqxc)" diff --git a/pandaharvester/harvesterbody/submitter.py b/pandaharvester/harvesterbody/submitter.py index e1f05ea7..d538e887 100644 --- a/pandaharvester/harvesterbody/submitter.py +++ b/pandaharvester/harvesterbody/submitter.py @@ -69,8 +69,8 @@ def run(self): if tmp_resource_type in res_map[DEFAULT_JOB_TYPE]: tmp_queue_name = res_map[DEFAULT_JOB_TYPE][tmp_resource_type] if tmp_queue_name in current_workers: - for tmp_pilot_type in current_workers[tmp_queue_name][DEFAULT_JOB_TYPE][tmp_resource_type]: - current_workers[tmp_queue_name][DEFAULT_JOB_TYPE][tmp_resource_type][tmp_pilot_type]["nNewWorkers"] = tmp_new_val + # pilot_type "ANY" to collect all nNewWorkers from the command + current_workers[tmp_queue_name][DEFAULT_JOB_TYPE][tmp_resource_type]["ANY"]["nNewWorkers"] = tmp_new_val # define number of new workers if len(current_workers) == 0: diff --git a/pandaharvester/harvesterbody/worker_adjuster.py b/pandaharvester/harvesterbody/worker_adjuster.py index 697b571d..0e7e78fa 100644 --- a/pandaharvester/harvesterbody/worker_adjuster.py +++ b/pandaharvester/harvesterbody/worker_adjuster.py @@ -510,10 +510,16 @@ def _normalize_job_type_any(queue_dict): ) if remaining_n_new_workers > 0: # add remaining n_new_workers to DEFAULT_PILOT_TYPE PR + tmp_static_num_workers[queue_name][job_type][resource_type].setdefault( + DEFAULT_PILOT_TYPE, {"nReady": 0, "nRunning": 0, "nQueue": 0, "nNewWorkers": 0} + ) tmp_static_num_workers[queue_name][job_type][resource_type][DEFAULT_PILOT_TYPE]["nNewWorkers"] += remaining_n_new_workers static_num_workers[queue_name].setdefault(job_type, {}).setdefault(resource_type, {}).setdefault( DEFAULT_PILOT_TYPE, {"nReady": 0, "nRunning": 0, "nQueue": 0, "nNewWorkers": 0} )["nNewWorkers"] = tmp_static_num_workers[queue_name][job_type][resource_type][DEFAULT_PILOT_TYPE]["nNewWorkers"] + tmp_log.debug( + f"Set remaining nNewWorkers to {remaining_n_new_workers} for queue={queue_name} job_type={job_type} resource_type={resource_type} pilot_type={DEFAULT_PILOT_TYPE}" + ) master_df = master_df.with_columns( pl.when( (pl.col("queue_name") == queue_name) From f329b44e06cd64c9b24cdca7f7606947514a8873 Mon Sep 17 00:00:00 2001 From: mightqxc Date: Mon, 30 Mar 2026 16:15:39 +0200 Subject: [PATCH 22/38] log pretty --- pandaharvester/commit_timestamp.py | 2 +- pandaharvester/harvesterbody/submitter.py | 2 +- pandaharvester/harvesterbody/worker_adjuster.py | 1 + 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/pandaharvester/commit_timestamp.py b/pandaharvester/commit_timestamp.py index e91e1b3e..09543ff3 100644 --- a/pandaharvester/commit_timestamp.py +++ b/pandaharvester/commit_timestamp.py @@ -1 +1 @@ -timestamp = "30-03-2026 14:07:32 on flin (by mightqxc)" +timestamp = "30-03-2026 14:15:40 on flin (by mightqxc)" diff --git a/pandaharvester/harvesterbody/submitter.py b/pandaharvester/harvesterbody/submitter.py index d538e887..60453cc5 100644 --- a/pandaharvester/harvesterbody/submitter.py +++ b/pandaharvester/harvesterbody/submitter.py @@ -96,7 +96,7 @@ def run(self): prod_source_label = core_utils.special_pilot_type_to_prod_source_label(pilot_type) tmp_log = self.make_logger( _logger, - f"id={locked_by} queue={queue_name} jtype={job_type} rtype={resource_type} ptype={pilot_type} pslabel={prod_source_label}", + f"id={locked_by} queue={queue_name} jtype={job_type} rtype={resource_type} ptype={pilot_type}", method_name="run", ) try: diff --git a/pandaharvester/harvesterbody/worker_adjuster.py b/pandaharvester/harvesterbody/worker_adjuster.py index 0e7e78fa..1b4a0e5f 100644 --- a/pandaharvester/harvesterbody/worker_adjuster.py +++ b/pandaharvester/harvesterbody/worker_adjuster.py @@ -410,6 +410,7 @@ def _normalize_job_type_any(queue_dict): .sort( [ "queue_name", + pl.when(pl.col("job_type") == "ANY").then(1).otherwise(0), "job_type", pl.when(pl.col("resource_type") == "ANY").then(1).otherwise(0), "resource_type", From a14c2381f007627517efb6ceea0a49ec638dddc1 Mon Sep 17 00:00:00 2001 From: mightqxc Date: Mon, 30 Mar 2026 16:52:10 +0200 Subject: [PATCH 23/38] db_proxy get_queues_to_submit: fix about ANY pilotType --- pandaharvester/commit_timestamp.py | 2 +- pandaharvester/harvestercore/db_proxy.py | 9 ++++++--- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/pandaharvester/commit_timestamp.py b/pandaharvester/commit_timestamp.py index 09543ff3..3c733dfd 100644 --- a/pandaharvester/commit_timestamp.py +++ b/pandaharvester/commit_timestamp.py @@ -1 +1 @@ -timestamp = "30-03-2026 14:15:40 on flin (by mightqxc)" +timestamp = "30-03-2026 14:52:11 on flin (by mightqxc)" diff --git a/pandaharvester/harvestercore/db_proxy.py b/pandaharvester/harvestercore/db_proxy.py index f3e9ed4d..45cba59d 100644 --- a/pandaharvester/harvestercore/db_proxy.py +++ b/pandaharvester/harvestercore/db_proxy.py @@ -1750,6 +1750,11 @@ def get_queues_to_submit(self, lookup_interval, lock_interval, locked_by, queue_ sql_count_workers_tmp += "GROUP BY pilotType, status " self.execute(sql_count_workers_tmp, varMap) + # Initialize nested dict structure before the loop + retMap.setdefault(queueName, {}) + retMap[queueName].setdefault(jobType, {}) + retMap[queueName][jobType].setdefault(resourceType, {}) + for pilotType, workerStatus, tmpNum in self.cur.fetchall(): nQueue = 0 nReady = 0 @@ -1776,9 +1781,6 @@ def get_queues_to_submit(self, lookup_interval, lock_interval, locked_by, queue_ (nReFill,) = self.cur.fetchone() nReady += nReFill - retMap.setdefault(queueName, {}) - retMap[queueName].setdefault(jobType, {}) - retMap[queueName][jobType].setdefault(resourceType, {}) # Initialize or update pilot type entry if pilotType not in retMap[queueName][jobType][resourceType]: retMap[queueName][jobType][resourceType][pilotType] = { @@ -1797,6 +1799,7 @@ def get_queues_to_submit(self, lookup_interval, lock_interval, locked_by, queue_ retMap[queueName][jobType][resourceType]["ANY"]["nQueue"] += nQueue # set nNewWorkers only in ANY pilotType + retMap[queueName][jobType][resourceType].setdefault("ANY", {"nReady": 0, "nRunning": 0, "nQueue": 0, "nNewWorkers": 0}) retMap[queueName][jobType][resourceType]["ANY"]["nNewWorkers"] = nNewWorkers resourceMap.setdefault(jobType, {}) From 851992f7a91421974c4bb38a22053e0ba49c41b0 Mon Sep 17 00:00:00 2001 From: mightqxc Date: Mon, 30 Mar 2026 17:00:25 +0200 Subject: [PATCH 24/38] fix --- pandaharvester/commit_timestamp.py | 2 +- pandaharvester/harvestercore/db_proxy.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/pandaharvester/commit_timestamp.py b/pandaharvester/commit_timestamp.py index 3c733dfd..53cc006f 100644 --- a/pandaharvester/commit_timestamp.py +++ b/pandaharvester/commit_timestamp.py @@ -1 +1 @@ -timestamp = "30-03-2026 14:52:11 on flin (by mightqxc)" +timestamp = "30-03-2026 15:00:26 on flin (by mightqxc)" diff --git a/pandaharvester/harvestercore/db_proxy.py b/pandaharvester/harvestercore/db_proxy.py index 45cba59d..becbb464 100644 --- a/pandaharvester/harvestercore/db_proxy.py +++ b/pandaharvester/harvestercore/db_proxy.py @@ -1754,6 +1754,7 @@ def get_queues_to_submit(self, lookup_interval, lock_interval, locked_by, queue_ retMap.setdefault(queueName, {}) retMap[queueName].setdefault(jobType, {}) retMap[queueName][jobType].setdefault(resourceType, {}) + retMap[queueName][jobType][resourceType].setdefault("ANY", {"nReady": 0, "nRunning": 0, "nQueue": 0, "nNewWorkers": 0}) for pilotType, workerStatus, tmpNum in self.cur.fetchall(): nQueue = 0 @@ -1793,13 +1794,11 @@ def get_queues_to_submit(self, lookup_interval, lock_interval, locked_by, queue_ retMap[queueName][jobType][resourceType][pilotType]["nRunning"] += nRunning retMap[queueName][jobType][resourceType][pilotType]["nQueue"] += nQueue # ANY pilotType - retMap[queueName][jobType][resourceType].setdefault("ANY", {"nReady": 0, "nRunning": 0, "nQueue": 0, "nNewWorkers": 0}) retMap[queueName][jobType][resourceType]["ANY"]["nReady"] += nReady retMap[queueName][jobType][resourceType]["ANY"]["nRunning"] += nRunning retMap[queueName][jobType][resourceType]["ANY"]["nQueue"] += nQueue # set nNewWorkers only in ANY pilotType - retMap[queueName][jobType][resourceType].setdefault("ANY", {"nReady": 0, "nRunning": 0, "nQueue": 0, "nNewWorkers": 0}) retMap[queueName][jobType][resourceType]["ANY"]["nNewWorkers"] = nNewWorkers resourceMap.setdefault(jobType, {}) From a6f9e1657aed2d8c3d6b1ae390770407615767b8 Mon Sep 17 00:00:00 2001 From: mightqxc Date: Tue, 31 Mar 2026 11:08:51 +0200 Subject: [PATCH 25/38] worker_adjuster: fix stats table --- pandaharvester/commit_timestamp.py | 2 +- .../harvesterbody/worker_adjuster.py | 22 ++++++++----------- 2 files changed, 10 insertions(+), 14 deletions(-) diff --git a/pandaharvester/commit_timestamp.py b/pandaharvester/commit_timestamp.py index 53cc006f..bfc709b2 100644 --- a/pandaharvester/commit_timestamp.py +++ b/pandaharvester/commit_timestamp.py @@ -1 +1 @@ -timestamp = "30-03-2026 15:00:26 on flin (by mightqxc)" +timestamp = "31-03-2026 09:08:52 on flin (by mightqxc)" diff --git a/pandaharvester/harvesterbody/worker_adjuster.py b/pandaharvester/harvesterbody/worker_adjuster.py index 1b4a0e5f..1effd08c 100644 --- a/pandaharvester/harvesterbody/worker_adjuster.py +++ b/pandaharvester/harvesterbody/worker_adjuster.py @@ -328,11 +328,11 @@ def _normalize_job_type_any(queue_dict): tmp_new_workers_df = ( self._num_workers_dict_to_df(static_num_workers) .filter(pl.col("queue_name") == queue_name) + .filter(pl.col("resource_type").is_not_null()) + .filter(pl.col("pilot_type").is_not_null()) .with_columns( [ pl.col("queue_name").fill_null(pl.lit(queue_name)), - pl.col("resource_type").fill_null(pl.lit("ANY")), - pl.col("pilot_type").fill_null(pl.lit("ANY")), pl.col("nQueue").fill_null(0), pl.col("nReady").fill_null(0), pl.col("nRunning").fill_null(0), @@ -352,28 +352,24 @@ def _normalize_job_type_any(queue_dict): ) .select(["queue_name", "resource_type", "pilot_type", "n_jobs"]) ) - # Add aggregated rows with resource_type="ANY" (sum over all resource_types for each pilot_type) - activated_df_any_rt = ( - activated_df.group_by(["queue_name", "pilot_type"]) - .agg(pl.col("n_jobs").sum()) - .with_columns(pl.lit("ANY").alias("resource_type")) - .select(["queue_name", "resource_type", "pilot_type", "n_jobs"]) - ) # Add aggregated rows with pilot_type="ANY" (sum over all pilot_types for each resource_type) activated_df_any_pt = ( - activated_df.group_by(["queue_name", "resource_type"]) + activated_df.select(["queue_name", "resource_type", "n_jobs"]) + .group_by(["queue_name", "resource_type"]) .agg(pl.col("n_jobs").sum()) .with_columns(pl.lit("ANY").alias("pilot_type")) .select(["queue_name", "resource_type", "pilot_type", "n_jobs"]) ) # Add aggregated row with both resource_type="ANY" and pilot_type="ANY" (sum over all) activated_df_any_both = ( - activated_df.select(pl.col("n_jobs").sum()) - .with_columns(pl.lit(queue_name).alias("queue_name"), pl.lit("ANY").alias("resource_type"), pl.lit("ANY").alias("pilot_type")) + activated_df.select(["queue_name", "n_jobs"]) + .group_by(["queue_name"]) + .agg(pl.col("n_jobs").sum()) + .with_columns(pl.lit("ANY").alias("resource_type"), pl.lit("ANY").alias("pilot_type")) .select(["queue_name", "resource_type", "pilot_type", "n_jobs"]) ) - activated_df = pl.concat([activated_df, activated_df_any_rt, activated_df_any_pt, activated_df_any_both]) + activated_df = pl.concat([activated_df, activated_df_any_pt, activated_df_any_both]) # tmp_log.debug(f"DEBUG: activated_df after filter shape: {activated_df.shape}") # tmp_log.debug(f"DEBUG: activated_df columns: {activated_df.columns}") # tmp_log.debug(f"DEBUG: activated_df:\n{activated_df}") From 767d3a20f9be1f51cee3359ee978969e96897d7e Mon Sep 17 00:00:00 2001 From: mightqxc Date: Tue, 31 Mar 2026 11:31:47 +0200 Subject: [PATCH 26/38] worker_adjuster: configurable prioritizedProdSourceLabels --- pandaharvester/commit_timestamp.py | 2 +- .../harvesterbody/worker_adjuster.py | 18 ++++++++++-------- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/pandaharvester/commit_timestamp.py b/pandaharvester/commit_timestamp.py index bfc709b2..4a3ae744 100644 --- a/pandaharvester/commit_timestamp.py +++ b/pandaharvester/commit_timestamp.py @@ -1 +1 @@ -timestamp = "31-03-2026 09:08:52 on flin (by mightqxc)" +timestamp = "31-03-2026 09:31:48 on flin (by mightqxc)" diff --git a/pandaharvester/harvesterbody/worker_adjuster.py b/pandaharvester/harvesterbody/worker_adjuster.py index 1effd08c..acf872a4 100644 --- a/pandaharvester/harvesterbody/worker_adjuster.py +++ b/pandaharvester/harvesterbody/worker_adjuster.py @@ -17,6 +17,7 @@ DEFAULT_JOB_TYPE = "managed" DEFAULT_PILOT_TYPE = "PR" +DEFAULT_PRIORITIZED_PROD_SOURCE_LABELS = ["rc_alrb"] # polars config pl.Config.set_ascii_tables(True) @@ -314,10 +315,6 @@ def _normalize_job_type_any(queue_dict): if job_stats_new is not None: job_stats_new_df = self._job_stats_to_df(job_stats_new.data) - # prioritized prod_source_labels for pilot submission - PRIORITIZED_PROD_SOURCE_LABELS = ["rc_alrb"] - PRIORITIZED_PILOT_TYPES = [core_utils.prod_source_label_to_pilot_type(label) for label in PRIORITIZED_PROD_SOURCE_LABELS] - # get panda queues dict from CRIC panda_queues_dict = PandaQueuesDict() @@ -325,6 +322,13 @@ def _normalize_job_type_any(queue_dict): rt_mapper = ResourceTypeMapper() for queue_name in static_num_workers: + queue_config = self.queue_configMapper.get_queue(queue_name) + queue_dict = panda_queues_dict.get(queue_name, {}) + + # prioritized prod_source_labels for pilot submission + prioritized_pslabels = queue_config.get("prioritizedProdSourceLabels", DEFAULT_PRIORITIZED_PROD_SOURCE_LABELS) + prioritized_pilot_types = [core_utils.prod_source_label_to_pilot_type(label) for label in prioritized_pslabels] + tmp_new_workers_df = ( self._num_workers_dict_to_df(static_num_workers) .filter(pl.col("queue_name") == queue_name) @@ -445,8 +449,6 @@ def _normalize_job_type_any(queue_dict): } ) - queue_config = self.queue_configMapper.get_queue(queue_name) - queue_dict = panda_queues_dict.get(queue_name, {}) # set initial nNewWorkers for pilot types based on number of activated jobs and the activate worker factor for job_type in tmp_static_num_workers[queue_name]: for resource_type, pilot_type_dict in tmp_static_num_workers[queue_name][job_type].items(): @@ -460,7 +462,7 @@ def _normalize_job_type_any(queue_dict): (pl.col("queue_name") == queue_name) & (pl.col("job_type") == job_type) & (pl.col("resource_type") == resource_type) - & (pl.col("pilot_type").is_in(PRIORITIZED_PILOT_TYPES)) + & (pl.col("pilot_type").is_in(prioritized_pilot_types)) ).select([pl.col("n_activated_jobs").sum(), pl.col("nQueue").sum()]) if prio_ptype_result.shape[0] > 0: total_prio_ptype_n_activated_jobs, total_prio_ptype_nQueue = prio_ptype_result.row(0) @@ -472,7 +474,7 @@ def _normalize_job_type_any(queue_dict): if total_prio_ptype_calculated_n_new_workers > 0: adjust_ratio = min(total_n_new_workers / total_prio_ptype_calculated_n_new_workers, 1) for pilot_type, tmp_val in pilot_type_dict.items(): - if pilot_type in PRIORITIZED_PILOT_TYPES: + if pilot_type in prioritized_pilot_types: pt_result = tmp_master_df.filter( (pl.col("queue_name") == queue_name) & (pl.col("job_type") == job_type) From 3e069d0cd99df78c31f9e92e4c76ff1454f5d240 Mon Sep 17 00:00:00 2001 From: mightqxc Date: Tue, 31 Mar 2026 12:30:56 +0200 Subject: [PATCH 27/38] fix --- pandaharvester/commit_timestamp.py | 2 +- pandaharvester/harvesterbody/worker_adjuster.py | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/pandaharvester/commit_timestamp.py b/pandaharvester/commit_timestamp.py index 4a3ae744..101ad756 100644 --- a/pandaharvester/commit_timestamp.py +++ b/pandaharvester/commit_timestamp.py @@ -1 +1 @@ -timestamp = "31-03-2026 09:31:48 on flin (by mightqxc)" +timestamp = "31-03-2026 10:30:57 on flin (by mightqxc)" diff --git a/pandaharvester/harvesterbody/worker_adjuster.py b/pandaharvester/harvesterbody/worker_adjuster.py index acf872a4..d213cd04 100644 --- a/pandaharvester/harvesterbody/worker_adjuster.py +++ b/pandaharvester/harvesterbody/worker_adjuster.py @@ -326,7 +326,7 @@ def _normalize_job_type_any(queue_dict): queue_dict = panda_queues_dict.get(queue_name, {}) # prioritized prod_source_labels for pilot submission - prioritized_pslabels = queue_config.get("prioritizedProdSourceLabels", DEFAULT_PRIORITIZED_PROD_SOURCE_LABELS) + prioritized_pslabels = getattr(queue_config, "prioritizedProdSourceLabels", DEFAULT_PRIORITIZED_PROD_SOURCE_LABELS) prioritized_pilot_types = [core_utils.prod_source_label_to_pilot_type(label) for label in prioritized_pslabels] tmp_new_workers_df = ( @@ -385,13 +385,14 @@ def _normalize_job_type_any(queue_dict): ).with_columns( [ pl.col("queue_name").fill_null(pl.lit(queue_name)), + pl.col("job_type").fill_null(DEFAULT_JOB_TYPE), pl.col("resource_type").fill_null(pl.lit("ANY")), pl.col("pilot_type").fill_null(pl.lit("ANY")), pl.col("nQueue").fill_null(0), pl.col("nReady").fill_null(0), pl.col("nRunning").fill_null(0), pl.col("nNewWorkers").fill_null(0), - pl.col("job_type").fill_null(DEFAULT_JOB_TYPE), + pl.col("n_jobs").fill_null(0), ] ) # tmp_log.debug(f"DEBUG: joined_df shape: {joined_df.shape}") @@ -405,7 +406,7 @@ def _normalize_job_type_any(queue_dict): pl.col("nReady").max(), pl.col("nRunning").max(), pl.col("nNewWorkers").max(), - pl.col("n_jobs").fill_null(0).sum().alias("n_activated_jobs"), + pl.col("n_jobs").sum().alias("n_activated_jobs"), ) .sort( [ From c37466b26d0a53be6f60b7a7a33c4bca41bd71b8 Mon Sep 17 00:00:00 2001 From: mightqxc Date: Tue, 31 Mar 2026 12:31:08 +0200 Subject: [PATCH 28/38] fix --- pandaharvester/commit_timestamp.py | 2 +- .../harvesterbody/worker_adjuster.py | 41 +++++++++++-------- 2 files changed, 26 insertions(+), 17 deletions(-) diff --git a/pandaharvester/commit_timestamp.py b/pandaharvester/commit_timestamp.py index 101ad756..2ca718d7 100644 --- a/pandaharvester/commit_timestamp.py +++ b/pandaharvester/commit_timestamp.py @@ -1 +1 @@ -timestamp = "31-03-2026 10:30:57 on flin (by mightqxc)" +timestamp = "31-03-2026 10:31:09 on flin (by mightqxc)" diff --git a/pandaharvester/harvesterbody/worker_adjuster.py b/pandaharvester/harvesterbody/worker_adjuster.py index d213cd04..8b39a320 100644 --- a/pandaharvester/harvesterbody/worker_adjuster.py +++ b/pandaharvester/harvesterbody/worker_adjuster.py @@ -378,22 +378,31 @@ def _normalize_job_type_any(queue_dict): # tmp_log.debug(f"DEBUG: activated_df columns: {activated_df.columns}") # tmp_log.debug(f"DEBUG: activated_df:\n{activated_df}") - joined_df = activated_df.join( - tmp_new_workers_df, - on=["queue_name", "resource_type", "pilot_type"], - how="full", - ).with_columns( - [ - pl.col("queue_name").fill_null(pl.lit(queue_name)), - pl.col("job_type").fill_null(DEFAULT_JOB_TYPE), - pl.col("resource_type").fill_null(pl.lit("ANY")), - pl.col("pilot_type").fill_null(pl.lit("ANY")), - pl.col("nQueue").fill_null(0), - pl.col("nReady").fill_null(0), - pl.col("nRunning").fill_null(0), - pl.col("nNewWorkers").fill_null(0), - pl.col("n_jobs").fill_null(0), - ] + joined_df = ( + activated_df.join( + tmp_new_workers_df, + on=["queue_name", "resource_type", "pilot_type"], + how="full", + suffix="_right", + ) + .with_columns( + [ + pl.col("queue_name").fill_null(pl.lit(queue_name)), + pl.col("job_type").fill_null(DEFAULT_JOB_TYPE), + # Use coalesce to prefer left side if not null, otherwise use right side + pl.coalesce(pl.col("resource_type"), pl.col("resource_type_right")).fill_null(pl.lit("ANY")).alias("resource_type"), + pl.coalesce(pl.col("pilot_type"), pl.col("pilot_type_right")).fill_null(pl.lit("ANY")).alias("pilot_type"), + pl.col("nQueue").fill_null(0), + pl.col("nReady").fill_null(0), + pl.col("nRunning").fill_null(0), + pl.col("nNewWorkers").fill_null(0), + pl.col("n_jobs").fill_null(0), + ] + ) + .select( + # Drop the temporary *_right columns after coalesce + pl.all().exclude(["resource_type_right", "pilot_type_right"]) + ) ) # tmp_log.debug(f"DEBUG: joined_df shape: {joined_df.shape}") # tmp_log.debug(f"DEBUG: joined_df columns: {joined_df.columns}") From 78d41cec53cb1f01d2adae14d59bce403f20b87b Mon Sep 17 00:00:00 2001 From: mightqxc Date: Tue, 31 Mar 2026 16:28:06 +0200 Subject: [PATCH 29/38] worker_adjuster: sort considering pilot_type --- pandaharvester/commit_timestamp.py | 2 +- pandaharvester/harvesterbody/worker_adjuster.py | 8 ++++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/pandaharvester/commit_timestamp.py b/pandaharvester/commit_timestamp.py index 2ca718d7..570d25e1 100644 --- a/pandaharvester/commit_timestamp.py +++ b/pandaharvester/commit_timestamp.py @@ -1 +1 @@ -timestamp = "31-03-2026 10:31:09 on flin (by mightqxc)" +timestamp = "31-03-2026 14:28:07 on flin (by mightqxc)" diff --git a/pandaharvester/harvesterbody/worker_adjuster.py b/pandaharvester/harvesterbody/worker_adjuster.py index 8b39a320..51968e3a 100644 --- a/pandaharvester/harvesterbody/worker_adjuster.py +++ b/pandaharvester/harvesterbody/worker_adjuster.py @@ -568,6 +568,10 @@ def _normalize_job_type_any(queue_dict): worker_limits_dict, worker_stats_map = self.dbProxy.get_worker_limits(queue_name, queue_config) else: tmp_log.warning("missing queue_config") + # prioritized prod_source_labels for pilot submission + prioritized_pslabels = getattr(queue_config, "prioritizedProdSourceLabels", DEFAULT_PRIORITIZED_PROD_SOURCE_LABELS) + prioritized_pilot_types = [core_utils.prod_source_label_to_pilot_type(label) for label in prioritized_pslabels] + # get limits from queue config max_workers = worker_limits_dict.get("maxWorkers", 0) n_queue_limit = worker_limits_dict.get("nQueueLimitWorker", 0) n_queue_limit_per_rt = n_queue_limit @@ -796,8 +800,8 @@ def _normalize_job_type_any(queue_dict): dyn_num_workers[queue_name][job_type][resource_type][pilot_type]["nNewWorkers"] = n_new_workers _rt_list[2] = remainder _countdown -= n_new_workers - _s_list = sorted(simple_rt_nw_list, key=(lambda x: x[1])) - sorted_rt_nw_list = sorted(_s_list, key=(lambda x: x[2]), reverse=True) + # sort by pilot_type (favor prioritized), then by remainder (descending), then by original n_new_workers (favor smaller ones) + sorted_rt_nw_list = sorted(simple_rt_nw_list, key=(lambda x: (x[0][2] not in prioritized_pilot_types, -x[2], x[1]))) for (resource_type, job_type, pilot_type), n_new_workers_orig, remainder in sorted_rt_nw_list: if _countdown <= 0: break From 8e60ca4ea7bc1ca04a441406f0ecef9081e91723 Mon Sep 17 00:00:00 2001 From: mightqxc Date: Wed, 1 Apr 2026 10:51:24 +0200 Subject: [PATCH 30/38] worker_adjuster: fix sorted --- pandaharvester/commit_timestamp.py | 2 +- pandaharvester/harvesterbody/worker_adjuster.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pandaharvester/commit_timestamp.py b/pandaharvester/commit_timestamp.py index 570d25e1..071272ac 100644 --- a/pandaharvester/commit_timestamp.py +++ b/pandaharvester/commit_timestamp.py @@ -1 +1 @@ -timestamp = "31-03-2026 14:28:07 on flin (by mightqxc)" +timestamp = "01-04-2026 08:51:25 on flin (by mightqxc)" diff --git a/pandaharvester/harvesterbody/worker_adjuster.py b/pandaharvester/harvesterbody/worker_adjuster.py index 51968e3a..f556a4e4 100644 --- a/pandaharvester/harvesterbody/worker_adjuster.py +++ b/pandaharvester/harvesterbody/worker_adjuster.py @@ -800,8 +800,8 @@ def _normalize_job_type_any(queue_dict): dyn_num_workers[queue_name][job_type][resource_type][pilot_type]["nNewWorkers"] = n_new_workers _rt_list[2] = remainder _countdown -= n_new_workers - # sort by pilot_type (favor prioritized), then by remainder (descending), then by original n_new_workers (favor smaller ones) - sorted_rt_nw_list = sorted(simple_rt_nw_list, key=(lambda x: (x[0][2] not in prioritized_pilot_types, -x[2], x[1]))) + # sort by whether n_new_workers_orig > 0 (favor positive over 0), then by pilot_type (favor prioritized), then by remainder (descending), then by original n_new_workers (favor smaller ones) + sorted_rt_nw_list = sorted(simple_rt_nw_list, key=(lambda x: (not (x[1] > 0), x[0][2] not in prioritized_pilot_types, -x[2], x[1]))) for (resource_type, job_type, pilot_type), n_new_workers_orig, remainder in sorted_rt_nw_list: if _countdown <= 0: break From 4cb5083227e8f517fd2c476fb3063d961605fe95 Mon Sep 17 00:00:00 2001 From: mightqxc Date: Wed, 1 Apr 2026 17:47:47 +0200 Subject: [PATCH 31/38] version up for dependency --- pandaharvester/commit_timestamp.py | 2 +- pandaharvester/panda_pkg_info.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandaharvester/commit_timestamp.py b/pandaharvester/commit_timestamp.py index 071272ac..9861d1d2 100644 --- a/pandaharvester/commit_timestamp.py +++ b/pandaharvester/commit_timestamp.py @@ -1 +1 @@ -timestamp = "01-04-2026 08:51:25 on flin (by mightqxc)" +timestamp = "01-04-2026 15:47:48 on flin (by mightqxc)" diff --git a/pandaharvester/panda_pkg_info.py b/pandaharvester/panda_pkg_info.py index 8ad31d59..504f8628 100644 --- a/pandaharvester/panda_pkg_info.py +++ b/pandaharvester/panda_pkg_info.py @@ -1 +1 @@ -release_version = "0.7.3" +release_version = "0.7.4" From 4e7c8ab7e69f276b2ffb9ec161f126282424ca16 Mon Sep 17 00:00:00 2001 From: mightqxc Date: Thu, 2 Apr 2026 09:28:42 +0200 Subject: [PATCH 32/38] pretty --- pandaharvester/commit_timestamp.py | 2 +- pandaharvester/harvesterbody/worker_adjuster.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pandaharvester/commit_timestamp.py b/pandaharvester/commit_timestamp.py index 9861d1d2..72af4506 100644 --- a/pandaharvester/commit_timestamp.py +++ b/pandaharvester/commit_timestamp.py @@ -1 +1 @@ -timestamp = "01-04-2026 15:47:48 on flin (by mightqxc)" +timestamp = "02-04-2026 07:28:43 on flin (by mightqxc)" diff --git a/pandaharvester/harvesterbody/worker_adjuster.py b/pandaharvester/harvesterbody/worker_adjuster.py index f556a4e4..471330c0 100644 --- a/pandaharvester/harvesterbody/worker_adjuster.py +++ b/pandaharvester/harvesterbody/worker_adjuster.py @@ -747,7 +747,7 @@ def _normalize_job_type_any(queue_dict): tmp_log.debug(f"setting n_new_workers to {n_new_workers} in order to respect universal maxNewWorkers") dyn_num_workers[queue_name][job_type][resource_type][pilot_type]["nNewWorkers"] = n_new_workers - # adjust n_new_workers for UCORE to let aggregations over RT respect nQueueLimitWorker and max_workers + # adjust n_new_workers for UCORE to let aggregations over rtype respect nQueueLimitWorker and max_workers if queue_config is None: max_new_workers_per_cycle = 0 ret_msg = "set max_new_workers_per_cycle=0 in UCORE aggregation due to missing queue_config" @@ -814,7 +814,7 @@ def _normalize_job_type_any(queue_dict): for pilot_type in dyn_num_workers[queue_name][job_type][resource_type]: n_new_workers = dyn_num_workers[queue_name][job_type][resource_type][pilot_type]["nNewWorkers"] tmp_log.debug( - f"setting n_new_workers to {n_new_workers} of job_type={job_type} resource_type={resource_type} pilot_type={pilot_type} in order to respect RT aggregations for UCORE" + f"setting n_new_workers to {n_new_workers} of job_type={job_type} resource_type={resource_type} pilot_type={pilot_type} in order to respect rtype aggregations for UCORE" ) if not apf_msg: From 35568639dd9df8f6941f936a960fce2a7107127c Mon Sep 17 00:00:00 2001 From: mightqxc Date: Thu, 2 Apr 2026 11:33:11 +0200 Subject: [PATCH 33/38] worker_adjuster: log result in table --- pandaharvester/commit_timestamp.py | 2 +- .../harvesterbody/worker_adjuster.py | 20 +++++++++++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/pandaharvester/commit_timestamp.py b/pandaharvester/commit_timestamp.py index 72af4506..8c40f5f1 100644 --- a/pandaharvester/commit_timestamp.py +++ b/pandaharvester/commit_timestamp.py @@ -1 +1 @@ -timestamp = "02-04-2026 07:28:43 on flin (by mightqxc)" +timestamp = "02-04-2026 09:33:11 on flin (by mightqxc)" diff --git a/pandaharvester/harvesterbody/worker_adjuster.py b/pandaharvester/harvesterbody/worker_adjuster.py index 471330c0..2caf967e 100644 --- a/pandaharvester/harvesterbody/worker_adjuster.py +++ b/pandaharvester/harvesterbody/worker_adjuster.py @@ -824,6 +824,26 @@ def _normalize_job_type_any(queue_dict): # dump tmp_log.debug(f"defined {str(dyn_num_workers)}") + # print result in table + dyn_num_workers_rows = [] + for queue_name, job_types in dyn_num_workers.items(): + for job_type, resource_types in job_types.items(): + for resource_type, pilot_types in resource_types.items(): + for pilot_type, worker_data in pilot_types.items(): + dyn_num_workers_rows.append( + { + "queue_name": queue_name, + "job_type": job_type, + "resource_type": resource_type, + "pilot_type": pilot_type, + "nQueue": worker_data.get("nQueue", 0), + "nReady": worker_data.get("nReady", 0), + "nRunning": worker_data.get("nRunning", 0), + "nNewWorkers": worker_data.get("nNewWorkers", 0), + } + ) + result_df = pl.DataFrame(dyn_num_workers_rows).select(pl.all().exclude(["queue_name"])) + tmp_log.debug(f"result_df:\n{result_df}") return dyn_num_workers except Exception: # dump error From 68006907cd8512bdc06fa9d265dc13c49561cef9 Mon Sep 17 00:00:00 2001 From: mightqxc Date: Thu, 2 Apr 2026 11:39:56 +0200 Subject: [PATCH 34/38] log pretty --- pandaharvester/commit_timestamp.py | 2 +- pandaharvester/harvesterbody/worker_adjuster.py | 15 ++++++++++++++- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/pandaharvester/commit_timestamp.py b/pandaharvester/commit_timestamp.py index 8c40f5f1..ff64f355 100644 --- a/pandaharvester/commit_timestamp.py +++ b/pandaharvester/commit_timestamp.py @@ -1 +1 @@ -timestamp = "02-04-2026 09:33:11 on flin (by mightqxc)" +timestamp = "02-04-2026 09:39:56 on flin (by mightqxc)" diff --git a/pandaharvester/harvesterbody/worker_adjuster.py b/pandaharvester/harvesterbody/worker_adjuster.py index 2caf967e..350354f1 100644 --- a/pandaharvester/harvesterbody/worker_adjuster.py +++ b/pandaharvester/harvesterbody/worker_adjuster.py @@ -842,7 +842,20 @@ def _normalize_job_type_any(queue_dict): "nNewWorkers": worker_data.get("nNewWorkers", 0), } ) - result_df = pl.DataFrame(dyn_num_workers_rows).select(pl.all().exclude(["queue_name"])) + result_df = ( + pl.DataFrame(dyn_num_workers_rows) + .select(pl.all().exclude(["queue_name"])) + .sort( + [ + pl.when(pl.col("job_type") == "ANY").then(1).otherwise(0), + "job_type", + pl.when(pl.col("resource_type") == "ANY").then(1).otherwise(0), + "resource_type", + pl.when(pl.col("pilot_type") == "ANY").then(2).when(pl.col("pilot_type") == DEFAULT_PILOT_TYPE).then(0).otherwise(1), + "pilot_type", + ] + ) + ) tmp_log.debug(f"result_df:\n{result_df}") return dyn_num_workers except Exception: From 4bd4dce7329aeac69485c70de90cb2a30291d10b Mon Sep 17 00:00:00 2001 From: mightqxc Date: Thu, 2 Apr 2026 12:52:46 +0200 Subject: [PATCH 35/38] fix --- pandaharvester/commit_timestamp.py | 2 +- .../harvesterbody/worker_adjuster.py | 31 ++++++++++--------- 2 files changed, 18 insertions(+), 15 deletions(-) diff --git a/pandaharvester/commit_timestamp.py b/pandaharvester/commit_timestamp.py index ff64f355..491e4ec9 100644 --- a/pandaharvester/commit_timestamp.py +++ b/pandaharvester/commit_timestamp.py @@ -1 +1 @@ -timestamp = "02-04-2026 09:39:56 on flin (by mightqxc)" +timestamp = "02-04-2026 10:52:46 on flin (by mightqxc)" diff --git a/pandaharvester/harvesterbody/worker_adjuster.py b/pandaharvester/harvesterbody/worker_adjuster.py index 350354f1..ccffe0d8 100644 --- a/pandaharvester/harvesterbody/worker_adjuster.py +++ b/pandaharvester/harvesterbody/worker_adjuster.py @@ -842,21 +842,24 @@ def _normalize_job_type_any(queue_dict): "nNewWorkers": worker_data.get("nNewWorkers", 0), } ) - result_df = ( - pl.DataFrame(dyn_num_workers_rows) - .select(pl.all().exclude(["queue_name"])) - .sort( - [ - pl.when(pl.col("job_type") == "ANY").then(1).otherwise(0), - "job_type", - pl.when(pl.col("resource_type") == "ANY").then(1).otherwise(0), - "resource_type", - pl.when(pl.col("pilot_type") == "ANY").then(2).when(pl.col("pilot_type") == DEFAULT_PILOT_TYPE).then(0).otherwise(1), - "pilot_type", - ] + if dyn_num_workers_rows: + result_df = ( + pl.DataFrame(dyn_num_workers_rows) + .select(pl.all().exclude(["queue_name"])) + .sort( + [ + pl.when(pl.col("job_type") == "ANY").then(1).otherwise(0), + "job_type", + pl.when(pl.col("resource_type") == "ANY").then(1).otherwise(0), + "resource_type", + pl.when(pl.col("pilot_type") == "ANY").then(2).when(pl.col("pilot_type") == DEFAULT_PILOT_TYPE).then(0).otherwise(1), + "pilot_type", + ] + ) ) - ) - tmp_log.debug(f"result_df:\n{result_df}") + tmp_log.debug(f"result_df:\n{result_df}") + else: + tmp_log.debug("result_df: nothing to display") return dyn_num_workers except Exception: # dump error From 4283cd941f3605a72399adf539e8d1f8feded9ad Mon Sep 17 00:00:00 2001 From: mightqxc Date: Tue, 7 Apr 2026 14:18:03 +0200 Subject: [PATCH 36/38] fixes suggested by github copilot --- pandaharvester/commit_timestamp.py | 2 +- pandaharvester/harvesterbody/submitter.py | 8 ++++- .../harvesterbody/worker_adjuster.py | 6 ++++ pandaharvester/harvestercore/db_proxy.py | 32 ++++++++++--------- .../dummy_dynamic_worker_maker.py | 2 +- .../multijob_worker_maker.py | 2 +- .../multinode_worker_maker.py | 2 +- .../simple_bf_es_worker_maker.py | 3 +- pyproject.toml | 2 ++ setup.py | 2 +- 10 files changed, 39 insertions(+), 22 deletions(-) diff --git a/pandaharvester/commit_timestamp.py b/pandaharvester/commit_timestamp.py index 491e4ec9..f99aa7c3 100644 --- a/pandaharvester/commit_timestamp.py +++ b/pandaharvester/commit_timestamp.py @@ -1 +1 @@ -timestamp = "02-04-2026 10:52:46 on flin (by mightqxc)" +timestamp = "07-04-2026 12:18:04 on flin (by mightqxc)" diff --git a/pandaharvester/harvesterbody/submitter.py b/pandaharvester/harvesterbody/submitter.py index 60453cc5..35c4e2f5 100644 --- a/pandaharvester/harvesterbody/submitter.py +++ b/pandaharvester/harvesterbody/submitter.py @@ -209,7 +209,13 @@ def run(self): continue # make workers okChunks, ngChunks = self.workerMaker.make_workers( - jobChunks, queue_config, nReady, job_type, resource_type, prod_source_label=prod_source_label, maker=workerMakerCore + jobChunks, + queue_config, + nReady, + job_type, + resource_type, + prod_source_label=prod_source_label, + maker=workerMakerCore, ) if len(ngChunks) == 0: diff --git a/pandaharvester/harvesterbody/worker_adjuster.py b/pandaharvester/harvesterbody/worker_adjuster.py index ccffe0d8..fbee0a86 100644 --- a/pandaharvester/harvesterbody/worker_adjuster.py +++ b/pandaharvester/harvesterbody/worker_adjuster.py @@ -323,6 +323,12 @@ def _normalize_job_type_any(queue_dict): for queue_name in static_num_workers: queue_config = self.queue_configMapper.get_queue(queue_name) + + # protection against not-up-to-date queue config + if queue_config is None: + tmp_log.debug(f"skipping queue {queue_name} due to missing queue_config in preprocessing loop") + continue + queue_dict = panda_queues_dict.get(queue_name, {}) # prioritized prod_source_labels for pilot submission diff --git a/pandaharvester/harvestercore/db_proxy.py b/pandaharvester/harvestercore/db_proxy.py index becbb464..7377a5ec 100644 --- a/pandaharvester/harvestercore/db_proxy.py +++ b/pandaharvester/harvestercore/db_proxy.py @@ -1750,6 +1750,20 @@ def get_queues_to_submit(self, lookup_interval, lock_interval, locked_by, queue_ sql_count_workers_tmp += "GROUP BY pilotType, status " self.execute(sql_count_workers_tmp, varMap) + # count nFillers once per queue/jobType/resourceType combination + varMap = dict() + varMap[":computingSite"] = queueName + varMap[":status"] = WorkSpec.ST_running + sql_count_refillers_tmp = sql_count_refillers + if jobType != "ANY": + varMap[":jobType"] = jobType + sql_count_refillers_tmp += "AND jobType=:jobType " + if resourceType != "ANY": + varMap[":resourceType"] = resourceType + sql_count_refillers_tmp += "AND resourceType=:resourceType " + self.execute(sql_count_refillers_tmp, varMap) + (nReFill,) = self.cur.fetchone() + # Initialize nested dict structure before the loop retMap.setdefault(queueName, {}) retMap[queueName].setdefault(jobType, {}) @@ -1767,21 +1781,6 @@ def get_queues_to_submit(self, lookup_interval, lock_interval, locked_by, queue_ elif workerStatus in [WorkSpec.ST_running]: nRunning += tmpNum - # count nFillers - varMap = dict() - varMap[":computingSite"] = queueName - varMap[":status"] = WorkSpec.ST_running - sql_count_refillers_tmp = sql_count_refillers - if jobType != "ANY": - varMap[":jobType"] = jobType - sql_count_refillers_tmp += "AND jobType=:jobType " - if resourceType != "ANY": - varMap[":resourceType"] = resourceType - sql_count_refillers_tmp += "AND resourceType=:resourceType " - self.execute(sql_count_refillers_tmp, varMap) - (nReFill,) = self.cur.fetchone() - nReady += nReFill - # Initialize or update pilot type entry if pilotType not in retMap[queueName][jobType][resourceType]: retMap[queueName][jobType][resourceType][pilotType] = { @@ -1798,6 +1797,9 @@ def get_queues_to_submit(self, lookup_interval, lock_interval, locked_by, queue_ retMap[queueName][jobType][resourceType]["ANY"]["nRunning"] += nRunning retMap[queueName][jobType][resourceType]["ANY"]["nQueue"] += nQueue + # Add refiller count once to the ANY pilotType entry + retMap[queueName][jobType][resourceType]["ANY"]["nReady"] += nReFill + # set nNewWorkers only in ANY pilotType retMap[queueName][jobType][resourceType]["ANY"]["nNewWorkers"] = nNewWorkers diff --git a/pandaharvester/harvesterworkermaker/dummy_dynamic_worker_maker.py b/pandaharvester/harvesterworkermaker/dummy_dynamic_worker_maker.py index 676ba14b..4b972559 100644 --- a/pandaharvester/harvesterworkermaker/dummy_dynamic_worker_maker.py +++ b/pandaharvester/harvesterworkermaker/dummy_dynamic_worker_maker.py @@ -13,7 +13,7 @@ def __init__(self, **kwarg): BaseWorkerMaker.__init__(self, **kwarg) # make a worker from jobs - def make_worker(self, jobspec_list, queue_config, job_type, resource_type): + def make_worker(self, jobspec_list, queue_config, job_type, resource_type, **kwargs): workSpec = WorkSpec() workSpec.resourceType = resource_type if len(jobspec_list) > 0: diff --git a/pandaharvester/harvesterworkermaker/multijob_worker_maker.py b/pandaharvester/harvesterworkermaker/multijob_worker_maker.py index f3d1c83e..db966459 100644 --- a/pandaharvester/harvesterworkermaker/multijob_worker_maker.py +++ b/pandaharvester/harvesterworkermaker/multijob_worker_maker.py @@ -46,7 +46,7 @@ def _get_executable(self, queue_config): return exe_str # make a worker from a job with a disk access point - def make_worker(self, jobspec_list, queue_config, job_type, resource_type): + def make_worker(self, jobspec_list, queue_config, job_type, resource_type, **kwargs): tmpLog = self.make_logger(baseLogger, method_name="make_worker") workSpec = WorkSpec() self.nJobsPerWorker = len(jobspec_list) diff --git a/pandaharvester/harvesterworkermaker/multinode_worker_maker.py b/pandaharvester/harvesterworkermaker/multinode_worker_maker.py index 0bd409c0..8e727258 100644 --- a/pandaharvester/harvesterworkermaker/multinode_worker_maker.py +++ b/pandaharvester/harvesterworkermaker/multinode_worker_maker.py @@ -59,7 +59,7 @@ def _get_executable(self): return exe_str # make a worker from jobs - def make_worker(self, jobspec_list, queue_config, job_type, resource_type): + def make_worker(self, jobspec_list, queue_config, job_type, resource_type, **kwargs): tmpLog = core_utils.make_logger(baseLogger, f"queue={queue_config.queueName}", method_name="make_worker") tmpLog.info("Multi node worker preparation started.") diff --git a/pandaharvester/harvesterworkermaker/simple_bf_es_worker_maker.py b/pandaharvester/harvesterworkermaker/simple_bf_es_worker_maker.py index 09382f6b..7400d520 100644 --- a/pandaharvester/harvesterworkermaker/simple_bf_es_worker_maker.py +++ b/pandaharvester/harvesterworkermaker/simple_bf_es_worker_maker.py @@ -17,6 +17,7 @@ class SimpleBackfillESWorkerMaker(BaseWorkerMaker): """Worker maker plugin for simple backfill event service workers.""" + # constructor def __init__(self, **kwarg): self.jobAttributesToUse = ["nCore", "minRamCount", "maxDiskCount", "maxWalltime"] @@ -26,7 +27,7 @@ def __init__(self, **kwarg): self.dyn_resources = None # make a worker from jobs - def make_worker(self, jobspec_list, queue_config, job_type, resource_type): + def make_worker(self, jobspec_list, queue_config, job_type, resource_type, **kwargs): tmpLog = self.make_logger(_logger, f"queue={queue_config.queueName}", method_name="make_worker") tmpLog.debug(f"jobspec_list: {jobspec_list}") diff --git a/pyproject.toml b/pyproject.toml index 21068cfe..abeada50 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,6 +9,7 @@ name = "pandaharvester" dynamic = ["version"] description = "Harvester Package" readme = "README.md" +requires-python = ">=3.10" license = {text = "Apache-2.0"} authors = [ { name = "PanDA Team", email = "panda-support@cern.ch" }, @@ -25,6 +26,7 @@ dependencies = [ 'pexpect', 'psutil >= 5.4.8', 'panda-pilot >= 2.7.2.1', + 'polars', ] [project.optional-dependencies] diff --git a/setup.py b/setup.py index 8997b1ce..49413b78 100644 --- a/setup.py +++ b/setup.py @@ -23,7 +23,7 @@ author="Panda Team", author_email="atlas-adc-panda@cern.ch", url="https://github.com/PanDAWMS/panda-harvester/wiki", - python_requires=">=2.7", + python_requires=">=3.10", packages=find_packages(), install_requires=[ "requests", From e3b015e3b194a120a9d3c4a3c461ed379be8bf46 Mon Sep 17 00:00:00 2001 From: mightqxc Date: Tue, 7 Apr 2026 14:28:41 +0200 Subject: [PATCH 37/38] fix --- pandaharvester/commit_timestamp.py | 2 +- pandaharvester/harvesterbody/worker_adjuster.py | 17 ++++++++--------- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/pandaharvester/commit_timestamp.py b/pandaharvester/commit_timestamp.py index f99aa7c3..8e0d3a68 100644 --- a/pandaharvester/commit_timestamp.py +++ b/pandaharvester/commit_timestamp.py @@ -1 +1 @@ -timestamp = "07-04-2026 12:18:04 on flin (by mightqxc)" +timestamp = "07-04-2026 12:28:42 on flin (by mightqxc)" diff --git a/pandaharvester/harvesterbody/worker_adjuster.py b/pandaharvester/harvesterbody/worker_adjuster.py index fbee0a86..c4c3b838 100644 --- a/pandaharvester/harvesterbody/worker_adjuster.py +++ b/pandaharvester/harvesterbody/worker_adjuster.py @@ -322,17 +322,16 @@ def _normalize_job_type_any(queue_dict): rt_mapper = ResourceTypeMapper() for queue_name in static_num_workers: - queue_config = self.queue_configMapper.get_queue(queue_name) - - # protection against not-up-to-date queue config - if queue_config is None: - tmp_log.debug(f"skipping queue {queue_name} due to missing queue_config in preprocessing loop") - continue - queue_dict = panda_queues_dict.get(queue_name, {}) + queue_config = self.queue_configMapper.get_queue(queue_name) # prioritized prod_source_labels for pilot submission - prioritized_pslabels = getattr(queue_config, "prioritizedProdSourceLabels", DEFAULT_PRIORITIZED_PROD_SOURCE_LABELS) + prioritized_pslabels = DEFAULT_PRIORITIZED_PROD_SOURCE_LABELS + if queue_config: + prioritized_pslabels = getattr(queue_config, "prioritizedProdSourceLabels", DEFAULT_PRIORITIZED_PROD_SOURCE_LABELS) + else: + tmp_log.warning(f"missing queue_config for queue: {queue_name}") + prioritized_pilot_types = [core_utils.prod_source_label_to_pilot_type(label) for label in prioritized_pslabels] tmp_new_workers_df = ( @@ -573,7 +572,7 @@ def _normalize_job_type_any(queue_dict): if queue_config: worker_limits_dict, worker_stats_map = self.dbProxy.get_worker_limits(queue_name, queue_config) else: - tmp_log.warning("missing queue_config") + tmp_log.warning(f"missing queue_config for queue: {queue_name}") # prioritized prod_source_labels for pilot submission prioritized_pslabels = getattr(queue_config, "prioritizedProdSourceLabels", DEFAULT_PRIORITIZED_PROD_SOURCE_LABELS) prioritized_pilot_types = [core_utils.prod_source_label_to_pilot_type(label) for label in prioritized_pslabels] From 04aa0729360fc239f96293a98061de01a97b2f17 Mon Sep 17 00:00:00 2001 From: mightqxc Date: Tue, 7 Apr 2026 14:44:10 +0200 Subject: [PATCH 38/38] fix --- pandaharvester/commit_timestamp.py | 2 +- pandaharvester/harvestercore/db_proxy.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/pandaharvester/commit_timestamp.py b/pandaharvester/commit_timestamp.py index 8e0d3a68..996caf4c 100644 --- a/pandaharvester/commit_timestamp.py +++ b/pandaharvester/commit_timestamp.py @@ -1 +1 @@ -timestamp = "07-04-2026 12:28:42 on flin (by mightqxc)" +timestamp = "07-04-2026 12:44:11 on flin (by mightqxc)" diff --git a/pandaharvester/harvestercore/db_proxy.py b/pandaharvester/harvestercore/db_proxy.py index 7377a5ec..8f125963 100644 --- a/pandaharvester/harvestercore/db_proxy.py +++ b/pandaharvester/harvestercore/db_proxy.py @@ -1749,6 +1749,8 @@ def get_queues_to_submit(self, lookup_interval, lock_interval, locked_by, queue_ sql_count_workers_tmp += "AND resourceType=:resourceType " sql_count_workers_tmp += "GROUP BY pilotType, status " self.execute(sql_count_workers_tmp, varMap) + # Fetch worker count results BEFORE executing any other query to preserve cursor state + resW = self.cur.fetchall() # count nFillers once per queue/jobType/resourceType combination varMap = dict() @@ -1770,7 +1772,7 @@ def get_queues_to_submit(self, lookup_interval, lock_interval, locked_by, queue_ retMap[queueName][jobType].setdefault(resourceType, {}) retMap[queueName][jobType][resourceType].setdefault("ANY", {"nReady": 0, "nRunning": 0, "nQueue": 0, "nNewWorkers": 0}) - for pilotType, workerStatus, tmpNum in self.cur.fetchall(): + for pilotType, workerStatus, tmpNum in resW: nQueue = 0 nReady = 0 nRunning = 0