From de526432c7ea05e1be616483dfd2d380a6ffe66d Mon Sep 17 00:00:00 2001 From: Tobias Macey Date: Wed, 11 Mar 2026 12:22:42 -0400 Subject: [PATCH 01/46] feat(ol_types): add xqwatcher to Services and Application enums Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/ol_infrastructure/lib/ol_types.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/ol_infrastructure/lib/ol_types.py b/src/ol_infrastructure/lib/ol_types.py index d9cf4272e8..9c45df42af 100644 --- a/src/ol_infrastructure/lib/ol_types.py +++ b/src/ol_infrastructure/lib/ol_types.py @@ -93,6 +93,7 @@ class Services(StrEnum): vector_log_proxy = "vector-log-proxy" xpro = "xpro" xqueue = "xqueue" + xqwatcher = "xqwatcher" @unique @@ -130,6 +131,7 @@ class Application(StrEnum): vector_log_proxy = "vector-log-proxy" xpro = "xpro" xqueue = "xqueue" + xqwatcher = "xqwatcher" @unique From c8d154ab71ef645b5679419d9f8f45368d87def9 Mon Sep 17 00:00:00 2001 From: Tobias Macey Date: Wed, 11 Mar 2026 12:22:47 -0400 Subject: [PATCH 02/46] feat(xqwatcher): update Vault policy to include xqueue credentials path Add read access to secret-DEPLOYMENT/edx-xqueue so the xqwatcher service can retrieve the xqueue server URL and authentication password needed by the ContainerGrader handler config. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../applications/xqwatcher/xqwatcher_server_policy.hcl | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/ol_infrastructure/applications/xqwatcher/xqwatcher_server_policy.hcl b/src/ol_infrastructure/applications/xqwatcher/xqwatcher_server_policy.hcl index f0ba478bf2..19e86627ef 100644 --- a/src/ol_infrastructure/applications/xqwatcher/xqwatcher_server_policy.hcl +++ b/src/ol_infrastructure/applications/xqwatcher/xqwatcher_server_policy.hcl @@ -5,3 +5,7 @@ path "sys/leases/renew" { path "secret-xqwatcher/*" { capabilities = [ "read" ] } + +path "secret-DEPLOYMENT/edx-xqueue" { + capabilities = [ "read" ] +} From 3f6c0ca155cc65b215d12a3b8492511e714f439e Mon Sep 17 00:00:00 2001 From: Tobias Macey Date: Wed, 11 Mar 2026 12:23:14 -0400 Subject: [PATCH 03/46] feat(xqwatcher): replace EC2 ASG deployment with Kubernetes Deployment Completely rewrite the xqwatcher Pulumi stack to deploy on Kubernetes instead of EC2 Auto Scaling Groups with AppArmor/codejail. Changes: - Replace IAM instance profile + Vault AWS auth with OLEKSAuthBinding (IRSA + Vault K8s auth backend) - Add OLVaultK8SSecret to sync grader handler config from Vault KV to a Kubernetes Secret via the Vault Secrets Operator CRD - Add a ConfigMap for base poll settings and structured JSON logging to stdout (no log rotation in containers) - Add RBAC Role + RoleBinding granting the xqwatcher service account permission to create/delete Kubernetes Jobs and read pod logs, required by ContainerGrader's kubernetes backend - Create a Kubernetes Deployment with: - ghcr.io/mitodl/xqueue-watcher image - Security context (non-root, drop ALL capabilities) - Resource requests + memory limit - Liveness probe via python -c import xqueue_watcher - Topology spread for HA across nodes - Vault grader config + base config mounted into /xqwatcher/conf.d/ - Preserve vault.kv.SecretV2 write so grader config remains managed in Pulumi - Export k8s_deployment_name and k8s_namespace Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../applications/xqwatcher/__main__.py | 543 +++++++++++------- 1 file changed, 342 insertions(+), 201 deletions(-) diff --git a/src/ol_infrastructure/applications/xqwatcher/__main__.py b/src/ol_infrastructure/applications/xqwatcher/__main__.py index 378bfc2c35..0cfc866f8d 100644 --- a/src/ol_infrastructure/applications/xqwatcher/__main__.py +++ b/src/ol_infrastructure/applications/xqwatcher/__main__.py @@ -1,168 +1,102 @@ -"""Create the resources needed to run a xqwatcher server. # noqa: D200""" +"""Create the Kubernetes resources needed to run xqueue-watcher. # noqa: D200 -# Note: This stack has a silent dependency on an peering connection between the VPC -# that it is installed in and the VPC(s) that contain the xqueue instances. +xqueue-watcher polls an xqueue server for student code submissions and grades +them by spawning an isolated container (ContainerGrader) per submission. This +stack replaces the previous EC2 AMI-based deployment with a Kubernetes +Deployment on the shared applications EKS cluster. + +Secrets are managed via the Vault Secrets Operator (VaultStaticSecret CRD). +""" -import base64 import json -import textwrap +import os from pathlib import Path +import pulumi_kubernetes as kubernetes import pulumi_vault as vault -import yaml -from pulumi import Config, StackReference, export -from pulumi_aws import ec2, get_caller_identity, iam +from pulumi import Config, ResourceOptions, StackReference, export +from pulumi_aws import get_caller_identity -from bridge.secrets.sops import read_yaml_secrets from bridge.settings.openedx.version_matrix import OpenLearningOpenEdxDeployment -from ol_infrastructure.components.aws.auto_scale_group import ( - BlockDeviceMapping, - OLAutoScaleGroupConfig, - OLAutoScaling, - OLLaunchTemplateConfig, - TagSpecification, +from ol_infrastructure.components.applications.eks import ( + OLEKSAuthBinding, + OLEKSAuthBindingConfig, +) +from ol_infrastructure.components.services.vault import ( + OLVaultK8SSecret, + OLVaultK8SStaticSecretConfig, ) -from ol_infrastructure.lib.aws.ec2_helper import InstanceTypes, default_egress_args -from ol_infrastructure.lib.consul import get_consul_provider -from ol_infrastructure.lib.ol_types import AWSBase +from ol_infrastructure.lib.aws.eks_helper import setup_k8s_provider +from ol_infrastructure.lib.ol_types import AWSBase, K8sGlobalLabels, Services from ol_infrastructure.lib.pulumi_helper import parse_stack from ol_infrastructure.lib.vault import setup_vault_provider +from bridge.secrets.sops import read_yaml_secrets + ################################## -## Setup + Config Retrival ## +## Setup + Config Retrieval ## ################################## -if Config("vault_server").get("env_namespace"): +if Config("vault_server").get("env_namespace") or Config("vault").get("address"): setup_vault_provider() + stack_info = parse_stack() xqwatcher_config = Config("xqwatcher") + network_stack = StackReference(f"infrastructure.aws.network.{stack_info.name}") -policy_stack = StackReference("infrastructure.aws.policies") -dns_stack = StackReference("infrastructure.aws.dns") -consul_stack = StackReference( - f"infrastructure.consul.{stack_info.env_prefix}.{stack_info.name}" +vault_mount_stack = StackReference( + f"substructure.vault.static_mounts.operations.{stack_info.name}" ) -env_name = f"{stack_info.env_prefix}-{stack_info.env_suffix}" - -target_vpc_name = xqwatcher_config.get("target_vpc") -target_vpc = network_stack.require_output(target_vpc_name) -vpc_id = target_vpc["id"] +cluster_name = xqwatcher_config.get("cluster") or "applications" +cluster_stack = StackReference( + f"infrastructure.aws.eks.{cluster_name}.{stack_info.name}" +) -consul_security_groups = consul_stack.require_output("security_groups") -consul_provider = get_consul_provider(stack_info) +env_name = f"{stack_info.env_prefix}-{stack_info.env_suffix}" -vault_mount_stack = StackReference( - f"substructure.vault.static_mounts.operations.{stack_info.name}" +openedx_release = ( + OpenLearningOpenEdxDeployment.get_item(stack_info.env_prefix) + .release_by_env(stack_info.name) + .value ) aws_account = get_caller_identity() aws_config = AWSBase( tags={ - "OU": xqwatcher_config.get("business_unit"), + "OU": xqwatcher_config.require("business_unit"), "Environment": env_name, "Application": "open-edx-xqwatcher", "Owner": "platform-engineering", } ) -xqwatcher_server_tag = f"open-edx-xqwatcher-server-{env_name}" -openedx_release = ( - OpenLearningOpenEdxDeployment.get_item(stack_info.env_prefix) - .release_by_env(stack_info.name) - .value +k8s_global_labels = K8sGlobalLabels( + service=Services.xqwatcher, + ou=xqwatcher_config.require("business_unit"), + stack=stack_info, ) -xqwatcher_server_ami = ec2.get_ami( - filters=[ - ec2.GetAmiFilterArgs(name="name", values=["open-edx-xqwatcher-server-*"]), - ec2.GetAmiFilterArgs(name="virtualization-type", values=["hvm"]), - ec2.GetAmiFilterArgs(name="root-device-type", values=["ebs"]), - ec2.GetAmiFilterArgs(name="tag:deployment", values=[stack_info.env_prefix]), - ec2.GetAmiFilterArgs(name="tag:openedx_release", values=[openedx_release]), - ], - most_recent=True, - owners=[aws_account.account_id], -) +setup_k8s_provider(kubeconfig=cluster_stack.require_output("kube_config")) -############################### -## General Resources ## -############################### - -# IAM and instance profile -xqwatcher_server_instance_role = iam.Role( - f"xqwatcher-server-instance-role-{env_name}", - assume_role_policy=json.dumps( - { - "Version": "2012-10-17", - "Statement": { - "Effect": "Allow", - "Action": "sts:AssumeRole", - "Principal": {"Service": "ec2.amazonaws.com"}, - }, - } - ), - path="/ol-infrastructure/xqwatcher-server/role/", - tags=aws_config.tags, -) -iam.RolePolicyAttachment( - f"xqwatcher-server-describe-instance-role-policy-{env_name}", - policy_arn=policy_stack.require_output("iam_policies")["describe_instances"], - role=xqwatcher_server_instance_role.name, -) -xqwatcher_server_instance_profile = iam.InstanceProfile( - f"xqwatcher-server-instance-profile-{env_name}", - role=xqwatcher_server_instance_role.name, - path="/ol-infrastructure/xqwatcher-server/profile/", -) +namespace = xqwatcher_config.get("namespace") or f"{stack_info.env_prefix}-openedx" -# Vault policy definition -xqwatcher_server_vault_policy = vault.Policy( - f"xqwatcher-server-vault-policy-{env_name}", - name=f"xqwatcher-server-{stack_info.env_prefix}", - policy=Path(__file__) - .parent.joinpath("xqwatcher_server_policy.hcl") - .read_text() - .replace("DEPLOYMENT", f"{stack_info.env_prefix}"), -) -# Register xqwatcher AMI for Vault AWS auth -vault.aws.AuthBackendRole( - f"xqwatcher-server-ami-ec2-vault-auth-{env_name}", - backend=f"aws-{stack_info.env_prefix}", - auth_type="iam", - role="xqwatcher-server", - inferred_entity_type="ec2_instance", - inferred_aws_region=aws_config.region, - bound_iam_instance_profile_arns=[xqwatcher_server_instance_profile.arn], - bound_ami_ids=[xqwatcher_server_ami.id], - bound_account_ids=[aws_account.account_id], - bound_vpc_ids=[vpc_id], - token_policies=[xqwatcher_server_vault_policy.name], +docker_image_tag = ( + os.environ.get("XQWATCHER_DOCKER_DIGEST") + or xqwatcher_config.get("docker_tag") + or openedx_release ) +min_replicas = xqwatcher_config.get_int("min_replicas") or 1 +max_replicas = xqwatcher_config.get_int("max_replicas") or 2 + ################################## -# Network Access Control # +## Vault Secret Data ## ################################## -# Create security group -xqwatcher_server_security_group = ec2.SecurityGroup( - f"xqwatcher-server-security-group-{env_name}", - name=f"xqwatcher-server-operations-{env_name}", - description="Access control for xqwatcher servers", - ingress=[], # no listeners on xqwatcher nodes - egress=default_egress_args, - vpc_id=vpc_id, -) - -################################### -# Web Node EC2 Deployment # -################################### - -consul_datacenter = consul_stack.require_output("datacenter") -grafana_credentials = read_yaml_secrets( - Path(f"vector/grafana.{stack_info.env_suffix}.yaml") -) +# Preserve management of the grader config secret in Vault KV. +# The VaultStaticSecret CRD (below) will sync this into the cluster. vault_secrets = read_yaml_secrets( Path(f"xqwatcher/secrets.{stack_info.env_prefix}.{stack_info.env_suffix}.yaml") ) @@ -174,92 +108,299 @@ data_json=json.dumps(vault_secrets), ) -block_device_mappings = [BlockDeviceMapping(volume_size=50)] -tag_specs = [ - TagSpecification( - resource_type="instance", - tags=aws_config.merged_tags({"Name": xqwatcher_server_tag}), +################################## +## Vault Policy + K8s Auth ## +################################## + +vault_policy_template = ( + Path(__file__).parent.joinpath("xqwatcher_server_policy.hcl").read_text() +) +vault_policy_text = vault_policy_template.replace( + "DEPLOYMENT", stack_info.env_prefix +) + +xqwatcher_app = OLEKSAuthBinding( + OLEKSAuthBindingConfig( + application_name=f"xqwatcher-{stack_info.env_prefix}", + namespace=namespace, + stack_info=stack_info, + aws_config=aws_config, + iam_policy_document=None, # no direct AWS resource access required + vault_policy_text=vault_policy_text, + cluster_name=cluster_stack.require_output("cluster_name"), + cluster_identities=cluster_stack.require_output("cluster_identities"), + vault_auth_endpoint=cluster_stack.require_output("vault_auth_endpoint"), + irsa_service_account_name="xqwatcher", + vault_sync_service_account_names=f"xqwatcher-{stack_info.env_prefix}-vault", + k8s_labels=k8s_global_labels, + ) +) + +vault_k8s_resources = xqwatcher_app.vault_k8s_resources + +################################## +## Vault Secrets ## +################################## + +# Grader handler config (queue names, ContainerGrader KWARGS, xqueue URL+auth). +# Stored as `confd_json` in the Vault KV entry written above. +grader_config_secret_name = "xqwatcher-grader-config" # pragma: allowlist secret +grader_config_secret = OLVaultK8SSecret( + f"xqwatcher-{env_name}-grader-config-secret", + OLVaultK8SStaticSecretConfig( + name=grader_config_secret_name, + namespace=namespace, + dest_secret_name=grader_config_secret_name, + dest_secret_labels=k8s_global_labels.model_dump(), + labels=k8s_global_labels.model_dump(), + mount=xqwatcher_vault_mount_name, + mount_type="kv-v2", + path=f"{stack_info.env_prefix}-grader-config", + refresh_after="1h", + restart_target_kind="Deployment", + restart_target_name="xqwatcher", + # Expose just the rendered JSON as a file-friendly key. + templates={ + "grader_config.json": "{{ .Secrets.confd_json }}", + }, + vaultauth=vault_k8s_resources.auth_name, ), - TagSpecification( - resource_type="volume", - tags=aws_config.merged_tags({"Name": xqwatcher_server_tag}), + opts=ResourceOptions( + delete_before_replace=True, + depends_on=[vault_k8s_resources], ), -] - -lt_config = OLLaunchTemplateConfig( - block_device_mappings=block_device_mappings, - image_id=xqwatcher_server_ami.id, - instance_type=xqwatcher_config.get("instance_type") - or InstanceTypes.burstable_small, - instance_profile_arn=xqwatcher_server_instance_profile.arn, - security_groups=[ - xqwatcher_server_security_group, - consul_security_groups["consul_agent"], - ], - tags=aws_config.merged_tags({"Name": xqwatcher_server_tag}), - tag_specifications=tag_specs, - user_data=consul_datacenter.apply( - lambda consul_dc: base64.b64encode( - "#cloud-config\n{}".format( - yaml.dump( - { - "write_files": [ - { - "path": "/etc/consul.d/02-autojoin.json", - "content": json.dumps( - { - "retry_join": [ - "provider=aws tag_key=consul_env " - f"tag_value={consul_dc}" - ], - "datacenter": consul_dc, - } - ), - "owner": "consul:consul", - }, - { - "path": "/etc/default/vector", - "content": textwrap.dedent( - f"""\ - ENVIRONMENT={consul_dc} - APPLICATION=xqwatcher-{stack_info.env_prefix} - VECTOR_CONFIG_DIR=/etc/vector/ - VECTOR_STRICT_ENV_VARS=false - AWS_REGION={aws_config.region} - GRAFANA_CLOUD_API_KEY={grafana_credentials["api_key"]} - GRAFANA_CLOUD_PROMETHEUS_API_USER={grafana_credentials["prometheus_user_id"]} - GRAFANA_CLOUD_LOKI_API_USER={grafana_credentials["loki_user_id"]} - """ - ), - "owner": "root:root", - }, - ] - }, - sort_keys=True, - ) - ).encode("utf8") - ).decode("utf8") +) + +################################## +## ConfigMap ## +################################## + +# Base xqueue-watcher config (poll settings, logging). +# Per-queue grader config comes from the Vault-synced secret above. +xqwatcher_configmap = kubernetes.core.v1.ConfigMap( + f"xqwatcher-{env_name}-configmap", + metadata=kubernetes.meta.v1.ObjectMetaArgs( + name="xqwatcher-config", + namespace=namespace, + labels=k8s_global_labels.model_dump(), + ), + data={ + "xqwatcher.json": json.dumps( + { + "FOLLOW_CLIENT_REDIRECTS": True, + "POLL_INTERVAL": 10, + "POLL_TIME": 10, + "REQUESTS_TIMEOUT": 10, + } + ), + # Emit logs to stdout only; no file rotation needed in containers. + "logging.json": json.dumps( + { + "version": 1, + "disable_existing_loggers": False, + "formatters": { + "default": { + "format": "%(asctime)s - %(filename)s:%(lineno)d -- %(funcName)s [%(levelname)s]: %(message)s", + } + }, + "handlers": { + "console": { + "class": "logging.StreamHandler", + "formatter": "default", + "level": "INFO", + } + }, + "loggers": { + "": { + "handlers": ["console"], + "level": "INFO", + } + }, + } + ), + }, +) + +################################## +## RBAC for ContainerGrader ## +################################## + +# xqwatcher uses the ContainerGrader backend which creates a Kubernetes Job +# per submission. The service account running xqwatcher pods needs permission +# to create/delete Jobs and read pod logs in the same namespace. + +xqwatcher_grader_role = kubernetes.rbac.v1.Role( + f"xqwatcher-{env_name}-grader-role", + metadata=kubernetes.meta.v1.ObjectMetaArgs( + name="xqwatcher-grader", + namespace=namespace, + labels=k8s_global_labels.model_dump(), ), + rules=[ + kubernetes.rbac.v1.PolicyRuleArgs( + api_groups=["batch"], + resources=["jobs"], + verbs=["create", "delete", "get", "list", "watch"], + ), + kubernetes.rbac.v1.PolicyRuleArgs( + api_groups=[""], + resources=["pods", "pods/log"], + verbs=["get", "list", "watch"], + ), + ], ) -auto_scale_config = xqwatcher_config.get_object("auto_scale") or { - "desired": 2, - "min": 1, - "max": 3, -} -asg_config = OLAutoScaleGroupConfig( - asg_name=f"xqwatcher-server-{env_name}", - aws_config=aws_config, - desired_size=auto_scale_config["desired"] or 2, - min_size=auto_scale_config["min"] or 1, - max_size=auto_scale_config["max"] or 3, - vpc_zone_identifiers=target_vpc["subnet_ids"], - tags=aws_config.merged_tags({"Name": xqwatcher_server_tag}), +xqwatcher_grader_rolebinding = kubernetes.rbac.v1.RoleBinding( + f"xqwatcher-{env_name}-grader-rolebinding", + metadata=kubernetes.meta.v1.ObjectMetaArgs( + name="xqwatcher-grader", + namespace=namespace, + labels=k8s_global_labels.model_dump(), + ), + role_ref=kubernetes.rbac.v1.RoleRefArgs( + api_group="rbac.authorization.k8s.io", + kind="Role", + name=xqwatcher_grader_role.metadata.name, + ), + subjects=[ + kubernetes.rbac.v1.SubjectArgs( + kind="ServiceAccount", + name="xqwatcher", + namespace=namespace, + ), + ], ) -as_setup = OLAutoScaling( - asg_config=asg_config, - lt_config=lt_config, +################################## +## Deployment ## +################################## + +app_labels = {**k8s_global_labels.model_dump(), "app": "xqwatcher"} + +xqwatcher_deployment = kubernetes.apps.v1.Deployment( + f"xqwatcher-{env_name}-deployment", + metadata=kubernetes.meta.v1.ObjectMetaArgs( + name="xqwatcher", + namespace=namespace, + labels=k8s_global_labels.model_dump(), + ), + spec=kubernetes.apps.v1.DeploymentSpecArgs( + replicas=min_replicas, + selector=kubernetes.meta.v1.LabelSelectorArgs( + match_labels={"app": "xqwatcher"}, + ), + strategy=kubernetes.apps.v1.DeploymentStrategyArgs( + type="RollingUpdate", + rolling_update=kubernetes.apps.v1.RollingUpdateDeploymentArgs( + max_surge=1, + max_unavailable=0, + ), + ), + template=kubernetes.core.v1.PodTemplateSpecArgs( + metadata=kubernetes.meta.v1.ObjectMetaArgs( + labels=app_labels, + ), + spec=kubernetes.core.v1.PodSpecArgs( + service_account_name="xqwatcher", + # Spread replicas across nodes for HA + topology_spread_constraints=[ + kubernetes.core.v1.TopologySpreadConstraintArgs( + max_skew=1, + topology_key="kubernetes.io/hostname", + when_unsatisfiable="ScheduleAnyway", + label_selector=kubernetes.meta.v1.LabelSelectorArgs( + match_labels={"app": "xqwatcher"}, + ), + ) + ], + containers=[ + kubernetes.core.v1.ContainerArgs( + name="xqueue-watcher", + image=f"ghcr.io/mitodl/xqueue-watcher:{docker_image_tag}", + image_pull_policy="IfNotPresent", + command=["xqueue-watcher"], + args=[ + "--config", "/xqwatcher/conf.d/xqwatcher.json", + "--logging-config", "/xqwatcher/conf.d/logging.json", + "-d", "/xqwatcher/conf.d", + ], + # Liveness: verify the Python runtime is functional. + # The process will crash (and K8s will restart) on + # persistent xqueue connectivity failures, so we rely on + # the restart policy for connectivity-level health. + liveness_probe=kubernetes.core.v1.ProbeArgs( + exec_=kubernetes.core.v1.ExecActionArgs( + command=[ + "python", + "-c", + "import xqueue_watcher; import sys; sys.exit(0)", + ] + ), + initial_delay_seconds=30, + period_seconds=60, + failure_threshold=3, + timeout_seconds=10, + ), + resources=kubernetes.core.v1.ResourceRequirementsArgs( + requests={"cpu": "250m", "memory": "256Mi"}, + limits={"memory": "512Mi"}, + ), + security_context=kubernetes.core.v1.SecurityContextArgs( + allow_privilege_escalation=False, + run_as_non_root=True, + run_as_user=1000, + capabilities=kubernetes.core.v1.CapabilitiesArgs( + drop=["ALL"], + ), + ), + volume_mounts=[ + # Base poll settings from ConfigMap + kubernetes.core.v1.VolumeMountArgs( + name="xqwatcher-config", + mount_path="/xqwatcher/conf.d/xqwatcher.json", + sub_path="xqwatcher.json", + read_only=True, + ), + kubernetes.core.v1.VolumeMountArgs( + name="xqwatcher-config", + mount_path="/xqwatcher/conf.d/logging.json", + sub_path="logging.json", + read_only=True, + ), + # Per-queue grader handler config from Vault secret + kubernetes.core.v1.VolumeMountArgs( + name="grader-config", + mount_path="/xqwatcher/conf.d/grader_config.json", + sub_path="grader_config.json", + read_only=True, + ), + ], + ), + ], + volumes=[ + kubernetes.core.v1.VolumeArgs( + name="xqwatcher-config", + config_map=kubernetes.core.v1.ConfigMapVolumeSourceArgs( + name=xqwatcher_configmap.metadata.name, + ), + ), + kubernetes.core.v1.VolumeArgs( + name="grader-config", + secret=kubernetes.core.v1.SecretVolumeSourceArgs( + secret_name=grader_config_secret_name, + ), + ), + ], + ), + ), + ), + opts=ResourceOptions(depends_on=[grader_config_secret]), ) -export("xqwatcher_security_group", xqwatcher_server_security_group.id) +################################## +## Exports ## +################################## + +export("k8s_deployment_name", "xqwatcher") +export("k8s_namespace", namespace) +export("grader_config_secret", grader_config_secret_name) From fb4441328bc8662bc008e134862cf43b8384f387 Mon Sep 17 00:00:00 2001 From: Tobias Macey Date: Wed, 11 Mar 2026 12:23:23 -0400 Subject: [PATCH 04/46] feat(xqwatcher): update all 9 stack configs for Kubernetes deployment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove EC2-specific settings (consul:address, auto_scale, instance_type) and add Kubernetes-specific settings for all stacks: - xqwatcher:cluster — EKS cluster name (residential or applications) - xqwatcher:namespace — target Kubernetes namespace - xqwatcher:min_replicas — minimum pod count (maps from auto_scale.desired) - xqwatcher:max_replicas — maximum pod count (maps from auto_scale.max) - xqwatcher:docker_tag — container image tag (default: latest) Cluster assignments: - mitx, mitx-staging → residential cluster - mitxonline → applications cluster Namespace assignments follow xqueue convention: - mitx → mitx-openedx - mitxonline → mitxonline-openedx - mitx-staging → mitx-staging-openedx Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- ...lumi.applications.xqwatcher.mitx-staging.CI.yaml | 11 +++++------ ...lications.xqwatcher.mitx-staging.Production.yaml | 11 +++++------ ...lumi.applications.xqwatcher.mitx-staging.QA.yaml | 11 +++++------ .../Pulumi.applications.xqwatcher.mitx.CI.yaml | 11 +++++------ ...lumi.applications.xqwatcher.mitx.Production.yaml | 11 +++++------ .../Pulumi.applications.xqwatcher.mitx.QA.yaml | 11 +++++------ ...Pulumi.applications.xqwatcher.mitxonline.CI.yaml | 13 ++++++------- ...pplications.xqwatcher.mitxonline.Production.yaml | 13 ++++++------- ...Pulumi.applications.xqwatcher.mitxonline.QA.yaml | 13 ++++++------- 9 files changed, 48 insertions(+), 57 deletions(-) diff --git a/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx-staging.CI.yaml b/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx-staging.CI.yaml index a023e8c5df..8a584c95db 100644 --- a/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx-staging.CI.yaml +++ b/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx-staging.CI.yaml @@ -3,13 +3,12 @@ secretsprovider: awskms://alias/infrastructure-secrets-ci encryptedkey: AQICAHjnbqe9AmEW1Js10nySybyuAG7Fb5E9EHUgkmqFDv7PxQGTfGgSk9EZ4ZNb/wbJfXd+AAAAfjB8BgkqhkiG9w0BBwagbzBtAgEAMGgGCSqGSIb3DQEHATAeBglghkgBZQMEAS4wEQQMA0HqkgIE4odoJJA1AgEQgDvV0/Ss11eXyMaVbNAyMdRYYZtma1v9dVZa+p4MuzZFJn9xBZU9Fsa1suYQRgBa5jhg3XsmZDnN8st/aw== config: aws:region: us-east-1 - consul:address: https://consul-mitx-staging-ci.odl.mit.edu - xqwatcher:auto_scale: - desired: 1 - max: 2 - min: 1 - xqwatcher:instance_type: t3a.small xqwatcher:business_unit: residential-staging xqwatcher:target_vpc: residential_mitx_staging_vpc + xqwatcher:cluster: residential + xqwatcher:namespace: mitx-staging-openedx + xqwatcher:min_replicas: 1 + xqwatcher:max_replicas: 2 + xqwatcher:docker_tag: latest vault:address: https://vault-ci.odl.mit.edu vault_server:env_namespace: operations.ci diff --git a/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx-staging.Production.yaml b/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx-staging.Production.yaml index 396551cb4b..29e15f7cce 100644 --- a/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx-staging.Production.yaml +++ b/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx-staging.Production.yaml @@ -3,13 +3,12 @@ secretsprovider: awskms://alias/infrastructure-secrets-production encryptedkey: AQICAHjmo6C0sCNz3fdkFlhbu0tdBZxnHmPYSnqtmocvGiuNygF8Luz2qeEEBP5Xlrjd6nabAAAAfjB8BgkqhkiG9w0BBwagbzBtAgEAMGgGCSqGSIb3DQEHATAeBglghkgBZQMEAS4wEQQMx2cfVhpQzWg5NpHZAgEQgDtL008GopRA6ADVsjgvqT7eMirUDc8R1jusrAhd7rHx016K9nC2OI23eapgxQyW3fgAomXkVJQir5fHYA== config: aws:region: us-east-1 - consul:address: https://consul-mitx-staging-production.odl.mit.edu - xqwatcher:auto_scale: - desired: 1 - max: 2 - min: 1 - xqwatcher:instance_type: t3a.small xqwatcher:business_unit: residential-staging xqwatcher:target_vpc: residential_mitx_staging_vpc + xqwatcher:cluster: residential + xqwatcher:namespace: mitx-staging-openedx + xqwatcher:min_replicas: 1 + xqwatcher:max_replicas: 2 + xqwatcher:docker_tag: latest vault:address: https://vault-production.odl.mit.edu vault_server:env_namespace: operations.production diff --git a/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx-staging.QA.yaml b/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx-staging.QA.yaml index 9a501d67ea..f3c3fa4b28 100644 --- a/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx-staging.QA.yaml +++ b/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx-staging.QA.yaml @@ -3,13 +3,12 @@ secretsprovider: awskms://alias/infrastructure-secrets-qa encryptedkey: AQICAHgQW+3bag/cl2fPG3dPdqAPbfcsZuwI7rETXZsx85HRpgHz947ZuJNR+i0BzvgqRXZMAAAAfjB8BgkqhkiG9w0BBwagbzBtAgEAMGgGCSqGSIb3DQEHATAeBglghkgBZQMEAS4wEQQMSLaA5CcNOfBjGw2fAgEQgDv2PNuHpexnToW8k4+LZa/O4CHA+8dn0qTB9vNd+rPFMlShc4mt37WhEY/KHAmUhkLvkDsaySxcdelxrA== config: aws:region: us-east-1 - consul:address: https://consul-mitx-staging-qa.odl.mit.edu - xqwatcher:auto_scale: - desired: 1 - max: 2 - min: 1 - xqwatcher:instance_type: t3a.small xqwatcher:business_unit: residential-staging xqwatcher:target_vpc: residential_mitx_staging_vpc + xqwatcher:cluster: residential + xqwatcher:namespace: mitx-staging-openedx + xqwatcher:min_replicas: 1 + xqwatcher:max_replicas: 2 + xqwatcher:docker_tag: latest vault:address: https://vault-qa.odl.mit.edu vault_server:env_namespace: operations.qa diff --git a/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx.CI.yaml b/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx.CI.yaml index 32563028d1..7651a8d173 100644 --- a/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx.CI.yaml +++ b/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx.CI.yaml @@ -3,13 +3,12 @@ secretsprovider: awskms://alias/infrastructure-secrets-ci encryptedkey: AQICAHjnbqe9AmEW1Js10nySybyuAG7Fb5E9EHUgkmqFDv7PxQHQ1nYxdMdGpUV3lkCYkPCCAAAAfjB8BgkqhkiG9w0BBwagbzBtAgEAMGgGCSqGSIb3DQEHATAeBglghkgBZQMEAS4wEQQMsqc+FPq0+d14aeiEAgEQgDtOLAH8o/ueXOQKwFgUIv0reMzktDtSL+DF4pec6zPtl0qaaP7mXGY9WECr4y4YGaZ6uHtgh1pHncqqIQ== config: aws:region: us-east-1 - consul:address: https://consul-mitx-ci.odl.mit.edu - xqwatcher:auto_scale: - desired: 1 - max: 2 - min: 1 - xqwatcher:instance_type: t3a.small xqwatcher:business_unit: residential xqwatcher:target_vpc: residential_mitx_vpc + xqwatcher:cluster: residential + xqwatcher:namespace: mitx-openedx + xqwatcher:min_replicas: 1 + xqwatcher:max_replicas: 2 + xqwatcher:docker_tag: latest vault:address: https://vault-ci.odl.mit.edu vault_server:env_namespace: operations.ci diff --git a/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx.Production.yaml b/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx.Production.yaml index 7b34ef48eb..6507938648 100644 --- a/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx.Production.yaml +++ b/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx.Production.yaml @@ -3,13 +3,12 @@ secretsprovider: awskms://alias/infrastructure-secrets-production encryptedkey: AQICAHjmo6C0sCNz3fdkFlhbu0tdBZxnHmPYSnqtmocvGiuNygG2bHWpHlBF4YM4HIMysk4IAAAAfjB8BgkqhkiG9w0BBwagbzBtAgEAMGgGCSqGSIb3DQEHATAeBglghkgBZQMEAS4wEQQMawAEgwsc+Nx69O7TAgEQgDtiAIRJusPXgD/M0b49KX75IkX36QN7kgXzYkq5KijA7xXU9pJkECwS0ZF9eQikfX6Po8sm4e+frmhCWg== config: aws:region: us-east-1 - consul:address: https://consul-mitx-production.odl.mit.edu - xqwatcher:auto_scale: - desired: 2 - max: 3 - min: 1 - xqwatcher:instance_type: r5a.large xqwatcher:business_unit: residential xqwatcher:target_vpc: residential_mitx_vpc + xqwatcher:cluster: residential + xqwatcher:namespace: mitx-openedx + xqwatcher:min_replicas: 2 + xqwatcher:max_replicas: 3 + xqwatcher:docker_tag: latest vault:address: https://vault-production.odl.mit.edu vault_server:env_namespace: operations.production diff --git a/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx.QA.yaml b/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx.QA.yaml index bcc3bcf6a7..44013a9da2 100644 --- a/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx.QA.yaml +++ b/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx.QA.yaml @@ -3,13 +3,12 @@ secretsprovider: awskms://alias/infrastructure-secrets-qa encryptedkey: AQICAHgQW+3bag/cl2fPG3dPdqAPbfcsZuwI7rETXZsx85HRpgFh6uALQ+g4+ZnTTRntlQCIAAAAfjB8BgkqhkiG9w0BBwagbzBtAgEAMGgGCSqGSIb3DQEHATAeBglghkgBZQMEAS4wEQQMAxL8o+aBjXxOHc8xAgEQgDuxzj+qX9ZtqoBbvEyXA4VRvbWDhFOHIRbGsJ1NCgU+Hmy8R3gsBN45UE7Wu71yWe6oINNqRbsuDw10EQ== config: aws:region: us-east-1 - consul:address: https://consul-mitx-qa.odl.mit.edu - xqwatcher:auto_scale: - desired: 1 - max: 2 - min: 1 - xqwatcher:instance_type: r5a.large xqwatcher:business_unit: residential xqwatcher:target_vpc: residential_mitx_vpc + xqwatcher:cluster: residential + xqwatcher:namespace: mitx-openedx + xqwatcher:min_replicas: 1 + xqwatcher:max_replicas: 2 + xqwatcher:docker_tag: latest vault:address: https://vault-qa.odl.mit.edu vault_server:env_namespace: operations.qa diff --git a/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitxonline.CI.yaml b/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitxonline.CI.yaml index 6899ca4608..f593e1044a 100644 --- a/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitxonline.CI.yaml +++ b/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitxonline.CI.yaml @@ -3,13 +3,12 @@ secretsprovider: awskms://alias/infrastructure-secrets-ci encryptedkey: AQICAHi3MZ/Pjy2dahB1Qm+zKkKDPV1b9MYPGp7k649HPjmOHAG+XE3l7voVbQN9bQ80XZRMAAAAfjB8BgkqhkiG9w0BBwagbzBtAgEAMGgGCSqGSIb3DQEHATAeBglghkgBZQMEAS4wEQQMq5xwinh6H/9mPuyeAgEQgDtUAuLW2HbXCdstaU6dZEtTOQ2SXq67YUzDbnJeE2FNb49KxWEgsXeUzz/r6XiML/cTwq1cTIHq7LDi0w== config: aws:region: us-east-1 - consul:address: https://consul-mitxonline-ci.odl.mit.edu - xqwatcher:auto_scale: - desired: 1 - max: 2 - min: 1 - xqwatcher:instance_type: t3a.small xqwatcher:business_unit: mitxonline - xqwatcher:target_vpc: mitxonline_vpc + xqwatcher:target_vpc: applications_vpc + xqwatcher:cluster: applications + xqwatcher:namespace: mitxonline-openedx + xqwatcher:min_replicas: 1 + xqwatcher:max_replicas: 2 + xqwatcher:docker_tag: latest vault:address: https://vault-ci.odl.mit.edu vault_server:env_namespace: operations.ci diff --git a/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitxonline.Production.yaml b/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitxonline.Production.yaml index 2247500801..d525f6334b 100644 --- a/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitxonline.Production.yaml +++ b/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitxonline.Production.yaml @@ -3,13 +3,12 @@ secretsprovider: awskms://alias/infrastructure-secrets-production encryptedkey: AQICAHjmo6C0sCNz3fdkFlhbu0tdBZxnHmPYSnqtmocvGiuNygG5AzdO0QY0yXbhDGt3drvfAAAAfjB8BgkqhkiG9w0BBwagbzBtAgEAMGgGCSqGSIb3DQEHATAeBglghkgBZQMEAS4wEQQMevv1o+gghWthYkifAgEQgDtMc3j8K7A1ne9ZjHtpgBo9wlSor6yW7KOQpjGjToqweQ5wvlLlkOQibnSKKxi6Vhsm3gXz7nlzNuliIg== config: aws:region: us-east-1 - consul:address: https://consul-mitxonline-production.odl.mit.edu - xqwatcher:auto_scale: - desired: 3 - max: 8 - min: 1 - xqwatcher:instance_type: r7a.large xqwatcher:business_unit: mitxonline - xqwatcher:target_vpc: mitxonline_vpc + xqwatcher:target_vpc: applications_vpc + xqwatcher:cluster: applications + xqwatcher:namespace: mitxonline-openedx + xqwatcher:min_replicas: 2 + xqwatcher:max_replicas: 8 + xqwatcher:docker_tag: latest vault:address: https://vault-production.odl.mit.edu vault_server:env_namespace: operations.production diff --git a/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitxonline.QA.yaml b/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitxonline.QA.yaml index bd41dd3103..978b5be73f 100644 --- a/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitxonline.QA.yaml +++ b/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitxonline.QA.yaml @@ -3,13 +3,12 @@ secretsprovider: awskms://alias/infrastructure-secrets-qa encryptedkey: AQICAHgQW+3bag/cl2fPG3dPdqAPbfcsZuwI7rETXZsx85HRpgHzGaCdqKWGOJ49SaKpOTIJAAAAfjB8BgkqhkiG9w0BBwagbzBtAgEAMGgGCSqGSIb3DQEHATAeBglghkgBZQMEAS4wEQQMolKUc2XHdg9utgvHAgEQgDt45yPejgGlntgwIkd0Eu0ButJHb8w1TpovLlcZAowgytdZ2JXjqvmRyncU1GOtcf7/NYjfjqj5WE5iSg== config: aws:region: us-east-1 - consul:address: https://consul-mitxonline-qa.odl.mit.edu - xqwatcher:auto_scale: - desired: 1 - max: 2 - min: 1 - xqwatcher:instance_type: t3a.small xqwatcher:business_unit: mitxonline - xqwatcher:target_vpc: mitxonline_vpc + xqwatcher:target_vpc: applications_vpc + xqwatcher:cluster: applications + xqwatcher:namespace: mitxonline-openedx + xqwatcher:min_replicas: 1 + xqwatcher:max_replicas: 2 + xqwatcher:docker_tag: latest vault:address: https://vault-qa.odl.mit.edu vault_server:env_namespace: operations.qa From 561346033676b93f6439e6d66d38830ed837dbc4 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 11 Mar 2026 16:24:53 +0000 Subject: [PATCH 05/46] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../applications/xqwatcher/__main__.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/ol_infrastructure/applications/xqwatcher/__main__.py b/src/ol_infrastructure/applications/xqwatcher/__main__.py index 0cfc866f8d..e31ede09e2 100644 --- a/src/ol_infrastructure/applications/xqwatcher/__main__.py +++ b/src/ol_infrastructure/applications/xqwatcher/__main__.py @@ -17,6 +17,7 @@ from pulumi import Config, ResourceOptions, StackReference, export from pulumi_aws import get_caller_identity +from bridge.secrets.sops import read_yaml_secrets from bridge.settings.openedx.version_matrix import OpenLearningOpenEdxDeployment from ol_infrastructure.components.applications.eks import ( OLEKSAuthBinding, @@ -31,8 +32,6 @@ from ol_infrastructure.lib.pulumi_helper import parse_stack from ol_infrastructure.lib.vault import setup_vault_provider -from bridge.secrets.sops import read_yaml_secrets - ################################## ## Setup + Config Retrieval ## ################################## @@ -115,9 +114,7 @@ vault_policy_template = ( Path(__file__).parent.joinpath("xqwatcher_server_policy.hcl").read_text() ) -vault_policy_text = vault_policy_template.replace( - "DEPLOYMENT", stack_info.env_prefix -) +vault_policy_text = vault_policy_template.replace("DEPLOYMENT", stack_info.env_prefix) xqwatcher_app = OLEKSAuthBinding( OLEKSAuthBindingConfig( @@ -320,9 +317,12 @@ image_pull_policy="IfNotPresent", command=["xqueue-watcher"], args=[ - "--config", "/xqwatcher/conf.d/xqwatcher.json", - "--logging-config", "/xqwatcher/conf.d/logging.json", - "-d", "/xqwatcher/conf.d", + "--config", + "/xqwatcher/conf.d/xqwatcher.json", + "--logging-config", + "/xqwatcher/conf.d/logging.json", + "-d", + "/xqwatcher/conf.d", ], # Liveness: verify the Python runtime is functional. # The process will crash (and K8s will restart) on From 1eedc5e03d36ae9d26ff203f8489da20127ac8c0 Mon Sep 17 00:00:00 2001 From: Tobias Macey Date: Wed, 18 Mar 2026 15:35:21 -0400 Subject: [PATCH 06/46] fix(xqwatcher): address PR review feedback and deployment issues - Add create_irsa_service_account flag to OLEKSAuthBinding to optionally create the K8s ServiceAccount with IRSA annotation; use it in xqwatcher to fix 'serviceaccount not found' pod error - Add XQWATCHER_* env vars to Deployment matching env_settings.py; expose http_basic_auth from Vault-synced secret via VSO template - Fix image reference from ghcr.io to mitodl/ (DockerHub) - Change imagePullPolicy to Always for mutable 'latest' tag - Rename XQWATCHER_DOCKER_DIGEST to XQWATCHER_DOCKER_TAG - Remove unused network_stack StackReference - Remove dead xqwatcher:target_vpc config key from all 9 stacks - Remove unimplemented xqwatcher:max_replicas from all 9 stacks Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- ...pplications.xqwatcher.mitx-staging.CI.yaml | 2 - ...ons.xqwatcher.mitx-staging.Production.yaml | 2 - ...pplications.xqwatcher.mitx-staging.QA.yaml | 2 - ...Pulumi.applications.xqwatcher.mitx.CI.yaml | 2 - ...pplications.xqwatcher.mitx.Production.yaml | 2 - ...Pulumi.applications.xqwatcher.mitx.QA.yaml | 2 - ....applications.xqwatcher.mitxonline.CI.yaml | 2 - ...tions.xqwatcher.mitxonline.Production.yaml | 2 - ....applications.xqwatcher.mitxonline.QA.yaml | 2 - .../applications/xqwatcher/__main__.py | 53 ++++++++++++++++--- .../components/applications/eks.py | 33 ++++++++++++ 11 files changed, 78 insertions(+), 26 deletions(-) diff --git a/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx-staging.CI.yaml b/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx-staging.CI.yaml index 8a584c95db..995cc3b520 100644 --- a/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx-staging.CI.yaml +++ b/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx-staging.CI.yaml @@ -4,11 +4,9 @@ encryptedkey: AQICAHjnbqe9AmEW1Js10nySybyuAG7Fb5E9EHUgkmqFDv7PxQGTfGgSk9EZ4ZNb/w config: aws:region: us-east-1 xqwatcher:business_unit: residential-staging - xqwatcher:target_vpc: residential_mitx_staging_vpc xqwatcher:cluster: residential xqwatcher:namespace: mitx-staging-openedx xqwatcher:min_replicas: 1 - xqwatcher:max_replicas: 2 xqwatcher:docker_tag: latest vault:address: https://vault-ci.odl.mit.edu vault_server:env_namespace: operations.ci diff --git a/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx-staging.Production.yaml b/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx-staging.Production.yaml index 29e15f7cce..d61999f625 100644 --- a/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx-staging.Production.yaml +++ b/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx-staging.Production.yaml @@ -4,11 +4,9 @@ encryptedkey: AQICAHjmo6C0sCNz3fdkFlhbu0tdBZxnHmPYSnqtmocvGiuNygF8Luz2qeEEBP5Xlr config: aws:region: us-east-1 xqwatcher:business_unit: residential-staging - xqwatcher:target_vpc: residential_mitx_staging_vpc xqwatcher:cluster: residential xqwatcher:namespace: mitx-staging-openedx xqwatcher:min_replicas: 1 - xqwatcher:max_replicas: 2 xqwatcher:docker_tag: latest vault:address: https://vault-production.odl.mit.edu vault_server:env_namespace: operations.production diff --git a/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx-staging.QA.yaml b/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx-staging.QA.yaml index f3c3fa4b28..154f44eafa 100644 --- a/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx-staging.QA.yaml +++ b/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx-staging.QA.yaml @@ -4,11 +4,9 @@ encryptedkey: AQICAHgQW+3bag/cl2fPG3dPdqAPbfcsZuwI7rETXZsx85HRpgHz947ZuJNR+i0Bzv config: aws:region: us-east-1 xqwatcher:business_unit: residential-staging - xqwatcher:target_vpc: residential_mitx_staging_vpc xqwatcher:cluster: residential xqwatcher:namespace: mitx-staging-openedx xqwatcher:min_replicas: 1 - xqwatcher:max_replicas: 2 xqwatcher:docker_tag: latest vault:address: https://vault-qa.odl.mit.edu vault_server:env_namespace: operations.qa diff --git a/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx.CI.yaml b/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx.CI.yaml index 7651a8d173..c4164b2634 100644 --- a/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx.CI.yaml +++ b/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx.CI.yaml @@ -4,11 +4,9 @@ encryptedkey: AQICAHjnbqe9AmEW1Js10nySybyuAG7Fb5E9EHUgkmqFDv7PxQHQ1nYxdMdGpUV3lk config: aws:region: us-east-1 xqwatcher:business_unit: residential - xqwatcher:target_vpc: residential_mitx_vpc xqwatcher:cluster: residential xqwatcher:namespace: mitx-openedx xqwatcher:min_replicas: 1 - xqwatcher:max_replicas: 2 xqwatcher:docker_tag: latest vault:address: https://vault-ci.odl.mit.edu vault_server:env_namespace: operations.ci diff --git a/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx.Production.yaml b/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx.Production.yaml index 6507938648..fd469e35ee 100644 --- a/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx.Production.yaml +++ b/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx.Production.yaml @@ -4,11 +4,9 @@ encryptedkey: AQICAHjmo6C0sCNz3fdkFlhbu0tdBZxnHmPYSnqtmocvGiuNygG2bHWpHlBF4YM4HI config: aws:region: us-east-1 xqwatcher:business_unit: residential - xqwatcher:target_vpc: residential_mitx_vpc xqwatcher:cluster: residential xqwatcher:namespace: mitx-openedx xqwatcher:min_replicas: 2 - xqwatcher:max_replicas: 3 xqwatcher:docker_tag: latest vault:address: https://vault-production.odl.mit.edu vault_server:env_namespace: operations.production diff --git a/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx.QA.yaml b/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx.QA.yaml index 44013a9da2..45d68c947e 100644 --- a/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx.QA.yaml +++ b/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx.QA.yaml @@ -4,11 +4,9 @@ encryptedkey: AQICAHgQW+3bag/cl2fPG3dPdqAPbfcsZuwI7rETXZsx85HRpgFh6uALQ+g4+ZnTTR config: aws:region: us-east-1 xqwatcher:business_unit: residential - xqwatcher:target_vpc: residential_mitx_vpc xqwatcher:cluster: residential xqwatcher:namespace: mitx-openedx xqwatcher:min_replicas: 1 - xqwatcher:max_replicas: 2 xqwatcher:docker_tag: latest vault:address: https://vault-qa.odl.mit.edu vault_server:env_namespace: operations.qa diff --git a/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitxonline.CI.yaml b/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitxonline.CI.yaml index f593e1044a..2338a2380e 100644 --- a/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitxonline.CI.yaml +++ b/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitxonline.CI.yaml @@ -4,11 +4,9 @@ encryptedkey: AQICAHi3MZ/Pjy2dahB1Qm+zKkKDPV1b9MYPGp7k649HPjmOHAG+XE3l7voVbQN9bQ config: aws:region: us-east-1 xqwatcher:business_unit: mitxonline - xqwatcher:target_vpc: applications_vpc xqwatcher:cluster: applications xqwatcher:namespace: mitxonline-openedx xqwatcher:min_replicas: 1 - xqwatcher:max_replicas: 2 xqwatcher:docker_tag: latest vault:address: https://vault-ci.odl.mit.edu vault_server:env_namespace: operations.ci diff --git a/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitxonline.Production.yaml b/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitxonline.Production.yaml index d525f6334b..f36414513d 100644 --- a/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitxonline.Production.yaml +++ b/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitxonline.Production.yaml @@ -4,11 +4,9 @@ encryptedkey: AQICAHjmo6C0sCNz3fdkFlhbu0tdBZxnHmPYSnqtmocvGiuNygG5AzdO0QY0yXbhDG config: aws:region: us-east-1 xqwatcher:business_unit: mitxonline - xqwatcher:target_vpc: applications_vpc xqwatcher:cluster: applications xqwatcher:namespace: mitxonline-openedx xqwatcher:min_replicas: 2 - xqwatcher:max_replicas: 8 xqwatcher:docker_tag: latest vault:address: https://vault-production.odl.mit.edu vault_server:env_namespace: operations.production diff --git a/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitxonline.QA.yaml b/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitxonline.QA.yaml index 978b5be73f..6ed6539fa4 100644 --- a/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitxonline.QA.yaml +++ b/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitxonline.QA.yaml @@ -4,11 +4,9 @@ encryptedkey: AQICAHgQW+3bag/cl2fPG3dPdqAPbfcsZuwI7rETXZsx85HRpgHzGaCdqKWGOJ49Sa config: aws:region: us-east-1 xqwatcher:business_unit: mitxonline - xqwatcher:target_vpc: applications_vpc xqwatcher:cluster: applications xqwatcher:namespace: mitxonline-openedx xqwatcher:min_replicas: 1 - xqwatcher:max_replicas: 2 xqwatcher:docker_tag: latest vault:address: https://vault-qa.odl.mit.edu vault_server:env_namespace: operations.qa diff --git a/src/ol_infrastructure/applications/xqwatcher/__main__.py b/src/ol_infrastructure/applications/xqwatcher/__main__.py index e31ede09e2..e3af7ed50a 100644 --- a/src/ol_infrastructure/applications/xqwatcher/__main__.py +++ b/src/ol_infrastructure/applications/xqwatcher/__main__.py @@ -42,7 +42,6 @@ stack_info = parse_stack() xqwatcher_config = Config("xqwatcher") -network_stack = StackReference(f"infrastructure.aws.network.{stack_info.name}") vault_mount_stack = StackReference( f"substructure.vault.static_mounts.operations.{stack_info.name}" ) @@ -82,13 +81,12 @@ namespace = xqwatcher_config.get("namespace") or f"{stack_info.env_prefix}-openedx" docker_image_tag = ( - os.environ.get("XQWATCHER_DOCKER_DIGEST") + os.environ.get("XQWATCHER_DOCKER_TAG") or xqwatcher_config.get("docker_tag") or openedx_release ) min_replicas = xqwatcher_config.get_int("min_replicas") or 1 -max_replicas = xqwatcher_config.get_int("max_replicas") or 2 ################################## ## Vault Secret Data ## @@ -130,6 +128,7 @@ irsa_service_account_name="xqwatcher", vault_sync_service_account_names=f"xqwatcher-{stack_info.env_prefix}-vault", k8s_labels=k8s_global_labels, + create_irsa_service_account=True, ) ) @@ -141,7 +140,9 @@ # Grader handler config (queue names, ContainerGrader KWARGS, xqueue URL+auth). # Stored as `confd_json` in the Vault KV entry written above. -grader_config_secret_name = "xqwatcher-grader-config" # pragma: allowlist secret +grader_config_secret_name = ( + "xqwatcher-grader-config" # pragma: allowlist secret # noqa: S105 +) grader_config_secret = OLVaultK8SSecret( f"xqwatcher-{env_name}-grader-config-secret", OLVaultK8SStaticSecretConfig( @@ -156,9 +157,11 @@ refresh_after="1h", restart_target_kind="Deployment", restart_target_name="xqwatcher", - # Expose just the rendered JSON as a file-friendly key. + # Expose the rendered grader JSON and the HTTP Basic Auth credential + # used by the xqueue-watcher manager to authenticate with xqueue. templates={ "grader_config.json": "{{ .Secrets.confd_json }}", + "http_basic_auth": "{{ .Secrets.http_basic_auth }}", }, vaultauth=vault_k8s_resources.auth_name, ), @@ -197,7 +200,7 @@ "disable_existing_loggers": False, "formatters": { "default": { - "format": "%(asctime)s - %(filename)s:%(lineno)d -- %(funcName)s [%(levelname)s]: %(message)s", + "format": "%(asctime)s - %(filename)s:%(lineno)d -- %(funcName)s [%(levelname)s]: %(message)s", # noqa: E501 } }, "handlers": { @@ -313,8 +316,8 @@ containers=[ kubernetes.core.v1.ContainerArgs( name="xqueue-watcher", - image=f"ghcr.io/mitodl/xqueue-watcher:{docker_image_tag}", - image_pull_policy="IfNotPresent", + image=f"mitodl/xqueue-watcher:{docker_image_tag}", + image_pull_policy="Always", command=["xqueue-watcher"], args=[ "--config", @@ -324,6 +327,40 @@ "-d", "/xqwatcher/conf.d", ], + env=[ + # HTTP Basic Auth for the xqueue server endpoint. + # Value is "username:password"; sourced from the + # Vault-synced secret so it never appears in the + # Deployment spec. + kubernetes.core.v1.EnvVarArgs( + name="XQWATCHER_HTTP_BASIC_AUTH", + value_from=kubernetes.core.v1.EnvVarSourceArgs( + secret_key_ref=kubernetes.core.v1.SecretKeySelectorArgs( + name=grader_config_secret_name, + key="http_basic_auth", + optional=True, + ) + ), + ), + # Non-sensitive manager config values — match + # MANAGER_CONFIG_DEFAULTS in env_settings.py. + kubernetes.core.v1.EnvVarArgs( + name="XQWATCHER_POLL_TIME", value="10" + ), + kubernetes.core.v1.EnvVarArgs( + name="XQWATCHER_REQUESTS_TIMEOUT", value="1" + ), + kubernetes.core.v1.EnvVarArgs( + name="XQWATCHER_POLL_INTERVAL", value="1" + ), + kubernetes.core.v1.EnvVarArgs( + name="XQWATCHER_LOGIN_POLL_INTERVAL", value="5" + ), + kubernetes.core.v1.EnvVarArgs( + name="XQWATCHER_FOLLOW_CLIENT_REDIRECTS", + value="true", + ), + ], # Liveness: verify the Python runtime is functional. # The process will crash (and K8s will restart) on # persistent xqueue connectivity failures, so we rely on diff --git a/src/ol_infrastructure/components/applications/eks.py b/src/ol_infrastructure/components/applications/eks.py index bc7ef3e11e..af5bf75271 100644 --- a/src/ol_infrastructure/components/applications/eks.py +++ b/src/ol_infrastructure/components/applications/eks.py @@ -3,6 +3,7 @@ from pathlib import Path from typing import Any +import pulumi_kubernetes as kubernetes from pulumi import ComponentResource, Config, Output, ResourceOptions from pulumi_aws import get_caller_identity, iam from pulumi_vault import Policy @@ -49,6 +50,11 @@ class OLEKSAuthBindingConfig(BaseModel): k8s_labels: K8sGlobalLabels # Optional parliament config for IAM policy linting parliament_config: dict[str, Any] | None = None + # When True, create the K8s ServiceAccount object(s) for irsa_service_account_name + # with the eks.amazonaws.com/role-arn annotation so pods can reference them. + # Set to False (default) when the ServiceAccount is managed externally (e.g. by + # Helm) or already exists in the cluster. + create_irsa_service_account: bool = False @model_validator(mode="after") def validate_vault_policy(self): @@ -73,6 +79,7 @@ class OLEKSAuthBinding(ComponentResource): irsa_role: iam.Role iam_policy: iam.Policy | None vault_k8s_resources: OLVaultK8SResources + irsa_service_accounts: list[kubernetes.core.v1.ServiceAccount] def __init__( self, @@ -141,6 +148,31 @@ def __init__( ) self.irsa_role = self.trust_role.role + if config.create_irsa_service_account: + sa_names = ( + [config.irsa_service_account_name] + if isinstance(config.irsa_service_account_name, str) + else config.irsa_service_account_name + ) + self.irsa_service_accounts = [ + kubernetes.core.v1.ServiceAccount( + f"{config.application_name}-{sa_name}-irsa-service-account", + metadata=kubernetes.meta.v1.ObjectMetaArgs( + name=sa_name, + namespace=config.namespace, + labels=config.k8s_labels.model_dump(), + annotations={ + "eks.amazonaws.com/role-arn": self.irsa_role.arn, + }, + ), + automount_service_account_token=False, + opts=ResourceOptions(parent=self), + ) + for sa_name in sa_names + ] + else: + self.irsa_service_accounts = [] + # Read Vault policy from file or use provided text vault_policy_text = ( config.vault_policy_path.read_text() @@ -195,6 +227,7 @@ def __init__( { "iam_policy": self.iam_policy, "irsa_role": self.irsa_role, + "irsa_service_accounts": self.irsa_service_accounts, "vault_policy": vault_policy, "vault_k8s_auth_role": k8s_auth_backend_role, "vault_k8s_resources": self.vault_k8s_resources, From 3b62704a883343f8dcda801a68acb34962159136 Mon Sep 17 00:00:00 2001 From: Tobias Macey Date: Wed, 18 Mar 2026 15:53:16 -0400 Subject: [PATCH 07/46] chore: Use xqwatcher image from dockerhub pull-through cache --- src/ol_infrastructure/applications/xqwatcher/__main__.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/ol_infrastructure/applications/xqwatcher/__main__.py b/src/ol_infrastructure/applications/xqwatcher/__main__.py index e3af7ed50a..d11c1ecdb1 100644 --- a/src/ol_infrastructure/applications/xqwatcher/__main__.py +++ b/src/ol_infrastructure/applications/xqwatcher/__main__.py @@ -27,7 +27,7 @@ OLVaultK8SSecret, OLVaultK8SStaticSecretConfig, ) -from ol_infrastructure.lib.aws.eks_helper import setup_k8s_provider +from ol_infrastructure.lib.aws.eks_helper import cached_image_uri, setup_k8s_provider from ol_infrastructure.lib.ol_types import AWSBase, K8sGlobalLabels, Services from ol_infrastructure.lib.pulumi_helper import parse_stack from ol_infrastructure.lib.vault import setup_vault_provider @@ -316,7 +316,9 @@ containers=[ kubernetes.core.v1.ContainerArgs( name="xqueue-watcher", - image=f"mitodl/xqueue-watcher:{docker_image_tag}", + image=cached_image_uri( + f"mitodl/xqueue-watcher:{docker_image_tag}" + ), image_pull_policy="Always", command=["xqueue-watcher"], args=[ From 0d21de115e381ae2e4d0cf0727dc1826aead6d89 Mon Sep 17 00:00:00 2001 From: Tobias Macey Date: Wed, 18 Mar 2026 16:27:42 -0400 Subject: [PATCH 08/46] fix(xqwatcher): use correct xqueue-watcher CLI arguments The manager CLI only accepts -d/--config_root; it auto-discovers xqwatcher.json and logging.json from that directory. Remove the non-existent --config and --logging-config flags. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/ol_infrastructure/applications/xqwatcher/__main__.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/src/ol_infrastructure/applications/xqwatcher/__main__.py b/src/ol_infrastructure/applications/xqwatcher/__main__.py index d11c1ecdb1..35a0b6a48b 100644 --- a/src/ol_infrastructure/applications/xqwatcher/__main__.py +++ b/src/ol_infrastructure/applications/xqwatcher/__main__.py @@ -321,14 +321,7 @@ ), image_pull_policy="Always", command=["xqueue-watcher"], - args=[ - "--config", - "/xqwatcher/conf.d/xqwatcher.json", - "--logging-config", - "/xqwatcher/conf.d/logging.json", - "-d", - "/xqwatcher/conf.d", - ], + args=["-d", "/xqwatcher/conf.d"], env=[ # HTTP Basic Auth for the xqueue server endpoint. # Value is "username:password"; sourced from the From 105f767ad3335abcc38dcefec72a14fb8248481b Mon Sep 17 00:00:00 2001 From: Tobias Macey Date: Wed, 18 Mar 2026 16:44:57 -0400 Subject: [PATCH 09/46] fix(xqwatcher): enable automount_service_account_token on Deployment pods ContainerGrader calls k8s_config.load_incluster_config() which reads the service account token from the projected volume at /var/run/secrets/kubernetes.io/serviceaccount/token. The xqwatcher ServiceAccount has automount_service_account_token=False (secure default), so the PodSpec must explicitly opt in to have the token mounted, otherwise all Kubernetes Job API calls will fail with a ConfigException. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/ol_infrastructure/applications/xqwatcher/__main__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/ol_infrastructure/applications/xqwatcher/__main__.py b/src/ol_infrastructure/applications/xqwatcher/__main__.py index 35a0b6a48b..bdee10bc16 100644 --- a/src/ol_infrastructure/applications/xqwatcher/__main__.py +++ b/src/ol_infrastructure/applications/xqwatcher/__main__.py @@ -302,6 +302,7 @@ ), spec=kubernetes.core.v1.PodSpecArgs( service_account_name="xqwatcher", + automount_service_account_token=True, # Spread replicas across nodes for HA topology_spread_constraints=[ kubernetes.core.v1.TopologySpreadConstraintArgs( From 365d19b17b1dae9651c9a5bbdbb3005d5f63ee7d Mon Sep 17 00:00:00 2001 From: Tobias Macey Date: Wed, 18 Mar 2026 16:51:03 -0400 Subject: [PATCH 10/46] fix(xqwatcher): reference container image by digest instead of mutable tag When the Concourse pipeline populates XQWATCHER_DOCKER_DIGEST, build the image ref as mitodl/xqueue-watcher@sha256:... (immutable digest) so Kubernetes always pulls exactly the image that was built and tested. Fall back to :tag from stack config only when the digest is unavailable (e.g. manual deploys). imagePullPolicy: Always is retained so new digests are always pulled on rollout. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../applications/xqwatcher/__main__.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/src/ol_infrastructure/applications/xqwatcher/__main__.py b/src/ol_infrastructure/applications/xqwatcher/__main__.py index bdee10bc16..4ec9241300 100644 --- a/src/ol_infrastructure/applications/xqwatcher/__main__.py +++ b/src/ol_infrastructure/applications/xqwatcher/__main__.py @@ -80,11 +80,15 @@ namespace = xqwatcher_config.get("namespace") or f"{stack_info.env_prefix}-openedx" -docker_image_tag = ( - os.environ.get("XQWATCHER_DOCKER_TAG") - or xqwatcher_config.get("docker_tag") - or openedx_release -) +docker_image_digest = os.environ.get("XQWATCHER_DOCKER_DIGEST") +docker_image_tag = xqwatcher_config.get("docker_tag") or openedx_release + +# Prefer an immutable digest (sha256:...) supplied by the pipeline over a +# mutable tag from stack config. Digests are referenced as image@sha256:... +if docker_image_digest: + docker_image_ref = f"mitodl/xqueue-watcher@{docker_image_digest}" +else: + docker_image_ref = f"mitodl/xqueue-watcher:{docker_image_tag}" min_replicas = xqwatcher_config.get_int("min_replicas") or 1 @@ -317,9 +321,7 @@ containers=[ kubernetes.core.v1.ContainerArgs( name="xqueue-watcher", - image=cached_image_uri( - f"mitodl/xqueue-watcher:{docker_image_tag}" - ), + image=cached_image_uri(docker_image_ref), image_pull_policy="Always", command=["xqueue-watcher"], args=["-d", "/xqwatcher/conf.d"], From 738d91a69919043bb9eac34d6c83e0cf021c8087 Mon Sep 17 00:00:00 2001 From: Tobias Macey Date: Wed, 18 Mar 2026 17:13:25 -0400 Subject: [PATCH 11/46] fix(xqwatcher): invoke entrypoint via uv run The uv virtualenv bin directory is not on PATH in the container, so the 'xqueue-watcher' console script can't be found directly. Use 'uv run xqueue-watcher' to invoke it through uv's environment, which correctly resolves the script installed in the project virtualenv. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/ol_infrastructure/applications/xqwatcher/__main__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ol_infrastructure/applications/xqwatcher/__main__.py b/src/ol_infrastructure/applications/xqwatcher/__main__.py index 4ec9241300..4928d2e773 100644 --- a/src/ol_infrastructure/applications/xqwatcher/__main__.py +++ b/src/ol_infrastructure/applications/xqwatcher/__main__.py @@ -323,7 +323,7 @@ name="xqueue-watcher", image=cached_image_uri(docker_image_ref), image_pull_policy="Always", - command=["xqueue-watcher"], + command=["uv", "run", "xqueue-watcher"], args=["-d", "/xqwatcher/conf.d"], env=[ # HTTP Basic Auth for the xqueue server endpoint. From 763968b3e96aa291d9932ed663508444a30bc702 Mon Sep 17 00:00:00 2001 From: Tobias Macey Date: Wed, 18 Mar 2026 17:15:58 -0400 Subject: [PATCH 12/46] fix(xqwatcher): pass --no-sync to uv run to skip dependency installation uv run without --no-sync attempts to sync the virtualenv at startup, which fails in the container (no write access / network). Use --no-sync to run the already-installed entrypoint as-is. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/ol_infrastructure/applications/xqwatcher/__main__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ol_infrastructure/applications/xqwatcher/__main__.py b/src/ol_infrastructure/applications/xqwatcher/__main__.py index 4928d2e773..3a3b7fbdb2 100644 --- a/src/ol_infrastructure/applications/xqwatcher/__main__.py +++ b/src/ol_infrastructure/applications/xqwatcher/__main__.py @@ -323,7 +323,7 @@ name="xqueue-watcher", image=cached_image_uri(docker_image_ref), image_pull_policy="Always", - command=["uv", "run", "xqueue-watcher"], + command=["uv", "run", "--no-sync", "xqueue-watcher"], args=["-d", "/xqwatcher/conf.d"], env=[ # HTTP Basic Auth for the xqueue server endpoint. From 950ebf2e56d6afa7f3799e416bb384868c4ed552 Mon Sep 17 00:00:00 2001 From: Tobias Macey Date: Wed, 18 Mar 2026 17:22:56 -0400 Subject: [PATCH 13/46] fix(xqwatcher): fix config directory structure for watcher discovery configure_from_directory(path) reads xqwatcher.json and logging.json directly from path, then globs path/conf.d/*.json for queue watcher configs. We were passing -d /xqwatcher/conf.d and mounting everything flat there, so the manager looked for watchers at /xqwatcher/conf.d/conf.d/*.json (not found). Fix: pass -d /xqwatcher and restructure mounts: /xqwatcher/xqwatcher.json <- manager config (ConfigMap) /xqwatcher/logging.json <- logging config (ConfigMap) /xqwatcher/conf.d/grader_config.json <- queue watchers (Vault secret) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../applications/xqwatcher/__main__.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/ol_infrastructure/applications/xqwatcher/__main__.py b/src/ol_infrastructure/applications/xqwatcher/__main__.py index 3a3b7fbdb2..d1696fc2b1 100644 --- a/src/ol_infrastructure/applications/xqwatcher/__main__.py +++ b/src/ol_infrastructure/applications/xqwatcher/__main__.py @@ -324,7 +324,7 @@ image=cached_image_uri(docker_image_ref), image_pull_policy="Always", command=["uv", "run", "--no-sync", "xqueue-watcher"], - args=["-d", "/xqwatcher/conf.d"], + args=["-d", "/xqwatcher"], env=[ # HTTP Basic Auth for the xqueue server endpoint. # Value is "username:password"; sourced from the @@ -389,20 +389,22 @@ ), ), volume_mounts=[ - # Base poll settings from ConfigMap + # Manager config and logging config at the root of + # the -d directory; conf.d/ holds queue watcher configs. kubernetes.core.v1.VolumeMountArgs( name="xqwatcher-config", - mount_path="/xqwatcher/conf.d/xqwatcher.json", + mount_path="/xqwatcher/xqwatcher.json", sub_path="xqwatcher.json", read_only=True, ), kubernetes.core.v1.VolumeMountArgs( name="xqwatcher-config", - mount_path="/xqwatcher/conf.d/logging.json", + mount_path="/xqwatcher/logging.json", sub_path="logging.json", read_only=True, ), - # Per-queue grader handler config from Vault secret + # Per-queue grader handler config from Vault secret, + # placed under conf.d/ so the manager discovers it. kubernetes.core.v1.VolumeMountArgs( name="grader-config", mount_path="/xqwatcher/conf.d/grader_config.json", From d2cd7bcc71b976f21adab5416a114d7f4fbc7826 Mon Sep 17 00:00:00 2001 From: Tobias Macey Date: Wed, 18 Mar 2026 17:27:34 -0400 Subject: [PATCH 14/46] fix(xqwatcher): serialize confd_json to string before writing to Vault VSO renders secret values via Go templates: {{ .Secrets.confd_json }}. When confd_json is stored as a nested object, VSO renders a Go map literal (map[...]) rather than valid JSON, causing a JSONDecodeError at startup. Pre-serialize confd_json to a JSON string so the template renders parseable JSON. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/ol_infrastructure/applications/xqwatcher/__main__.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/ol_infrastructure/applications/xqwatcher/__main__.py b/src/ol_infrastructure/applications/xqwatcher/__main__.py index d1696fc2b1..d67e300c38 100644 --- a/src/ol_infrastructure/applications/xqwatcher/__main__.py +++ b/src/ol_infrastructure/applications/xqwatcher/__main__.py @@ -101,6 +101,12 @@ vault_secrets = read_yaml_secrets( Path(f"xqwatcher/secrets.{stack_info.env_prefix}.{stack_info.env_suffix}.yaml") ) +# VSO renders secret values using Go templates: {{ .Secrets.confd_json }}. +# If confd_json is stored as a nested object, VSO renders it as a Go map +# literal rather than JSON. Pre-serialize it to a JSON string so the +# template output is valid JSON that xqueue-watcher can parse. +if "confd_json" in vault_secrets and not isinstance(vault_secrets["confd_json"], str): + vault_secrets["confd_json"] = json.dumps(vault_secrets["confd_json"]) xqwatcher_vault_mount_name = vault_mount_stack.require_output("xqwatcher_kv")["path"] vault.kv.SecretV2( f"xqwatcher-{env_name}-grader-static-secrets", From e6a8d6566b4c94837a31253800ec76b8b492e40a Mon Sep 17 00:00:00 2001 From: Tobias Macey Date: Wed, 18 Mar 2026 19:17:12 -0400 Subject: [PATCH 15/46] fix(xqwatcher): require XQWATCHER_DOCKER_DIGEST; remove docker_tag fallback Match the keycloak pattern: require the digest env var so the image is always pinned to an immutable digest. Remove the mutable :latest tag fallback that allowed manual pulumi-up runs to silently deploy an uncontrolled image. Also remove the unused xqwatcher:docker_tag config key from all stack YAML files. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- ...umi.applications.xqwatcher.mitx-staging.CI.yaml | 1 - ...ications.xqwatcher.mitx-staging.Production.yaml | 1 - ...umi.applications.xqwatcher.mitx-staging.QA.yaml | 1 - .../Pulumi.applications.xqwatcher.mitx.CI.yaml | 1 - ...umi.applications.xqwatcher.mitx.Production.yaml | 1 - .../Pulumi.applications.xqwatcher.mitx.QA.yaml | 1 - ...ulumi.applications.xqwatcher.mitxonline.CI.yaml | 1 - ...plications.xqwatcher.mitxonline.Production.yaml | 1 - ...ulumi.applications.xqwatcher.mitxonline.QA.yaml | 1 - .../applications/xqwatcher/__main__.py | 14 +++++--------- 10 files changed, 5 insertions(+), 18 deletions(-) diff --git a/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx-staging.CI.yaml b/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx-staging.CI.yaml index 995cc3b520..9dd17a1cda 100644 --- a/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx-staging.CI.yaml +++ b/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx-staging.CI.yaml @@ -7,6 +7,5 @@ config: xqwatcher:cluster: residential xqwatcher:namespace: mitx-staging-openedx xqwatcher:min_replicas: 1 - xqwatcher:docker_tag: latest vault:address: https://vault-ci.odl.mit.edu vault_server:env_namespace: operations.ci diff --git a/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx-staging.Production.yaml b/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx-staging.Production.yaml index d61999f625..21f696187d 100644 --- a/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx-staging.Production.yaml +++ b/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx-staging.Production.yaml @@ -7,6 +7,5 @@ config: xqwatcher:cluster: residential xqwatcher:namespace: mitx-staging-openedx xqwatcher:min_replicas: 1 - xqwatcher:docker_tag: latest vault:address: https://vault-production.odl.mit.edu vault_server:env_namespace: operations.production diff --git a/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx-staging.QA.yaml b/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx-staging.QA.yaml index 154f44eafa..fe7dd890fd 100644 --- a/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx-staging.QA.yaml +++ b/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx-staging.QA.yaml @@ -7,6 +7,5 @@ config: xqwatcher:cluster: residential xqwatcher:namespace: mitx-staging-openedx xqwatcher:min_replicas: 1 - xqwatcher:docker_tag: latest vault:address: https://vault-qa.odl.mit.edu vault_server:env_namespace: operations.qa diff --git a/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx.CI.yaml b/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx.CI.yaml index c4164b2634..a83669d67f 100644 --- a/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx.CI.yaml +++ b/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx.CI.yaml @@ -7,6 +7,5 @@ config: xqwatcher:cluster: residential xqwatcher:namespace: mitx-openedx xqwatcher:min_replicas: 1 - xqwatcher:docker_tag: latest vault:address: https://vault-ci.odl.mit.edu vault_server:env_namespace: operations.ci diff --git a/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx.Production.yaml b/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx.Production.yaml index fd469e35ee..e3ca466f59 100644 --- a/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx.Production.yaml +++ b/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx.Production.yaml @@ -7,6 +7,5 @@ config: xqwatcher:cluster: residential xqwatcher:namespace: mitx-openedx xqwatcher:min_replicas: 2 - xqwatcher:docker_tag: latest vault:address: https://vault-production.odl.mit.edu vault_server:env_namespace: operations.production diff --git a/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx.QA.yaml b/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx.QA.yaml index 45d68c947e..1776ffe84b 100644 --- a/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx.QA.yaml +++ b/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx.QA.yaml @@ -7,6 +7,5 @@ config: xqwatcher:cluster: residential xqwatcher:namespace: mitx-openedx xqwatcher:min_replicas: 1 - xqwatcher:docker_tag: latest vault:address: https://vault-qa.odl.mit.edu vault_server:env_namespace: operations.qa diff --git a/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitxonline.CI.yaml b/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitxonline.CI.yaml index 2338a2380e..39748de90a 100644 --- a/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitxonline.CI.yaml +++ b/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitxonline.CI.yaml @@ -7,6 +7,5 @@ config: xqwatcher:cluster: applications xqwatcher:namespace: mitxonline-openedx xqwatcher:min_replicas: 1 - xqwatcher:docker_tag: latest vault:address: https://vault-ci.odl.mit.edu vault_server:env_namespace: operations.ci diff --git a/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitxonline.Production.yaml b/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitxonline.Production.yaml index f36414513d..2ff68415aa 100644 --- a/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitxonline.Production.yaml +++ b/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitxonline.Production.yaml @@ -7,6 +7,5 @@ config: xqwatcher:cluster: applications xqwatcher:namespace: mitxonline-openedx xqwatcher:min_replicas: 2 - xqwatcher:docker_tag: latest vault:address: https://vault-production.odl.mit.edu vault_server:env_namespace: operations.production diff --git a/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitxonline.QA.yaml b/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitxonline.QA.yaml index 6ed6539fa4..f4a438ef4c 100644 --- a/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitxonline.QA.yaml +++ b/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitxonline.QA.yaml @@ -7,6 +7,5 @@ config: xqwatcher:cluster: applications xqwatcher:namespace: mitxonline-openedx xqwatcher:min_replicas: 1 - xqwatcher:docker_tag: latest vault:address: https://vault-qa.odl.mit.edu vault_server:env_namespace: operations.qa diff --git a/src/ol_infrastructure/applications/xqwatcher/__main__.py b/src/ol_infrastructure/applications/xqwatcher/__main__.py index d67e300c38..0b47bbe105 100644 --- a/src/ol_infrastructure/applications/xqwatcher/__main__.py +++ b/src/ol_infrastructure/applications/xqwatcher/__main__.py @@ -80,15 +80,11 @@ namespace = xqwatcher_config.get("namespace") or f"{stack_info.env_prefix}-openedx" -docker_image_digest = os.environ.get("XQWATCHER_DOCKER_DIGEST") -docker_image_tag = xqwatcher_config.get("docker_tag") or openedx_release - -# Prefer an immutable digest (sha256:...) supplied by the pipeline over a -# mutable tag from stack config. Digests are referenced as image@sha256:... -if docker_image_digest: - docker_image_ref = f"mitodl/xqueue-watcher@{docker_image_digest}" -else: - docker_image_ref = f"mitodl/xqueue-watcher:{docker_image_tag}" +if "XQWATCHER_DOCKER_DIGEST" not in os.environ: + msg = "XQWATCHER_DOCKER_DIGEST must be set" + raise ValueError(msg) +docker_image_digest = os.environ["XQWATCHER_DOCKER_DIGEST"] +docker_image_ref = f"mitodl/xqueue-watcher@{docker_image_digest}" min_replicas = xqwatcher_config.get_int("min_replicas") or 1 From 708b03ea71ddccfe02bd2cb63bba120b0d7533d9 Mon Sep 17 00:00:00 2001 From: Tobias Macey Date: Wed, 18 Mar 2026 19:29:00 -0400 Subject: [PATCH 16/46] feat(xqwatcher): rewrite ContainerGrader image through ECR pull-through cache When the SOPS secret's confd_json contains a ContainerGrader handler whose KWARGS include an 'image' key, rewrite that value through cached_image_uri() before writing to Vault. This means the SOPS secret stores a plain DockerHub reference (e.g. mitodl/mit-600x-grader:latest) and Pulumi transforms it to the ECR pull-through cache URI at deploy time, keeping grading Jobs free from DockerHub rate limits. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../applications/xqwatcher/__main__.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/ol_infrastructure/applications/xqwatcher/__main__.py b/src/ol_infrastructure/applications/xqwatcher/__main__.py index 0b47bbe105..fea4e191bb 100644 --- a/src/ol_infrastructure/applications/xqwatcher/__main__.py +++ b/src/ol_infrastructure/applications/xqwatcher/__main__.py @@ -97,6 +97,19 @@ vault_secrets = read_yaml_secrets( Path(f"xqwatcher/secrets.{stack_info.env_prefix}.{stack_info.env_suffix}.yaml") ) +# For ContainerGrader handlers: if the SOPS secret supplies a plain DockerHub +# image reference in KWARGS.image, rewrite it to use the ECR pull-through +# cache so the grading Jobs are not subject to DockerHub rate limits. +if isinstance(vault_secrets.get("confd_json"), dict): + for _queue_cfg in vault_secrets["confd_json"].values(): + for handler_cfg in _queue_cfg.get("HANDLERS", []): + if handler_cfg.get("HANDLER", "").endswith( + "ContainerGrader" + ) and "image" in handler_cfg.get("KWARGS", {}): + handler_cfg["KWARGS"]["image"] = cached_image_uri( + handler_cfg["KWARGS"]["image"] + ) + # VSO renders secret values using Go templates: {{ .Secrets.confd_json }}. # If confd_json is stored as a nested object, VSO renders it as a Go map # literal rather than JSON. Pre-serialize it to a JSON string so the From 37ed37b455748aa8e7fe007745fb78331137c9dc Mon Sep 17 00:00:00 2001 From: Tobias Macey Date: Thu, 19 Mar 2026 12:33:54 -0400 Subject: [PATCH 17/46] ci: suppress CodeQL actions extractor failure The CodeQL 'Analyze (actions)' job (exit code 32) fails because the extractor finds .github/workflows/*.yml and .github/actions/**/*.yml but cannot process any of them. This is a known extractor-level issue with CodeQL 2.24.x on Erk agent workflow patterns. Excluding .github from CodeQL's path analysis silences the fatal error while leaving Python and JavaScript/TypeScript scans unaffected. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .github/codeql/codeql-config.yml | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 .github/codeql/codeql-config.yml diff --git a/.github/codeql/codeql-config.yml b/.github/codeql/codeql-config.yml new file mode 100644 index 0000000000..0da51bc345 --- /dev/null +++ b/.github/codeql/codeql-config.yml @@ -0,0 +1,9 @@ +--- +# CodeQL configuration for Default Setup. +# The "actions" extractor (CodeQL 2.24+) fails on this repository because it +# detects the Erk agent workflow files but cannot extract any source code from +# them (exit code 32: "no code was successfully extracted"). Excluding the +# .github tree from actions analysis suppresses the fatal extractor error while +# keeping Python and JavaScript/TypeScript analysis fully intact. +paths-ignore: +- ".github" From 273bb960d3085bfb3e1f54f7e673f503140179bd Mon Sep 17 00:00:00 2001 From: Tobias Macey Date: Thu, 19 Mar 2026 14:31:40 -0400 Subject: [PATCH 18/46] feat(concourse): add reusable grader image build pipelines Add src/ol_concourse/pipelines/open_edx/grader_images/ with three pipeline definitions for building and publishing containerized course grader images to private ECR. base_image_pipeline.py: Builds grader_support/Dockerfile.base from the xqueue-watcher repo and pushes to both DockerHub (mitodl/xqueue-watcher-grader-base, public) and ECR (610119931565.dkr.ecr.us-east-1.amazonaws.com/mitodl/xqueue-watcher- grader-base, private). Triggered by changes to grader_support/ in the xqueue-watcher repo. The ECR push is the trigger source for downstream per-grader build pipelines. build_pipeline.py: GraderPipelineConfig dataclass and grader_image_pipeline() factory for per-grader-repo build pipelines. Triggered by new commits to the grader repo OR a new base image digest in ECR. The Docker build receives GRADER_BASE_IMAGE=repo@sha256:... resolved at runtime via a sh wrapper around oci-build-task's build script (the only way to inject a file-derived BUILD_ARG in Concourse; params are static strings). Pushes to private ECR only. GRADER_PIPELINES list seeded with graders-mit-600x. meta.py: Self-updating meta pipeline that creates and maintains the base image pipeline and one build pipeline per GRADER_PIPELINES entry. Triggered by changes to the grader_images/ pipeline code in ol-infrastructure. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../open_edx/grader_images/__init__.py | 0 .../grader_images/base_image_pipeline.py | 118 +++++++++ .../open_edx/grader_images/build_pipeline.py | 234 ++++++++++++++++++ .../pipelines/open_edx/grader_images/meta.py | 174 +++++++++++++ 4 files changed, 526 insertions(+) create mode 100644 src/ol_concourse/pipelines/open_edx/grader_images/__init__.py create mode 100644 src/ol_concourse/pipelines/open_edx/grader_images/base_image_pipeline.py create mode 100644 src/ol_concourse/pipelines/open_edx/grader_images/build_pipeline.py create mode 100644 src/ol_concourse/pipelines/open_edx/grader_images/meta.py diff --git a/src/ol_concourse/pipelines/open_edx/grader_images/__init__.py b/src/ol_concourse/pipelines/open_edx/grader_images/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/ol_concourse/pipelines/open_edx/grader_images/base_image_pipeline.py b/src/ol_concourse/pipelines/open_edx/grader_images/base_image_pipeline.py new file mode 100644 index 0000000000..2f5a506919 --- /dev/null +++ b/src/ol_concourse/pipelines/open_edx/grader_images/base_image_pipeline.py @@ -0,0 +1,118 @@ +""" +Pipeline that builds the xqueue-watcher grader base image and pushes it to +both DockerHub and ECR. + +The base image (grader_support/Dockerfile.base) is the foundation for all +course-specific grader images. Publishing it to both registries allows: + - DockerHub (mitodl/xqueue-watcher-grader-base): public reference usable + without AWS credentials; used in grader repo Dockerfiles as the default + GRADER_BASE_IMAGE build arg. + - ECR (mitodl/xqueue-watcher-grader-base): used by the per-grader Concourse + build pipelines as the trigger source, so a base image rebuild + automatically triggers downstream grader image rebuilds. + +Triggers: + - Push to the xqueue-watcher repo on paths under grader_support/. +""" + +import sys + +from ol_concourse.lib.containers import container_build_task +from ol_concourse.lib.models.fragment import PipelineFragment +from ol_concourse.lib.models.pipeline import ( + GetStep, + Identifier, + Input, + Job, + Pipeline, + PutStep, +) +from ol_concourse.lib.resources import git_repo, registry_image + +_AWS_ACCOUNT_ID = "610119931565" +_AWS_REGION = "us-east-1" +_BASE_IMAGE_REPO = "mitodl/xqueue-watcher-grader-base" +_ECR_BASE_IMAGE_REPO = ( + f"{_AWS_ACCOUNT_ID}.dkr.ecr.{_AWS_REGION}.amazonaws.com/{_BASE_IMAGE_REPO}" +) + + +def grader_base_image_pipeline() -> Pipeline: + """Return the pipeline that builds and publishes the grader base image.""" + xqwatcher_repo = git_repo( + name=Identifier("xqueue-watcher-code"), + uri="https://github.com/mitodl/xqueue-watcher", + branch="master", + paths=["grader_support/"], + ) + + # DockerHub push target — public, used by grader repo Dockerfiles as default + # GRADER_BASE_IMAGE build arg and accessible without AWS credentials. + dockerhub_base_image = registry_image( + name=Identifier("grader-base-dockerhub"), + image_repository=_BASE_IMAGE_REPO, + image_tag="latest", + username="((dockerhub.username))", + password="((dockerhub.password))", # noqa: S106 + ) + + # ECR push target — used as the trigger source for per-grader build + # pipelines so that a base image rebuild causes downstream rebuilds. + ecr_base_image = registry_image( + name=Identifier("grader-base-ecr"), + image_repository=_ECR_BASE_IMAGE_REPO, + image_tag="latest", + ecr_region=_AWS_REGION, + ) + + build_job = Job( + name=Identifier("build-grader-base-image"), + plan=[ + GetStep(get=xqwatcher_repo.name, trigger=True), + container_build_task( + inputs=[Input(name=xqwatcher_repo.name)], + build_parameters={ + "CONTEXT": f"{xqwatcher_repo.name}/grader_support", + "DOCKERFILE": f"{xqwatcher_repo.name}/grader_support/Dockerfile.base", + }, + ), + # Push to DockerHub first — fail fast if credentials are wrong + # before consuming the ECR push quota. + PutStep( + put=dockerhub_base_image.name, + params={ + "image": "image/image.tar", + "additional_tags": f"./{xqwatcher_repo.name}/.git/describe_ref", + }, + ), + PutStep( + put=ecr_base_image.name, + params={ + "image": "image/image.tar", + "additional_tags": f"./{xqwatcher_repo.name}/.git/describe_ref", + }, + ), + ], + ) + + fragment = PipelineFragment( + resources=[xqwatcher_repo, dockerhub_base_image, ecr_base_image], + jobs=[build_job], + ) + + return Pipeline( + resource_types=fragment.resource_types, + resources=fragment.resources, + jobs=fragment.jobs, + ) + + +if __name__ == "__main__": + pipeline_json = grader_base_image_pipeline().model_dump_json(indent=2) + with open("definition.json", "w") as definition: # noqa: PTH123 + definition.write(pipeline_json) + sys.stdout.write(pipeline_json) + sys.stdout.write( + "\nfly -t set-pipeline" + " -p build-grader-base-image -c definition.json\n" + ) diff --git a/src/ol_concourse/pipelines/open_edx/grader_images/build_pipeline.py b/src/ol_concourse/pipelines/open_edx/grader_images/build_pipeline.py new file mode 100644 index 0000000000..3f1f15bc03 --- /dev/null +++ b/src/ol_concourse/pipelines/open_edx/grader_images/build_pipeline.py @@ -0,0 +1,234 @@ +""" +Reusable pipeline builder for course-specific grader images. + +Each grader repository (e.g. graders-mit-600x) extends the xqueue-watcher +grader base image with course-specific grader scripts and dependencies. +This module provides a ``GraderPipelineConfig`` dataclass and a +``grader_image_pipeline()`` factory that returns a ``Pipeline`` for building +and pushing that course image to a private ECR repository. + +Triggers: + - New commit to the grader repo (grader scripts or Dockerfile changed). + - New digest of the grader base image in ECR (base image rebuilt / security + patch applied). + +The base image digest is resolved at build time by reading the ``repository`` +and ``digest`` files that Concourse's ``registry-image`` resource writes for +every fetched image. The resolved ``repo@sha256:…`` reference is injected +into the Docker build as ``GRADER_BASE_IMAGE`` via a shell wrapper around the +``oci-build-task``'s ``build`` script so that the build layer cache is +correctly invalidated and the published image records the exact base used. +""" + +import dataclasses +import sys + +from ol_concourse.lib.models.fragment import PipelineFragment +from ol_concourse.lib.models.pipeline import ( + Cache, + Command, + GetStep, + Identifier, + Input, + Job, + Output, + Pipeline, + Platform, + PutStep, + TaskConfig, + TaskStep, +) +from ol_concourse.lib.resources import git_repo, registry_image + +_AWS_ACCOUNT_ID = "610119931565" +_AWS_REGION = "us-east-1" +_ECR_BASE_IMAGE_REPO = ( + f"{_AWS_ACCOUNT_ID}.dkr.ecr.{_AWS_REGION}.amazonaws.com" + "/mitodl/xqueue-watcher-grader-base" +) + + +@dataclasses.dataclass +class GraderPipelineConfig: + """Parameters for building and publishing one course-specific grader image. + + Attributes: + pipeline_name: Short identifier used in resource/job names and the + Concourse pipeline name, e.g. ``"graders-mit-600x"``. + grader_repo_url: HTTPS URL of the grader repository, e.g. + ``"https://github.com/mitodl/graders-mit-600x"``. + grader_repo_branch: Branch to track, e.g. ``"main"`` or ``"master"``. + ecr_repo_name: ECR repository path (without the registry host), e.g. + ``"mitodl/graders-mit-600x"``. The full URI is constructed as + ``{aws_account_id}.dkr.ecr.{aws_region}.amazonaws.com/{ecr_repo_name}``. + grader_base_ecr_repo: Full ECR URI of the grader base image used as the + build trigger and ``GRADER_BASE_IMAGE`` build arg. Defaults to + the standard MIT OL base image URI. + aws_account_id: AWS account ID that hosts the ECR registry. + aws_region: AWS region for ECR authentication and URI construction. + """ + + pipeline_name: str + grader_repo_url: str + grader_repo_branch: str + ecr_repo_name: str + grader_base_ecr_repo: str = _ECR_BASE_IMAGE_REPO + aws_account_id: str = _AWS_ACCOUNT_ID + aws_region: str = _AWS_REGION + + @property + def ecr_image_uri(self) -> str: + return ( + f"{self.aws_account_id}.dkr.ecr.{self.aws_region}.amazonaws.com" + f"/{self.ecr_repo_name}" + ) + + +def grader_image_pipeline(config: GraderPipelineConfig) -> Pipeline: + """Return a Pipeline that builds and pushes a course-specific grader image. + + The pipeline contains a single build job that: + 1. Watches the grader repo for new commits (trigger). + 2. Watches the grader base image in ECR for updates (trigger). + 3. Builds the Dockerfile in the root of the grader repo. A shell + wrapper reads the ``repository`` and ``digest`` files written by the + ``registry-image`` resource and sets ``BUILD_ARG_GRADER_BASE_IMAGE`` + to the immutable ``repo@sha256:…`` reference before invoking the + ``oci-build-task``'s ``build`` script. + 4. Pushes the resulting image to private ECR. + + Args: + config: Pipeline configuration for the grader repository. + + Returns: + A ``Pipeline`` object suitable for serialisation to Concourse YAML/JSON. + """ + grader_repo = git_repo( + name=Identifier(f"{config.pipeline_name}-code"), + uri=config.grader_repo_url, + branch=config.grader_repo_branch, + ) + + # Grader base image in ECR — used as a build trigger so that rebuilding + # the base image automatically causes this pipeline to run. + grader_base_image = registry_image( + name=Identifier("grader-base-image"), + image_repository=config.grader_base_ecr_repo, + image_tag="latest", + ecr_region=config.aws_region, + ) + + # Private ECR image for this course's grader. + grader_ecr_image = registry_image( + name=Identifier(f"{config.pipeline_name}-image"), + image_repository=config.ecr_image_uri, + image_tag="latest", + ecr_region=config.aws_region, + ) + + # The registry-image resource writes `repository` and `digest` files into + # the fetched directory. We read them inside the task via a shell wrapper + # that sets BUILD_ARG_GRADER_BASE_IMAGE=repo@sha256:… before exec-ing the + # oci-build-task `build` script. This pins the base image to the exact + # digest that triggered the pipeline run, ensuring reproducibility and + # correct Docker layer-cache invalidation. + # + # Note: oci-build-task `params` are env vars injected verbatim — shell + # expressions like $(cat …) are NOT evaluated there. The `run.args` shell + # wrapper is the only way to dynamically set a BUILD_ARG from a file. + base_ref = grader_base_image.name + build_job = Job( + name=Identifier(f"build-{config.pipeline_name}-image"), + plan=[ + GetStep(get=grader_repo.name, trigger=True), + GetStep(get=grader_base_image.name, trigger=True), + TaskStep( + task=Identifier("build-container-image"), + privileged=True, + config=TaskConfig( + platform=Platform.linux, + image_resource={ + "type": "registry-image", + "source": {"repository": "concourse/oci-build-task"}, + }, + params={ + "CONTEXT": str(grader_repo.name), + "DOCKERFILE": f"{grader_repo.name}/Dockerfile", + }, + caches=[Cache(path="cache")], + inputs=[ + Input(name=grader_repo.name), + Input(name=grader_base_image.name), + ], + outputs=[Output(name=Identifier("image"))], + # Read the base image digest file at runtime and export it + # as BUILD_ARG_GRADER_BASE_IMAGE before running `build`. + run=Command( + path="sh", + args=[ + "-euc", + ( + f"export BUILD_ARG_GRADER_BASE_IMAGE=" + f'"$(cat {base_ref}/repository)@$(cat {base_ref}/digest)"' + " && exec build" + ), + ], + ), + ), + ), + PutStep( + put=grader_ecr_image.name, + params={ + "image": "image/image.tar", + "additional_tags": ( + f"./{grader_repo.name}/.git/describe_ref" + ), + }, + ), + ], + ) + + fragment = PipelineFragment( + resources=[grader_repo, grader_base_image, grader_ecr_image], + jobs=[build_job], + ) + + return Pipeline( + resource_types=fragment.resource_types, + resources=fragment.resources, + jobs=fragment.jobs, + ) + + +# --------------------------------------------------------------------------- +# Configured grader pipelines +# --------------------------------------------------------------------------- + +GRADER_PIPELINES: list[GraderPipelineConfig] = [ + GraderPipelineConfig( + pipeline_name="graders-mit-600x", + grader_repo_url="https://github.com/mitodl/graders-mit-600x", + grader_repo_branch="main", + ecr_repo_name="mitodl/graders-mit-600x", + ), +] + + +if __name__ == "__main__": + pipeline_name = sys.argv[1] + config = next( + (p for p in GRADER_PIPELINES if p.pipeline_name == pipeline_name), None + ) + if config is None: + sys.exit( + f"Unknown pipeline name {pipeline_name!r}. " + f"Available: {[p.pipeline_name for p in GRADER_PIPELINES]}" + ) + pipeline_json = grader_image_pipeline(config).model_dump_json(indent=2) + with open("definition.json", "w") as definition: # noqa: PTH123 + definition.write(pipeline_json) + sys.stdout.write(pipeline_json) + sys.stdout.write( + f"\nfly -t set-pipeline" + f" -p build-{pipeline_name}-image -c definition.json\n" + ) diff --git a/src/ol_concourse/pipelines/open_edx/grader_images/meta.py b/src/ol_concourse/pipelines/open_edx/grader_images/meta.py new file mode 100644 index 0000000000..f285841980 --- /dev/null +++ b/src/ol_concourse/pipelines/open_edx/grader_images/meta.py @@ -0,0 +1,174 @@ +""" +Meta pipeline for grader image build pipelines. + +Creates and maintains two types of pipelines: + 1. A base image pipeline (build-grader-base-image) that builds + grader_support/Dockerfile.base from the xqueue-watcher repo and pushes + to both DockerHub and ECR. + 2. One build pipeline per entry in GRADER_PIPELINES that builds and pushes + a course-specific grader image to private ECR. + +This meta pipeline is self-updating: the "create-grader-images-meta-pipeline" +job re-sets itself whenever the pipeline code in ol-infrastructure changes. + +Usage: + fly -t set-pipeline -p grader-images-meta -c definition.json +""" + +import sys + +from ol_concourse.lib.models.pipeline import ( + AnonymousResource, + Command, + GetStep, + Identifier, + Input, + Job, + Output, + Pipeline, + Platform, + SetPipelineStep, + TaskConfig, + TaskStep, +) +from ol_concourse.lib.resources import git_repo +from ol_concourse.pipelines.open_edx.grader_images.build_pipeline import ( + GRADER_PIPELINES, +) + +_PIPELINE_CODE_PATHS = [ + "src/ol_concourse/lib/", + "src/ol_concourse/pipelines/open_edx/grader_images/", +] + +pipeline_code = git_repo( + name=Identifier("grader-images-pipeline-code"), + uri="https://github.com/mitodl/ol-infrastructure", + branch="main", + paths=_PIPELINE_CODE_PATHS, +) + +_OL_INFRA_IMAGE = AnonymousResource( + type="registry-image", + source={ + "repository": "mitodl/ol-infrastructure", + "tag": "latest", + }, +) + + +def _generate_pipeline_task( + task_name: str, script_path: str, script_args: list[str] +) -> TaskStep: + """Return a TaskStep that runs a pipeline-definition script. + + The script writes ``definition.json`` to the ``pipeline`` output directory, + which the subsequent ``SetPipelineStep`` reads. + """ + return TaskStep( + task=Identifier(task_name), + config=TaskConfig( + platform=Platform.linux, + image_resource=_OL_INFRA_IMAGE, + inputs=[Input(name=pipeline_code.name)], + outputs=[Output(name=Identifier("pipeline"))], + run=Command( + path="python", + dir="pipeline", + user="root", + args=[f"../{pipeline_code.name}/{script_path}", *script_args], + ), + ), + ) + + +def _build_base_image_meta_job() -> Job: + """Job that creates/updates the grader base image build pipeline.""" + return Job( + name=Identifier("create-grader-base-image-pipeline"), + plan=[ + GetStep(get=pipeline_code.name, trigger=True), + _generate_pipeline_task( + task_name="generate-base-image-pipeline-definition", + script_path=( + "src/ol_concourse/pipelines/open_edx/" + "grader_images/base_image_pipeline.py" + ), + script_args=[], + ), + SetPipelineStep( + team="infrastructure", + set_pipeline=Identifier("build-grader-base-image"), + file="pipeline/definition.json", + ), + ], + ) + + +def _build_grader_meta_job(pipeline_name: str) -> Job: + """Job that creates/updates the build pipeline for one grader repo.""" + return Job( + name=Identifier(f"create-{pipeline_name}-pipeline"), + plan=[ + GetStep(get=pipeline_code.name, trigger=True), + _generate_pipeline_task( + task_name=f"generate-{pipeline_name}-pipeline-definition", + script_path=( + "src/ol_concourse/pipelines/open_edx/" + "grader_images/build_pipeline.py" + ), + script_args=[pipeline_name], + ), + SetPipelineStep( + team="infrastructure", + set_pipeline=Identifier(f"build-{pipeline_name}-image"), + file="pipeline/definition.json", + ), + ], + ) + + +def _build_self_update_job() -> Job: + """Job that keeps the meta pipeline itself in sync with the repo.""" + return Job( + name=Identifier("create-grader-images-meta-pipeline"), + plan=[ + GetStep(get=pipeline_code.name, trigger=True), + _generate_pipeline_task( + task_name="generate-meta-pipeline-definition", + script_path=( + "src/ol_concourse/pipelines/open_edx/" + "grader_images/meta.py" + ), + script_args=[], + ), + SetPipelineStep( + team="main", + set_pipeline="self", + file="pipeline/definition.json", + ), + ], + ) + + +meta_jobs = [ + _build_self_update_job(), + _build_base_image_meta_job(), + *[ + _build_grader_meta_job(config.pipeline_name) + for config in GRADER_PIPELINES + ], +] + +meta_pipeline = Pipeline(resources=[pipeline_code], jobs=meta_jobs) + + +if __name__ == "__main__": + pipeline_json = meta_pipeline.model_dump_json(indent=2) + with open("definition.json", "w") as definition: # noqa: PTH123 + definition.write(pipeline_json) + sys.stdout.write(pipeline_json) + sys.stdout.write( + "\nfly -t set-pipeline" + " -p grader-images-meta -c definition.json\n" + ) From 02460a5132bd827447bf838e15f1e9ab20ab9d76 Mon Sep 17 00:00:00 2001 From: Tobias Macey Date: Thu, 19 Mar 2026 14:39:47 -0400 Subject: [PATCH 19/46] fix(concourse): track current working branches for grader image pipelines - base_image_pipeline: use chore/migrate-to-uv-and-k8s-container-grader branch of xqueue-watcher (where Dockerfile.base updates live) - build_pipeline: track feat/containerized-grader for graders-mit-600x - Fix E501 in both files: split long strings to stay within 88-char limit Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../open_edx/grader_images/base_image_pipeline.py | 9 +++++---- .../pipelines/open_edx/grader_images/build_pipeline.py | 9 ++++----- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/ol_concourse/pipelines/open_edx/grader_images/base_image_pipeline.py b/src/ol_concourse/pipelines/open_edx/grader_images/base_image_pipeline.py index 2f5a506919..4b06cec78e 100644 --- a/src/ol_concourse/pipelines/open_edx/grader_images/base_image_pipeline.py +++ b/src/ol_concourse/pipelines/open_edx/grader_images/base_image_pipeline.py @@ -42,7 +42,7 @@ def grader_base_image_pipeline() -> Pipeline: xqwatcher_repo = git_repo( name=Identifier("xqueue-watcher-code"), uri="https://github.com/mitodl/xqueue-watcher", - branch="master", + branch="chore/migrate-to-uv-and-k8s-container-grader", paths=["grader_support/"], ) @@ -73,7 +73,9 @@ def grader_base_image_pipeline() -> Pipeline: inputs=[Input(name=xqwatcher_repo.name)], build_parameters={ "CONTEXT": f"{xqwatcher_repo.name}/grader_support", - "DOCKERFILE": f"{xqwatcher_repo.name}/grader_support/Dockerfile.base", + "DOCKERFILE": ( + f"{xqwatcher_repo.name}/grader_support/Dockerfile.base" + ), }, ), # Push to DockerHub first — fail fast if credentials are wrong @@ -113,6 +115,5 @@ def grader_base_image_pipeline() -> Pipeline: definition.write(pipeline_json) sys.stdout.write(pipeline_json) sys.stdout.write( - "\nfly -t set-pipeline" - " -p build-grader-base-image -c definition.json\n" + "\nfly -t set-pipeline -p build-grader-base-image -c definition.json\n" ) diff --git a/src/ol_concourse/pipelines/open_edx/grader_images/build_pipeline.py b/src/ol_concourse/pipelines/open_edx/grader_images/build_pipeline.py index 3f1f15bc03..53d3acb3db 100644 --- a/src/ol_concourse/pipelines/open_edx/grader_images/build_pipeline.py +++ b/src/ol_concourse/pipelines/open_edx/grader_images/build_pipeline.py @@ -169,7 +169,8 @@ def grader_image_pipeline(config: GraderPipelineConfig) -> Pipeline: "-euc", ( f"export BUILD_ARG_GRADER_BASE_IMAGE=" - f'"$(cat {base_ref}/repository)@$(cat {base_ref}/digest)"' + f'"$(cat {base_ref}/repository)' + f'@$(cat {base_ref}/digest)"' " && exec build" ), ], @@ -180,9 +181,7 @@ def grader_image_pipeline(config: GraderPipelineConfig) -> Pipeline: put=grader_ecr_image.name, params={ "image": "image/image.tar", - "additional_tags": ( - f"./{grader_repo.name}/.git/describe_ref" - ), + "additional_tags": (f"./{grader_repo.name}/.git/describe_ref"), }, ), ], @@ -208,7 +207,7 @@ def grader_image_pipeline(config: GraderPipelineConfig) -> Pipeline: GraderPipelineConfig( pipeline_name="graders-mit-600x", grader_repo_url="https://github.com/mitodl/graders-mit-600x", - grader_repo_branch="main", + grader_repo_branch="feat/containerized-grader", ecr_repo_name="mitodl/graders-mit-600x", ), ] From f8c786c9fd47d67e3698ac126b4692f216d48732 Mon Sep 17 00:00:00 2001 From: Tobias Macey Date: Thu, 19 Mar 2026 14:40:40 -0400 Subject: [PATCH 20/46] fix(concourse): set build context to repo root for grader base image The CONTEXT was grader_support/ which caused the COPY grader_support/ instruction in Dockerfile.base to fail (no nested grader_support/ inside the context). Use the repo root as CONTEXT so the COPY can locate the directory relative to it. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../pipelines/open_edx/grader_images/base_image_pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ol_concourse/pipelines/open_edx/grader_images/base_image_pipeline.py b/src/ol_concourse/pipelines/open_edx/grader_images/base_image_pipeline.py index 4b06cec78e..a05d6dd7d7 100644 --- a/src/ol_concourse/pipelines/open_edx/grader_images/base_image_pipeline.py +++ b/src/ol_concourse/pipelines/open_edx/grader_images/base_image_pipeline.py @@ -72,7 +72,7 @@ def grader_base_image_pipeline() -> Pipeline: container_build_task( inputs=[Input(name=xqwatcher_repo.name)], build_parameters={ - "CONTEXT": f"{xqwatcher_repo.name}/grader_support", + "CONTEXT": str(xqwatcher_repo.name), "DOCKERFILE": ( f"{xqwatcher_repo.name}/grader_support/Dockerfile.base" ), From 1a3cc07d8274b5152e060f94a16f91ba76c87f36 Mon Sep 17 00:00:00 2001 From: Tobias Macey Date: Thu, 19 Mar 2026 14:51:20 -0400 Subject: [PATCH 21/46] feat(concourse): ensure ECR repositories exist before pushing grader images Add ensure_ecr_task() helper to ol_concourse/lib/containers.py (mirrors the pattern used in the dagster docker_pulumi_pipeline). The task runs the AWS CLI to check for the ECR repository and creates it if missing, so the first pipeline run does not fail on a missing registry. Apply to both grader image pipelines: - base_image_pipeline: ensures mitodl/xqueue-watcher-grader-base exists before pushing to ECR - build_pipeline: ensures the per-grader ECR repo (config.ecr_repo_name) exists before pushing the course grader image Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/ol_concourse/lib/containers.py | 41 +++++++++++++++++++ .../grader_images/base_image_pipeline.py | 3 +- .../open_edx/grader_images/build_pipeline.py | 2 + 3 files changed, 45 insertions(+), 1 deletion(-) diff --git a/src/ol_concourse/lib/containers.py b/src/ol_concourse/lib/containers.py index ff4bb72653..c275da4c81 100644 --- a/src/ol_concourse/lib/containers.py +++ b/src/ol_concourse/lib/containers.py @@ -1,5 +1,6 @@ from ol_concourse.lib.jobs.infrastructure import Output from ol_concourse.lib.models.pipeline import ( + AnonymousResource, Cache, Command, Identifier, @@ -35,3 +36,43 @@ def container_build_task( outputs=[Output(name=Identifier("image"))], ), ) + + +def ensure_ecr_task(ecr_repo_name: str) -> TaskStep: + """Return a TaskStep that creates an ECR repository if it does not exist. + + Uses the AWS CLI with instance credentials (IRSA / worker IAM role). + Safe to run on every pipeline execution: ``describe-repositories`` is a + no-op when the repo already exists, and ``create-repository`` only runs + when it does not. + + Args: + ecr_repo_name: The ECR repository name *without* the registry host, + e.g. ``"mitodl/graders-mit-600x"``. + """ + return TaskStep( + task=Identifier("ensure-ecr-repository"), + config=TaskConfig( + platform="linux", + image_resource=AnonymousResource( + type="registry-image", + source={"repository": "amazon/aws-cli", "tag": "latest"}, + ), + params={ + "REPO_NAME": ecr_repo_name, + "AWS_PAGER": "cat", + }, + run=Command( + path="sh", + args=[ + "-exc", + ( + "aws ecr describe-repositories" + " --repository-names ${REPO_NAME}" + " || aws ecr create-repository" + " --repository-name ${REPO_NAME}" + ), + ], + ), + ), + ) diff --git a/src/ol_concourse/pipelines/open_edx/grader_images/base_image_pipeline.py b/src/ol_concourse/pipelines/open_edx/grader_images/base_image_pipeline.py index a05d6dd7d7..06c19e4703 100644 --- a/src/ol_concourse/pipelines/open_edx/grader_images/base_image_pipeline.py +++ b/src/ol_concourse/pipelines/open_edx/grader_images/base_image_pipeline.py @@ -17,7 +17,7 @@ import sys -from ol_concourse.lib.containers import container_build_task +from ol_concourse.lib.containers import container_build_task, ensure_ecr_task from ol_concourse.lib.models.fragment import PipelineFragment from ol_concourse.lib.models.pipeline import ( GetStep, @@ -78,6 +78,7 @@ def grader_base_image_pipeline() -> Pipeline: ), }, ), + ensure_ecr_task(_BASE_IMAGE_REPO), # Push to DockerHub first — fail fast if credentials are wrong # before consuming the ECR push quota. PutStep( diff --git a/src/ol_concourse/pipelines/open_edx/grader_images/build_pipeline.py b/src/ol_concourse/pipelines/open_edx/grader_images/build_pipeline.py index 53d3acb3db..71ac2e3ef6 100644 --- a/src/ol_concourse/pipelines/open_edx/grader_images/build_pipeline.py +++ b/src/ol_concourse/pipelines/open_edx/grader_images/build_pipeline.py @@ -23,6 +23,7 @@ import dataclasses import sys +from ol_concourse.lib.containers import ensure_ecr_task from ol_concourse.lib.models.fragment import PipelineFragment from ol_concourse.lib.models.pipeline import ( Cache, @@ -177,6 +178,7 @@ def grader_image_pipeline(config: GraderPipelineConfig) -> Pipeline: ), ), ), + ensure_ecr_task(config.ecr_repo_name), PutStep( put=grader_ecr_image.name, params={ From 019873471acc7ff235366874a24055ff615e3ceb Mon Sep 17 00:00:00 2001 From: Tobias Macey Date: Thu, 19 Mar 2026 15:26:15 -0400 Subject: [PATCH 22/46] fix: pass repo-name-only to registry-image ECR resources When ecr_region is set, the registry-image resource automatically constructs the full ECR URI as {account}.dkr.ecr.{region}.amazonaws.com/{repository}. Passing the full URI in image_repository caused the hostname to be doubled in API calls, resulting in NAME_UNKNOWN errors. - Remove ecr_image_uri property from GraderPipelineConfig - Fix grader_base_ecr_repo default to use repo-name-only string - Change registry_image(image_repository=config.ecr_image_uri) to registry_image(image_repository=config.ecr_repo_name) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../open_edx/grader_images/build_pipeline.py | 30 +++++++------------ 1 file changed, 10 insertions(+), 20 deletions(-) diff --git a/src/ol_concourse/pipelines/open_edx/grader_images/build_pipeline.py b/src/ol_concourse/pipelines/open_edx/grader_images/build_pipeline.py index 71ac2e3ef6..4f42233c36 100644 --- a/src/ol_concourse/pipelines/open_edx/grader_images/build_pipeline.py +++ b/src/ol_concourse/pipelines/open_edx/grader_images/build_pipeline.py @@ -43,10 +43,6 @@ _AWS_ACCOUNT_ID = "610119931565" _AWS_REGION = "us-east-1" -_ECR_BASE_IMAGE_REPO = ( - f"{_AWS_ACCOUNT_ID}.dkr.ecr.{_AWS_REGION}.amazonaws.com" - "/mitodl/xqueue-watcher-grader-base" -) @dataclasses.dataclass @@ -59,31 +55,25 @@ class GraderPipelineConfig: grader_repo_url: HTTPS URL of the grader repository, e.g. ``"https://github.com/mitodl/graders-mit-600x"``. grader_repo_branch: Branch to track, e.g. ``"main"`` or ``"master"``. - ecr_repo_name: ECR repository path (without the registry host), e.g. - ``"mitodl/graders-mit-600x"``. The full URI is constructed as - ``{aws_account_id}.dkr.ecr.{aws_region}.amazonaws.com/{ecr_repo_name}``. - grader_base_ecr_repo: Full ECR URI of the grader base image used as the - build trigger and ``GRADER_BASE_IMAGE`` build arg. Defaults to - the standard MIT OL base image URI. + ecr_repo_name: ECR repository name (without the registry host), e.g. + ``"mitodl/graders-mit-600x"``. Passed directly to the + ``registry-image`` resource; ``ecr_region`` causes Concourse to + infer the correct registry host automatically. + grader_base_ecr_repo: ECR repository name (without the registry host) + for the grader base image used as the build trigger. Defaults to + the standard MIT OL base image repo name. aws_account_id: AWS account ID that hosts the ECR registry. - aws_region: AWS region for ECR authentication and URI construction. + aws_region: AWS region for ECR authentication. """ pipeline_name: str grader_repo_url: str grader_repo_branch: str ecr_repo_name: str - grader_base_ecr_repo: str = _ECR_BASE_IMAGE_REPO + grader_base_ecr_repo: str = "mitodl/xqueue-watcher-grader-base" aws_account_id: str = _AWS_ACCOUNT_ID aws_region: str = _AWS_REGION - @property - def ecr_image_uri(self) -> str: - return ( - f"{self.aws_account_id}.dkr.ecr.{self.aws_region}.amazonaws.com" - f"/{self.ecr_repo_name}" - ) - def grader_image_pipeline(config: GraderPipelineConfig) -> Pipeline: """Return a Pipeline that builds and pushes a course-specific grader image. @@ -122,7 +112,7 @@ def grader_image_pipeline(config: GraderPipelineConfig) -> Pipeline: # Private ECR image for this course's grader. grader_ecr_image = registry_image( name=Identifier(f"{config.pipeline_name}-image"), - image_repository=config.ecr_image_uri, + image_repository=config.ecr_repo_name, image_tag="latest", ecr_region=config.aws_region, ) From 0e0e8d001de8f0c7d03fabe688a3d7157010072c Mon Sep 17 00:00:00 2001 From: Tobias Macey Date: Thu, 19 Mar 2026 15:42:28 -0400 Subject: [PATCH 23/46] fix: track feature branch in meta pipeline pipeline-code resource The grader-images-pipeline-code git resource was tracking 'main', but the pipeline files don't exist on main yet. Switch to the feature branch until this work is merged. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../pipelines/open_edx/grader_images/meta.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/src/ol_concourse/pipelines/open_edx/grader_images/meta.py b/src/ol_concourse/pipelines/open_edx/grader_images/meta.py index f285841980..ce6b4f0133 100644 --- a/src/ol_concourse/pipelines/open_edx/grader_images/meta.py +++ b/src/ol_concourse/pipelines/open_edx/grader_images/meta.py @@ -44,7 +44,7 @@ pipeline_code = git_repo( name=Identifier("grader-images-pipeline-code"), uri="https://github.com/mitodl/ol-infrastructure", - branch="main", + branch="feat/xqwatcher-kubernetes-migration", paths=_PIPELINE_CODE_PATHS, ) @@ -137,8 +137,7 @@ def _build_self_update_job() -> Job: _generate_pipeline_task( task_name="generate-meta-pipeline-definition", script_path=( - "src/ol_concourse/pipelines/open_edx/" - "grader_images/meta.py" + "src/ol_concourse/pipelines/open_edx/grader_images/meta.py" ), script_args=[], ), @@ -154,10 +153,7 @@ def _build_self_update_job() -> Job: meta_jobs = [ _build_self_update_job(), _build_base_image_meta_job(), - *[ - _build_grader_meta_job(config.pipeline_name) - for config in GRADER_PIPELINES - ], + *[_build_grader_meta_job(config.pipeline_name) for config in GRADER_PIPELINES], ] meta_pipeline = Pipeline(resources=[pipeline_code], jobs=meta_jobs) @@ -169,6 +165,5 @@ def _build_self_update_job() -> Job: definition.write(pipeline_json) sys.stdout.write(pipeline_json) sys.stdout.write( - "\nfly -t set-pipeline" - " -p grader-images-meta -c definition.json\n" + "\nfly -t set-pipeline -p grader-images-meta -c definition.json\n" ) From 21929ee4553e3b61ef580d840f910f0064848625 Mon Sep 17 00:00:00 2001 From: Tobias Macey Date: Thu, 19 Mar 2026 15:48:31 -0400 Subject: [PATCH 24/46] feat: use SSH credentials for private grader repository access The graders-mit-600x repository is private. Switch the git resource from an HTTPS git_repo to an ssh_git_repo so Concourse can clone it. The SSH private key is read from Vault at ((github.ssh_private_key)). - Import ssh_git_repo instead of git_repo - Add github_private_key field to GraderPipelineConfig (defaults to ((github.ssh_private_key))) - Update grader_repo_url in GRADER_PIPELINES to use SSH form (git@github.com:mitodl/graders-mit-600x) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../open_edx/grader_images/build_pipeline.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/ol_concourse/pipelines/open_edx/grader_images/build_pipeline.py b/src/ol_concourse/pipelines/open_edx/grader_images/build_pipeline.py index 4f42233c36..ff081fc125 100644 --- a/src/ol_concourse/pipelines/open_edx/grader_images/build_pipeline.py +++ b/src/ol_concourse/pipelines/open_edx/grader_images/build_pipeline.py @@ -39,7 +39,7 @@ TaskConfig, TaskStep, ) -from ol_concourse.lib.resources import git_repo, registry_image +from ol_concourse.lib.resources import registry_image, ssh_git_repo _AWS_ACCOUNT_ID = "610119931565" _AWS_REGION = "us-east-1" @@ -52,8 +52,8 @@ class GraderPipelineConfig: Attributes: pipeline_name: Short identifier used in resource/job names and the Concourse pipeline name, e.g. ``"graders-mit-600x"``. - grader_repo_url: HTTPS URL of the grader repository, e.g. - ``"https://github.com/mitodl/graders-mit-600x"``. + grader_repo_url: SSH URL of the grader repository, e.g. + ``"git@github.com:mitodl/graders-mit-600x"``. grader_repo_branch: Branch to track, e.g. ``"main"`` or ``"master"``. ecr_repo_name: ECR repository name (without the registry host), e.g. ``"mitodl/graders-mit-600x"``. Passed directly to the @@ -62,6 +62,8 @@ class GraderPipelineConfig: grader_base_ecr_repo: ECR repository name (without the registry host) for the grader base image used as the build trigger. Defaults to the standard MIT OL base image repo name. + github_private_key: Vault path for the SSH private key used to clone + the (private) grader repository. aws_account_id: AWS account ID that hosts the ECR registry. aws_region: AWS region for ECR authentication. """ @@ -71,6 +73,7 @@ class GraderPipelineConfig: grader_repo_branch: str ecr_repo_name: str grader_base_ecr_repo: str = "mitodl/xqueue-watcher-grader-base" + github_private_key: str = "((github.ssh_private_key))" aws_account_id: str = _AWS_ACCOUNT_ID aws_region: str = _AWS_REGION @@ -94,10 +97,11 @@ def grader_image_pipeline(config: GraderPipelineConfig) -> Pipeline: Returns: A ``Pipeline`` object suitable for serialisation to Concourse YAML/JSON. """ - grader_repo = git_repo( + grader_repo = ssh_git_repo( name=Identifier(f"{config.pipeline_name}-code"), uri=config.grader_repo_url, branch=config.grader_repo_branch, + private_key=config.github_private_key, ) # Grader base image in ECR — used as a build trigger so that rebuilding @@ -198,7 +202,7 @@ def grader_image_pipeline(config: GraderPipelineConfig) -> Pipeline: GRADER_PIPELINES: list[GraderPipelineConfig] = [ GraderPipelineConfig( pipeline_name="graders-mit-600x", - grader_repo_url="https://github.com/mitodl/graders-mit-600x", + grader_repo_url="git@github.com:mitodl/graders-mit-600x", grader_repo_branch="feat/containerized-grader", ecr_repo_name="mitodl/graders-mit-600x", ), From 6a8e14ee62d1994532f6200a1109e5966451d3b9 Mon Sep 17 00:00:00 2001 From: Tobias Macey Date: Thu, 19 Mar 2026 15:52:35 -0400 Subject: [PATCH 25/46] fix: use odlbot SSH key for private grader repo access infrastructure/github has no generic SSH key. The correct key for cloning private mitodl repos from the infrastructure Concourse team is odlbot_private_ssh_key in infrastructure/open_api_clients. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../pipelines/open_edx/grader_images/build_pipeline.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/ol_concourse/pipelines/open_edx/grader_images/build_pipeline.py b/src/ol_concourse/pipelines/open_edx/grader_images/build_pipeline.py index ff081fc125..8bfc337696 100644 --- a/src/ol_concourse/pipelines/open_edx/grader_images/build_pipeline.py +++ b/src/ol_concourse/pipelines/open_edx/grader_images/build_pipeline.py @@ -63,7 +63,8 @@ class GraderPipelineConfig: for the grader base image used as the build trigger. Defaults to the standard MIT OL base image repo name. github_private_key: Vault path for the SSH private key used to clone - the (private) grader repository. + the (private) grader repository. Defaults to the odlbot SSH key + stored at ``infrastructure/open_api_clients`` in Vault. aws_account_id: AWS account ID that hosts the ECR registry. aws_region: AWS region for ECR authentication. """ @@ -73,7 +74,7 @@ class GraderPipelineConfig: grader_repo_branch: str ecr_repo_name: str grader_base_ecr_repo: str = "mitodl/xqueue-watcher-grader-base" - github_private_key: str = "((github.ssh_private_key))" + github_private_key: str = "((open_api_clients.odlbot_private_ssh_key))" aws_account_id: str = _AWS_ACCOUNT_ID aws_region: str = _AWS_REGION From 7c66896d71d03998efe08c08a56671b964a9659a Mon Sep 17 00:00:00 2001 From: Tobias Macey Date: Thu, 19 Mar 2026 15:58:35 -0400 Subject: [PATCH 26/46] fix: use DockerHub grader base image as build trigger Switch the grader-base-image registry-image resource from ECR to DockerHub (mitodl/xqueue-watcher-grader-base). The base image pipeline pushes to both DockerHub and ECR; DockerHub is public and simpler to poll as a trigger without needing AWS credentials. - Rename GraderPipelineConfig.grader_base_ecr_repo to grader_base_dockerhub_repo - Remove ecr_region from the base image resource - Add DockerHub credentials ((dockerhub.username/password)) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../open_edx/grader_images/build_pipeline.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/src/ol_concourse/pipelines/open_edx/grader_images/build_pipeline.py b/src/ol_concourse/pipelines/open_edx/grader_images/build_pipeline.py index 8bfc337696..c1bcfe1468 100644 --- a/src/ol_concourse/pipelines/open_edx/grader_images/build_pipeline.py +++ b/src/ol_concourse/pipelines/open_edx/grader_images/build_pipeline.py @@ -59,9 +59,9 @@ class GraderPipelineConfig: ``"mitodl/graders-mit-600x"``. Passed directly to the ``registry-image`` resource; ``ecr_region`` causes Concourse to infer the correct registry host automatically. - grader_base_ecr_repo: ECR repository name (without the registry host) - for the grader base image used as the build trigger. Defaults to - the standard MIT OL base image repo name. + grader_base_dockerhub_repo: DockerHub repository name for the grader + base image used as the build trigger, e.g. + ``"mitodl/xqueue-watcher-grader-base"``. github_private_key: Vault path for the SSH private key used to clone the (private) grader repository. Defaults to the odlbot SSH key stored at ``infrastructure/open_api_clients`` in Vault. @@ -73,7 +73,7 @@ class GraderPipelineConfig: grader_repo_url: str grader_repo_branch: str ecr_repo_name: str - grader_base_ecr_repo: str = "mitodl/xqueue-watcher-grader-base" + grader_base_dockerhub_repo: str = "mitodl/xqueue-watcher-grader-base" github_private_key: str = "((open_api_clients.odlbot_private_ssh_key))" aws_account_id: str = _AWS_ACCOUNT_ID aws_region: str = _AWS_REGION @@ -84,7 +84,7 @@ def grader_image_pipeline(config: GraderPipelineConfig) -> Pipeline: The pipeline contains a single build job that: 1. Watches the grader repo for new commits (trigger). - 2. Watches the grader base image in ECR for updates (trigger). + 2. Watches the grader base image on DockerHub for updates (trigger). 3. Builds the Dockerfile in the root of the grader repo. A shell wrapper reads the ``repository`` and ``digest`` files written by the ``registry-image`` resource and sets ``BUILD_ARG_GRADER_BASE_IMAGE`` @@ -105,13 +105,14 @@ def grader_image_pipeline(config: GraderPipelineConfig) -> Pipeline: private_key=config.github_private_key, ) - # Grader base image in ECR — used as a build trigger so that rebuilding - # the base image automatically causes this pipeline to run. + # Grader base image on DockerHub — used as a build trigger so that + # rebuilding the base image automatically causes this pipeline to run. grader_base_image = registry_image( name=Identifier("grader-base-image"), - image_repository=config.grader_base_ecr_repo, + image_repository=config.grader_base_dockerhub_repo, image_tag="latest", - ecr_region=config.aws_region, + username="((dockerhub.username))", + password="((dockerhub.password))", # noqa: S106 ) # Private ECR image for this course's grader. From ff1334113ad98a47b84d4a6d6db90c77ab9936a8 Mon Sep 17 00:00:00 2001 From: Tobias Macey Date: Thu, 19 Mar 2026 16:14:39 -0400 Subject: [PATCH 27/46] feat: wire XQWATCHER_GRADER_* env vars and fix ECR image rewrite Three related fixes for the containerized grader deployment: 1. Add XQWATCHER_GRADER_NAMESPACE env var set to the deployment namespace. Without this, ContainerGrader defaults to spawning Jobs in 'default', breaking the RBAC Role binding and landing Jobs in the wrong namespace. 2. Add XQWATCHER_GRADER_BACKEND, CPU_LIMIT, MEMORY_LIMIT, TIMEOUT env vars driven by new stack config keys (grader_namespace, grader_cpu_limit, grader_memory_limit, grader_timeout). These set deployment-wide defaults so individual conf.d queue files don't need to repeat them. 3. Fix the DockerHub pull-through cache rewrite to skip images that already have a registry hostname (e.g. private ECR URIs, ghcr.io). Previously cached_image_uri() was called unconditionally, which would mangle a full ECR URI into an invalid doubled-host path. Images are now only rewritten if the first path component contains no '.'. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../applications/xqwatcher/__main__.py | 51 +++++++++++++++++-- 1 file changed, 48 insertions(+), 3 deletions(-) diff --git a/src/ol_infrastructure/applications/xqwatcher/__main__.py b/src/ol_infrastructure/applications/xqwatcher/__main__.py index fea4e191bb..64612ebf98 100644 --- a/src/ol_infrastructure/applications/xqwatcher/__main__.py +++ b/src/ol_infrastructure/applications/xqwatcher/__main__.py @@ -88,6 +88,14 @@ min_replicas = xqwatcher_config.get_int("min_replicas") or 1 +# Deployment-wide ContainerGrader defaults. These become XQWATCHER_GRADER_* +# environment variables on the xqwatcher pod so operators don't have to repeat +# them in every conf.d queue JSON file. Per-queue KWARGS still override these. +grader_namespace = xqwatcher_config.get("grader_namespace") or namespace +grader_cpu_limit = xqwatcher_config.get("grader_cpu_limit") or "500m" +grader_memory_limit = xqwatcher_config.get("grader_memory_limit") or "256Mi" +grader_timeout = xqwatcher_config.get("grader_timeout") or "20" + ################################## ## Vault Secret Data ## ################################## @@ -97,18 +105,29 @@ vault_secrets = read_yaml_secrets( Path(f"xqwatcher/secrets.{stack_info.env_prefix}.{stack_info.env_suffix}.yaml") ) + + # For ContainerGrader handlers: if the SOPS secret supplies a plain DockerHub # image reference in KWARGS.image, rewrite it to use the ECR pull-through # cache so the grading Jobs are not subject to DockerHub rate limits. +# Images that already have a registry hostname (e.g. private ECR URIs like +# 610119931565.dkr.ecr.us-east-1.amazonaws.com/…, or ghcr.io/…) are left +# unchanged — the hostname is identified by a "." in the first path component. +def _needs_pullthrough_rewrite(image: str) -> bool: + """Return True only for bare DockerHub image refs (no registry hostname).""" + first_component = image.split("/", maxsplit=1)[0] + return "." not in first_component and ":" not in first_component + + if isinstance(vault_secrets.get("confd_json"), dict): for _queue_cfg in vault_secrets["confd_json"].values(): for handler_cfg in _queue_cfg.get("HANDLERS", []): if handler_cfg.get("HANDLER", "").endswith( "ContainerGrader" ) and "image" in handler_cfg.get("KWARGS", {}): - handler_cfg["KWARGS"]["image"] = cached_image_uri( - handler_cfg["KWARGS"]["image"] - ) + image_ref = handler_cfg["KWARGS"]["image"] + if _needs_pullthrough_rewrite(image_ref): + handler_cfg["KWARGS"]["image"] = cached_image_uri(image_ref) # VSO renders secret values using Go templates: {{ .Secrets.confd_json }}. # If confd_json is stored as a nested object, VSO renders it as a Go map @@ -373,6 +392,32 @@ name="XQWATCHER_FOLLOW_CLIENT_REDIRECTS", value="true", ), + # ContainerGrader deployment-wide defaults. + # These are used when a queue's KWARGS block does not + # specify the value explicitly. + kubernetes.core.v1.EnvVarArgs( + name="XQWATCHER_GRADER_BACKEND", + value="kubernetes", + ), + # Critical: grading Jobs must land in the same + # namespace as xqwatcher so the RBAC Role binding + # above grants the necessary permissions. + kubernetes.core.v1.EnvVarArgs( + name="XQWATCHER_GRADER_NAMESPACE", + value=grader_namespace, + ), + kubernetes.core.v1.EnvVarArgs( + name="XQWATCHER_GRADER_CPU_LIMIT", + value=grader_cpu_limit, + ), + kubernetes.core.v1.EnvVarArgs( + name="XQWATCHER_GRADER_MEMORY_LIMIT", + value=grader_memory_limit, + ), + kubernetes.core.v1.EnvVarArgs( + name="XQWATCHER_GRADER_TIMEOUT", + value=grader_timeout, + ), ], # Liveness: verify the Python runtime is functional. # The process will crash (and K8s will restart) on From 0f8f7f5207578d543015663c90d051cbc1926e46 Mon Sep 17 00:00:00 2001 From: Tobias Macey Date: Thu, 19 Mar 2026 16:23:22 -0400 Subject: [PATCH 28/46] config: Update MITx CI watcher config for use on K8s --- .../secrets/xqwatcher/secrets.mitx.ci.yaml | 56 +++++++------------ .../grader_images/base_image_pipeline.py | 5 +- 2 files changed, 22 insertions(+), 39 deletions(-) diff --git a/src/bridge/secrets/xqwatcher/secrets.mitx.ci.yaml b/src/bridge/secrets/xqwatcher/secrets.mitx.ci.yaml index 5bbb88e1e0..e4fab27e2f 100644 --- a/src/bridge/secrets/xqwatcher/secrets.mitx.ci.yaml +++ b/src/bridge/secrets/xqwatcher/secrets.mitx.ci.yaml @@ -1,16 +1,4 @@ --- -graders_yaml: - graders: - - name: ENC[AES256_GCM,data:p5T9MPzwVzB2SmsbB7kFcEuBlR9HJGbyPVZw,iv:iFC4JC+oyLBtKH06ZGKOLdZDEleN2f9j4iXJxQpwWt8=,tag:588sITOlP+vcF66ohqyRLw==,type:str] - address: ENC[AES256_GCM,data:0ZoJISrNH1Y/fcbioWNDlvNpPzELSdN+jZolfqYOkWlYOnzufqUMlQ==,iv:OTS2obQKZd/yJray6BxhpOKEaOamXQf2RdsZibHklSc=,tag:Jusz+4i6wysepofR9M0mQA==,type:str] - git_ref: ENC[AES256_GCM,data:M5tIe1zv,iv:yB5QnVIDuBmYauTQzx6rzLwQNd6V+t15DUcAirIDySI=,tag:SsgXOI1nmtqfkhmCgVJbrg==,type:str] - env: - GIT_SSH_COMMAND: ENC[AES256_GCM,data:/tHzs53lMHrUmndEgR0jOhOPW0ned4A1USAZ2j9jLM+4KYjPKlicXEeWnOKL7wAwNZnlpN7O+iD0tE3UjkfkCrF9+MMMjzeh6Cxi7HjaX8B7yNVGXpt3CMBbVIduza0Qta6bqo0Ixao1xID4jhq2Hx/K0XTGHr8sq1WDo5RWjhgNni0HrYfQ7LCAT4/h8pNR,iv:3JZxMQWw95j4KGAL/bMqTo+irhdtlDEpVJjPrkK/IWQ=,tag:Uh0vwfZzZVN8nkxz0V6lDw==,type:str] - - name: ENC[AES256_GCM,data:guJV2vkSNpqaWWnlQgOoZhGXzLpPekf+GZABiO4=,iv:IJIIE0cFWPEhwcWKBQtcPGnv5XF4vy8ENEjETpIrnWc=,tag:lc82HaHmcJSy7Waxv7WN2Q==,type:str] - address: ENC[AES256_GCM,data:0ZoJISrNH1Y/fcbioWNDlvNpPzELSdN+jZolfqYOkWlYOnzufqUMlQ==,iv:OTS2obQKZd/yJray6BxhpOKEaOamXQf2RdsZibHklSc=,tag:Jusz+4i6wysepofR9M0mQA==,type:str] - git_ref: ENC[AES256_GCM,data:M5tIe1zv,iv:yB5QnVIDuBmYauTQzx6rzLwQNd6V+t15DUcAirIDySI=,tag:SsgXOI1nmtqfkhmCgVJbrg==,type:str] - env: - GIT_SSH_COMMAND: ENC[AES256_GCM,data:/tHzs53lMHrUmndEgR0jOhOPW0ned4A1USAZ2j9jLM+4KYjPKlicXEeWnOKL7wAwNZnlpN7O+iD0tE3UjkfkCrF9+MMMjzeh6Cxi7HjaX8B7yNVGXpt3CMBbVIduza0Qta6bqo0Ixao1xID4jhq2Hx/K0XTGHr8sq1WDo5RWjhgNni0HrYfQ7LCAT4/h8pNR,iv:3JZxMQWw95j4KGAL/bMqTo+irhdtlDEpVJjPrkK/IWQ=,tag:Uh0vwfZzZVN8nkxz0V6lDw==,type:str] confd_json: Watcher-MITx-6.00x: AUTH: @@ -18,48 +6,46 @@ confd_json: - ENC[AES256_GCM,data:/FqGfC4XbHZ2AIaJiUHJA+hebqDA4eag+qwjiG2lywpzOOMJIwblQw==,iv:dmDX989O8vyVtgfcCcSG1IPoX/+Vy/CHifk+4omnwV8=,tag:QFkUcAaojGs/+c+5h3r5lg==,type:str] CONNECTIONS: ENC[AES256_GCM,data:Gg==,iv:d7E2fNOoSSflC2UNEnIUg9bBCHI3rlbn8pgmHCnxX68=,tag:5V0iQ94DuAdt0+zxQXV39A==,type:int] HANDLERS: - - CODEJAIL: - bin_path: ENC[AES256_GCM,data:kXgOjRNdv5RNXL2JcleS3u/W2FUelW/OWS6u9Dd3ASC2mOxHMAjxQK0VoivVPj0ZqSzlqhxFPg21sg==,iv:jtm/LiMJapsXJEIZGgOzknJ/Nmd9zFofyNtQG4jJLHs=,tag:gIBJjPWkIydCIzoan9V6kw==,type:str] - lang: ENC[AES256_GCM,data:vxjPnbN7oQ==,iv:m25r3jtmoUjIWYDKyKUX82awfO98K9UdFPzUGlmdcF4=,tag:8yMYlavG9/pPCzIl5prLcg==,type:str] - name: ENC[AES256_GCM,data:1ISqevS5v+s=,iv:pX1qlTFPdlrgqP16/Vm9OGJ/hKnuMScGwjJ3M+YWeA4=,tag:d88eMeekFkpm0mRbNy741A==,type:str] - user: ENC[AES256_GCM,data:jiWGB4mhGX4=,iv:WOiO7BPAsvjl9tQz8NwLLTYcCi/XwcixYei6ybWiDYg=,tag:yGfU4XhPE0hkkVFM3c3Hcg==,type:str] - HANDLER: ENC[AES256_GCM,data:xKNmClln1J5leWUOsHXo6d0XMm+tNAN9bek4soM5b49o6zLbv3s8TQ==,iv:kZ7jGM5B4n2RsOCppRTt0Cn8TDH2sSnAOuYbQl95dY4=,tag:OTw+Y0uhByeYXY8eP13dqw==,type:str] + - HANDLER: ENC[AES256_GCM,data:SUKLJX/arQHo7OMbzoOCQgqACuwEDVgJVJ0HGqbLWbDZR3u6dK6zvxJUrNVM7w==,iv:JRwutReGuNwudxa6sRZVLZjx34ZaiVqAQSmwdUgn5AI=,tag:w4hyFq+uznC/+u1uciYlwQ==,type:str] KWARGS: - grader_root: ENC[AES256_GCM,data:A+hHIVLX0mTz1DHc7MMmVAf60hQ/EJjgzG5h9lnZppBLZIscVvhyryczyTS+RKimzqWQQYTT32FAsqYqvtI=,iv:dIMitVcAwebHlO3LRKoD13dVkMkuL2z6rX/gJbuGxKc=,tag:HDCOyEUMMK1/ErgXk6q7Kw==,type:str] - SERVER: ENC[AES256_GCM,data:fgyBM4dTnNCyUgb0O/nruyu5WgsGEZaYGHP+oV/Qi7llIUkOFDN9sQY=,iv:7GIz4gNKMqINA+i1WzGfEvptip3vLo6XnA/EScQbEy4=,tag:tjyrPKIx9dONJMgb6vyIPQ==,type:str] + grader_root: ENC[AES256_GCM,data:AAhj8GEJMqHJktLWYVFgqivcaBGn01IrOS87FGIxB7dlEGsV85c/qlVNbjisF3/4t2UMDpV9/L4DDzM6HnI=,iv:6HrqSIZUbMkdZt0ug6++Vpz4bMAtqiFiiaK+LZFOGj0=,tag:3f3Ku1T54aTyrtbafgDtfA==,type:str] + image: ENC[AES256_GCM,data:K/smElRfcbDfioQwlbSO7nMoW7yI39IzNFY2TvSgrWNaFqo5MnZSnyFOPC021WzHRZWMdQR1kTLB2zImRZuFZcEcnN0XNyHPiYrL,iv:Zikm+Z2QR4b7UEXrfrQvGh5EdywYPYxUL0lSECx7chQ=,tag:SeUO/AQmu6/eolH3+jmAYQ==,type:str] + backend: ENC[AES256_GCM,data:Xhkvv02Rh5aNkw==,iv:ZN1iF4gQ7Krm99YjrrWkWbWC3PYROk9gsOqFmqS3X/E=,tag:klNdz4BlVdHTnT8sOQ8/dA==,type:str] + cpu_limit: ENC[AES256_GCM,data:47Kuqa8=,iv:uZRate/HQzNH0QggGZdzguU0vZxxS5M8WYRcXUnQvo8=,tag:XnCarT23s5qOsbMim4Xs+A==,type:str] + memory_limit: ENC[AES256_GCM,data:hpEAzVQ=,iv:K9Uz/YVxopA7x6amvG9GMzuvsBssTYdrFt0O8qfdcnU=,tag:XIg9CpujBmzCM/fFebxIjA==,type:str] + timeout: ENC[AES256_GCM,data:jV0=,iv:pw+yw/Wzt0rtnoJmwnXzETb2bCeeGnD/nNHRq/FJH20=,tag:hwUcPI6in9/Yvqea7Hxn4Q==,type:int] + image_pull_policy: ENC[AES256_GCM,data:XiQx6roJ,iv:pORRAsnjXxV/c2cHzepkSVgs18hhkHAAtOT/Npv9lME=,tag:VgUaG0Uqmj5JWo+iWujolw==,type:str] + SERVER: ENC[AES256_GCM,data:9+nHYHNrnrHmTw0UqzyvU4CoC0YeLPE4wQb5eTWFMXb0yw==,iv:NdCWF86cPGN180Puc6y37cuhV1H0vsMZgQBEG6zLUTg=,tag:VjWyEn/Bve29esECOBMotg==,type:str] Watcher-MITx-6.0001r: AUTH: - ENC[AES256_GCM,data:GeoKMl2An+NT,iv:4bpjXM5pBTGk43L4aFffe/i7Xuiqqma4x1bjwqr2vZc=,tag:cRJlbd1X9ScKyIMXc56Mng==,type:str] - ENC[AES256_GCM,data:2IRNwrwVNeQ/N5B3Ka0E8JKa2rMoY+g1h078CWy82O0EYrsfUAEIxw==,iv:sl5rkCOZxVIM2xkgAHvsg9wPl4arEOm7NTq4B67Dz5s=,tag:dpxlezEbjFndERvvvGQSkQ==,type:str] CONNECTIONS: ENC[AES256_GCM,data:0g==,iv:6tyg9WvnRhnpLm0vKGefnd70VfpmEFy0ErgwnNiFSAU=,tag:fl01agv/QL6uosiMu0rDzA==,type:int] HANDLERS: - - CODEJAIL: - bin_path: ENC[AES256_GCM,data:lsEJDjU3WT6woU5bptLcYR1wXqWdWUfTg95Rsi+KBHbJgB2WCSApOL4NokZErBfF9b2Gp6Oo8lbTnw==,iv:A2I+IE3Sb2sMMt3jzwQahVeHVVSKFegWe4lOYq7d+kA=,tag:fcR1QNOhmBa60VW62XcEwg==,type:str] - lang: ENC[AES256_GCM,data:UkMtJmLV7w==,iv:pbuB6D7FVuwpjJU8Aq/94McxvrADBG6q9NK1dInRaWA=,tag:keZqRH/pnb7hJGK/5IB6Dw==,type:str] - name: ENC[AES256_GCM,data:FNj9NpiNTLA=,iv:/NO4SppIdQ2sk4b4v0yFy5uhUA6xJe3Ln4CcbNOwoYQ=,tag:kPIcKyFzwlXawgYNuEFGQg==,type:str] - user: ENC[AES256_GCM,data:AgacvZvMofI=,iv:M8eB7g7NIieNs+WeQjemUfcyYiPxRSsxdQhKCHcehLg=,tag:kEFkDWkdhqWhrkSVnpPjRg==,type:str] - HANDLER: ENC[AES256_GCM,data:ptWfCuhtmp1XaCxZsxoyFryVbS82cJkYLOgeTfI883Abtal/nUK58g==,iv:Y2vuOaLH94pc9iY0V6qUfmlj9m3ijK/vxnyljFhBC4w=,tag:dBe2y1yG4y/MDqKeUlobow==,type:str] + - HANDLER: ENC[AES256_GCM,data:GxcBCI9x7tzIoP+2Ssgz92doSvmldQtDDx6Jl9duFWcXP/bwRKodTLfYDEwmaw==,iv:AXZJHneW4i1z90TK7SIr6WOG7HLz4EvSP2o0rWALMIM=,tag:5iOqnbcO53qOcgWYRS3O9Q==,type:str] KWARGS: - grader_root: ENC[AES256_GCM,data:6UFR02eSi21NXOU6SQcZyRm3SbFqgMXV4yJiMTW8+lMiVRfRVyz0cFnQjFEAlwflHxgWgXNymHNTkxYFzSUpUw==,iv:AiEGSD68isqsL0Ften6dn+/DTpHfdbUGwBN5vdBd1w8=,tag:lCXfJNrV9hkrSRxFVesPsQ==,type:str] - SERVER: ENC[AES256_GCM,data:FHHp0nXRKUsYrfccjmv0TF1Qb6ms7sCRnvA/V2Pm0snJznW/YA66LRo=,iv:7l7VZyh2wyVT7q1j43fIWdDBsODzK9IGGZt9+D+Z55I=,tag:8zW+XIdqJjuuN2QY3ndSgg==,type:str] -xqwatcher_grader_code_ssh_identity: ENC[AES256_GCM,data:2ogEHew5oYkdHOhn/Td92T0tLF59cC0ts6kQddqEgUu/7/cSadFoGvb9RfS4TUOzhAARnMdMRT5JclXlp4Oi8T0X2nenAM/zZkokd5GMmC0J2mESCm2bbdXcG2qat9Rjga7d+1bLyP1+1heIHPb5uRsGJYkA87d1bvv8vlmwtEQN3VeBrD/0I+jwTUcLamVg4T602RxWMnEm9nXw0NTDUp4TsOMtIrzdBvs9IsNEqdDT7hwSkHpmC69dulUJzJGOSFYmkYFHcDAK8qTRdc8qp5vvMTz4k7X8JF8yp7LfEGuE5w==,iv:c2o8vAGEB8DFq9VtW8Zhm5Xat4s49x+oQf7212ZeFFk=,tag:9gI1u/SBMIjLDfY3b1X8+w==,type:str] + grader_root: ENC[AES256_GCM,data:G0CAaCLj5ejxuJnPIDrXh/2cWJmWRp56HV5J05scwkq4UOOGjIcnnKJgN1NnBNXTmS4TL/H4dtnxQBIpnLE=,iv:Ye7wv7Jnu36qmHXA7d5vx4AKdwoGcKg/uJnSK/J29OA=,tag:kqVpn7RwYNSKQU/t/oPMYQ==,type:str] + image: ENC[AES256_GCM,data:xbI2u7mnzujef23MXabbxrTtqFzlM6SoH8koIHvJhsViugeRGYMQTpd7gLaShLLJDPGMoQ/8MGuNuZp1IY1Rn+xoFGE8W5owiYHu,iv:hIn5E/VXuPuunHkxUkFoMhWeUw8Ex+x+pfHN+cTxf/8=,tag:EMY867H0qn83N5v0OpnDrg==,type:str] + backend: ENC[AES256_GCM,data:OaGBiwiRT0m/Pg==,iv:RBUzueYrobo6vv2LXXhTMPEY8q7ZvkDN86x8fBc4lOU=,tag:hqS+ZaFHjs0xhTO5ldc6pw==,type:str] + cpu_limit: ENC[AES256_GCM,data:FjtcqBs=,iv:/4awwScRxzctMgesO+Akjh+xawRxjQYM6FwctaUr6to=,tag:HLUnAYyAhknuBQXTRHs3ng==,type:str] + memory_limit: ENC[AES256_GCM,data:gISNMOg=,iv:y842XxT45ipoWCXrLO5Pjh6WgqtUyZ9qRfkCWfVx/6E=,tag:sdMFIVYXzUj9LeOA2XXnIg==,type:str] + timeout: ENC[AES256_GCM,data:KYY=,iv:YiBTO75tTrTB8N1sjLPA0YTJX48sls7EppkfmsYAbWg=,tag:vHJR93RPrWf+qtt7+aEIOA==,type:int] + image_pull_policy: ENC[AES256_GCM,data:++zPPhK8,iv:0mPD6jGWeGVsEGxmCeiJxyE9wYz0vMqVtLLYCguLg0s=,tag:ounenzLmuxsYg9tYsm86wg==,type:str] + SERVER: ENC[AES256_GCM,data:ScOVWtueEbmPj/cA42MlN58fxYYKjBWGh982ebrLp6qbaQ==,iv:uOWEPN5qBeZ9Yw6xr7hfZ6GWzgYGBYZkBFsvM0sfzyg=,tag:SyNl6UGl6yilFPNVqa1aGg==,type:str] sops: kms: - arn: arn:aws:kms:us-east-1:610119931565:alias/infrastructure-secrets-ci created_at: "2024-04-16T18:44:13Z" enc: AQICAHjnbqe9AmEW1Js10nySybyuAG7Fb5E9EHUgkmqFDv7PxQGybcfmnUvB5N3pkXc+9ch5AAAAfjB8BgkqhkiG9w0BBwagbzBtAgEAMGgGCSqGSIb3DQEHATAeBglghkgBZQMEAS4wEQQM/4CSiwztDCKuyN3oAgEQgDvUtEGftvJN4EzTdnZS00yMzsZhgmq3lCXovEvM6fFJFBZYZjGAeKZYnhW69ITlOIUi8K4iZmlUy9eQFw== aws_profile: "" - gcp_kms: [] - azure_kv: [] hc_vault: - vault_address: https://vault-ci.odl.mit.edu engine_path: infrastructure key_name: sops created_at: "2024-04-16T18:44:13Z" enc: vault:v1:DFw1gsayFWeGxTCrU0HCQzWk4YBPHQdKHpValoIHi4bO/jHn+eZv+Nr2d4FubYiq8jKrKREm/UgsizDS - age: [] - lastmodified: "2024-04-25T18:35:23Z" - mac: ENC[AES256_GCM,data:X4CfjxG/ps13sYRd2PzeXl0MLz0pxkSKky23dIN/PmsH1xyhPnVXQ8wMV85vjN55LrGcSHs9GWAlJjqna43apnoggHfdqO/bYUeHhUjUJ8RiqXTFCi9NfRkCq5x1upmiXTR95fKdZUrykFZTGx+8JvTWLjs+2HfgHWL0kDk1HDU=,iv:RVkdd8V9rHgFbMP7ZqormU7TYXF2uwe46QCKHpmhi8E=,tag:xEdvYfirIpM/rwhKDAouHw==,type:str] + lastmodified: "2026-03-19T20:20:19Z" + mac: ENC[AES256_GCM,data:8f0XRO62GjWYYvnhY8HZkuVzoAUD6qXSpJ4Ok3DuzRhX/WybqKx77Joi+GUzG60nBV7g/uz0eBk/zh4HyII0z//SfEsBXEN1DJkDO+dah2ZzZoQIipnTXtWiR4vTNfZ1gEFUzx/vTcsLE31JMB+0QbCuByaDv3JJodpPhNLuHIA=,iv:+jNex5GLcrPa/cSPdwIEcdCDMgxXgUPXcfGVVYKS++g=,tag:R7tQdp61Po7f0FqpCzbm2A==,type:str] pgp: - created_at: "2024-04-16T18:44:13Z" enc: |- @@ -142,4 +128,4 @@ sops: -----END PGP MESSAGE----- fp: 3582AE9F12CE295BDAF545ED17A5F53F11681446 unencrypted_suffix: _unencrypted - version: 3.8.1 + version: 3.12.1 diff --git a/src/ol_concourse/pipelines/open_edx/grader_images/base_image_pipeline.py b/src/ol_concourse/pipelines/open_edx/grader_images/base_image_pipeline.py index 06c19e4703..543042d29c 100644 --- a/src/ol_concourse/pipelines/open_edx/grader_images/base_image_pipeline.py +++ b/src/ol_concourse/pipelines/open_edx/grader_images/base_image_pipeline.py @@ -32,9 +32,6 @@ _AWS_ACCOUNT_ID = "610119931565" _AWS_REGION = "us-east-1" _BASE_IMAGE_REPO = "mitodl/xqueue-watcher-grader-base" -_ECR_BASE_IMAGE_REPO = ( - f"{_AWS_ACCOUNT_ID}.dkr.ecr.{_AWS_REGION}.amazonaws.com/{_BASE_IMAGE_REPO}" -) def grader_base_image_pipeline() -> Pipeline: @@ -60,7 +57,7 @@ def grader_base_image_pipeline() -> Pipeline: # pipelines so that a base image rebuild causes downstream rebuilds. ecr_base_image = registry_image( name=Identifier("grader-base-ecr"), - image_repository=_ECR_BASE_IMAGE_REPO, + image_repository=_BASE_IMAGE_REPO, image_tag="latest", ecr_region=_AWS_REGION, ) From 5e38539e5d1c17449dbd14b27fedbf2e958574f4 Mon Sep 17 00:00:00 2001 From: Tobias Macey Date: Thu, 19 Mar 2026 17:03:37 -0400 Subject: [PATCH 29/46] config: Get grader path to strip erroneous prefix --- src/bridge/secrets/xqwatcher/secrets.mitx.ci.yaml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/bridge/secrets/xqwatcher/secrets.mitx.ci.yaml b/src/bridge/secrets/xqwatcher/secrets.mitx.ci.yaml index e4fab27e2f..37e630b965 100644 --- a/src/bridge/secrets/xqwatcher/secrets.mitx.ci.yaml +++ b/src/bridge/secrets/xqwatcher/secrets.mitx.ci.yaml @@ -15,6 +15,7 @@ confd_json: memory_limit: ENC[AES256_GCM,data:hpEAzVQ=,iv:K9Uz/YVxopA7x6amvG9GMzuvsBssTYdrFt0O8qfdcnU=,tag:XIg9CpujBmzCM/fFebxIjA==,type:str] timeout: ENC[AES256_GCM,data:jV0=,iv:pw+yw/Wzt0rtnoJmwnXzETb2bCeeGnD/nNHRq/FJH20=,tag:hwUcPI6in9/Yvqea7Hxn4Q==,type:int] image_pull_policy: ENC[AES256_GCM,data:XiQx6roJ,iv:pORRAsnjXxV/c2cHzepkSVgs18hhkHAAtOT/Npv9lME=,tag:VgUaG0Uqmj5JWo+iWujolw==,type:str] + strip_path_components: ENC[AES256_GCM,data:tA==,iv:kP2IbFJdbSMa4ywsaFde8DCooTZfvkzDIFtGDAtNdBg=,tag:iTwfngyFd7rnHwigxXCQ/w==,type:int] SERVER: ENC[AES256_GCM,data:9+nHYHNrnrHmTw0UqzyvU4CoC0YeLPE4wQb5eTWFMXb0yw==,iv:NdCWF86cPGN180Puc6y37cuhV1H0vsMZgQBEG6zLUTg=,tag:VjWyEn/Bve29esECOBMotg==,type:str] Watcher-MITx-6.0001r: AUTH: @@ -31,6 +32,7 @@ confd_json: memory_limit: ENC[AES256_GCM,data:gISNMOg=,iv:y842XxT45ipoWCXrLO5Pjh6WgqtUyZ9qRfkCWfVx/6E=,tag:sdMFIVYXzUj9LeOA2XXnIg==,type:str] timeout: ENC[AES256_GCM,data:KYY=,iv:YiBTO75tTrTB8N1sjLPA0YTJX48sls7EppkfmsYAbWg=,tag:vHJR93RPrWf+qtt7+aEIOA==,type:int] image_pull_policy: ENC[AES256_GCM,data:++zPPhK8,iv:0mPD6jGWeGVsEGxmCeiJxyE9wYz0vMqVtLLYCguLg0s=,tag:ounenzLmuxsYg9tYsm86wg==,type:str] + strip_path_components: ENC[AES256_GCM,data:cQ==,iv:6igJgOzpjaOQVAvQ58cWt0rS8+1fV31eOc4JEIfYW+8=,tag:i0co134v7YmeHmVEKAWVMw==,type:int] SERVER: ENC[AES256_GCM,data:ScOVWtueEbmPj/cA42MlN58fxYYKjBWGh982ebrLp6qbaQ==,iv:uOWEPN5qBeZ9Yw6xr7hfZ6GWzgYGBYZkBFsvM0sfzyg=,tag:SyNl6UGl6yilFPNVqa1aGg==,type:str] sops: kms: @@ -44,8 +46,8 @@ sops: key_name: sops created_at: "2024-04-16T18:44:13Z" enc: vault:v1:DFw1gsayFWeGxTCrU0HCQzWk4YBPHQdKHpValoIHi4bO/jHn+eZv+Nr2d4FubYiq8jKrKREm/UgsizDS - lastmodified: "2026-03-19T20:20:19Z" - mac: ENC[AES256_GCM,data:8f0XRO62GjWYYvnhY8HZkuVzoAUD6qXSpJ4Ok3DuzRhX/WybqKx77Joi+GUzG60nBV7g/uz0eBk/zh4HyII0z//SfEsBXEN1DJkDO+dah2ZzZoQIipnTXtWiR4vTNfZ1gEFUzx/vTcsLE31JMB+0QbCuByaDv3JJodpPhNLuHIA=,iv:+jNex5GLcrPa/cSPdwIEcdCDMgxXgUPXcfGVVYKS++g=,tag:R7tQdp61Po7f0FqpCzbm2A==,type:str] + lastmodified: "2026-03-19T21:03:12Z" + mac: ENC[AES256_GCM,data:nofHx78MVy+o+t8arslFn5bSZcTv8BkfwzSD8lu5S8JW5kSjYAQ5I6wewd5fLAgjKuJF+hVxAi6tfOyb01Bl7IGmHBcS/b6qxBTZD2Gc9K37KSLtEoUF3lSs11E633IrckOgFSFsINX84pYCfeire07JN6Y/vmv2WW9pXfLaEIo=,iv:3r0o0mtgBjMJxvLsktxNE/I7pE44CGuVFebQ0FaEPwU=,tag:mnel1Hk5kJUIdnMGNj+dTQ==,type:str] pgp: - created_at: "2024-04-16T18:44:13Z" enc: |- From 23298edc31151a02e0c6878f09e9f401ea655622 Mon Sep 17 00:00:00 2001 From: Tobias Macey Date: Thu, 19 Mar 2026 19:00:53 -0400 Subject: [PATCH 30/46] fix: Set proper grader root for dockerized graders --- src/bridge/secrets/xqwatcher/secrets.mitx.ci.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/bridge/secrets/xqwatcher/secrets.mitx.ci.yaml b/src/bridge/secrets/xqwatcher/secrets.mitx.ci.yaml index 37e630b965..b18d686d1e 100644 --- a/src/bridge/secrets/xqwatcher/secrets.mitx.ci.yaml +++ b/src/bridge/secrets/xqwatcher/secrets.mitx.ci.yaml @@ -8,7 +8,7 @@ confd_json: HANDLERS: - HANDLER: ENC[AES256_GCM,data:SUKLJX/arQHo7OMbzoOCQgqACuwEDVgJVJ0HGqbLWbDZR3u6dK6zvxJUrNVM7w==,iv:JRwutReGuNwudxa6sRZVLZjx34ZaiVqAQSmwdUgn5AI=,tag:w4hyFq+uznC/+u1uciYlwQ==,type:str] KWARGS: - grader_root: ENC[AES256_GCM,data:AAhj8GEJMqHJktLWYVFgqivcaBGn01IrOS87FGIxB7dlEGsV85c/qlVNbjisF3/4t2UMDpV9/L4DDzM6HnI=,iv:6HrqSIZUbMkdZt0ug6++Vpz4bMAtqiFiiaK+LZFOGj0=,tag:3f3Ku1T54aTyrtbafgDtfA==,type:str] + grader_root: ENC[AES256_GCM,data:5zGvVtmLO3LA,iv:Ke3uqZ/ojRSiYh7Y+IhbqwFvo6PHkM7TyqBkL79d5Z4=,tag:I7wcHg3MumwDUIo1vFP2CA==,type:str] image: ENC[AES256_GCM,data:K/smElRfcbDfioQwlbSO7nMoW7yI39IzNFY2TvSgrWNaFqo5MnZSnyFOPC021WzHRZWMdQR1kTLB2zImRZuFZcEcnN0XNyHPiYrL,iv:Zikm+Z2QR4b7UEXrfrQvGh5EdywYPYxUL0lSECx7chQ=,tag:SeUO/AQmu6/eolH3+jmAYQ==,type:str] backend: ENC[AES256_GCM,data:Xhkvv02Rh5aNkw==,iv:ZN1iF4gQ7Krm99YjrrWkWbWC3PYROk9gsOqFmqS3X/E=,tag:klNdz4BlVdHTnT8sOQ8/dA==,type:str] cpu_limit: ENC[AES256_GCM,data:47Kuqa8=,iv:uZRate/HQzNH0QggGZdzguU0vZxxS5M8WYRcXUnQvo8=,tag:XnCarT23s5qOsbMim4Xs+A==,type:str] @@ -25,7 +25,7 @@ confd_json: HANDLERS: - HANDLER: ENC[AES256_GCM,data:GxcBCI9x7tzIoP+2Ssgz92doSvmldQtDDx6Jl9duFWcXP/bwRKodTLfYDEwmaw==,iv:AXZJHneW4i1z90TK7SIr6WOG7HLz4EvSP2o0rWALMIM=,tag:5iOqnbcO53qOcgWYRS3O9Q==,type:str] KWARGS: - grader_root: ENC[AES256_GCM,data:G0CAaCLj5ejxuJnPIDrXh/2cWJmWRp56HV5J05scwkq4UOOGjIcnnKJgN1NnBNXTmS4TL/H4dtnxQBIpnLE=,iv:Ye7wv7Jnu36qmHXA7d5vx4AKdwoGcKg/uJnSK/J29OA=,tag:kqVpn7RwYNSKQU/t/oPMYQ==,type:str] + grader_root: ENC[AES256_GCM,data:P9s0aP2ZcBJ2,iv:h9SVYsDasC6xW758EqrlEyCjkeEX+qLE/Rklcx/x80Y=,tag:YrrGSmOS0ATgofSMDeZqRQ==,type:str] image: ENC[AES256_GCM,data:xbI2u7mnzujef23MXabbxrTtqFzlM6SoH8koIHvJhsViugeRGYMQTpd7gLaShLLJDPGMoQ/8MGuNuZp1IY1Rn+xoFGE8W5owiYHu,iv:hIn5E/VXuPuunHkxUkFoMhWeUw8Ex+x+pfHN+cTxf/8=,tag:EMY867H0qn83N5v0OpnDrg==,type:str] backend: ENC[AES256_GCM,data:OaGBiwiRT0m/Pg==,iv:RBUzueYrobo6vv2LXXhTMPEY8q7ZvkDN86x8fBc4lOU=,tag:hqS+ZaFHjs0xhTO5ldc6pw==,type:str] cpu_limit: ENC[AES256_GCM,data:FjtcqBs=,iv:/4awwScRxzctMgesO+Akjh+xawRxjQYM6FwctaUr6to=,tag:HLUnAYyAhknuBQXTRHs3ng==,type:str] @@ -46,8 +46,8 @@ sops: key_name: sops created_at: "2024-04-16T18:44:13Z" enc: vault:v1:DFw1gsayFWeGxTCrU0HCQzWk4YBPHQdKHpValoIHi4bO/jHn+eZv+Nr2d4FubYiq8jKrKREm/UgsizDS - lastmodified: "2026-03-19T21:03:12Z" - mac: ENC[AES256_GCM,data:nofHx78MVy+o+t8arslFn5bSZcTv8BkfwzSD8lu5S8JW5kSjYAQ5I6wewd5fLAgjKuJF+hVxAi6tfOyb01Bl7IGmHBcS/b6qxBTZD2Gc9K37KSLtEoUF3lSs11E633IrckOgFSFsINX84pYCfeire07JN6Y/vmv2WW9pXfLaEIo=,iv:3r0o0mtgBjMJxvLsktxNE/I7pE44CGuVFebQ0FaEPwU=,tag:mnel1Hk5kJUIdnMGNj+dTQ==,type:str] + lastmodified: "2026-03-19T23:00:25Z" + mac: ENC[AES256_GCM,data:tjYZxJzwuOHPzs7vuG1M8P2qCCjIYV5FMjGlfy3YCsBnxcPG4D89fzh0TvS8MZuCF51GH41ETuO+fSZk29ia30P96Ht1xqLmrJHgrO/xnCQUZ8U86aShscI0exo1d3pOTbvBKWwdX6AWfgNJYUgKm9qcZroLs7hGqo3WkbObPJQ=,iv:w1Jl9AfYXepCHia+ZPY6f4UxOp+Rerr8NJe6TEp6lyY=,tag:tR3N7TZ6gyTt5I5KZ04u+w==,type:str] pgp: - created_at: "2024-04-16T18:44:13Z" enc: |- From 3392b9fe7952933719d9823645fba742abfe6bc4 Mon Sep 17 00:00:00 2001 From: Tobias Macey Date: Thu, 19 Mar 2026 19:18:27 -0400 Subject: [PATCH 31/46] fix: Don't strip path components either --- src/bridge/secrets/xqwatcher/secrets.mitx.ci.yaml | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/src/bridge/secrets/xqwatcher/secrets.mitx.ci.yaml b/src/bridge/secrets/xqwatcher/secrets.mitx.ci.yaml index b18d686d1e..a6596b31a2 100644 --- a/src/bridge/secrets/xqwatcher/secrets.mitx.ci.yaml +++ b/src/bridge/secrets/xqwatcher/secrets.mitx.ci.yaml @@ -8,14 +8,13 @@ confd_json: HANDLERS: - HANDLER: ENC[AES256_GCM,data:SUKLJX/arQHo7OMbzoOCQgqACuwEDVgJVJ0HGqbLWbDZR3u6dK6zvxJUrNVM7w==,iv:JRwutReGuNwudxa6sRZVLZjx34ZaiVqAQSmwdUgn5AI=,tag:w4hyFq+uznC/+u1uciYlwQ==,type:str] KWARGS: - grader_root: ENC[AES256_GCM,data:5zGvVtmLO3LA,iv:Ke3uqZ/ojRSiYh7Y+IhbqwFvo6PHkM7TyqBkL79d5Z4=,tag:I7wcHg3MumwDUIo1vFP2CA==,type:str] + grader_root: ENC[AES256_GCM,data:yW8aK1I4cgoeu/mWBVZic2lPPtFekwuG,iv:+Xp/VtAeSF843ov2dx9XMrTKvRPTZ0sSxt44abcg5so=,tag:SRC0qFcYk1X2XBF4OqpENg==,type:str] image: ENC[AES256_GCM,data:K/smElRfcbDfioQwlbSO7nMoW7yI39IzNFY2TvSgrWNaFqo5MnZSnyFOPC021WzHRZWMdQR1kTLB2zImRZuFZcEcnN0XNyHPiYrL,iv:Zikm+Z2QR4b7UEXrfrQvGh5EdywYPYxUL0lSECx7chQ=,tag:SeUO/AQmu6/eolH3+jmAYQ==,type:str] backend: ENC[AES256_GCM,data:Xhkvv02Rh5aNkw==,iv:ZN1iF4gQ7Krm99YjrrWkWbWC3PYROk9gsOqFmqS3X/E=,tag:klNdz4BlVdHTnT8sOQ8/dA==,type:str] cpu_limit: ENC[AES256_GCM,data:47Kuqa8=,iv:uZRate/HQzNH0QggGZdzguU0vZxxS5M8WYRcXUnQvo8=,tag:XnCarT23s5qOsbMim4Xs+A==,type:str] memory_limit: ENC[AES256_GCM,data:hpEAzVQ=,iv:K9Uz/YVxopA7x6amvG9GMzuvsBssTYdrFt0O8qfdcnU=,tag:XIg9CpujBmzCM/fFebxIjA==,type:str] timeout: ENC[AES256_GCM,data:jV0=,iv:pw+yw/Wzt0rtnoJmwnXzETb2bCeeGnD/nNHRq/FJH20=,tag:hwUcPI6in9/Yvqea7Hxn4Q==,type:int] image_pull_policy: ENC[AES256_GCM,data:XiQx6roJ,iv:pORRAsnjXxV/c2cHzepkSVgs18hhkHAAtOT/Npv9lME=,tag:VgUaG0Uqmj5JWo+iWujolw==,type:str] - strip_path_components: ENC[AES256_GCM,data:tA==,iv:kP2IbFJdbSMa4ywsaFde8DCooTZfvkzDIFtGDAtNdBg=,tag:iTwfngyFd7rnHwigxXCQ/w==,type:int] SERVER: ENC[AES256_GCM,data:9+nHYHNrnrHmTw0UqzyvU4CoC0YeLPE4wQb5eTWFMXb0yw==,iv:NdCWF86cPGN180Puc6y37cuhV1H0vsMZgQBEG6zLUTg=,tag:VjWyEn/Bve29esECOBMotg==,type:str] Watcher-MITx-6.0001r: AUTH: @@ -25,14 +24,13 @@ confd_json: HANDLERS: - HANDLER: ENC[AES256_GCM,data:GxcBCI9x7tzIoP+2Ssgz92doSvmldQtDDx6Jl9duFWcXP/bwRKodTLfYDEwmaw==,iv:AXZJHneW4i1z90TK7SIr6WOG7HLz4EvSP2o0rWALMIM=,tag:5iOqnbcO53qOcgWYRS3O9Q==,type:str] KWARGS: - grader_root: ENC[AES256_GCM,data:P9s0aP2ZcBJ2,iv:h9SVYsDasC6xW758EqrlEyCjkeEX+qLE/Rklcx/x80Y=,tag:YrrGSmOS0ATgofSMDeZqRQ==,type:str] + grader_root: ENC[AES256_GCM,data:IOvzEu688/L+NAhVumJegc8hY8Gq0JSR,iv:aBiBcv3PXlV4ZAAE+v44tjCgTNoWblQsvSyXBWu99VY=,tag:/+Tw3yjQyBNc1iDhnvmqQg==,type:str] image: ENC[AES256_GCM,data:xbI2u7mnzujef23MXabbxrTtqFzlM6SoH8koIHvJhsViugeRGYMQTpd7gLaShLLJDPGMoQ/8MGuNuZp1IY1Rn+xoFGE8W5owiYHu,iv:hIn5E/VXuPuunHkxUkFoMhWeUw8Ex+x+pfHN+cTxf/8=,tag:EMY867H0qn83N5v0OpnDrg==,type:str] backend: ENC[AES256_GCM,data:OaGBiwiRT0m/Pg==,iv:RBUzueYrobo6vv2LXXhTMPEY8q7ZvkDN86x8fBc4lOU=,tag:hqS+ZaFHjs0xhTO5ldc6pw==,type:str] cpu_limit: ENC[AES256_GCM,data:FjtcqBs=,iv:/4awwScRxzctMgesO+Akjh+xawRxjQYM6FwctaUr6to=,tag:HLUnAYyAhknuBQXTRHs3ng==,type:str] memory_limit: ENC[AES256_GCM,data:gISNMOg=,iv:y842XxT45ipoWCXrLO5Pjh6WgqtUyZ9qRfkCWfVx/6E=,tag:sdMFIVYXzUj9LeOA2XXnIg==,type:str] timeout: ENC[AES256_GCM,data:KYY=,iv:YiBTO75tTrTB8N1sjLPA0YTJX48sls7EppkfmsYAbWg=,tag:vHJR93RPrWf+qtt7+aEIOA==,type:int] image_pull_policy: ENC[AES256_GCM,data:++zPPhK8,iv:0mPD6jGWeGVsEGxmCeiJxyE9wYz0vMqVtLLYCguLg0s=,tag:ounenzLmuxsYg9tYsm86wg==,type:str] - strip_path_components: ENC[AES256_GCM,data:cQ==,iv:6igJgOzpjaOQVAvQ58cWt0rS8+1fV31eOc4JEIfYW+8=,tag:i0co134v7YmeHmVEKAWVMw==,type:int] SERVER: ENC[AES256_GCM,data:ScOVWtueEbmPj/cA42MlN58fxYYKjBWGh982ebrLp6qbaQ==,iv:uOWEPN5qBeZ9Yw6xr7hfZ6GWzgYGBYZkBFsvM0sfzyg=,tag:SyNl6UGl6yilFPNVqa1aGg==,type:str] sops: kms: @@ -46,8 +44,8 @@ sops: key_name: sops created_at: "2024-04-16T18:44:13Z" enc: vault:v1:DFw1gsayFWeGxTCrU0HCQzWk4YBPHQdKHpValoIHi4bO/jHn+eZv+Nr2d4FubYiq8jKrKREm/UgsizDS - lastmodified: "2026-03-19T23:00:25Z" - mac: ENC[AES256_GCM,data:tjYZxJzwuOHPzs7vuG1M8P2qCCjIYV5FMjGlfy3YCsBnxcPG4D89fzh0TvS8MZuCF51GH41ETuO+fSZk29ia30P96Ht1xqLmrJHgrO/xnCQUZ8U86aShscI0exo1d3pOTbvBKWwdX6AWfgNJYUgKm9qcZroLs7hGqo3WkbObPJQ=,iv:w1Jl9AfYXepCHia+ZPY6f4UxOp+Rerr8NJe6TEp6lyY=,tag:tR3N7TZ6gyTt5I5KZ04u+w==,type:str] + lastmodified: "2026-03-19T23:02:07Z" + mac: ENC[AES256_GCM,data:p8QH7qCtN2PTtTC0Gq/ZvyLr8jWwjK7J84l2RaBA4dBRekYl9aNRbMDDpoMzd9XWdGmt7diHLJXWGYktWO8mvNQDJ2f7U/+rB2wLaB8/BOPsnoWR+R1mYS+1IPQAwvmP2UfElvDp65gpi0ccvWKNugaWWNpaDnWxikmrhCD9i2E=,iv:MQWDAQHeVHXauM7eyMod0haEC1kv7CEsHqNayOI4AlI=,tag:b3xLkafRivjymyBoU4Df+w==,type:str] pgp: - created_at: "2024-04-16T18:44:13Z" enc: |- From a4fc006b0ffa21b7bb1ef38b135dbb7b145832f4 Mon Sep 17 00:00:00 2001 From: Tobias Macey Date: Fri, 20 Mar 2026 11:28:34 -0400 Subject: [PATCH 32/46] fix: Update mitx CI watcher password to match xqueue --- src/bridge/secrets/xqwatcher/secrets.mitx.ci.yaml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/bridge/secrets/xqwatcher/secrets.mitx.ci.yaml b/src/bridge/secrets/xqwatcher/secrets.mitx.ci.yaml index a6596b31a2..811a656e68 100644 --- a/src/bridge/secrets/xqwatcher/secrets.mitx.ci.yaml +++ b/src/bridge/secrets/xqwatcher/secrets.mitx.ci.yaml @@ -3,7 +3,7 @@ confd_json: Watcher-MITx-6.00x: AUTH: - ENC[AES256_GCM,data:SCKbOmao2Gpl,iv:Hb7ZVtcobrj2E6hFBrmqVpq1oNnJ+j2YOQMcfLFN6Dc=,tag:3rvoooddYT3G04AQOsKxEA==,type:str] - - ENC[AES256_GCM,data:/FqGfC4XbHZ2AIaJiUHJA+hebqDA4eag+qwjiG2lywpzOOMJIwblQw==,iv:dmDX989O8vyVtgfcCcSG1IPoX/+Vy/CHifk+4omnwV8=,tag:QFkUcAaojGs/+c+5h3r5lg==,type:str] + - ENC[AES256_GCM,data:huly/No4UkykZIlKre7Lr5bJLH+4zmha3gS7m9bFbXO85TF/QPUpAg==,iv:CMds+DvCv7E9qyAlgxRvO2XAxkX4wso/21Q9zXQrf8I=,tag:xcsHlo32rScjVagNmndv8Q==,type:str] CONNECTIONS: ENC[AES256_GCM,data:Gg==,iv:d7E2fNOoSSflC2UNEnIUg9bBCHI3rlbn8pgmHCnxX68=,tag:5V0iQ94DuAdt0+zxQXV39A==,type:int] HANDLERS: - HANDLER: ENC[AES256_GCM,data:SUKLJX/arQHo7OMbzoOCQgqACuwEDVgJVJ0HGqbLWbDZR3u6dK6zvxJUrNVM7w==,iv:JRwutReGuNwudxa6sRZVLZjx34ZaiVqAQSmwdUgn5AI=,tag:w4hyFq+uznC/+u1uciYlwQ==,type:str] @@ -19,7 +19,7 @@ confd_json: Watcher-MITx-6.0001r: AUTH: - ENC[AES256_GCM,data:GeoKMl2An+NT,iv:4bpjXM5pBTGk43L4aFffe/i7Xuiqqma4x1bjwqr2vZc=,tag:cRJlbd1X9ScKyIMXc56Mng==,type:str] - - ENC[AES256_GCM,data:2IRNwrwVNeQ/N5B3Ka0E8JKa2rMoY+g1h078CWy82O0EYrsfUAEIxw==,iv:sl5rkCOZxVIM2xkgAHvsg9wPl4arEOm7NTq4B67Dz5s=,tag:dpxlezEbjFndERvvvGQSkQ==,type:str] + - ENC[AES256_GCM,data:VKQGDGtNq+FQVFLFO6IHxos/nHs+GHihFZND/mnUngNFGSLj+NSi5w==,iv:Obb9DEHraRwAf/nCaWKyFJh5wK3dTL3gzwkEDVbhV7M=,tag:HXrvSTWBPuH9vPiFKLBQKA==,type:str] CONNECTIONS: ENC[AES256_GCM,data:0g==,iv:6tyg9WvnRhnpLm0vKGefnd70VfpmEFy0ErgwnNiFSAU=,tag:fl01agv/QL6uosiMu0rDzA==,type:int] HANDLERS: - HANDLER: ENC[AES256_GCM,data:GxcBCI9x7tzIoP+2Ssgz92doSvmldQtDDx6Jl9duFWcXP/bwRKodTLfYDEwmaw==,iv:AXZJHneW4i1z90TK7SIr6WOG7HLz4EvSP2o0rWALMIM=,tag:5iOqnbcO53qOcgWYRS3O9Q==,type:str] @@ -44,8 +44,8 @@ sops: key_name: sops created_at: "2024-04-16T18:44:13Z" enc: vault:v1:DFw1gsayFWeGxTCrU0HCQzWk4YBPHQdKHpValoIHi4bO/jHn+eZv+Nr2d4FubYiq8jKrKREm/UgsizDS - lastmodified: "2026-03-19T23:02:07Z" - mac: ENC[AES256_GCM,data:p8QH7qCtN2PTtTC0Gq/ZvyLr8jWwjK7J84l2RaBA4dBRekYl9aNRbMDDpoMzd9XWdGmt7diHLJXWGYktWO8mvNQDJ2f7U/+rB2wLaB8/BOPsnoWR+R1mYS+1IPQAwvmP2UfElvDp65gpi0ccvWKNugaWWNpaDnWxikmrhCD9i2E=,iv:MQWDAQHeVHXauM7eyMod0haEC1kv7CEsHqNayOI4AlI=,tag:b3xLkafRivjymyBoU4Df+w==,type:str] + lastmodified: "2026-03-20T15:27:59Z" + mac: ENC[AES256_GCM,data:YuUS49U2swqS76nxsRcSxx3/66S4qxedfZeGyBKH3MsZRNCwy4BdP5xa45CwLT6xH/sWIoJPo6Drw7ECekY+wdjwqV2cM7hCtD+kH8CVqkx4CCIUNBvQyYY+d4bdiaSdty75pGgYQzD090AKtcE5acFMF3K/wrmYTPWzh8hBDRw=,iv:quJO4m+SuoiVPTAncLYJUl0jJhBWg64Abwp/oXeN9Hc=,tag:pHFGW/Prf8A2aYf2kCmSIQ==,type:str] pgp: - created_at: "2024-04-16T18:44:13Z" enc: |- @@ -128,4 +128,4 @@ sops: -----END PGP MESSAGE----- fp: 3582AE9F12CE295BDAF545ED17A5F53F11681446 unencrypted_suffix: _unencrypted - version: 3.12.1 + version: 3.12.2 From 94edcd742fac9ad9a57b18025cf850827b62e453 Mon Sep 17 00:00:00 2001 From: Tobias Macey Date: Fri, 20 Mar 2026 12:52:32 -0400 Subject: [PATCH 33/46] feat(xqwatcher): migrate grader config from SOPS/Vault KV to Pulumi config + SERVER_REF Queue configs (CONNECTIONS, HANDLERS, ContainerGrader KWARGS) are now stored as plaintext in Pulumi stack YAML files under xqwatcher:queues. The xqueue server URL is stored under xqwatcher:xqueue_server_url. SERVER_REF is injected at deploy time so xqueue-watcher resolves credentials at runtime from xqueue_servers.json, which is mounted from a Vault-synced Kubernetes Secret. The secret is sourced from the same secret-{env_prefix}/edx-xqueue Vault KV path already used by the xqueue and edxapp deployments (xqwatcher_password field), eliminating the separate xqwatcher-specific KV mount and SOPS secrets files. Changes: - __main__.py: remove SOPS read, vault.kv.SecretV2, vault_mount_stack StackReference, and XQWATCHER_HTTP_BASIC_AUTH env var; read queues config from Pulumi config; inject SERVER_REF into each queue entry; move grader_config.json into ConfigMap; add xqueue_servers.json Vault-synced secret from secret-{env_prefix}/edx-xqueue; update Deployment volumes/mounts accordingly - xqwatcher_server_policy.hcl: remove secret-xqwatcher/* path - All 9 stack YAML files: add xqueue_server_url and queues config Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- ...pplications.xqwatcher.mitx-staging.CI.yaml | 26 +++ ...ons.xqwatcher.mitx-staging.Production.yaml | 26 +++ ...pplications.xqwatcher.mitx-staging.QA.yaml | 26 +++ ...Pulumi.applications.xqwatcher.mitx.CI.yaml | 26 +++ ...pplications.xqwatcher.mitx.Production.yaml | 26 +++ ...Pulumi.applications.xqwatcher.mitx.QA.yaml | 26 +++ ....applications.xqwatcher.mitxonline.CI.yaml | 14 ++ ...tions.xqwatcher.mitxonline.Production.yaml | 14 ++ ....applications.xqwatcher.mitxonline.QA.yaml | 14 ++ .../applications/xqwatcher/__main__.py | 160 +++++++----------- .../xqwatcher/xqwatcher_server_policy.hcl | 4 - 11 files changed, 261 insertions(+), 101 deletions(-) diff --git a/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx-staging.CI.yaml b/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx-staging.CI.yaml index 9dd17a1cda..5fa3b1e484 100644 --- a/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx-staging.CI.yaml +++ b/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx-staging.CI.yaml @@ -9,3 +9,29 @@ config: xqwatcher:min_replicas: 1 vault:address: https://vault-ci.odl.mit.edu vault_server:env_namespace: operations.ci + xqwatcher:xqueue_server_url: https://xqueue.mitx-staging-ci.odl.mit.edu + xqwatcher:queues: + Watcher-MITx-6.00x: + CONNECTIONS: 5 + HANDLERS: + - HANDLER: xqueue_watcher.containergrader.ContainerGrader + KWARGS: + grader_root: /graders/python3graders/ + image: 610119931565.dkr.ecr.us-east-1.amazonaws.com/mitodl/graders-mit-600x:latest + backend: kubernetes + cpu_limit: 1000m + memory_limit: 512Mi + timeout: 60 + image_pull_policy: always + Watcher-MITx-6.0001r: + CONNECTIONS: 5 + HANDLERS: + - HANDLER: xqueue_watcher.containergrader.ContainerGrader + KWARGS: + grader_root: /graders/python3graders/ + image: 610119931565.dkr.ecr.us-east-1.amazonaws.com/mitodl/graders-mit-600x:latest + backend: kubernetes + cpu_limit: 1000m + memory_limit: 512Mi + timeout: 60 + image_pull_policy: always diff --git a/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx-staging.Production.yaml b/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx-staging.Production.yaml index 21f696187d..3c55f1b327 100644 --- a/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx-staging.Production.yaml +++ b/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx-staging.Production.yaml @@ -9,3 +9,29 @@ config: xqwatcher:min_replicas: 1 vault:address: https://vault-production.odl.mit.edu vault_server:env_namespace: operations.production + xqwatcher:xqueue_server_url: https://xqueue.mitx-staging-prod.odl.mit.edu + xqwatcher:queues: + Watcher-MITx-staging-6.00x: + CONNECTIONS: 5 + HANDLERS: + - HANDLER: xqueue_watcher.containergrader.ContainerGrader + KWARGS: + grader_root: /graders/python3graders/ + image: 610119931565.dkr.ecr.us-east-1.amazonaws.com/mitodl/graders-mit-600x:latest + backend: kubernetes + cpu_limit: 1000m + memory_limit: 512Mi + timeout: 60 + image_pull_policy: always + Watcher-MITx-staging-6.0001r: + CONNECTIONS: 5 + HANDLERS: + - HANDLER: xqueue_watcher.containergrader.ContainerGrader + KWARGS: + grader_root: /graders/python3graders/ + image: 610119931565.dkr.ecr.us-east-1.amazonaws.com/mitodl/graders-mit-600x:latest + backend: kubernetes + cpu_limit: 1000m + memory_limit: 512Mi + timeout: 60 + image_pull_policy: always diff --git a/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx-staging.QA.yaml b/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx-staging.QA.yaml index fe7dd890fd..538de15860 100644 --- a/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx-staging.QA.yaml +++ b/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx-staging.QA.yaml @@ -9,3 +9,29 @@ config: xqwatcher:min_replicas: 1 vault:address: https://vault-qa.odl.mit.edu vault_server:env_namespace: operations.qa + xqwatcher:xqueue_server_url: https://xqueue.mitx-staging-qa.odl.mit.edu + xqwatcher:queues: + Watcher-MITx-staging-6.00x: + CONNECTIONS: 5 + HANDLERS: + - HANDLER: xqueue_watcher.containergrader.ContainerGrader + KWARGS: + grader_root: /graders/python3graders/ + image: 610119931565.dkr.ecr.us-east-1.amazonaws.com/mitodl/graders-mit-600x:latest + backend: kubernetes + cpu_limit: 1000m + memory_limit: 512Mi + timeout: 60 + image_pull_policy: always + Watcher-MITx-staging-6.0001r: + CONNECTIONS: 5 + HANDLERS: + - HANDLER: xqueue_watcher.containergrader.ContainerGrader + KWARGS: + grader_root: /graders/python3graders/ + image: 610119931565.dkr.ecr.us-east-1.amazonaws.com/mitodl/graders-mit-600x:latest + backend: kubernetes + cpu_limit: 1000m + memory_limit: 512Mi + timeout: 60 + image_pull_policy: always diff --git a/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx.CI.yaml b/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx.CI.yaml index a83669d67f..74b8886a89 100644 --- a/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx.CI.yaml +++ b/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx.CI.yaml @@ -9,3 +9,29 @@ config: xqwatcher:min_replicas: 1 vault:address: https://vault-ci.odl.mit.edu vault_server:env_namespace: operations.ci + xqwatcher:xqueue_server_url: https://xqueue.mitx.ci.odl.mit.edu + xqwatcher:queues: + Watcher-MITx-6.00x: + CONNECTIONS: 5 + HANDLERS: + - HANDLER: xqueue_watcher.containergrader.ContainerGrader + KWARGS: + grader_root: /graders/python3graders/ + image: 610119931565.dkr.ecr.us-east-1.amazonaws.com/mitodl/graders-mit-600x:latest + backend: kubernetes + cpu_limit: 1000m + memory_limit: 512Mi + timeout: 60 + image_pull_policy: always + Watcher-MITx-6.0001r: + CONNECTIONS: 5 + HANDLERS: + - HANDLER: xqueue_watcher.containergrader.ContainerGrader + KWARGS: + grader_root: /graders/python3graders/ + image: 610119931565.dkr.ecr.us-east-1.amazonaws.com/mitodl/graders-mit-600x:latest + backend: kubernetes + cpu_limit: 1000m + memory_limit: 512Mi + timeout: 60 + image_pull_policy: always diff --git a/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx.Production.yaml b/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx.Production.yaml index e3ca466f59..53936aecb3 100644 --- a/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx.Production.yaml +++ b/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx.Production.yaml @@ -9,3 +9,29 @@ config: xqwatcher:min_replicas: 2 vault:address: https://vault-production.odl.mit.edu vault_server:env_namespace: operations.production + xqwatcher:xqueue_server_url: https://xqueue.mitx.odl.mit.edu + xqwatcher:queues: + Watcher-MITx-6.00x: + CONNECTIONS: 5 + HANDLERS: + - HANDLER: xqueue_watcher.containergrader.ContainerGrader + KWARGS: + grader_root: /graders/python3graders/ + image: 610119931565.dkr.ecr.us-east-1.amazonaws.com/mitodl/graders-mit-600x:latest + backend: kubernetes + cpu_limit: 1000m + memory_limit: 512Mi + timeout: 60 + image_pull_policy: always + Watcher-MITx-6.0001r: + CONNECTIONS: 5 + HANDLERS: + - HANDLER: xqueue_watcher.containergrader.ContainerGrader + KWARGS: + grader_root: /graders/python3graders/ + image: 610119931565.dkr.ecr.us-east-1.amazonaws.com/mitodl/graders-mit-600x:latest + backend: kubernetes + cpu_limit: 1000m + memory_limit: 512Mi + timeout: 60 + image_pull_policy: always diff --git a/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx.QA.yaml b/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx.QA.yaml index 1776ffe84b..3b6bc1b926 100644 --- a/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx.QA.yaml +++ b/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx.QA.yaml @@ -9,3 +9,29 @@ config: xqwatcher:min_replicas: 1 vault:address: https://vault-qa.odl.mit.edu vault_server:env_namespace: operations.qa + xqwatcher:xqueue_server_url: https://xqueue.mitx.qa.odl.mit.edu + xqwatcher:queues: + Watcher-MITx-6.00x: + CONNECTIONS: 5 + HANDLERS: + - HANDLER: xqueue_watcher.containergrader.ContainerGrader + KWARGS: + grader_root: /graders/python3graders/ + image: 610119931565.dkr.ecr.us-east-1.amazonaws.com/mitodl/graders-mit-600x:latest + backend: kubernetes + cpu_limit: 1000m + memory_limit: 512Mi + timeout: 60 + image_pull_policy: always + Watcher-MITx-6.0001r: + CONNECTIONS: 5 + HANDLERS: + - HANDLER: xqueue_watcher.containergrader.ContainerGrader + KWARGS: + grader_root: /graders/python3graders/ + image: 610119931565.dkr.ecr.us-east-1.amazonaws.com/mitodl/graders-mit-600x:latest + backend: kubernetes + cpu_limit: 1000m + memory_limit: 512Mi + timeout: 60 + image_pull_policy: always diff --git a/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitxonline.CI.yaml b/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitxonline.CI.yaml index 39748de90a..5c01f5a82f 100644 --- a/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitxonline.CI.yaml +++ b/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitxonline.CI.yaml @@ -9,3 +9,17 @@ config: xqwatcher:min_replicas: 1 vault:address: https://vault-ci.odl.mit.edu vault_server:env_namespace: operations.ci + xqwatcher:xqueue_server_url: https://xqueue.mitxonline.ci.odl.mit.edu + xqwatcher:queues: + Watcher-MITx-6.00x: + CONNECTIONS: 5 + HANDLERS: + - HANDLER: xqueue_watcher.containergrader.ContainerGrader + KWARGS: + grader_root: /graders/python3graders/ + image: 610119931565.dkr.ecr.us-east-1.amazonaws.com/mitodl/graders-mit-600x:latest + backend: kubernetes + cpu_limit: 1000m + memory_limit: 512Mi + timeout: 60 + image_pull_policy: always diff --git a/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitxonline.Production.yaml b/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitxonline.Production.yaml index 2ff68415aa..e3830c39a7 100644 --- a/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitxonline.Production.yaml +++ b/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitxonline.Production.yaml @@ -9,3 +9,17 @@ config: xqwatcher:min_replicas: 2 vault:address: https://vault-production.odl.mit.edu vault_server:env_namespace: operations.production + xqwatcher:xqueue_server_url: https://xqueue.mitxonline.odl.mit.edu + xqwatcher:queues: + Watcher-MITx-6.00x: + CONNECTIONS: 5 + HANDLERS: + - HANDLER: xqueue_watcher.containergrader.ContainerGrader + KWARGS: + grader_root: /graders/python3graders/ + image: 610119931565.dkr.ecr.us-east-1.amazonaws.com/mitodl/graders-mit-600x:latest + backend: kubernetes + cpu_limit: 1000m + memory_limit: 512Mi + timeout: 60 + image_pull_policy: always diff --git a/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitxonline.QA.yaml b/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitxonline.QA.yaml index f4a438ef4c..5b4317485b 100644 --- a/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitxonline.QA.yaml +++ b/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitxonline.QA.yaml @@ -9,3 +9,17 @@ config: xqwatcher:min_replicas: 1 vault:address: https://vault-qa.odl.mit.edu vault_server:env_namespace: operations.qa + xqwatcher:xqueue_server_url: https://xqueue.mitxonline.qa.odl.mit.edu + xqwatcher:queues: + Watcher-MITx-6.00x: + CONNECTIONS: 5 + HANDLERS: + - HANDLER: xqueue_watcher.containergrader.ContainerGrader + KWARGS: + grader_root: /graders/python3graders/ + image: 610119931565.dkr.ecr.us-east-1.amazonaws.com/mitodl/graders-mit-600x:latest + backend: kubernetes + cpu_limit: 1000m + memory_limit: 512Mi + timeout: 60 + image_pull_policy: always diff --git a/src/ol_infrastructure/applications/xqwatcher/__main__.py b/src/ol_infrastructure/applications/xqwatcher/__main__.py index 64612ebf98..9da408e4da 100644 --- a/src/ol_infrastructure/applications/xqwatcher/__main__.py +++ b/src/ol_infrastructure/applications/xqwatcher/__main__.py @@ -8,17 +8,15 @@ Secrets are managed via the Vault Secrets Operator (VaultStaticSecret CRD). """ +import copy import json import os from pathlib import Path +from typing import Any import pulumi_kubernetes as kubernetes -import pulumi_vault as vault from pulumi import Config, ResourceOptions, StackReference, export -from pulumi_aws import get_caller_identity -from bridge.secrets.sops import read_yaml_secrets -from bridge.settings.openedx.version_matrix import OpenLearningOpenEdxDeployment from ol_infrastructure.components.applications.eks import ( OLEKSAuthBinding, OLEKSAuthBindingConfig, @@ -42,10 +40,6 @@ stack_info = parse_stack() xqwatcher_config = Config("xqwatcher") -vault_mount_stack = StackReference( - f"substructure.vault.static_mounts.operations.{stack_info.name}" -) - cluster_name = xqwatcher_config.get("cluster") or "applications" cluster_stack = StackReference( f"infrastructure.aws.eks.{cluster_name}.{stack_info.name}" @@ -53,14 +47,6 @@ env_name = f"{stack_info.env_prefix}-{stack_info.env_suffix}" -openedx_release = ( - OpenLearningOpenEdxDeployment.get_item(stack_info.env_prefix) - .release_by_env(stack_info.name) - .value -) - -aws_account = get_caller_identity() - aws_config = AWSBase( tags={ "OU": xqwatcher_config.require("business_unit"), @@ -97,51 +83,28 @@ grader_timeout = xqwatcher_config.get("grader_timeout") or "20" ################################## -## Vault Secret Data ## +## Grader Queue Config ## ################################## -# Preserve management of the grader config secret in Vault KV. -# The VaultStaticSecret CRD (below) will sync this into the cluster. -vault_secrets = read_yaml_secrets( - Path(f"xqwatcher/secrets.{stack_info.env_prefix}.{stack_info.env_suffix}.yaml") -) - - -# For ContainerGrader handlers: if the SOPS secret supplies a plain DockerHub -# image reference in KWARGS.image, rewrite it to use the ECR pull-through -# cache so the grading Jobs are not subject to DockerHub rate limits. -# Images that already have a registry hostname (e.g. private ECR URIs like -# 610119931565.dkr.ecr.us-east-1.amazonaws.com/…, or ghcr.io/…) are left -# unchanged — the hostname is identified by a "." in the first path component. -def _needs_pullthrough_rewrite(image: str) -> bool: - """Return True only for bare DockerHub image refs (no registry hostname).""" - first_component = image.split("/", maxsplit=1)[0] - return "." not in first_component and ":" not in first_component - - -if isinstance(vault_secrets.get("confd_json"), dict): - for _queue_cfg in vault_secrets["confd_json"].values(): - for handler_cfg in _queue_cfg.get("HANDLERS", []): - if handler_cfg.get("HANDLER", "").endswith( - "ContainerGrader" - ) and "image" in handler_cfg.get("KWARGS", {}): - image_ref = handler_cfg["KWARGS"]["image"] - if _needs_pullthrough_rewrite(image_ref): - handler_cfg["KWARGS"]["image"] = cached_image_uri(image_ref) - -# VSO renders secret values using Go templates: {{ .Secrets.confd_json }}. -# If confd_json is stored as a nested object, VSO renders it as a Go map -# literal rather than JSON. Pre-serialize it to a JSON string so the -# template output is valid JSON that xqueue-watcher can parse. -if "confd_json" in vault_secrets and not isinstance(vault_secrets["confd_json"], str): - vault_secrets["confd_json"] = json.dumps(vault_secrets["confd_json"]) -xqwatcher_vault_mount_name = vault_mount_stack.require_output("xqwatcher_kv")["path"] -vault.kv.SecretV2( - f"xqwatcher-{env_name}-grader-static-secrets", - mount=xqwatcher_vault_mount_name, - name=f"{stack_info.env_prefix}-grader-config", - data_json=json.dumps(vault_secrets), -) +xqueue_server_url = xqwatcher_config.require("xqueue_server_url") + +# Read the non-secret queue configs from Pulumi stack config and inject +# SERVER_REF so credentials are resolved from xqueue_servers.json at runtime. +_queues_raw: dict[str, Any] = xqwatcher_config.require_object("queues") +queues_config: dict[str, Any] = {} +for queue_name, queue_cfg in _queues_raw.items(): + entry = copy.deepcopy(queue_cfg) + # Rewrite bare DockerHub image refs to use the ECR pull-through cache. + for handler_cfg in entry.get("HANDLERS", []): + if handler_cfg.get("HANDLER", "").endswith( + "ContainerGrader" + ) and "image" in handler_cfg.get("KWARGS", {}): + image_ref = handler_cfg["KWARGS"]["image"] + first_component = image_ref.split("/", maxsplit=1)[0] + if "." not in first_component and ":" not in first_component: + handler_cfg["KWARGS"]["image"] = cached_image_uri(image_ref) + entry["SERVER_REF"] = "default" + queues_config[queue_name] = entry ################################## ## Vault Policy + K8s Auth ## @@ -176,30 +139,35 @@ def _needs_pullthrough_rewrite(image: str) -> bool: ## Vault Secrets ## ################################## -# Grader handler config (queue names, ContainerGrader KWARGS, xqueue URL+auth). -# Stored as `confd_json` in the Vault KV entry written above. -grader_config_secret_name = ( - "xqwatcher-grader-config" # pragma: allowlist secret # noqa: S105 +# xqueue_servers.json — the only secret: xqueue URL and xqwatcher credentials. +# Sourced from the same Vault KV entry used by the xqueue and edxapp deployments. +xqueue_servers_secret_name = ( + "xqwatcher-xqueue-servers" # pragma: allowlist secret # noqa: S105 ) -grader_config_secret = OLVaultK8SSecret( - f"xqwatcher-{env_name}-grader-config-secret", +xqueue_servers_template = json.dumps( + { + "default": { + "SERVER": xqueue_server_url, + "AUTH": ["xqwatcher", "{{ .Secrets.xqwatcher_password }}"], + } + } +) +xqueue_servers_secret = OLVaultK8SSecret( + f"xqwatcher-{env_name}-xqueue-servers-secret", OLVaultK8SStaticSecretConfig( - name=grader_config_secret_name, + name=xqueue_servers_secret_name, namespace=namespace, - dest_secret_name=grader_config_secret_name, + dest_secret_name=xqueue_servers_secret_name, dest_secret_labels=k8s_global_labels.model_dump(), labels=k8s_global_labels.model_dump(), - mount=xqwatcher_vault_mount_name, - mount_type="kv-v2", - path=f"{stack_info.env_prefix}-grader-config", + mount=f"secret-{stack_info.env_prefix}", + mount_type="kv-v1", + path="edx-xqueue", refresh_after="1h", restart_target_kind="Deployment", restart_target_name="xqwatcher", - # Expose the rendered grader JSON and the HTTP Basic Auth credential - # used by the xqueue-watcher manager to authenticate with xqueue. templates={ - "grader_config.json": "{{ .Secrets.confd_json }}", - "http_basic_auth": "{{ .Secrets.http_basic_auth }}", + "xqueue_servers.json": xqueue_servers_template, }, vaultauth=vault_k8s_resources.auth_name, ), @@ -213,8 +181,8 @@ def _needs_pullthrough_rewrite(image: str) -> bool: ## ConfigMap ## ################################## -# Base xqueue-watcher config (poll settings, logging). -# Per-queue grader config comes from the Vault-synced secret above. +# Base xqueue-watcher config (poll settings, logging) and non-secret grader +# queue configs. The Vault-synced secret provides xqueue_servers.json. xqwatcher_configmap = kubernetes.core.v1.ConfigMap( f"xqwatcher-{env_name}-configmap", metadata=kubernetes.meta.v1.ObjectMetaArgs( @@ -256,6 +224,9 @@ def _needs_pullthrough_rewrite(image: str) -> bool: }, } ), + # Non-secret queue configs; SERVER_REF resolves credentials at runtime + # from xqueue_servers.json (mounted from the Vault-synced secret). + "grader_config.json": json.dumps(queues_config), }, ) @@ -360,20 +331,6 @@ def _needs_pullthrough_rewrite(image: str) -> bool: command=["uv", "run", "--no-sync", "xqueue-watcher"], args=["-d", "/xqwatcher"], env=[ - # HTTP Basic Auth for the xqueue server endpoint. - # Value is "username:password"; sourced from the - # Vault-synced secret so it never appears in the - # Deployment spec. - kubernetes.core.v1.EnvVarArgs( - name="XQWATCHER_HTTP_BASIC_AUTH", - value_from=kubernetes.core.v1.EnvVarSourceArgs( - secret_key_ref=kubernetes.core.v1.SecretKeySelectorArgs( - name=grader_config_secret_name, - key="http_basic_auth", - optional=True, - ) - ), - ), # Non-sensitive manager config values — match # MANAGER_CONFIG_DEFAULTS in env_settings.py. kubernetes.core.v1.EnvVarArgs( @@ -463,14 +420,23 @@ def _needs_pullthrough_rewrite(image: str) -> bool: sub_path="logging.json", read_only=True, ), - # Per-queue grader handler config from Vault secret, - # placed under conf.d/ so the manager discovers it. + # Per-queue grader handler config from the ConfigMap + # (non-secret: no SERVER/AUTH, uses SERVER_REF). kubernetes.core.v1.VolumeMountArgs( - name="grader-config", + name="xqwatcher-config", mount_path="/xqwatcher/conf.d/grader_config.json", sub_path="grader_config.json", read_only=True, ), + # Named server definitions (SERVER URL + AUTH credentials) + # from the Vault-synced secret, mounted at the config + # root so xqueue-watcher can resolve SERVER_REF entries. + kubernetes.core.v1.VolumeMountArgs( + name="xqueue-servers", + mount_path="/xqwatcher/xqueue_servers.json", + sub_path="xqueue_servers.json", + read_only=True, + ), ], ), ], @@ -482,16 +448,16 @@ def _needs_pullthrough_rewrite(image: str) -> bool: ), ), kubernetes.core.v1.VolumeArgs( - name="grader-config", + name="xqueue-servers", secret=kubernetes.core.v1.SecretVolumeSourceArgs( - secret_name=grader_config_secret_name, + secret_name=xqueue_servers_secret_name, ), ), ], ), ), ), - opts=ResourceOptions(depends_on=[grader_config_secret]), + opts=ResourceOptions(depends_on=[xqueue_servers_secret]), ) ################################## @@ -500,4 +466,4 @@ def _needs_pullthrough_rewrite(image: str) -> bool: export("k8s_deployment_name", "xqwatcher") export("k8s_namespace", namespace) -export("grader_config_secret", grader_config_secret_name) +export("xqueue_servers_secret", xqueue_servers_secret_name) diff --git a/src/ol_infrastructure/applications/xqwatcher/xqwatcher_server_policy.hcl b/src/ol_infrastructure/applications/xqwatcher/xqwatcher_server_policy.hcl index 19e86627ef..65b51c67f6 100644 --- a/src/ol_infrastructure/applications/xqwatcher/xqwatcher_server_policy.hcl +++ b/src/ol_infrastructure/applications/xqwatcher/xqwatcher_server_policy.hcl @@ -2,10 +2,6 @@ path "sys/leases/renew" { capabilities = [ "update" ] } -path "secret-xqwatcher/*" { - capabilities = [ "read" ] -} - path "secret-DEPLOYMENT/edx-xqueue" { capabilities = [ "read" ] } From 41b0abaf966bd802fd52babba204dffad36c791e Mon Sep 17 00:00:00 2001 From: Tobias Macey Date: Fri, 20 Mar 2026 13:05:59 -0400 Subject: [PATCH 34/46] fix: address PR review feedback - Add AWS_DEFAULT_REGION=us-east-1 to ensure_ecr_task params so the AWS CLI knows which region to use without relying on worker defaults - Remove spurious service_account_name kwarg from OLVaultK8SResourcesConfig instantiation in OLEKSAuthBinding; the field does not exist on the model and the name is derived internally from application_name - Fix liveness probe to use 'uv run --no-sync python' instead of bare 'python', which would fail with ModuleNotFoundError because xqueue_watcher is only available inside the uv virtual environment Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/ol_concourse/lib/containers.py | 1 + src/ol_infrastructure/applications/xqwatcher/__main__.py | 3 +++ src/ol_infrastructure/components/applications/eks.py | 1 - 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/src/ol_concourse/lib/containers.py b/src/ol_concourse/lib/containers.py index c275da4c81..9e1a82b0a3 100644 --- a/src/ol_concourse/lib/containers.py +++ b/src/ol_concourse/lib/containers.py @@ -60,6 +60,7 @@ def ensure_ecr_task(ecr_repo_name: str) -> TaskStep: ), params={ "REPO_NAME": ecr_repo_name, + "AWS_DEFAULT_REGION": "us-east-1", "AWS_PAGER": "cat", }, run=Command( diff --git a/src/ol_infrastructure/applications/xqwatcher/__main__.py b/src/ol_infrastructure/applications/xqwatcher/__main__.py index 9da408e4da..e9b00a40fc 100644 --- a/src/ol_infrastructure/applications/xqwatcher/__main__.py +++ b/src/ol_infrastructure/applications/xqwatcher/__main__.py @@ -383,6 +383,9 @@ liveness_probe=kubernetes.core.v1.ProbeArgs( exec_=kubernetes.core.v1.ExecActionArgs( command=[ + "uv", + "run", + "--no-sync", "python", "-c", "import xqueue_watcher; import sys; sys.exit(0)", diff --git a/src/ol_infrastructure/components/applications/eks.py b/src/ol_infrastructure/components/applications/eks.py index af5bf75271..649a43cd25 100644 --- a/src/ol_infrastructure/components/applications/eks.py +++ b/src/ol_infrastructure/components/applications/eks.py @@ -212,7 +212,6 @@ def __init__( or f"https://vault-{stack_info.env_suffix}.odl.mit.edu", vault_auth_endpoint=config.vault_auth_endpoint, vault_auth_role_name=k8s_auth_backend_role.role_name, - service_account_name=service_account_names[0], ) self.vault_k8s_resources = OLVaultK8SResources( resource_config=vault_k8s_resources_config, From 7811043266093100a55e3a055a93caf0c4c7520a Mon Sep 17 00:00:00 2001 From: Tobias Macey Date: Fri, 20 Mar 2026 13:22:38 -0400 Subject: [PATCH 35/46] feat(xqwatcher): add Docker+Pulumi Concourse pipeline Replace the old Packer-based xqwatcher pipeline with a Docker+Pulumi pipeline that mirrors the xqueue pattern: - Watches mitodl/xqueue-watcher (main) for new commits - Builds and pushes the Docker image to DockerHub as mitodl/xqueue-watcher:{release} - Passes the built image digest as XQWATCHER_DOCKER_DIGEST to each Pulumi stack so the Deployment rolls to the exact image SHA Update meta.py to generate docker-pulumi-xqwatcher-{release} pipelines instead of the retired packer-pulumi ones. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../xqwatcher/docker_pulumi_pipeline.py | 120 ++++++++++++++++++ .../pipelines/open_edx/xqwatcher/meta.py | 4 +- 2 files changed, 122 insertions(+), 2 deletions(-) create mode 100644 src/ol_concourse/pipelines/open_edx/xqwatcher/docker_pulumi_pipeline.py diff --git a/src/ol_concourse/pipelines/open_edx/xqwatcher/docker_pulumi_pipeline.py b/src/ol_concourse/pipelines/open_edx/xqwatcher/docker_pulumi_pipeline.py new file mode 100644 index 0000000000..38fc3000c0 --- /dev/null +++ b/src/ol_concourse/pipelines/open_edx/xqwatcher/docker_pulumi_pipeline.py @@ -0,0 +1,120 @@ +import sys + +from bridge.settings.openedx.accessors import filter_deployments_by_application +from ol_concourse.lib.containers import container_build_task +from ol_concourse.lib.jobs.infrastructure import pulumi_jobs_chain +from ol_concourse.lib.models.fragment import PipelineFragment +from ol_concourse.lib.models.pipeline import ( + GetStep, + Identifier, + Input, + Job, + Pipeline, + PutStep, +) +from ol_concourse.lib.resources import git_repo, registry_image +from ol_concourse.pipelines.constants import PULUMI_CODE_PATH, PULUMI_WATCHED_PATHS + + +def build_xqwatcher_pipeline(release_name: str) -> Pipeline: + xqwatcher_repo = git_repo( + name=Identifier("xqueue-watcher-code"), + uri="https://github.com/mitodl/xqueue-watcher", + branch="main", + ) + + xqwatcher_registry_image = registry_image( + name=Identifier("xqueue-watcher-container"), + image_repository="mitodl/xqueue-watcher", + image_tag=release_name, + username="((dockerhub.username))", + password="((dockerhub.password))", # noqa: S106 + ) + + xqwatcher_pulumi_code = git_repo( + name=Identifier("ol-infrastructure-deploy"), + uri="https://github.com/mitodl/ol-infrastructure", + branch="main", + paths=[ + *PULUMI_WATCHED_PATHS, + PULUMI_CODE_PATH.joinpath("applications/xqwatcher/"), + "src/bridge/settings/openedx/", + ], + ) + + image_build_job = Job( + name=Identifier("build-xqueue-watcher-image"), + plan=[ + GetStep(get=xqwatcher_repo.name, trigger=True), + container_build_task( + inputs=[Input(name=xqwatcher_repo.name)], + build_parameters={"CONTEXT": str(xqwatcher_repo.name)}, + ), + PutStep( + put=xqwatcher_registry_image.name, + params={ + "image": "image/image.tar", + "additional_tags": f"./{xqwatcher_repo.name}/.git/describe_ref", + }, + ), + ], + ) + + container_fragment = PipelineFragment( + resources=[xqwatcher_repo, xqwatcher_registry_image], + jobs=[image_build_job], + ) + + loop_fragments = [] + for deployment in filter_deployments_by_application(release_name, "xqwatcher"): + pulumi_fragment = pulumi_jobs_chain( + xqwatcher_pulumi_code, + stack_names=[ + f"applications.xqwatcher.{deployment.deployment_name}.{stage}" + for stage in deployment.envs_by_release(release_name) + ], + project_name="ol-infrastructure-xqwatcher-server", + project_source_path=PULUMI_CODE_PATH.joinpath("applications/xqwatcher/"), + dependencies=[ + GetStep( + get=xqwatcher_registry_image.name, + trigger=True, + passed=[image_build_job.name], + ), + ], + env_vars_from_files={ + "XQWATCHER_DOCKER_DIGEST": f"{xqwatcher_registry_image.name}/digest" + }, + ) + loop_fragments.append(pulumi_fragment) + + combined_fragments = PipelineFragment.combine_fragments( + container_fragment, + *loop_fragments, + ) + + return Pipeline( + resource_types=combined_fragments.resource_types, + resources=[ + *combined_fragments.resources, + xqwatcher_pulumi_code, + ], + jobs=combined_fragments.jobs, + ) + + +if __name__ == "__main__": + release_name = sys.argv[1] + pipeline_json = build_xqwatcher_pipeline(release_name).model_dump_json(indent=2) + with open("definition.json", "w") as definition: # noqa: PTH123 + definition.write(pipeline_json) + sys.stdout.write(pipeline_json) + sys.stdout.writelines( + ( + "\n", + ( + "fly -t set-pipeline -p" + f" docker-pulumi-xqwatcher-{release_name} -c definition.json" + ), + ) + ) diff --git a/src/ol_concourse/pipelines/open_edx/xqwatcher/meta.py b/src/ol_concourse/pipelines/open_edx/xqwatcher/meta.py index 6d79c62c4a..6ee836d7fb 100644 --- a/src/ol_concourse/pipelines/open_edx/xqwatcher/meta.py +++ b/src/ol_concourse/pipelines/open_edx/xqwatcher/meta.py @@ -37,10 +37,10 @@ def build_meta_job(release_name): pipeline_id = "self" else: pipeline_definition_path = ( - "src/ol_concourse/pipelines/open_edx/xqwatcher/packer_pulumi_pipeline.py" + "src/ol_concourse/pipelines/open_edx/xqwatcher/docker_pulumi_pipeline.py" ) pipeline_team = "infrastructure" - pipeline_id = f"packer-pulumi-xqwatcher-{release_name}" + pipeline_id = f"docker-pulumi-xqwatcher-{release_name}" return Job( name=Identifier(f"create-xqwatcher-{release_name}-pipeline"), plan=[ From db24a457984ac29e77de60b6bbdbc01e5d288088 Mon Sep 17 00:00:00 2001 From: Tobias Macey Date: Fri, 20 Mar 2026 13:34:35 -0400 Subject: [PATCH 36/46] fix: address remaining Copilot PR review feedback - Unpin grader_images meta pipeline from feature branch; track main - Unpin xqueue-watcher base image source from dev branch; track main - Unpin graders-mit-600x grader repo from feature branch; track main - Fix base_image_pipeline.py docstring: downstream pipelines trigger off the DockerHub push, not the ECR push - Add xqwatcher:docker_tag config fallback for XQWATCHER_DOCKER_DIGEST so pulumi up can run without the env var set (matches xqueue pattern) - Remove env vars that duplicate xqwatcher.json ConfigMap values (POLL_TIME, REQUESTS_TIMEOUT, POLL_INTERVAL, FOLLOW_CLIENT_REDIRECTS); keep only LOGIN_POLL_INTERVAL and GRADER_* which are not in the ConfigMap - Update PR description: image is on DockerHub, not GHCR Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../grader_images/base_image_pipeline.py | 11 ++++---- .../open_edx/grader_images/build_pipeline.py | 2 +- .../pipelines/open_edx/grader_images/meta.py | 2 +- .../applications/xqwatcher/__main__.py | 25 ++++++------------- 4 files changed, 15 insertions(+), 25 deletions(-) diff --git a/src/ol_concourse/pipelines/open_edx/grader_images/base_image_pipeline.py b/src/ol_concourse/pipelines/open_edx/grader_images/base_image_pipeline.py index 543042d29c..5f1fe414b4 100644 --- a/src/ol_concourse/pipelines/open_edx/grader_images/base_image_pipeline.py +++ b/src/ol_concourse/pipelines/open_edx/grader_images/base_image_pipeline.py @@ -6,10 +6,11 @@ course-specific grader images. Publishing it to both registries allows: - DockerHub (mitodl/xqueue-watcher-grader-base): public reference usable without AWS credentials; used in grader repo Dockerfiles as the default - GRADER_BASE_IMAGE build arg. - - ECR (mitodl/xqueue-watcher-grader-base): used by the per-grader Concourse - build pipelines as the trigger source, so a base image rebuild - automatically triggers downstream grader image rebuilds. + GRADER_BASE_IMAGE build arg. The per-grader Concourse build pipelines + trigger off this DockerHub image so a base image rebuild automatically + triggers downstream grader image rebuilds. + - ECR (mitodl/xqueue-watcher-grader-base): private mirror for use inside + AWS without DockerHub rate-limit concerns. Triggers: - Push to the xqueue-watcher repo on paths under grader_support/. @@ -39,7 +40,7 @@ def grader_base_image_pipeline() -> Pipeline: xqwatcher_repo = git_repo( name=Identifier("xqueue-watcher-code"), uri="https://github.com/mitodl/xqueue-watcher", - branch="chore/migrate-to-uv-and-k8s-container-grader", + branch="main", paths=["grader_support/"], ) diff --git a/src/ol_concourse/pipelines/open_edx/grader_images/build_pipeline.py b/src/ol_concourse/pipelines/open_edx/grader_images/build_pipeline.py index c1bcfe1468..8dce1acd24 100644 --- a/src/ol_concourse/pipelines/open_edx/grader_images/build_pipeline.py +++ b/src/ol_concourse/pipelines/open_edx/grader_images/build_pipeline.py @@ -205,7 +205,7 @@ def grader_image_pipeline(config: GraderPipelineConfig) -> Pipeline: GraderPipelineConfig( pipeline_name="graders-mit-600x", grader_repo_url="git@github.com:mitodl/graders-mit-600x", - grader_repo_branch="feat/containerized-grader", + grader_repo_branch="main", ecr_repo_name="mitodl/graders-mit-600x", ), ] diff --git a/src/ol_concourse/pipelines/open_edx/grader_images/meta.py b/src/ol_concourse/pipelines/open_edx/grader_images/meta.py index ce6b4f0133..fbf4d4e149 100644 --- a/src/ol_concourse/pipelines/open_edx/grader_images/meta.py +++ b/src/ol_concourse/pipelines/open_edx/grader_images/meta.py @@ -44,7 +44,7 @@ pipeline_code = git_repo( name=Identifier("grader-images-pipeline-code"), uri="https://github.com/mitodl/ol-infrastructure", - branch="feat/xqwatcher-kubernetes-migration", + branch="main", paths=_PIPELINE_CODE_PATHS, ) diff --git a/src/ol_infrastructure/applications/xqwatcher/__main__.py b/src/ol_infrastructure/applications/xqwatcher/__main__.py index e9b00a40fc..1d2500e7e0 100644 --- a/src/ol_infrastructure/applications/xqwatcher/__main__.py +++ b/src/ol_infrastructure/applications/xqwatcher/__main__.py @@ -66,10 +66,12 @@ namespace = xqwatcher_config.get("namespace") or f"{stack_info.env_prefix}-openedx" -if "XQWATCHER_DOCKER_DIGEST" not in os.environ: - msg = "XQWATCHER_DOCKER_DIGEST must be set" +docker_image_digest = os.environ.get("XQWATCHER_DOCKER_DIGEST") or xqwatcher_config.get( + "docker_tag" +) +if not docker_image_digest: + msg = "Either XQWATCHER_DOCKER_DIGEST env var or xqwatcher:docker_tag config must be set" # noqa: E501 raise ValueError(msg) -docker_image_digest = os.environ["XQWATCHER_DOCKER_DIGEST"] docker_image_ref = f"mitodl/xqueue-watcher@{docker_image_digest}" min_replicas = xqwatcher_config.get_int("min_replicas") or 1 @@ -331,24 +333,11 @@ command=["uv", "run", "--no-sync", "xqueue-watcher"], args=["-d", "/xqwatcher"], env=[ - # Non-sensitive manager config values — match - # MANAGER_CONFIG_DEFAULTS in env_settings.py. - kubernetes.core.v1.EnvVarArgs( - name="XQWATCHER_POLL_TIME", value="10" - ), - kubernetes.core.v1.EnvVarArgs( - name="XQWATCHER_REQUESTS_TIMEOUT", value="1" - ), - kubernetes.core.v1.EnvVarArgs( - name="XQWATCHER_POLL_INTERVAL", value="1" - ), + # Non-sensitive manager config values that are not + # already covered by the mounted xqwatcher.json ConfigMap. kubernetes.core.v1.EnvVarArgs( name="XQWATCHER_LOGIN_POLL_INTERVAL", value="5" ), - kubernetes.core.v1.EnvVarArgs( - name="XQWATCHER_FOLLOW_CLIENT_REDIRECTS", - value="true", - ), # ContainerGrader deployment-wide defaults. # These are used when a queue's KWARGS block does not # specify the value explicitly. From f5eb04a7af4f07b91110b69206d5f5845998168d Mon Sep 17 00:00:00 2001 From: Tobias Macey Date: Fri, 20 Mar 2026 14:06:39 -0400 Subject: [PATCH 37/46] Add graders-mit-686x image build pipeline Register the MIT 6.686x course-specific grader image in GRADER_PIPELINES so the meta pipeline creates a build-graders-mit-686x-image Concourse pipeline that tracks the graders-mit-686x repo and pushes to ECR at mitodl/graders-mit-686x. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../pipelines/open_edx/grader_images/build_pipeline.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/ol_concourse/pipelines/open_edx/grader_images/build_pipeline.py b/src/ol_concourse/pipelines/open_edx/grader_images/build_pipeline.py index 8dce1acd24..a8802f50be 100644 --- a/src/ol_concourse/pipelines/open_edx/grader_images/build_pipeline.py +++ b/src/ol_concourse/pipelines/open_edx/grader_images/build_pipeline.py @@ -208,6 +208,12 @@ def grader_image_pipeline(config: GraderPipelineConfig) -> Pipeline: grader_repo_branch="main", ecr_repo_name="mitodl/graders-mit-600x", ), + GraderPipelineConfig( + pipeline_name="graders-mit-686x", + grader_repo_url="git@github.com:mitodl/graders-mit-686x", + grader_repo_branch="main", + ecr_repo_name="mitodl/graders-mit-686x", + ), ] From d1f4d77a1677b6b1c66c67caabb794ee213eec58 Mon Sep 17 00:00:00 2001 From: Tobias Macey Date: Fri, 20 Mar 2026 14:42:15 -0400 Subject: [PATCH 38/46] Delete .github/codeql/codeql-config.yml --- .github/codeql/codeql-config.yml | 9 --------- 1 file changed, 9 deletions(-) delete mode 100644 .github/codeql/codeql-config.yml diff --git a/.github/codeql/codeql-config.yml b/.github/codeql/codeql-config.yml deleted file mode 100644 index 0da51bc345..0000000000 --- a/.github/codeql/codeql-config.yml +++ /dev/null @@ -1,9 +0,0 @@ ---- -# CodeQL configuration for Default Setup. -# The "actions" extractor (CodeQL 2.24+) fails on this repository because it -# detects the Erk agent workflow files but cannot extract any source code from -# them (exit code 32: "no code was successfully extracted"). Excluding the -# .github tree from actions analysis suppresses the fatal extractor error while -# keeping Python and JavaScript/TypeScript analysis fully intact. -paths-ignore: -- ".github" From d001f6e8da60d2504ecdaba4cbd345915e1dc1b2 Mon Sep 17 00:00:00 2001 From: Tobias Macey Date: Fri, 20 Mar 2026 15:24:04 -0400 Subject: [PATCH 39/46] Add 686x ContainerGrader to mitxonline production stack Add the edxorg-686x queue to the mitxonline production xqwatcher stack using the ContainerGrader handler, replacing the legacy JailedGrader configuration in confd_json. This is in preparation for deployment of the xqueue-watcher changes in https://github.com/mitodl/xqueue-watcher/pull/14. The memory limit is set to 1Gi (vs 512Mi for 600x) to accommodate the torch dependency used by the mnist problem set graders. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- ...applications.xqwatcher.mitxonline.Production.yaml | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitxonline.Production.yaml b/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitxonline.Production.yaml index e3830c39a7..9043f059da 100644 --- a/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitxonline.Production.yaml +++ b/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitxonline.Production.yaml @@ -23,3 +23,15 @@ config: memory_limit: 512Mi timeout: 60 image_pull_policy: always + edxorg-686x: + CONNECTIONS: 5 + HANDLERS: + - HANDLER: xqueue_watcher.containergrader.ContainerGrader + KWARGS: + grader_root: /graders/ + image: 610119931565.dkr.ecr.us-east-1.amazonaws.com/mitodl/graders-mit-686x:latest + backend: kubernetes + cpu_limit: 1000m + memory_limit: 1Gi + timeout: 60 + image_pull_policy: always From ab4683a86b4221edff05b1d46fed6477fc4b7893 Mon Sep 17 00:00:00 2001 From: Tobias Macey Date: Fri, 20 Mar 2026 15:29:23 -0400 Subject: [PATCH 40/46] Add edxorg xqueue server ref for 686x grader Add an "edxorg" entry to the xqueue_servers.json Vault template so that queues using SERVER_REF "edxorg" resolve credentials for https://xqueue.edx.org. The template variables edxorg_xqueue_username and edxorg_xqueue_password must be added to the existing edx-xqueue Vault KV secret. Update the queue config loop to use setdefault so that queues can declare their own SERVER_REF in the Pulumi stack config rather than always being assigned "default". Set SERVER_REF: edxorg on the edxorg-686x queue in the mitxonline production stack config. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- ....applications.xqwatcher.mitxonline.Production.yaml | 1 + .../applications/xqwatcher/__main__.py | 11 +++++++++-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitxonline.Production.yaml b/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitxonline.Production.yaml index 9043f059da..bfc18a6c71 100644 --- a/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitxonline.Production.yaml +++ b/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitxonline.Production.yaml @@ -24,6 +24,7 @@ config: timeout: 60 image_pull_policy: always edxorg-686x: + SERVER_REF: edxorg CONNECTIONS: 5 HANDLERS: - HANDLER: xqueue_watcher.containergrader.ContainerGrader diff --git a/src/ol_infrastructure/applications/xqwatcher/__main__.py b/src/ol_infrastructure/applications/xqwatcher/__main__.py index 1d2500e7e0..f2c6ab4c4c 100644 --- a/src/ol_infrastructure/applications/xqwatcher/__main__.py +++ b/src/ol_infrastructure/applications/xqwatcher/__main__.py @@ -105,7 +105,7 @@ first_component = image_ref.split("/", maxsplit=1)[0] if "." not in first_component and ":" not in first_component: handler_cfg["KWARGS"]["image"] = cached_image_uri(image_ref) - entry["SERVER_REF"] = "default" + entry.setdefault("SERVER_REF", "default") queues_config[queue_name] = entry ################################## @@ -151,7 +151,14 @@ "default": { "SERVER": xqueue_server_url, "AUTH": ["xqwatcher", "{{ .Secrets.xqwatcher_password }}"], - } + }, + "edxorg": { + "SERVER": "https://xqueue.edx.org", + "AUTH": [ + "{{ .Secrets.edxorg_xqueue_username }}", + "{{ .Secrets.edxorg_xqueue_password }}", + ], + }, } ) xqueue_servers_secret = OLVaultK8SSecret( From 0d529f8048fce744017498c50935c001fa8e281a Mon Sep 17 00:00:00 2001 From: Tobias Macey Date: Mon, 23 Mar 2026 12:57:26 -0400 Subject: [PATCH 41/46] feat(xqwatcher): surface new security env vars and add pod seccomp profile Accommodates changes from xqueue-watcher commit 15fdd86 (security: harden containergrader and XQueue client): - Expose XQWATCHER_VERIFY_TLS via xqwatcher:verify_tls Pulumi config (default "true"; set "false" only for dev envs with self-signed certs). - Expose XQWATCHER_SUBMISSION_SIZE_LIMIT via xqwatcher:submission_size_limit Pulumi config (default 1 MB, matching containergrader default). - Add RuntimeDefault seccomp profile to the xqwatcher pod's PodSecurityContextArgs, mirroring the profile now applied to grading Jobs in containergrader.py for defence-in-depth consistency. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../applications/xqwatcher/__main__.py | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/src/ol_infrastructure/applications/xqwatcher/__main__.py b/src/ol_infrastructure/applications/xqwatcher/__main__.py index f2c6ab4c4c..11b14676ca 100644 --- a/src/ol_infrastructure/applications/xqwatcher/__main__.py +++ b/src/ol_infrastructure/applications/xqwatcher/__main__.py @@ -83,6 +83,10 @@ grader_cpu_limit = xqwatcher_config.get("grader_cpu_limit") or "500m" grader_memory_limit = xqwatcher_config.get("grader_memory_limit") or "256Mi" grader_timeout = xqwatcher_config.get("grader_timeout") or "20" +verify_tls = xqwatcher_config.get("verify_tls") or "true" +submission_size_limit = xqwatcher_config.get("submission_size_limit") or str( + 1024 * 1024 +) # 1 MB default, matching containergrader ################################## ## Grader Queue Config ## @@ -321,6 +325,14 @@ spec=kubernetes.core.v1.PodSpecArgs( service_account_name="xqwatcher", automount_service_account_token=True, + # Apply RuntimeDefault seccomp to the xqwatcher pod itself, + # mirroring the profile applied to grading Jobs in + # containergrader.py for defence-in-depth. + security_context=kubernetes.core.v1.PodSecurityContextArgs( + seccomp_profile=kubernetes.core.v1.SeccompProfileArgs( + type="RuntimeDefault", + ), + ), # Spread replicas across nodes for HA topology_spread_constraints=[ kubernetes.core.v1.TopologySpreadConstraintArgs( @@ -371,6 +383,20 @@ name="XQWATCHER_GRADER_TIMEOUT", value=grader_timeout, ), + # TLS verification for outbound xqueue HTTPS requests. + # Default "true" (safe for production). Set "false" + # only for dev environments with self-signed certs. + kubernetes.core.v1.EnvVarArgs( + name="XQWATCHER_VERIFY_TLS", + value=verify_tls, + ), + # Hard cap on submission size (bytes) before a grading + # container is launched. Prevents etcd object-size + # overflows and resource-exhaustion attacks. + kubernetes.core.v1.EnvVarArgs( + name="XQWATCHER_SUBMISSION_SIZE_LIMIT", + value=submission_size_limit, + ), ], # Liveness: verify the Python runtime is functional. # The process will crash (and K8s will restart) on From 12ffdcafb4f6d0ae010b0ee1e22ee985d10dad6f Mon Sep 17 00:00:00 2001 From: Tobias Macey Date: Mon, 23 Mar 2026 13:51:27 -0400 Subject: [PATCH 42/46] fix: address unresolved PR review feedback - Use ':' separator for tags and '@' only for sha256 digests when building the xqwatcher docker_image_ref; rename env var from XQWATCHER_DOCKER_DIGEST to XQWATCHER_DOCKER_TAG to match the config key name - Fix misleading comment on ECR base image resource in base_image_pipeline.py: downstream grader pipelines trigger off DockerHub, not ECR - Remove automount_service_account_token=False from IRSA ServiceAccount created by OLEKSAuthBinding so the projected token is mounted and IRSA can authenticate via sts:AssumeRoleWithWebIdentity Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../open_edx/grader_images/base_image_pipeline.py | 5 +++-- .../applications/xqwatcher/__main__.py | 12 ++++++++---- src/ol_infrastructure/components/applications/eks.py | 1 - 3 files changed, 11 insertions(+), 7 deletions(-) diff --git a/src/ol_concourse/pipelines/open_edx/grader_images/base_image_pipeline.py b/src/ol_concourse/pipelines/open_edx/grader_images/base_image_pipeline.py index 5f1fe414b4..d63b325f9f 100644 --- a/src/ol_concourse/pipelines/open_edx/grader_images/base_image_pipeline.py +++ b/src/ol_concourse/pipelines/open_edx/grader_images/base_image_pipeline.py @@ -54,8 +54,9 @@ def grader_base_image_pipeline() -> Pipeline: password="((dockerhub.password))", # noqa: S106 ) - # ECR push target — used as the trigger source for per-grader build - # pipelines so that a base image rebuild causes downstream rebuilds. + # ECR push target — private mirror for use inside AWS without DockerHub + # rate-limit concerns. The per-grader Concourse build pipelines trigger + # off the DockerHub base image (grader_base_dockerhub_repo), not ECR. ecr_base_image = registry_image( name=Identifier("grader-base-ecr"), image_repository=_BASE_IMAGE_REPO, diff --git a/src/ol_infrastructure/applications/xqwatcher/__main__.py b/src/ol_infrastructure/applications/xqwatcher/__main__.py index 11b14676ca..8b73325efc 100644 --- a/src/ol_infrastructure/applications/xqwatcher/__main__.py +++ b/src/ol_infrastructure/applications/xqwatcher/__main__.py @@ -66,13 +66,17 @@ namespace = xqwatcher_config.get("namespace") or f"{stack_info.env_prefix}-openedx" -docker_image_digest = os.environ.get("XQWATCHER_DOCKER_DIGEST") or xqwatcher_config.get( +docker_image_tag = os.environ.get("XQWATCHER_DOCKER_TAG") or xqwatcher_config.get( "docker_tag" ) -if not docker_image_digest: - msg = "Either XQWATCHER_DOCKER_DIGEST env var or xqwatcher:docker_tag config must be set" # noqa: E501 +if not docker_image_tag: + msg = ( + "Either XQWATCHER_DOCKER_TAG env var or xqwatcher:docker_tag config must be set" + ) raise ValueError(msg) -docker_image_ref = f"mitodl/xqueue-watcher@{docker_image_digest}" +# Digests use @ (e.g. sha256:abc…), tags use : (e.g. latest, v1.2.3) +_sep = "@" if docker_image_tag.startswith("sha256:") else ":" +docker_image_ref = f"mitodl/xqueue-watcher{_sep}{docker_image_tag}" min_replicas = xqwatcher_config.get_int("min_replicas") or 1 diff --git a/src/ol_infrastructure/components/applications/eks.py b/src/ol_infrastructure/components/applications/eks.py index 649a43cd25..061743bd96 100644 --- a/src/ol_infrastructure/components/applications/eks.py +++ b/src/ol_infrastructure/components/applications/eks.py @@ -165,7 +165,6 @@ def __init__( "eks.amazonaws.com/role-arn": self.irsa_role.arn, }, ), - automount_service_account_token=False, opts=ResourceOptions(parent=self), ) for sa_name in sa_names From a9df683b1072f846e631f64aaeed5ea49c96c761 Mon Sep 17 00:00:00 2001 From: Chris Patti Date: Tue, 24 Mar 2026 11:52:09 -0400 Subject: [PATCH 43/46] Apply suggestion from @Copilot Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- .../pipelines/open_edx/grader_images/build_pipeline.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ol_concourse/pipelines/open_edx/grader_images/build_pipeline.py b/src/ol_concourse/pipelines/open_edx/grader_images/build_pipeline.py index a8802f50be..0c45283426 100644 --- a/src/ol_concourse/pipelines/open_edx/grader_images/build_pipeline.py +++ b/src/ol_concourse/pipelines/open_edx/grader_images/build_pipeline.py @@ -9,8 +9,8 @@ Triggers: - New commit to the grader repo (grader scripts or Dockerfile changed). - - New digest of the grader base image in ECR (base image rebuilt / security - patch applied). + - New digest of the Docker Hub grader base image (base image rebuilt / + security patch applied). The base image digest is resolved at build time by reading the ``repository`` and ``digest`` files that Concourse's ``registry-image`` resource writes for From f5ffad6c72d32ae945fb1b48a4f3c1113a50bac2 Mon Sep 17 00:00:00 2001 From: Tobias Macey Date: Tue, 24 Mar 2026 12:28:19 -0400 Subject: [PATCH 44/46] chore: Update watcher pipeline --- .../xqwatcher/docker_pulumi_pipeline.py | 30 ++++++++++++++----- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/src/ol_concourse/pipelines/open_edx/xqwatcher/docker_pulumi_pipeline.py b/src/ol_concourse/pipelines/open_edx/xqwatcher/docker_pulumi_pipeline.py index 38fc3000c0..875896420b 100644 --- a/src/ol_concourse/pipelines/open_edx/xqwatcher/docker_pulumi_pipeline.py +++ b/src/ol_concourse/pipelines/open_edx/xqwatcher/docker_pulumi_pipeline.py @@ -16,7 +16,7 @@ from ol_concourse.pipelines.constants import PULUMI_CODE_PATH, PULUMI_WATCHED_PATHS -def build_xqwatcher_pipeline(release_name: str) -> Pipeline: +def build_xqwatcher_pipeline(release_name: str): xqwatcher_repo = git_repo( name=Identifier("xqueue-watcher-code"), uri="https://github.com/mitodl/xqueue-watcher", @@ -26,7 +26,7 @@ def build_xqwatcher_pipeline(release_name: str) -> Pipeline: xqwatcher_registry_image = registry_image( name=Identifier("xqueue-watcher-container"), image_repository="mitodl/xqueue-watcher", - image_tag=release_name, + image_tag="latest", username="((dockerhub.username))", password="((dockerhub.password))", # noqa: S106 ) @@ -47,8 +47,13 @@ def build_xqwatcher_pipeline(release_name: str) -> Pipeline: plan=[ GetStep(get=xqwatcher_repo.name, trigger=True), container_build_task( - inputs=[Input(name=xqwatcher_repo.name)], - build_parameters={"CONTEXT": str(xqwatcher_repo.name)}, + inputs=[ + Input(name=xqwatcher_repo.name), + ], + build_parameters={ + "CONTEXT": xqwatcher_repo.name, + "DOCKERFILE": f"{xqwatcher_repo.name}/Dockerfile", + }, ), PutStep( put=xqwatcher_registry_image.name, @@ -77,9 +82,9 @@ def build_xqwatcher_pipeline(release_name: str) -> Pipeline: project_source_path=PULUMI_CODE_PATH.joinpath("applications/xqwatcher/"), dependencies=[ GetStep( - get=xqwatcher_registry_image.name, + get=container_fragment.resources[-1].name, trigger=True, - passed=[image_build_job.name], + passed=[container_fragment.jobs[-1].name], ), ], env_vars_from_files={ @@ -104,8 +109,19 @@ def build_xqwatcher_pipeline(release_name: str) -> Pipeline: if __name__ == "__main__": + from bridge.settings.openedx.types import OpenEdxSupportedRelease + + if len(sys.argv) < 2: # noqa: PLR2004 + releases = [r.name for r in OpenEdxSupportedRelease] + sys.stderr.write( + f"Usage: {sys.argv[0]} \n" + f"Available releases: {', '.join(releases)}\n" + ) + sys.exit(1) release_name = sys.argv[1] - pipeline_json = build_xqwatcher_pipeline(release_name).model_dump_json(indent=2) + pipeline_json = build_xqwatcher_pipeline( + release_name, + ).model_dump_json(indent=2) with open("definition.json", "w") as definition: # noqa: PTH123 definition.write(pipeline_json) sys.stdout.write(pipeline_json) From 1ac0e767109bc8808bedb68708442cedda5a382c Mon Sep 17 00:00:00 2001 From: Tobias Macey Date: Tue, 24 Mar 2026 12:33:54 -0400 Subject: [PATCH 45/46] feat(xqwatcher): add HPA for CPU/memory-based autoscaling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a HorizontalPodAutoscaler (autoscaling/v2) targeting the xqwatcher Deployment, scaling on: - CPU: 60% average utilization - Memory: 80% average utilization Scale-up is aggressive (up to 100% more pods per minute, 60s stabilization) to handle submission bursts; scale-down is conservative (≤25% reduction per minute, 5-minute stabilization) to avoid thrashing. Min/max replica bounds are configurable via xqwatcher:min_replicas and xqwatcher:max_replicas stack config (defaults: 1 and 5). The Deployment gains ignore_changes=["spec.replicas"] so Pulumi does not revert the replica count that the HPA manages between stack updates. Exports k8s_hpa_name for stack consumers. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../applications/xqwatcher/__main__.py | 81 ++++++++++++++++++- 1 file changed, 80 insertions(+), 1 deletion(-) diff --git a/src/ol_infrastructure/applications/xqwatcher/__main__.py b/src/ol_infrastructure/applications/xqwatcher/__main__.py index 8b73325efc..a9a50a9558 100644 --- a/src/ol_infrastructure/applications/xqwatcher/__main__.py +++ b/src/ol_infrastructure/applications/xqwatcher/__main__.py @@ -79,6 +79,7 @@ docker_image_ref = f"mitodl/xqueue-watcher{_sep}{docker_image_tag}" min_replicas = xqwatcher_config.get_int("min_replicas") or 1 +max_replicas = xqwatcher_config.get_int("max_replicas") or 5 # Deployment-wide ContainerGrader defaults. These become XQWATCHER_GRADER_* # environment variables on the xqwatcher pod so operators don't have to repeat @@ -486,7 +487,84 @@ ), ), ), - opts=ResourceOptions(depends_on=[xqueue_servers_secret]), + opts=ResourceOptions( + depends_on=[xqueue_servers_secret], + # Allow the HPA to manage replica count without Pulumi reverting it. + ignore_changes=["spec.replicas"], + ), +) + +################################## +## Horizontal Pod Autoscaler ## +################################## + +# Scale on CPU (60 % utilization) and memory (80 % utilization). +# Scale-up is aggressive (double pods per minute) while scale-down is +# conservative (25 % reduction per minute, 5-minute stabilization window) to +# avoid thrashing during bursty submission activity. +xqwatcher_hpa = kubernetes.autoscaling.v2.HorizontalPodAutoscaler( + f"xqwatcher-{env_name}-hpa", + metadata=kubernetes.meta.v1.ObjectMetaArgs( + name="xqwatcher", + namespace=namespace, + labels=k8s_global_labels.model_dump(), + ), + spec=kubernetes.autoscaling.v2.HorizontalPodAutoscalerSpecArgs( + scale_target_ref=kubernetes.autoscaling.v2.CrossVersionObjectReferenceArgs( + api_version="apps/v1", + kind="Deployment", + name="xqwatcher", + ), + min_replicas=min_replicas, + max_replicas=max_replicas, + metrics=[ + kubernetes.autoscaling.v2.MetricSpecArgs( + type="Resource", + resource=kubernetes.autoscaling.v2.ResourceMetricSourceArgs( + name="cpu", + target=kubernetes.autoscaling.v2.MetricTargetArgs( + type="Utilization", + average_utilization=60, + ), + ), + ), + kubernetes.autoscaling.v2.MetricSpecArgs( + type="Resource", + resource=kubernetes.autoscaling.v2.ResourceMetricSourceArgs( + name="memory", + target=kubernetes.autoscaling.v2.MetricTargetArgs( + type="Utilization", + average_utilization=80, + ), + ), + ), + ], + behavior=kubernetes.autoscaling.v2.HorizontalPodAutoscalerBehaviorArgs( + scale_up=kubernetes.autoscaling.v2.HPAScalingRulesArgs( + stabilization_window_seconds=60, + select_policy="Max", + policies=[ + kubernetes.autoscaling.v2.HPAScalingPolicyArgs( + type="Percent", + value=100, + period_seconds=60, + ), + ], + ), + scale_down=kubernetes.autoscaling.v2.HPAScalingRulesArgs( + stabilization_window_seconds=300, + select_policy="Min", + policies=[ + kubernetes.autoscaling.v2.HPAScalingPolicyArgs( + type="Percent", + value=25, + period_seconds=60, + ), + ], + ), + ), + ), + opts=ResourceOptions(depends_on=[xqwatcher_deployment]), ) ################################## @@ -495,4 +573,5 @@ export("k8s_deployment_name", "xqwatcher") export("k8s_namespace", namespace) +export("k8s_hpa_name", "xqwatcher") export("xqueue_servers_secret", xqueue_servers_secret_name) From 5148008f1a756fe934b2eea90f19071e61b4d4d0 Mon Sep 17 00:00:00 2001 From: Tobias Macey Date: Tue, 24 Mar 2026 13:25:41 -0400 Subject: [PATCH 46/46] feat(xqwatcher): separate edxorg server into its own Deployment The original design embedded edxorg credentials in the same VaultStaticSecret as the MIT-hosted xqueue server, referencing edxorg_xqueue_username / edxorg_xqueue_password keys that do not exist in secret-/edx-xqueue (which only holds edxapp_password and xqwatcher_password). Instead, create a fully independent Deployment per xqueue server: - xqwatcher (default): watches queues targeting the MIT-hosted xqueue. Reads credentials from secret-/edx-xqueue. Only queues with SERVER_REF="default" (or no SERVER_REF) are included in its ConfigMap. - xqwatcher-edxorg (optional): watches queues with SERVER_REF="edxorg". Reads credentials from a separate secret-/edxorg-xqueue Vault path, created only when xqwatcher:edxorg_xqueue_enabled=true. Each Deployment has its own ConfigMap (scoped to its own queue subset), VaultStaticSecret, and HPA; they share the xqwatcher ServiceAccount and RBAC Role since both need identical permissions to manage grading Jobs. The Vault policy gains a read grant for secret-DEPLOYMENT/edxorg-xqueue. This eliminates the need for a file-merge initContainer and gives each server integration independent observability, scaling, and secret access. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../applications/xqwatcher/__main__.py | 349 +++++++++++++++++- .../xqwatcher/xqwatcher_server_policy.hcl | 4 + 2 files changed, 340 insertions(+), 13 deletions(-) diff --git a/src/ol_infrastructure/applications/xqwatcher/__main__.py b/src/ol_infrastructure/applications/xqwatcher/__main__.py index a9a50a9558..7b5d992d5b 100644 --- a/src/ol_infrastructure/applications/xqwatcher/__main__.py +++ b/src/ol_infrastructure/applications/xqwatcher/__main__.py @@ -80,6 +80,10 @@ min_replicas = xqwatcher_config.get_int("min_replicas") or 1 max_replicas = xqwatcher_config.get_int("max_replicas") or 5 +# When true, a second VaultStaticSecret reads edx.org xqueue credentials from +# secret-/edxorg-xqueue and merges them into xqueue_servers.json at pod +# start. Set this only for stacks that actually watch edx.org queues. +edxorg_xqueue_enabled = xqwatcher_config.get_bool("edxorg_xqueue_enabled") or False # Deployment-wide ContainerGrader defaults. These become XQWATCHER_GRADER_* # environment variables on the xqwatcher pod so operators don't have to repeat @@ -117,6 +121,24 @@ entry.setdefault("SERVER_REF", "default") queues_config[queue_name] = entry +# Split by SERVER_REF so each Deployment only ships configs for its own server. +# Queues with no SERVER_REF (or SERVER_REF="default") belong to the MIT-hosted +# server; queues with SERVER_REF="edxorg" belong to the edx.org server. +default_queues: dict[str, Any] = { + name: cfg + for name, cfg in queues_config.items() + if cfg.get("SERVER_REF", "default") == "default" +} +edxorg_queues: dict[str, Any] = ( + { + name: cfg + for name, cfg in queues_config.items() + if cfg.get("SERVER_REF") == "edxorg" + } + if edxorg_xqueue_enabled + else {} +) + ################################## ## Vault Policy + K8s Auth ## ################################## @@ -150,8 +172,9 @@ ## Vault Secrets ## ################################## -# xqueue_servers.json — the only secret: xqueue URL and xqwatcher credentials. -# Sourced from the same Vault KV entry used by the xqueue and edxapp deployments. +# ── Default (MIT-hosted) xqueue server ────────────────────────────────────── +# Credentials live at secret-/edx-xqueue alongside the xqueue and edxapp +# deployments. Only xqwatcher_password is needed here. xqueue_servers_secret_name = ( "xqwatcher-xqueue-servers" # pragma: allowlist secret # noqa: S105 ) @@ -161,13 +184,6 @@ "SERVER": xqueue_server_url, "AUTH": ["xqwatcher", "{{ .Secrets.xqwatcher_password }}"], }, - "edxorg": { - "SERVER": "https://xqueue.edx.org", - "AUTH": [ - "{{ .Secrets.edxorg_xqueue_username }}", - "{{ .Secrets.edxorg_xqueue_password }}", - ], - }, } ) xqueue_servers_secret = OLVaultK8SSecret( @@ -195,6 +211,54 @@ ), ) +# ── edx.org (external) xqueue server ──────────────────────────────────────── +# Credentials are entirely separate from the MIT-hosted instance and live at +# secret-/edxorg-xqueue. Only created for stacks that watch edx.org +# queues (edxorg_xqueue_enabled = true in stack config). +edxorg_servers_secret_name = ( + "xqwatcher-edxorg-servers" # pragma: allowlist secret # noqa: S105 +) +edxorg_servers_template = json.dumps( + { + "edxorg": { + "SERVER": "https://xqueue.edx.org", + "AUTH": [ + "{{ .Secrets.edxorg_xqueue_username }}", + "{{ .Secrets.edxorg_xqueue_password }}", + ], + }, + } +) + +edxorg_servers_secret = ( + OLVaultK8SSecret( + f"xqwatcher-{env_name}-edxorg-servers-secret", + OLVaultK8SStaticSecretConfig( + name=edxorg_servers_secret_name, + namespace=namespace, + dest_secret_name=edxorg_servers_secret_name, + dest_secret_labels=k8s_global_labels.model_dump(), + labels=k8s_global_labels.model_dump(), + mount=f"secret-{stack_info.env_prefix}", + mount_type="kv-v1", + path="edxorg-xqueue", + refresh_after="1h", + restart_target_kind="Deployment", + restart_target_name="xqwatcher-edxorg", + templates={ + "edxorg_servers.json": edxorg_servers_template, + }, + vaultauth=vault_k8s_resources.auth_name, + ), + opts=ResourceOptions( + delete_before_replace=True, + depends_on=[vault_k8s_resources], + ), + ) + if edxorg_xqueue_enabled + else None +) + ################################## ## ConfigMap ## ################################## @@ -244,10 +308,31 @@ ), # Non-secret queue configs; SERVER_REF resolves credentials at runtime # from xqueue_servers.json (mounted from the Vault-synced secret). - "grader_config.json": json.dumps(queues_config), + # Only queues for the MIT-hosted server (SERVER_REF="default"). + "grader_config.json": json.dumps(default_queues), }, ) +# edxorg-specific ConfigMap: only the queues that target the external edx.org +# server. Created only when edxorg_xqueue_enabled is True. +xqwatcher_edxorg_configmap = ( + kubernetes.core.v1.ConfigMap( + f"xqwatcher-{env_name}-edxorg-configmap", + metadata=kubernetes.meta.v1.ObjectMetaArgs( + name="xqwatcher-edxorg-config", + namespace=namespace, + labels=k8s_global_labels.model_dump(), + ), + data={ + "xqwatcher.json": xqwatcher_configmap.data["xqwatcher.json"], + "logging.json": xqwatcher_configmap.data["logging.json"], + "grader_config.json": json.dumps(edxorg_queues), + }, + ) + if edxorg_xqueue_enabled + else None +) + ################################## ## RBAC for ContainerGrader ## ################################## @@ -458,9 +543,7 @@ sub_path="grader_config.json", read_only=True, ), - # Named server definitions (SERVER URL + AUTH credentials) - # from the Vault-synced secret, mounted at the config - # root so xqueue-watcher can resolve SERVER_REF entries. + # MIT-hosted server definitions only (SERVER_REF="default"). kubernetes.core.v1.VolumeMountArgs( name="xqueue-servers", mount_path="/xqwatcher/xqueue_servers.json", @@ -567,6 +650,246 @@ opts=ResourceOptions(depends_on=[xqwatcher_deployment]), ) +################################## +## edx.org Watcher Deployment ## +################################## + +# A fully independent Deployment for queues that target the external edx.org +# xqueue server. It shares the service account and RBAC role with the default +# Deployment (both need identical permissions to manage grading Jobs) but has +# its own ConfigMap and VaultStaticSecret so the edxorg credentials are never +# co-located with the MIT-hosted xqueue credentials. +if edxorg_xqueue_enabled and edxorg_servers_secret and xqwatcher_edxorg_configmap: + edxorg_app_labels = {**k8s_global_labels.model_dump(), "app": "xqwatcher-edxorg"} + + xqwatcher_edxorg_deployment = kubernetes.apps.v1.Deployment( + f"xqwatcher-{env_name}-edxorg-deployment", + metadata=kubernetes.meta.v1.ObjectMetaArgs( + name="xqwatcher-edxorg", + namespace=namespace, + labels=k8s_global_labels.model_dump(), + ), + spec=kubernetes.apps.v1.DeploymentSpecArgs( + replicas=min_replicas, + selector=kubernetes.meta.v1.LabelSelectorArgs( + match_labels={"app": "xqwatcher-edxorg"}, + ), + strategy=kubernetes.apps.v1.DeploymentStrategyArgs( + type="RollingUpdate", + rolling_update=kubernetes.apps.v1.RollingUpdateDeploymentArgs( + max_surge=1, + max_unavailable=0, + ), + ), + template=kubernetes.core.v1.PodTemplateSpecArgs( + metadata=kubernetes.meta.v1.ObjectMetaArgs( + labels=edxorg_app_labels, + ), + spec=kubernetes.core.v1.PodSpecArgs( + service_account_name="xqwatcher", + automount_service_account_token=True, + security_context=kubernetes.core.v1.PodSecurityContextArgs( + seccomp_profile=kubernetes.core.v1.SeccompProfileArgs( + type="RuntimeDefault", + ), + ), + topology_spread_constraints=[ + kubernetes.core.v1.TopologySpreadConstraintArgs( + max_skew=1, + topology_key="kubernetes.io/hostname", + when_unsatisfiable="ScheduleAnyway", + label_selector=kubernetes.meta.v1.LabelSelectorArgs( + match_labels={"app": "xqwatcher-edxorg"}, + ), + ) + ], + containers=[ + kubernetes.core.v1.ContainerArgs( + name="xqueue-watcher", + image=cached_image_uri(docker_image_ref), + image_pull_policy="Always", + command=["uv", "run", "--no-sync", "xqueue-watcher"], + args=["-d", "/xqwatcher"], + env=[ + kubernetes.core.v1.EnvVarArgs( + name="XQWATCHER_LOGIN_POLL_INTERVAL", value="5" + ), + kubernetes.core.v1.EnvVarArgs( + name="XQWATCHER_GRADER_BACKEND", + value="kubernetes", + ), + kubernetes.core.v1.EnvVarArgs( + name="XQWATCHER_GRADER_NAMESPACE", + value=grader_namespace, + ), + kubernetes.core.v1.EnvVarArgs( + name="XQWATCHER_GRADER_CPU_LIMIT", + value=grader_cpu_limit, + ), + kubernetes.core.v1.EnvVarArgs( + name="XQWATCHER_GRADER_MEMORY_LIMIT", + value=grader_memory_limit, + ), + kubernetes.core.v1.EnvVarArgs( + name="XQWATCHER_GRADER_TIMEOUT", + value=grader_timeout, + ), + kubernetes.core.v1.EnvVarArgs( + name="XQWATCHER_VERIFY_TLS", + value=verify_tls, + ), + kubernetes.core.v1.EnvVarArgs( + name="XQWATCHER_SUBMISSION_SIZE_LIMIT", + value=submission_size_limit, + ), + ], + liveness_probe=kubernetes.core.v1.ProbeArgs( + exec_=kubernetes.core.v1.ExecActionArgs( + command=[ + "uv", + "run", + "--no-sync", + "python", + "-c", + "import xqueue_watcher; import sys;" + " sys.exit(0)", + ] + ), + initial_delay_seconds=30, + period_seconds=60, + failure_threshold=3, + timeout_seconds=10, + ), + resources=kubernetes.core.v1.ResourceRequirementsArgs( + requests={"cpu": "250m", "memory": "256Mi"}, + limits={"memory": "512Mi"}, + ), + security_context=kubernetes.core.v1.SecurityContextArgs( + allow_privilege_escalation=False, + run_as_non_root=True, + run_as_user=1000, + capabilities=kubernetes.core.v1.CapabilitiesArgs( + drop=["ALL"], + ), + ), + volume_mounts=[ + kubernetes.core.v1.VolumeMountArgs( + name="xqwatcher-edxorg-config", + mount_path="/xqwatcher/xqwatcher.json", + sub_path="xqwatcher.json", + read_only=True, + ), + kubernetes.core.v1.VolumeMountArgs( + name="xqwatcher-edxorg-config", + mount_path="/xqwatcher/logging.json", + sub_path="logging.json", + read_only=True, + ), + kubernetes.core.v1.VolumeMountArgs( + name="xqwatcher-edxorg-config", + mount_path="/xqwatcher/conf.d/grader_config.json", + sub_path="grader_config.json", + read_only=True, + ), + # edx.org server defs (SERVER_REF="edxorg"); + # mounted under the filename xqueue-watcher expects. + kubernetes.core.v1.VolumeMountArgs( + name="edxorg-servers", + mount_path="/xqwatcher/xqueue_servers.json", + sub_path="edxorg_servers.json", + read_only=True, + ), + ], + ), + ], + volumes=[ + kubernetes.core.v1.VolumeArgs( + name="xqwatcher-edxorg-config", + config_map=kubernetes.core.v1.ConfigMapVolumeSourceArgs( + name=xqwatcher_edxorg_configmap.metadata.name, + ), + ), + kubernetes.core.v1.VolumeArgs( + name="edxorg-servers", + secret=kubernetes.core.v1.SecretVolumeSourceArgs( + secret_name=edxorg_servers_secret_name, + ), + ), + ], + ), + ), + ), + opts=ResourceOptions( + depends_on=[edxorg_servers_secret], + ignore_changes=["spec.replicas"], + ), + ) + + xqwatcher_edxorg_hpa = kubernetes.autoscaling.v2.HorizontalPodAutoscaler( + f"xqwatcher-{env_name}-edxorg-hpa", + metadata=kubernetes.meta.v1.ObjectMetaArgs( + name="xqwatcher-edxorg", + namespace=namespace, + labels=k8s_global_labels.model_dump(), + ), + spec=kubernetes.autoscaling.v2.HorizontalPodAutoscalerSpecArgs( + scale_target_ref=kubernetes.autoscaling.v2.CrossVersionObjectReferenceArgs( + api_version="apps/v1", + kind="Deployment", + name="xqwatcher-edxorg", + ), + min_replicas=min_replicas, + max_replicas=max_replicas, + metrics=[ + kubernetes.autoscaling.v2.MetricSpecArgs( + type="Resource", + resource=kubernetes.autoscaling.v2.ResourceMetricSourceArgs( + name="cpu", + target=kubernetes.autoscaling.v2.MetricTargetArgs( + type="Utilization", + average_utilization=60, + ), + ), + ), + kubernetes.autoscaling.v2.MetricSpecArgs( + type="Resource", + resource=kubernetes.autoscaling.v2.ResourceMetricSourceArgs( + name="memory", + target=kubernetes.autoscaling.v2.MetricTargetArgs( + type="Utilization", + average_utilization=80, + ), + ), + ), + ], + behavior=kubernetes.autoscaling.v2.HorizontalPodAutoscalerBehaviorArgs( + scale_up=kubernetes.autoscaling.v2.HPAScalingRulesArgs( + stabilization_window_seconds=60, + select_policy="Max", + policies=[ + kubernetes.autoscaling.v2.HPAScalingPolicyArgs( + type="Percent", + value=100, + period_seconds=60, + ), + ], + ), + scale_down=kubernetes.autoscaling.v2.HPAScalingRulesArgs( + stabilization_window_seconds=300, + select_policy="Min", + policies=[ + kubernetes.autoscaling.v2.HPAScalingPolicyArgs( + type="Percent", + value=25, + period_seconds=60, + ), + ], + ), + ), + ), + opts=ResourceOptions(depends_on=[xqwatcher_edxorg_deployment]), + ) + ################################## ## Exports ## ################################## diff --git a/src/ol_infrastructure/applications/xqwatcher/xqwatcher_server_policy.hcl b/src/ol_infrastructure/applications/xqwatcher/xqwatcher_server_policy.hcl index 65b51c67f6..fe49a0b9b0 100644 --- a/src/ol_infrastructure/applications/xqwatcher/xqwatcher_server_policy.hcl +++ b/src/ol_infrastructure/applications/xqwatcher/xqwatcher_server_policy.hcl @@ -5,3 +5,7 @@ path "sys/leases/renew" { path "secret-DEPLOYMENT/edx-xqueue" { capabilities = [ "read" ] } + +path "secret-DEPLOYMENT/edxorg-xqueue" { + capabilities = [ "read" ] +}