diff --git a/src/bridge/secrets/xqwatcher/secrets.mitx.ci.yaml b/src/bridge/secrets/xqwatcher/secrets.mitx.ci.yaml index 5bbb88e1e0..811a656e68 100644 --- a/src/bridge/secrets/xqwatcher/secrets.mitx.ci.yaml +++ b/src/bridge/secrets/xqwatcher/secrets.mitx.ci.yaml @@ -1,65 +1,51 @@ --- -graders_yaml: - graders: - - name: ENC[AES256_GCM,data:p5T9MPzwVzB2SmsbB7kFcEuBlR9HJGbyPVZw,iv:iFC4JC+oyLBtKH06ZGKOLdZDEleN2f9j4iXJxQpwWt8=,tag:588sITOlP+vcF66ohqyRLw==,type:str] - address: ENC[AES256_GCM,data:0ZoJISrNH1Y/fcbioWNDlvNpPzELSdN+jZolfqYOkWlYOnzufqUMlQ==,iv:OTS2obQKZd/yJray6BxhpOKEaOamXQf2RdsZibHklSc=,tag:Jusz+4i6wysepofR9M0mQA==,type:str] - git_ref: ENC[AES256_GCM,data:M5tIe1zv,iv:yB5QnVIDuBmYauTQzx6rzLwQNd6V+t15DUcAirIDySI=,tag:SsgXOI1nmtqfkhmCgVJbrg==,type:str] - env: - GIT_SSH_COMMAND: ENC[AES256_GCM,data:/tHzs53lMHrUmndEgR0jOhOPW0ned4A1USAZ2j9jLM+4KYjPKlicXEeWnOKL7wAwNZnlpN7O+iD0tE3UjkfkCrF9+MMMjzeh6Cxi7HjaX8B7yNVGXpt3CMBbVIduza0Qta6bqo0Ixao1xID4jhq2Hx/K0XTGHr8sq1WDo5RWjhgNni0HrYfQ7LCAT4/h8pNR,iv:3JZxMQWw95j4KGAL/bMqTo+irhdtlDEpVJjPrkK/IWQ=,tag:Uh0vwfZzZVN8nkxz0V6lDw==,type:str] - - name: ENC[AES256_GCM,data:guJV2vkSNpqaWWnlQgOoZhGXzLpPekf+GZABiO4=,iv:IJIIE0cFWPEhwcWKBQtcPGnv5XF4vy8ENEjETpIrnWc=,tag:lc82HaHmcJSy7Waxv7WN2Q==,type:str] - address: ENC[AES256_GCM,data:0ZoJISrNH1Y/fcbioWNDlvNpPzELSdN+jZolfqYOkWlYOnzufqUMlQ==,iv:OTS2obQKZd/yJray6BxhpOKEaOamXQf2RdsZibHklSc=,tag:Jusz+4i6wysepofR9M0mQA==,type:str] - git_ref: ENC[AES256_GCM,data:M5tIe1zv,iv:yB5QnVIDuBmYauTQzx6rzLwQNd6V+t15DUcAirIDySI=,tag:SsgXOI1nmtqfkhmCgVJbrg==,type:str] - env: - GIT_SSH_COMMAND: ENC[AES256_GCM,data:/tHzs53lMHrUmndEgR0jOhOPW0ned4A1USAZ2j9jLM+4KYjPKlicXEeWnOKL7wAwNZnlpN7O+iD0tE3UjkfkCrF9+MMMjzeh6Cxi7HjaX8B7yNVGXpt3CMBbVIduza0Qta6bqo0Ixao1xID4jhq2Hx/K0XTGHr8sq1WDo5RWjhgNni0HrYfQ7LCAT4/h8pNR,iv:3JZxMQWw95j4KGAL/bMqTo+irhdtlDEpVJjPrkK/IWQ=,tag:Uh0vwfZzZVN8nkxz0V6lDw==,type:str] confd_json: Watcher-MITx-6.00x: AUTH: - ENC[AES256_GCM,data:SCKbOmao2Gpl,iv:Hb7ZVtcobrj2E6hFBrmqVpq1oNnJ+j2YOQMcfLFN6Dc=,tag:3rvoooddYT3G04AQOsKxEA==,type:str] - - ENC[AES256_GCM,data:/FqGfC4XbHZ2AIaJiUHJA+hebqDA4eag+qwjiG2lywpzOOMJIwblQw==,iv:dmDX989O8vyVtgfcCcSG1IPoX/+Vy/CHifk+4omnwV8=,tag:QFkUcAaojGs/+c+5h3r5lg==,type:str] + - ENC[AES256_GCM,data:huly/No4UkykZIlKre7Lr5bJLH+4zmha3gS7m9bFbXO85TF/QPUpAg==,iv:CMds+DvCv7E9qyAlgxRvO2XAxkX4wso/21Q9zXQrf8I=,tag:xcsHlo32rScjVagNmndv8Q==,type:str] CONNECTIONS: ENC[AES256_GCM,data:Gg==,iv:d7E2fNOoSSflC2UNEnIUg9bBCHI3rlbn8pgmHCnxX68=,tag:5V0iQ94DuAdt0+zxQXV39A==,type:int] HANDLERS: - - CODEJAIL: - bin_path: ENC[AES256_GCM,data:kXgOjRNdv5RNXL2JcleS3u/W2FUelW/OWS6u9Dd3ASC2mOxHMAjxQK0VoivVPj0ZqSzlqhxFPg21sg==,iv:jtm/LiMJapsXJEIZGgOzknJ/Nmd9zFofyNtQG4jJLHs=,tag:gIBJjPWkIydCIzoan9V6kw==,type:str] - lang: ENC[AES256_GCM,data:vxjPnbN7oQ==,iv:m25r3jtmoUjIWYDKyKUX82awfO98K9UdFPzUGlmdcF4=,tag:8yMYlavG9/pPCzIl5prLcg==,type:str] - name: ENC[AES256_GCM,data:1ISqevS5v+s=,iv:pX1qlTFPdlrgqP16/Vm9OGJ/hKnuMScGwjJ3M+YWeA4=,tag:d88eMeekFkpm0mRbNy741A==,type:str] - user: ENC[AES256_GCM,data:jiWGB4mhGX4=,iv:WOiO7BPAsvjl9tQz8NwLLTYcCi/XwcixYei6ybWiDYg=,tag:yGfU4XhPE0hkkVFM3c3Hcg==,type:str] - HANDLER: ENC[AES256_GCM,data:xKNmClln1J5leWUOsHXo6d0XMm+tNAN9bek4soM5b49o6zLbv3s8TQ==,iv:kZ7jGM5B4n2RsOCppRTt0Cn8TDH2sSnAOuYbQl95dY4=,tag:OTw+Y0uhByeYXY8eP13dqw==,type:str] + - HANDLER: ENC[AES256_GCM,data:SUKLJX/arQHo7OMbzoOCQgqACuwEDVgJVJ0HGqbLWbDZR3u6dK6zvxJUrNVM7w==,iv:JRwutReGuNwudxa6sRZVLZjx34ZaiVqAQSmwdUgn5AI=,tag:w4hyFq+uznC/+u1uciYlwQ==,type:str] KWARGS: - grader_root: ENC[AES256_GCM,data:A+hHIVLX0mTz1DHc7MMmVAf60hQ/EJjgzG5h9lnZppBLZIscVvhyryczyTS+RKimzqWQQYTT32FAsqYqvtI=,iv:dIMitVcAwebHlO3LRKoD13dVkMkuL2z6rX/gJbuGxKc=,tag:HDCOyEUMMK1/ErgXk6q7Kw==,type:str] - SERVER: ENC[AES256_GCM,data:fgyBM4dTnNCyUgb0O/nruyu5WgsGEZaYGHP+oV/Qi7llIUkOFDN9sQY=,iv:7GIz4gNKMqINA+i1WzGfEvptip3vLo6XnA/EScQbEy4=,tag:tjyrPKIx9dONJMgb6vyIPQ==,type:str] + grader_root: ENC[AES256_GCM,data:yW8aK1I4cgoeu/mWBVZic2lPPtFekwuG,iv:+Xp/VtAeSF843ov2dx9XMrTKvRPTZ0sSxt44abcg5so=,tag:SRC0qFcYk1X2XBF4OqpENg==,type:str] + image: ENC[AES256_GCM,data:K/smElRfcbDfioQwlbSO7nMoW7yI39IzNFY2TvSgrWNaFqo5MnZSnyFOPC021WzHRZWMdQR1kTLB2zImRZuFZcEcnN0XNyHPiYrL,iv:Zikm+Z2QR4b7UEXrfrQvGh5EdywYPYxUL0lSECx7chQ=,tag:SeUO/AQmu6/eolH3+jmAYQ==,type:str] + backend: ENC[AES256_GCM,data:Xhkvv02Rh5aNkw==,iv:ZN1iF4gQ7Krm99YjrrWkWbWC3PYROk9gsOqFmqS3X/E=,tag:klNdz4BlVdHTnT8sOQ8/dA==,type:str] + cpu_limit: ENC[AES256_GCM,data:47Kuqa8=,iv:uZRate/HQzNH0QggGZdzguU0vZxxS5M8WYRcXUnQvo8=,tag:XnCarT23s5qOsbMim4Xs+A==,type:str] + memory_limit: ENC[AES256_GCM,data:hpEAzVQ=,iv:K9Uz/YVxopA7x6amvG9GMzuvsBssTYdrFt0O8qfdcnU=,tag:XIg9CpujBmzCM/fFebxIjA==,type:str] + timeout: ENC[AES256_GCM,data:jV0=,iv:pw+yw/Wzt0rtnoJmwnXzETb2bCeeGnD/nNHRq/FJH20=,tag:hwUcPI6in9/Yvqea7Hxn4Q==,type:int] + image_pull_policy: ENC[AES256_GCM,data:XiQx6roJ,iv:pORRAsnjXxV/c2cHzepkSVgs18hhkHAAtOT/Npv9lME=,tag:VgUaG0Uqmj5JWo+iWujolw==,type:str] + SERVER: ENC[AES256_GCM,data:9+nHYHNrnrHmTw0UqzyvU4CoC0YeLPE4wQb5eTWFMXb0yw==,iv:NdCWF86cPGN180Puc6y37cuhV1H0vsMZgQBEG6zLUTg=,tag:VjWyEn/Bve29esECOBMotg==,type:str] Watcher-MITx-6.0001r: AUTH: - ENC[AES256_GCM,data:GeoKMl2An+NT,iv:4bpjXM5pBTGk43L4aFffe/i7Xuiqqma4x1bjwqr2vZc=,tag:cRJlbd1X9ScKyIMXc56Mng==,type:str] - - ENC[AES256_GCM,data:2IRNwrwVNeQ/N5B3Ka0E8JKa2rMoY+g1h078CWy82O0EYrsfUAEIxw==,iv:sl5rkCOZxVIM2xkgAHvsg9wPl4arEOm7NTq4B67Dz5s=,tag:dpxlezEbjFndERvvvGQSkQ==,type:str] + - ENC[AES256_GCM,data:VKQGDGtNq+FQVFLFO6IHxos/nHs+GHihFZND/mnUngNFGSLj+NSi5w==,iv:Obb9DEHraRwAf/nCaWKyFJh5wK3dTL3gzwkEDVbhV7M=,tag:HXrvSTWBPuH9vPiFKLBQKA==,type:str] CONNECTIONS: ENC[AES256_GCM,data:0g==,iv:6tyg9WvnRhnpLm0vKGefnd70VfpmEFy0ErgwnNiFSAU=,tag:fl01agv/QL6uosiMu0rDzA==,type:int] HANDLERS: - - CODEJAIL: - bin_path: ENC[AES256_GCM,data:lsEJDjU3WT6woU5bptLcYR1wXqWdWUfTg95Rsi+KBHbJgB2WCSApOL4NokZErBfF9b2Gp6Oo8lbTnw==,iv:A2I+IE3Sb2sMMt3jzwQahVeHVVSKFegWe4lOYq7d+kA=,tag:fcR1QNOhmBa60VW62XcEwg==,type:str] - lang: ENC[AES256_GCM,data:UkMtJmLV7w==,iv:pbuB6D7FVuwpjJU8Aq/94McxvrADBG6q9NK1dInRaWA=,tag:keZqRH/pnb7hJGK/5IB6Dw==,type:str] - name: ENC[AES256_GCM,data:FNj9NpiNTLA=,iv:/NO4SppIdQ2sk4b4v0yFy5uhUA6xJe3Ln4CcbNOwoYQ=,tag:kPIcKyFzwlXawgYNuEFGQg==,type:str] - user: ENC[AES256_GCM,data:AgacvZvMofI=,iv:M8eB7g7NIieNs+WeQjemUfcyYiPxRSsxdQhKCHcehLg=,tag:kEFkDWkdhqWhrkSVnpPjRg==,type:str] - HANDLER: ENC[AES256_GCM,data:ptWfCuhtmp1XaCxZsxoyFryVbS82cJkYLOgeTfI883Abtal/nUK58g==,iv:Y2vuOaLH94pc9iY0V6qUfmlj9m3ijK/vxnyljFhBC4w=,tag:dBe2y1yG4y/MDqKeUlobow==,type:str] + - HANDLER: ENC[AES256_GCM,data:GxcBCI9x7tzIoP+2Ssgz92doSvmldQtDDx6Jl9duFWcXP/bwRKodTLfYDEwmaw==,iv:AXZJHneW4i1z90TK7SIr6WOG7HLz4EvSP2o0rWALMIM=,tag:5iOqnbcO53qOcgWYRS3O9Q==,type:str] KWARGS: - grader_root: ENC[AES256_GCM,data:6UFR02eSi21NXOU6SQcZyRm3SbFqgMXV4yJiMTW8+lMiVRfRVyz0cFnQjFEAlwflHxgWgXNymHNTkxYFzSUpUw==,iv:AiEGSD68isqsL0Ften6dn+/DTpHfdbUGwBN5vdBd1w8=,tag:lCXfJNrV9hkrSRxFVesPsQ==,type:str] - SERVER: ENC[AES256_GCM,data:FHHp0nXRKUsYrfccjmv0TF1Qb6ms7sCRnvA/V2Pm0snJznW/YA66LRo=,iv:7l7VZyh2wyVT7q1j43fIWdDBsODzK9IGGZt9+D+Z55I=,tag:8zW+XIdqJjuuN2QY3ndSgg==,type:str] -xqwatcher_grader_code_ssh_identity: ENC[AES256_GCM,data:2ogEHew5oYkdHOhn/Td92T0tLF59cC0ts6kQddqEgUu/7/cSadFoGvb9RfS4TUOzhAARnMdMRT5JclXlp4Oi8T0X2nenAM/zZkokd5GMmC0J2mESCm2bbdXcG2qat9Rjga7d+1bLyP1+1heIHPb5uRsGJYkA87d1bvv8vlmwtEQN3VeBrD/0I+jwTUcLamVg4T602RxWMnEm9nXw0NTDUp4TsOMtIrzdBvs9IsNEqdDT7hwSkHpmC69dulUJzJGOSFYmkYFHcDAK8qTRdc8qp5vvMTz4k7X8JF8yp7LfEGuE5w==,iv:c2o8vAGEB8DFq9VtW8Zhm5Xat4s49x+oQf7212ZeFFk=,tag:9gI1u/SBMIjLDfY3b1X8+w==,type:str] + grader_root: ENC[AES256_GCM,data:IOvzEu688/L+NAhVumJegc8hY8Gq0JSR,iv:aBiBcv3PXlV4ZAAE+v44tjCgTNoWblQsvSyXBWu99VY=,tag:/+Tw3yjQyBNc1iDhnvmqQg==,type:str] + image: ENC[AES256_GCM,data:xbI2u7mnzujef23MXabbxrTtqFzlM6SoH8koIHvJhsViugeRGYMQTpd7gLaShLLJDPGMoQ/8MGuNuZp1IY1Rn+xoFGE8W5owiYHu,iv:hIn5E/VXuPuunHkxUkFoMhWeUw8Ex+x+pfHN+cTxf/8=,tag:EMY867H0qn83N5v0OpnDrg==,type:str] + backend: ENC[AES256_GCM,data:OaGBiwiRT0m/Pg==,iv:RBUzueYrobo6vv2LXXhTMPEY8q7ZvkDN86x8fBc4lOU=,tag:hqS+ZaFHjs0xhTO5ldc6pw==,type:str] + cpu_limit: ENC[AES256_GCM,data:FjtcqBs=,iv:/4awwScRxzctMgesO+Akjh+xawRxjQYM6FwctaUr6to=,tag:HLUnAYyAhknuBQXTRHs3ng==,type:str] + memory_limit: ENC[AES256_GCM,data:gISNMOg=,iv:y842XxT45ipoWCXrLO5Pjh6WgqtUyZ9qRfkCWfVx/6E=,tag:sdMFIVYXzUj9LeOA2XXnIg==,type:str] + timeout: ENC[AES256_GCM,data:KYY=,iv:YiBTO75tTrTB8N1sjLPA0YTJX48sls7EppkfmsYAbWg=,tag:vHJR93RPrWf+qtt7+aEIOA==,type:int] + image_pull_policy: ENC[AES256_GCM,data:++zPPhK8,iv:0mPD6jGWeGVsEGxmCeiJxyE9wYz0vMqVtLLYCguLg0s=,tag:ounenzLmuxsYg9tYsm86wg==,type:str] + SERVER: ENC[AES256_GCM,data:ScOVWtueEbmPj/cA42MlN58fxYYKjBWGh982ebrLp6qbaQ==,iv:uOWEPN5qBeZ9Yw6xr7hfZ6GWzgYGBYZkBFsvM0sfzyg=,tag:SyNl6UGl6yilFPNVqa1aGg==,type:str] sops: kms: - arn: arn:aws:kms:us-east-1:610119931565:alias/infrastructure-secrets-ci created_at: "2024-04-16T18:44:13Z" enc: AQICAHjnbqe9AmEW1Js10nySybyuAG7Fb5E9EHUgkmqFDv7PxQGybcfmnUvB5N3pkXc+9ch5AAAAfjB8BgkqhkiG9w0BBwagbzBtAgEAMGgGCSqGSIb3DQEHATAeBglghkgBZQMEAS4wEQQM/4CSiwztDCKuyN3oAgEQgDvUtEGftvJN4EzTdnZS00yMzsZhgmq3lCXovEvM6fFJFBZYZjGAeKZYnhW69ITlOIUi8K4iZmlUy9eQFw== aws_profile: "" - gcp_kms: [] - azure_kv: [] hc_vault: - vault_address: https://vault-ci.odl.mit.edu engine_path: infrastructure key_name: sops created_at: "2024-04-16T18:44:13Z" enc: vault:v1:DFw1gsayFWeGxTCrU0HCQzWk4YBPHQdKHpValoIHi4bO/jHn+eZv+Nr2d4FubYiq8jKrKREm/UgsizDS - age: [] - lastmodified: "2024-04-25T18:35:23Z" - mac: ENC[AES256_GCM,data:X4CfjxG/ps13sYRd2PzeXl0MLz0pxkSKky23dIN/PmsH1xyhPnVXQ8wMV85vjN55LrGcSHs9GWAlJjqna43apnoggHfdqO/bYUeHhUjUJ8RiqXTFCi9NfRkCq5x1upmiXTR95fKdZUrykFZTGx+8JvTWLjs+2HfgHWL0kDk1HDU=,iv:RVkdd8V9rHgFbMP7ZqormU7TYXF2uwe46QCKHpmhi8E=,tag:xEdvYfirIpM/rwhKDAouHw==,type:str] + lastmodified: "2026-03-20T15:27:59Z" + mac: ENC[AES256_GCM,data:YuUS49U2swqS76nxsRcSxx3/66S4qxedfZeGyBKH3MsZRNCwy4BdP5xa45CwLT6xH/sWIoJPo6Drw7ECekY+wdjwqV2cM7hCtD+kH8CVqkx4CCIUNBvQyYY+d4bdiaSdty75pGgYQzD090AKtcE5acFMF3K/wrmYTPWzh8hBDRw=,iv:quJO4m+SuoiVPTAncLYJUl0jJhBWg64Abwp/oXeN9Hc=,tag:pHFGW/Prf8A2aYf2kCmSIQ==,type:str] pgp: - created_at: "2024-04-16T18:44:13Z" enc: |- @@ -142,4 +128,4 @@ sops: -----END PGP MESSAGE----- fp: 3582AE9F12CE295BDAF545ED17A5F53F11681446 unencrypted_suffix: _unencrypted - version: 3.8.1 + version: 3.12.2 diff --git a/src/ol_concourse/lib/containers.py b/src/ol_concourse/lib/containers.py index ff4bb72653..9e1a82b0a3 100644 --- a/src/ol_concourse/lib/containers.py +++ b/src/ol_concourse/lib/containers.py @@ -1,5 +1,6 @@ from ol_concourse.lib.jobs.infrastructure import Output from ol_concourse.lib.models.pipeline import ( + AnonymousResource, Cache, Command, Identifier, @@ -35,3 +36,44 @@ def container_build_task( outputs=[Output(name=Identifier("image"))], ), ) + + +def ensure_ecr_task(ecr_repo_name: str) -> TaskStep: + """Return a TaskStep that creates an ECR repository if it does not exist. + + Uses the AWS CLI with instance credentials (IRSA / worker IAM role). + Safe to run on every pipeline execution: ``describe-repositories`` is a + no-op when the repo already exists, and ``create-repository`` only runs + when it does not. + + Args: + ecr_repo_name: The ECR repository name *without* the registry host, + e.g. ``"mitodl/graders-mit-600x"``. + """ + return TaskStep( + task=Identifier("ensure-ecr-repository"), + config=TaskConfig( + platform="linux", + image_resource=AnonymousResource( + type="registry-image", + source={"repository": "amazon/aws-cli", "tag": "latest"}, + ), + params={ + "REPO_NAME": ecr_repo_name, + "AWS_DEFAULT_REGION": "us-east-1", + "AWS_PAGER": "cat", + }, + run=Command( + path="sh", + args=[ + "-exc", + ( + "aws ecr describe-repositories" + " --repository-names ${REPO_NAME}" + " || aws ecr create-repository" + " --repository-name ${REPO_NAME}" + ), + ], + ), + ), + ) diff --git a/src/ol_concourse/pipelines/open_edx/grader_images/__init__.py b/src/ol_concourse/pipelines/open_edx/grader_images/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/ol_concourse/pipelines/open_edx/grader_images/base_image_pipeline.py b/src/ol_concourse/pipelines/open_edx/grader_images/base_image_pipeline.py new file mode 100644 index 0000000000..d63b325f9f --- /dev/null +++ b/src/ol_concourse/pipelines/open_edx/grader_images/base_image_pipeline.py @@ -0,0 +1,119 @@ +""" +Pipeline that builds the xqueue-watcher grader base image and pushes it to +both DockerHub and ECR. + +The base image (grader_support/Dockerfile.base) is the foundation for all +course-specific grader images. Publishing it to both registries allows: + - DockerHub (mitodl/xqueue-watcher-grader-base): public reference usable + without AWS credentials; used in grader repo Dockerfiles as the default + GRADER_BASE_IMAGE build arg. The per-grader Concourse build pipelines + trigger off this DockerHub image so a base image rebuild automatically + triggers downstream grader image rebuilds. + - ECR (mitodl/xqueue-watcher-grader-base): private mirror for use inside + AWS without DockerHub rate-limit concerns. + +Triggers: + - Push to the xqueue-watcher repo on paths under grader_support/. +""" + +import sys + +from ol_concourse.lib.containers import container_build_task, ensure_ecr_task +from ol_concourse.lib.models.fragment import PipelineFragment +from ol_concourse.lib.models.pipeline import ( + GetStep, + Identifier, + Input, + Job, + Pipeline, + PutStep, +) +from ol_concourse.lib.resources import git_repo, registry_image + +_AWS_ACCOUNT_ID = "610119931565" +_AWS_REGION = "us-east-1" +_BASE_IMAGE_REPO = "mitodl/xqueue-watcher-grader-base" + + +def grader_base_image_pipeline() -> Pipeline: + """Return the pipeline that builds and publishes the grader base image.""" + xqwatcher_repo = git_repo( + name=Identifier("xqueue-watcher-code"), + uri="https://github.com/mitodl/xqueue-watcher", + branch="main", + paths=["grader_support/"], + ) + + # DockerHub push target — public, used by grader repo Dockerfiles as default + # GRADER_BASE_IMAGE build arg and accessible without AWS credentials. + dockerhub_base_image = registry_image( + name=Identifier("grader-base-dockerhub"), + image_repository=_BASE_IMAGE_REPO, + image_tag="latest", + username="((dockerhub.username))", + password="((dockerhub.password))", # noqa: S106 + ) + + # ECR push target — private mirror for use inside AWS without DockerHub + # rate-limit concerns. The per-grader Concourse build pipelines trigger + # off the DockerHub base image (grader_base_dockerhub_repo), not ECR. + ecr_base_image = registry_image( + name=Identifier("grader-base-ecr"), + image_repository=_BASE_IMAGE_REPO, + image_tag="latest", + ecr_region=_AWS_REGION, + ) + + build_job = Job( + name=Identifier("build-grader-base-image"), + plan=[ + GetStep(get=xqwatcher_repo.name, trigger=True), + container_build_task( + inputs=[Input(name=xqwatcher_repo.name)], + build_parameters={ + "CONTEXT": str(xqwatcher_repo.name), + "DOCKERFILE": ( + f"{xqwatcher_repo.name}/grader_support/Dockerfile.base" + ), + }, + ), + ensure_ecr_task(_BASE_IMAGE_REPO), + # Push to DockerHub first — fail fast if credentials are wrong + # before consuming the ECR push quota. + PutStep( + put=dockerhub_base_image.name, + params={ + "image": "image/image.tar", + "additional_tags": f"./{xqwatcher_repo.name}/.git/describe_ref", + }, + ), + PutStep( + put=ecr_base_image.name, + params={ + "image": "image/image.tar", + "additional_tags": f"./{xqwatcher_repo.name}/.git/describe_ref", + }, + ), + ], + ) + + fragment = PipelineFragment( + resources=[xqwatcher_repo, dockerhub_base_image, ecr_base_image], + jobs=[build_job], + ) + + return Pipeline( + resource_types=fragment.resource_types, + resources=fragment.resources, + jobs=fragment.jobs, + ) + + +if __name__ == "__main__": + pipeline_json = grader_base_image_pipeline().model_dump_json(indent=2) + with open("definition.json", "w") as definition: # noqa: PTH123 + definition.write(pipeline_json) + sys.stdout.write(pipeline_json) + sys.stdout.write( + "\nfly -t set-pipeline -p build-grader-base-image -c definition.json\n" + ) diff --git a/src/ol_concourse/pipelines/open_edx/grader_images/build_pipeline.py b/src/ol_concourse/pipelines/open_edx/grader_images/build_pipeline.py new file mode 100644 index 0000000000..0c45283426 --- /dev/null +++ b/src/ol_concourse/pipelines/open_edx/grader_images/build_pipeline.py @@ -0,0 +1,237 @@ +""" +Reusable pipeline builder for course-specific grader images. + +Each grader repository (e.g. graders-mit-600x) extends the xqueue-watcher +grader base image with course-specific grader scripts and dependencies. +This module provides a ``GraderPipelineConfig`` dataclass and a +``grader_image_pipeline()`` factory that returns a ``Pipeline`` for building +and pushing that course image to a private ECR repository. + +Triggers: + - New commit to the grader repo (grader scripts or Dockerfile changed). + - New digest of the Docker Hub grader base image (base image rebuilt / + security patch applied). + +The base image digest is resolved at build time by reading the ``repository`` +and ``digest`` files that Concourse's ``registry-image`` resource writes for +every fetched image. The resolved ``repo@sha256:…`` reference is injected +into the Docker build as ``GRADER_BASE_IMAGE`` via a shell wrapper around the +``oci-build-task``'s ``build`` script so that the build layer cache is +correctly invalidated and the published image records the exact base used. +""" + +import dataclasses +import sys + +from ol_concourse.lib.containers import ensure_ecr_task +from ol_concourse.lib.models.fragment import PipelineFragment +from ol_concourse.lib.models.pipeline import ( + Cache, + Command, + GetStep, + Identifier, + Input, + Job, + Output, + Pipeline, + Platform, + PutStep, + TaskConfig, + TaskStep, +) +from ol_concourse.lib.resources import registry_image, ssh_git_repo + +_AWS_ACCOUNT_ID = "610119931565" +_AWS_REGION = "us-east-1" + + +@dataclasses.dataclass +class GraderPipelineConfig: + """Parameters for building and publishing one course-specific grader image. + + Attributes: + pipeline_name: Short identifier used in resource/job names and the + Concourse pipeline name, e.g. ``"graders-mit-600x"``. + grader_repo_url: SSH URL of the grader repository, e.g. + ``"git@github.com:mitodl/graders-mit-600x"``. + grader_repo_branch: Branch to track, e.g. ``"main"`` or ``"master"``. + ecr_repo_name: ECR repository name (without the registry host), e.g. + ``"mitodl/graders-mit-600x"``. Passed directly to the + ``registry-image`` resource; ``ecr_region`` causes Concourse to + infer the correct registry host automatically. + grader_base_dockerhub_repo: DockerHub repository name for the grader + base image used as the build trigger, e.g. + ``"mitodl/xqueue-watcher-grader-base"``. + github_private_key: Vault path for the SSH private key used to clone + the (private) grader repository. Defaults to the odlbot SSH key + stored at ``infrastructure/open_api_clients`` in Vault. + aws_account_id: AWS account ID that hosts the ECR registry. + aws_region: AWS region for ECR authentication. + """ + + pipeline_name: str + grader_repo_url: str + grader_repo_branch: str + ecr_repo_name: str + grader_base_dockerhub_repo: str = "mitodl/xqueue-watcher-grader-base" + github_private_key: str = "((open_api_clients.odlbot_private_ssh_key))" + aws_account_id: str = _AWS_ACCOUNT_ID + aws_region: str = _AWS_REGION + + +def grader_image_pipeline(config: GraderPipelineConfig) -> Pipeline: + """Return a Pipeline that builds and pushes a course-specific grader image. + + The pipeline contains a single build job that: + 1. Watches the grader repo for new commits (trigger). + 2. Watches the grader base image on DockerHub for updates (trigger). + 3. Builds the Dockerfile in the root of the grader repo. A shell + wrapper reads the ``repository`` and ``digest`` files written by the + ``registry-image`` resource and sets ``BUILD_ARG_GRADER_BASE_IMAGE`` + to the immutable ``repo@sha256:…`` reference before invoking the + ``oci-build-task``'s ``build`` script. + 4. Pushes the resulting image to private ECR. + + Args: + config: Pipeline configuration for the grader repository. + + Returns: + A ``Pipeline`` object suitable for serialisation to Concourse YAML/JSON. + """ + grader_repo = ssh_git_repo( + name=Identifier(f"{config.pipeline_name}-code"), + uri=config.grader_repo_url, + branch=config.grader_repo_branch, + private_key=config.github_private_key, + ) + + # Grader base image on DockerHub — used as a build trigger so that + # rebuilding the base image automatically causes this pipeline to run. + grader_base_image = registry_image( + name=Identifier("grader-base-image"), + image_repository=config.grader_base_dockerhub_repo, + image_tag="latest", + username="((dockerhub.username))", + password="((dockerhub.password))", # noqa: S106 + ) + + # Private ECR image for this course's grader. + grader_ecr_image = registry_image( + name=Identifier(f"{config.pipeline_name}-image"), + image_repository=config.ecr_repo_name, + image_tag="latest", + ecr_region=config.aws_region, + ) + + # The registry-image resource writes `repository` and `digest` files into + # the fetched directory. We read them inside the task via a shell wrapper + # that sets BUILD_ARG_GRADER_BASE_IMAGE=repo@sha256:… before exec-ing the + # oci-build-task `build` script. This pins the base image to the exact + # digest that triggered the pipeline run, ensuring reproducibility and + # correct Docker layer-cache invalidation. + # + # Note: oci-build-task `params` are env vars injected verbatim — shell + # expressions like $(cat …) are NOT evaluated there. The `run.args` shell + # wrapper is the only way to dynamically set a BUILD_ARG from a file. + base_ref = grader_base_image.name + build_job = Job( + name=Identifier(f"build-{config.pipeline_name}-image"), + plan=[ + GetStep(get=grader_repo.name, trigger=True), + GetStep(get=grader_base_image.name, trigger=True), + TaskStep( + task=Identifier("build-container-image"), + privileged=True, + config=TaskConfig( + platform=Platform.linux, + image_resource={ + "type": "registry-image", + "source": {"repository": "concourse/oci-build-task"}, + }, + params={ + "CONTEXT": str(grader_repo.name), + "DOCKERFILE": f"{grader_repo.name}/Dockerfile", + }, + caches=[Cache(path="cache")], + inputs=[ + Input(name=grader_repo.name), + Input(name=grader_base_image.name), + ], + outputs=[Output(name=Identifier("image"))], + # Read the base image digest file at runtime and export it + # as BUILD_ARG_GRADER_BASE_IMAGE before running `build`. + run=Command( + path="sh", + args=[ + "-euc", + ( + f"export BUILD_ARG_GRADER_BASE_IMAGE=" + f'"$(cat {base_ref}/repository)' + f'@$(cat {base_ref}/digest)"' + " && exec build" + ), + ], + ), + ), + ), + ensure_ecr_task(config.ecr_repo_name), + PutStep( + put=grader_ecr_image.name, + params={ + "image": "image/image.tar", + "additional_tags": (f"./{grader_repo.name}/.git/describe_ref"), + }, + ), + ], + ) + + fragment = PipelineFragment( + resources=[grader_repo, grader_base_image, grader_ecr_image], + jobs=[build_job], + ) + + return Pipeline( + resource_types=fragment.resource_types, + resources=fragment.resources, + jobs=fragment.jobs, + ) + + +# --------------------------------------------------------------------------- +# Configured grader pipelines +# --------------------------------------------------------------------------- + +GRADER_PIPELINES: list[GraderPipelineConfig] = [ + GraderPipelineConfig( + pipeline_name="graders-mit-600x", + grader_repo_url="git@github.com:mitodl/graders-mit-600x", + grader_repo_branch="main", + ecr_repo_name="mitodl/graders-mit-600x", + ), + GraderPipelineConfig( + pipeline_name="graders-mit-686x", + grader_repo_url="git@github.com:mitodl/graders-mit-686x", + grader_repo_branch="main", + ecr_repo_name="mitodl/graders-mit-686x", + ), +] + + +if __name__ == "__main__": + pipeline_name = sys.argv[1] + config = next( + (p for p in GRADER_PIPELINES if p.pipeline_name == pipeline_name), None + ) + if config is None: + sys.exit( + f"Unknown pipeline name {pipeline_name!r}. " + f"Available: {[p.pipeline_name for p in GRADER_PIPELINES]}" + ) + pipeline_json = grader_image_pipeline(config).model_dump_json(indent=2) + with open("definition.json", "w") as definition: # noqa: PTH123 + definition.write(pipeline_json) + sys.stdout.write(pipeline_json) + sys.stdout.write( + f"\nfly -t set-pipeline" + f" -p build-{pipeline_name}-image -c definition.json\n" + ) diff --git a/src/ol_concourse/pipelines/open_edx/grader_images/meta.py b/src/ol_concourse/pipelines/open_edx/grader_images/meta.py new file mode 100644 index 0000000000..fbf4d4e149 --- /dev/null +++ b/src/ol_concourse/pipelines/open_edx/grader_images/meta.py @@ -0,0 +1,169 @@ +""" +Meta pipeline for grader image build pipelines. + +Creates and maintains two types of pipelines: + 1. A base image pipeline (build-grader-base-image) that builds + grader_support/Dockerfile.base from the xqueue-watcher repo and pushes + to both DockerHub and ECR. + 2. One build pipeline per entry in GRADER_PIPELINES that builds and pushes + a course-specific grader image to private ECR. + +This meta pipeline is self-updating: the "create-grader-images-meta-pipeline" +job re-sets itself whenever the pipeline code in ol-infrastructure changes. + +Usage: + fly -t set-pipeline -p grader-images-meta -c definition.json +""" + +import sys + +from ol_concourse.lib.models.pipeline import ( + AnonymousResource, + Command, + GetStep, + Identifier, + Input, + Job, + Output, + Pipeline, + Platform, + SetPipelineStep, + TaskConfig, + TaskStep, +) +from ol_concourse.lib.resources import git_repo +from ol_concourse.pipelines.open_edx.grader_images.build_pipeline import ( + GRADER_PIPELINES, +) + +_PIPELINE_CODE_PATHS = [ + "src/ol_concourse/lib/", + "src/ol_concourse/pipelines/open_edx/grader_images/", +] + +pipeline_code = git_repo( + name=Identifier("grader-images-pipeline-code"), + uri="https://github.com/mitodl/ol-infrastructure", + branch="main", + paths=_PIPELINE_CODE_PATHS, +) + +_OL_INFRA_IMAGE = AnonymousResource( + type="registry-image", + source={ + "repository": "mitodl/ol-infrastructure", + "tag": "latest", + }, +) + + +def _generate_pipeline_task( + task_name: str, script_path: str, script_args: list[str] +) -> TaskStep: + """Return a TaskStep that runs a pipeline-definition script. + + The script writes ``definition.json`` to the ``pipeline`` output directory, + which the subsequent ``SetPipelineStep`` reads. + """ + return TaskStep( + task=Identifier(task_name), + config=TaskConfig( + platform=Platform.linux, + image_resource=_OL_INFRA_IMAGE, + inputs=[Input(name=pipeline_code.name)], + outputs=[Output(name=Identifier("pipeline"))], + run=Command( + path="python", + dir="pipeline", + user="root", + args=[f"../{pipeline_code.name}/{script_path}", *script_args], + ), + ), + ) + + +def _build_base_image_meta_job() -> Job: + """Job that creates/updates the grader base image build pipeline.""" + return Job( + name=Identifier("create-grader-base-image-pipeline"), + plan=[ + GetStep(get=pipeline_code.name, trigger=True), + _generate_pipeline_task( + task_name="generate-base-image-pipeline-definition", + script_path=( + "src/ol_concourse/pipelines/open_edx/" + "grader_images/base_image_pipeline.py" + ), + script_args=[], + ), + SetPipelineStep( + team="infrastructure", + set_pipeline=Identifier("build-grader-base-image"), + file="pipeline/definition.json", + ), + ], + ) + + +def _build_grader_meta_job(pipeline_name: str) -> Job: + """Job that creates/updates the build pipeline for one grader repo.""" + return Job( + name=Identifier(f"create-{pipeline_name}-pipeline"), + plan=[ + GetStep(get=pipeline_code.name, trigger=True), + _generate_pipeline_task( + task_name=f"generate-{pipeline_name}-pipeline-definition", + script_path=( + "src/ol_concourse/pipelines/open_edx/" + "grader_images/build_pipeline.py" + ), + script_args=[pipeline_name], + ), + SetPipelineStep( + team="infrastructure", + set_pipeline=Identifier(f"build-{pipeline_name}-image"), + file="pipeline/definition.json", + ), + ], + ) + + +def _build_self_update_job() -> Job: + """Job that keeps the meta pipeline itself in sync with the repo.""" + return Job( + name=Identifier("create-grader-images-meta-pipeline"), + plan=[ + GetStep(get=pipeline_code.name, trigger=True), + _generate_pipeline_task( + task_name="generate-meta-pipeline-definition", + script_path=( + "src/ol_concourse/pipelines/open_edx/grader_images/meta.py" + ), + script_args=[], + ), + SetPipelineStep( + team="main", + set_pipeline="self", + file="pipeline/definition.json", + ), + ], + ) + + +meta_jobs = [ + _build_self_update_job(), + _build_base_image_meta_job(), + *[_build_grader_meta_job(config.pipeline_name) for config in GRADER_PIPELINES], +] + +meta_pipeline = Pipeline(resources=[pipeline_code], jobs=meta_jobs) + + +if __name__ == "__main__": + pipeline_json = meta_pipeline.model_dump_json(indent=2) + with open("definition.json", "w") as definition: # noqa: PTH123 + definition.write(pipeline_json) + sys.stdout.write(pipeline_json) + sys.stdout.write( + "\nfly -t set-pipeline -p grader-images-meta -c definition.json\n" + ) diff --git a/src/ol_concourse/pipelines/open_edx/xqwatcher/docker_pulumi_pipeline.py b/src/ol_concourse/pipelines/open_edx/xqwatcher/docker_pulumi_pipeline.py new file mode 100644 index 0000000000..875896420b --- /dev/null +++ b/src/ol_concourse/pipelines/open_edx/xqwatcher/docker_pulumi_pipeline.py @@ -0,0 +1,136 @@ +import sys + +from bridge.settings.openedx.accessors import filter_deployments_by_application +from ol_concourse.lib.containers import container_build_task +from ol_concourse.lib.jobs.infrastructure import pulumi_jobs_chain +from ol_concourse.lib.models.fragment import PipelineFragment +from ol_concourse.lib.models.pipeline import ( + GetStep, + Identifier, + Input, + Job, + Pipeline, + PutStep, +) +from ol_concourse.lib.resources import git_repo, registry_image +from ol_concourse.pipelines.constants import PULUMI_CODE_PATH, PULUMI_WATCHED_PATHS + + +def build_xqwatcher_pipeline(release_name: str): + xqwatcher_repo = git_repo( + name=Identifier("xqueue-watcher-code"), + uri="https://github.com/mitodl/xqueue-watcher", + branch="main", + ) + + xqwatcher_registry_image = registry_image( + name=Identifier("xqueue-watcher-container"), + image_repository="mitodl/xqueue-watcher", + image_tag="latest", + username="((dockerhub.username))", + password="((dockerhub.password))", # noqa: S106 + ) + + xqwatcher_pulumi_code = git_repo( + name=Identifier("ol-infrastructure-deploy"), + uri="https://github.com/mitodl/ol-infrastructure", + branch="main", + paths=[ + *PULUMI_WATCHED_PATHS, + PULUMI_CODE_PATH.joinpath("applications/xqwatcher/"), + "src/bridge/settings/openedx/", + ], + ) + + image_build_job = Job( + name=Identifier("build-xqueue-watcher-image"), + plan=[ + GetStep(get=xqwatcher_repo.name, trigger=True), + container_build_task( + inputs=[ + Input(name=xqwatcher_repo.name), + ], + build_parameters={ + "CONTEXT": xqwatcher_repo.name, + "DOCKERFILE": f"{xqwatcher_repo.name}/Dockerfile", + }, + ), + PutStep( + put=xqwatcher_registry_image.name, + params={ + "image": "image/image.tar", + "additional_tags": f"./{xqwatcher_repo.name}/.git/describe_ref", + }, + ), + ], + ) + + container_fragment = PipelineFragment( + resources=[xqwatcher_repo, xqwatcher_registry_image], + jobs=[image_build_job], + ) + + loop_fragments = [] + for deployment in filter_deployments_by_application(release_name, "xqwatcher"): + pulumi_fragment = pulumi_jobs_chain( + xqwatcher_pulumi_code, + stack_names=[ + f"applications.xqwatcher.{deployment.deployment_name}.{stage}" + for stage in deployment.envs_by_release(release_name) + ], + project_name="ol-infrastructure-xqwatcher-server", + project_source_path=PULUMI_CODE_PATH.joinpath("applications/xqwatcher/"), + dependencies=[ + GetStep( + get=container_fragment.resources[-1].name, + trigger=True, + passed=[container_fragment.jobs[-1].name], + ), + ], + env_vars_from_files={ + "XQWATCHER_DOCKER_DIGEST": f"{xqwatcher_registry_image.name}/digest" + }, + ) + loop_fragments.append(pulumi_fragment) + + combined_fragments = PipelineFragment.combine_fragments( + container_fragment, + *loop_fragments, + ) + + return Pipeline( + resource_types=combined_fragments.resource_types, + resources=[ + *combined_fragments.resources, + xqwatcher_pulumi_code, + ], + jobs=combined_fragments.jobs, + ) + + +if __name__ == "__main__": + from bridge.settings.openedx.types import OpenEdxSupportedRelease + + if len(sys.argv) < 2: # noqa: PLR2004 + releases = [r.name for r in OpenEdxSupportedRelease] + sys.stderr.write( + f"Usage: {sys.argv[0]} \n" + f"Available releases: {', '.join(releases)}\n" + ) + sys.exit(1) + release_name = sys.argv[1] + pipeline_json = build_xqwatcher_pipeline( + release_name, + ).model_dump_json(indent=2) + with open("definition.json", "w") as definition: # noqa: PTH123 + definition.write(pipeline_json) + sys.stdout.write(pipeline_json) + sys.stdout.writelines( + ( + "\n", + ( + "fly -t set-pipeline -p" + f" docker-pulumi-xqwatcher-{release_name} -c definition.json" + ), + ) + ) diff --git a/src/ol_concourse/pipelines/open_edx/xqwatcher/meta.py b/src/ol_concourse/pipelines/open_edx/xqwatcher/meta.py index 6d79c62c4a..6ee836d7fb 100644 --- a/src/ol_concourse/pipelines/open_edx/xqwatcher/meta.py +++ b/src/ol_concourse/pipelines/open_edx/xqwatcher/meta.py @@ -37,10 +37,10 @@ def build_meta_job(release_name): pipeline_id = "self" else: pipeline_definition_path = ( - "src/ol_concourse/pipelines/open_edx/xqwatcher/packer_pulumi_pipeline.py" + "src/ol_concourse/pipelines/open_edx/xqwatcher/docker_pulumi_pipeline.py" ) pipeline_team = "infrastructure" - pipeline_id = f"packer-pulumi-xqwatcher-{release_name}" + pipeline_id = f"docker-pulumi-xqwatcher-{release_name}" return Job( name=Identifier(f"create-xqwatcher-{release_name}-pipeline"), plan=[ diff --git a/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx-staging.CI.yaml b/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx-staging.CI.yaml index a023e8c5df..5fa3b1e484 100644 --- a/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx-staging.CI.yaml +++ b/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx-staging.CI.yaml @@ -3,13 +3,35 @@ secretsprovider: awskms://alias/infrastructure-secrets-ci encryptedkey: AQICAHjnbqe9AmEW1Js10nySybyuAG7Fb5E9EHUgkmqFDv7PxQGTfGgSk9EZ4ZNb/wbJfXd+AAAAfjB8BgkqhkiG9w0BBwagbzBtAgEAMGgGCSqGSIb3DQEHATAeBglghkgBZQMEAS4wEQQMA0HqkgIE4odoJJA1AgEQgDvV0/Ss11eXyMaVbNAyMdRYYZtma1v9dVZa+p4MuzZFJn9xBZU9Fsa1suYQRgBa5jhg3XsmZDnN8st/aw== config: aws:region: us-east-1 - consul:address: https://consul-mitx-staging-ci.odl.mit.edu - xqwatcher:auto_scale: - desired: 1 - max: 2 - min: 1 - xqwatcher:instance_type: t3a.small xqwatcher:business_unit: residential-staging - xqwatcher:target_vpc: residential_mitx_staging_vpc + xqwatcher:cluster: residential + xqwatcher:namespace: mitx-staging-openedx + xqwatcher:min_replicas: 1 vault:address: https://vault-ci.odl.mit.edu vault_server:env_namespace: operations.ci + xqwatcher:xqueue_server_url: https://xqueue.mitx-staging-ci.odl.mit.edu + xqwatcher:queues: + Watcher-MITx-6.00x: + CONNECTIONS: 5 + HANDLERS: + - HANDLER: xqueue_watcher.containergrader.ContainerGrader + KWARGS: + grader_root: /graders/python3graders/ + image: 610119931565.dkr.ecr.us-east-1.amazonaws.com/mitodl/graders-mit-600x:latest + backend: kubernetes + cpu_limit: 1000m + memory_limit: 512Mi + timeout: 60 + image_pull_policy: always + Watcher-MITx-6.0001r: + CONNECTIONS: 5 + HANDLERS: + - HANDLER: xqueue_watcher.containergrader.ContainerGrader + KWARGS: + grader_root: /graders/python3graders/ + image: 610119931565.dkr.ecr.us-east-1.amazonaws.com/mitodl/graders-mit-600x:latest + backend: kubernetes + cpu_limit: 1000m + memory_limit: 512Mi + timeout: 60 + image_pull_policy: always diff --git a/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx-staging.Production.yaml b/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx-staging.Production.yaml index 396551cb4b..3c55f1b327 100644 --- a/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx-staging.Production.yaml +++ b/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx-staging.Production.yaml @@ -3,13 +3,35 @@ secretsprovider: awskms://alias/infrastructure-secrets-production encryptedkey: AQICAHjmo6C0sCNz3fdkFlhbu0tdBZxnHmPYSnqtmocvGiuNygF8Luz2qeEEBP5Xlrjd6nabAAAAfjB8BgkqhkiG9w0BBwagbzBtAgEAMGgGCSqGSIb3DQEHATAeBglghkgBZQMEAS4wEQQMx2cfVhpQzWg5NpHZAgEQgDtL008GopRA6ADVsjgvqT7eMirUDc8R1jusrAhd7rHx016K9nC2OI23eapgxQyW3fgAomXkVJQir5fHYA== config: aws:region: us-east-1 - consul:address: https://consul-mitx-staging-production.odl.mit.edu - xqwatcher:auto_scale: - desired: 1 - max: 2 - min: 1 - xqwatcher:instance_type: t3a.small xqwatcher:business_unit: residential-staging - xqwatcher:target_vpc: residential_mitx_staging_vpc + xqwatcher:cluster: residential + xqwatcher:namespace: mitx-staging-openedx + xqwatcher:min_replicas: 1 vault:address: https://vault-production.odl.mit.edu vault_server:env_namespace: operations.production + xqwatcher:xqueue_server_url: https://xqueue.mitx-staging-prod.odl.mit.edu + xqwatcher:queues: + Watcher-MITx-staging-6.00x: + CONNECTIONS: 5 + HANDLERS: + - HANDLER: xqueue_watcher.containergrader.ContainerGrader + KWARGS: + grader_root: /graders/python3graders/ + image: 610119931565.dkr.ecr.us-east-1.amazonaws.com/mitodl/graders-mit-600x:latest + backend: kubernetes + cpu_limit: 1000m + memory_limit: 512Mi + timeout: 60 + image_pull_policy: always + Watcher-MITx-staging-6.0001r: + CONNECTIONS: 5 + HANDLERS: + - HANDLER: xqueue_watcher.containergrader.ContainerGrader + KWARGS: + grader_root: /graders/python3graders/ + image: 610119931565.dkr.ecr.us-east-1.amazonaws.com/mitodl/graders-mit-600x:latest + backend: kubernetes + cpu_limit: 1000m + memory_limit: 512Mi + timeout: 60 + image_pull_policy: always diff --git a/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx-staging.QA.yaml b/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx-staging.QA.yaml index 9a501d67ea..538de15860 100644 --- a/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx-staging.QA.yaml +++ b/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx-staging.QA.yaml @@ -3,13 +3,35 @@ secretsprovider: awskms://alias/infrastructure-secrets-qa encryptedkey: AQICAHgQW+3bag/cl2fPG3dPdqAPbfcsZuwI7rETXZsx85HRpgHz947ZuJNR+i0BzvgqRXZMAAAAfjB8BgkqhkiG9w0BBwagbzBtAgEAMGgGCSqGSIb3DQEHATAeBglghkgBZQMEAS4wEQQMSLaA5CcNOfBjGw2fAgEQgDv2PNuHpexnToW8k4+LZa/O4CHA+8dn0qTB9vNd+rPFMlShc4mt37WhEY/KHAmUhkLvkDsaySxcdelxrA== config: aws:region: us-east-1 - consul:address: https://consul-mitx-staging-qa.odl.mit.edu - xqwatcher:auto_scale: - desired: 1 - max: 2 - min: 1 - xqwatcher:instance_type: t3a.small xqwatcher:business_unit: residential-staging - xqwatcher:target_vpc: residential_mitx_staging_vpc + xqwatcher:cluster: residential + xqwatcher:namespace: mitx-staging-openedx + xqwatcher:min_replicas: 1 vault:address: https://vault-qa.odl.mit.edu vault_server:env_namespace: operations.qa + xqwatcher:xqueue_server_url: https://xqueue.mitx-staging-qa.odl.mit.edu + xqwatcher:queues: + Watcher-MITx-staging-6.00x: + CONNECTIONS: 5 + HANDLERS: + - HANDLER: xqueue_watcher.containergrader.ContainerGrader + KWARGS: + grader_root: /graders/python3graders/ + image: 610119931565.dkr.ecr.us-east-1.amazonaws.com/mitodl/graders-mit-600x:latest + backend: kubernetes + cpu_limit: 1000m + memory_limit: 512Mi + timeout: 60 + image_pull_policy: always + Watcher-MITx-staging-6.0001r: + CONNECTIONS: 5 + HANDLERS: + - HANDLER: xqueue_watcher.containergrader.ContainerGrader + KWARGS: + grader_root: /graders/python3graders/ + image: 610119931565.dkr.ecr.us-east-1.amazonaws.com/mitodl/graders-mit-600x:latest + backend: kubernetes + cpu_limit: 1000m + memory_limit: 512Mi + timeout: 60 + image_pull_policy: always diff --git a/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx.CI.yaml b/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx.CI.yaml index 32563028d1..74b8886a89 100644 --- a/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx.CI.yaml +++ b/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx.CI.yaml @@ -3,13 +3,35 @@ secretsprovider: awskms://alias/infrastructure-secrets-ci encryptedkey: AQICAHjnbqe9AmEW1Js10nySybyuAG7Fb5E9EHUgkmqFDv7PxQHQ1nYxdMdGpUV3lkCYkPCCAAAAfjB8BgkqhkiG9w0BBwagbzBtAgEAMGgGCSqGSIb3DQEHATAeBglghkgBZQMEAS4wEQQMsqc+FPq0+d14aeiEAgEQgDtOLAH8o/ueXOQKwFgUIv0reMzktDtSL+DF4pec6zPtl0qaaP7mXGY9WECr4y4YGaZ6uHtgh1pHncqqIQ== config: aws:region: us-east-1 - consul:address: https://consul-mitx-ci.odl.mit.edu - xqwatcher:auto_scale: - desired: 1 - max: 2 - min: 1 - xqwatcher:instance_type: t3a.small xqwatcher:business_unit: residential - xqwatcher:target_vpc: residential_mitx_vpc + xqwatcher:cluster: residential + xqwatcher:namespace: mitx-openedx + xqwatcher:min_replicas: 1 vault:address: https://vault-ci.odl.mit.edu vault_server:env_namespace: operations.ci + xqwatcher:xqueue_server_url: https://xqueue.mitx.ci.odl.mit.edu + xqwatcher:queues: + Watcher-MITx-6.00x: + CONNECTIONS: 5 + HANDLERS: + - HANDLER: xqueue_watcher.containergrader.ContainerGrader + KWARGS: + grader_root: /graders/python3graders/ + image: 610119931565.dkr.ecr.us-east-1.amazonaws.com/mitodl/graders-mit-600x:latest + backend: kubernetes + cpu_limit: 1000m + memory_limit: 512Mi + timeout: 60 + image_pull_policy: always + Watcher-MITx-6.0001r: + CONNECTIONS: 5 + HANDLERS: + - HANDLER: xqueue_watcher.containergrader.ContainerGrader + KWARGS: + grader_root: /graders/python3graders/ + image: 610119931565.dkr.ecr.us-east-1.amazonaws.com/mitodl/graders-mit-600x:latest + backend: kubernetes + cpu_limit: 1000m + memory_limit: 512Mi + timeout: 60 + image_pull_policy: always diff --git a/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx.Production.yaml b/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx.Production.yaml index 7b34ef48eb..53936aecb3 100644 --- a/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx.Production.yaml +++ b/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx.Production.yaml @@ -3,13 +3,35 @@ secretsprovider: awskms://alias/infrastructure-secrets-production encryptedkey: AQICAHjmo6C0sCNz3fdkFlhbu0tdBZxnHmPYSnqtmocvGiuNygG2bHWpHlBF4YM4HIMysk4IAAAAfjB8BgkqhkiG9w0BBwagbzBtAgEAMGgGCSqGSIb3DQEHATAeBglghkgBZQMEAS4wEQQMawAEgwsc+Nx69O7TAgEQgDtiAIRJusPXgD/M0b49KX75IkX36QN7kgXzYkq5KijA7xXU9pJkECwS0ZF9eQikfX6Po8sm4e+frmhCWg== config: aws:region: us-east-1 - consul:address: https://consul-mitx-production.odl.mit.edu - xqwatcher:auto_scale: - desired: 2 - max: 3 - min: 1 - xqwatcher:instance_type: r5a.large xqwatcher:business_unit: residential - xqwatcher:target_vpc: residential_mitx_vpc + xqwatcher:cluster: residential + xqwatcher:namespace: mitx-openedx + xqwatcher:min_replicas: 2 vault:address: https://vault-production.odl.mit.edu vault_server:env_namespace: operations.production + xqwatcher:xqueue_server_url: https://xqueue.mitx.odl.mit.edu + xqwatcher:queues: + Watcher-MITx-6.00x: + CONNECTIONS: 5 + HANDLERS: + - HANDLER: xqueue_watcher.containergrader.ContainerGrader + KWARGS: + grader_root: /graders/python3graders/ + image: 610119931565.dkr.ecr.us-east-1.amazonaws.com/mitodl/graders-mit-600x:latest + backend: kubernetes + cpu_limit: 1000m + memory_limit: 512Mi + timeout: 60 + image_pull_policy: always + Watcher-MITx-6.0001r: + CONNECTIONS: 5 + HANDLERS: + - HANDLER: xqueue_watcher.containergrader.ContainerGrader + KWARGS: + grader_root: /graders/python3graders/ + image: 610119931565.dkr.ecr.us-east-1.amazonaws.com/mitodl/graders-mit-600x:latest + backend: kubernetes + cpu_limit: 1000m + memory_limit: 512Mi + timeout: 60 + image_pull_policy: always diff --git a/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx.QA.yaml b/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx.QA.yaml index bcc3bcf6a7..3b6bc1b926 100644 --- a/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx.QA.yaml +++ b/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitx.QA.yaml @@ -3,13 +3,35 @@ secretsprovider: awskms://alias/infrastructure-secrets-qa encryptedkey: AQICAHgQW+3bag/cl2fPG3dPdqAPbfcsZuwI7rETXZsx85HRpgFh6uALQ+g4+ZnTTRntlQCIAAAAfjB8BgkqhkiG9w0BBwagbzBtAgEAMGgGCSqGSIb3DQEHATAeBglghkgBZQMEAS4wEQQMAxL8o+aBjXxOHc8xAgEQgDuxzj+qX9ZtqoBbvEyXA4VRvbWDhFOHIRbGsJ1NCgU+Hmy8R3gsBN45UE7Wu71yWe6oINNqRbsuDw10EQ== config: aws:region: us-east-1 - consul:address: https://consul-mitx-qa.odl.mit.edu - xqwatcher:auto_scale: - desired: 1 - max: 2 - min: 1 - xqwatcher:instance_type: r5a.large xqwatcher:business_unit: residential - xqwatcher:target_vpc: residential_mitx_vpc + xqwatcher:cluster: residential + xqwatcher:namespace: mitx-openedx + xqwatcher:min_replicas: 1 vault:address: https://vault-qa.odl.mit.edu vault_server:env_namespace: operations.qa + xqwatcher:xqueue_server_url: https://xqueue.mitx.qa.odl.mit.edu + xqwatcher:queues: + Watcher-MITx-6.00x: + CONNECTIONS: 5 + HANDLERS: + - HANDLER: xqueue_watcher.containergrader.ContainerGrader + KWARGS: + grader_root: /graders/python3graders/ + image: 610119931565.dkr.ecr.us-east-1.amazonaws.com/mitodl/graders-mit-600x:latest + backend: kubernetes + cpu_limit: 1000m + memory_limit: 512Mi + timeout: 60 + image_pull_policy: always + Watcher-MITx-6.0001r: + CONNECTIONS: 5 + HANDLERS: + - HANDLER: xqueue_watcher.containergrader.ContainerGrader + KWARGS: + grader_root: /graders/python3graders/ + image: 610119931565.dkr.ecr.us-east-1.amazonaws.com/mitodl/graders-mit-600x:latest + backend: kubernetes + cpu_limit: 1000m + memory_limit: 512Mi + timeout: 60 + image_pull_policy: always diff --git a/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitxonline.CI.yaml b/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitxonline.CI.yaml index 6899ca4608..5c01f5a82f 100644 --- a/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitxonline.CI.yaml +++ b/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitxonline.CI.yaml @@ -3,13 +3,23 @@ secretsprovider: awskms://alias/infrastructure-secrets-ci encryptedkey: AQICAHi3MZ/Pjy2dahB1Qm+zKkKDPV1b9MYPGp7k649HPjmOHAG+XE3l7voVbQN9bQ80XZRMAAAAfjB8BgkqhkiG9w0BBwagbzBtAgEAMGgGCSqGSIb3DQEHATAeBglghkgBZQMEAS4wEQQMq5xwinh6H/9mPuyeAgEQgDtUAuLW2HbXCdstaU6dZEtTOQ2SXq67YUzDbnJeE2FNb49KxWEgsXeUzz/r6XiML/cTwq1cTIHq7LDi0w== config: aws:region: us-east-1 - consul:address: https://consul-mitxonline-ci.odl.mit.edu - xqwatcher:auto_scale: - desired: 1 - max: 2 - min: 1 - xqwatcher:instance_type: t3a.small xqwatcher:business_unit: mitxonline - xqwatcher:target_vpc: mitxonline_vpc + xqwatcher:cluster: applications + xqwatcher:namespace: mitxonline-openedx + xqwatcher:min_replicas: 1 vault:address: https://vault-ci.odl.mit.edu vault_server:env_namespace: operations.ci + xqwatcher:xqueue_server_url: https://xqueue.mitxonline.ci.odl.mit.edu + xqwatcher:queues: + Watcher-MITx-6.00x: + CONNECTIONS: 5 + HANDLERS: + - HANDLER: xqueue_watcher.containergrader.ContainerGrader + KWARGS: + grader_root: /graders/python3graders/ + image: 610119931565.dkr.ecr.us-east-1.amazonaws.com/mitodl/graders-mit-600x:latest + backend: kubernetes + cpu_limit: 1000m + memory_limit: 512Mi + timeout: 60 + image_pull_policy: always diff --git a/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitxonline.Production.yaml b/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitxonline.Production.yaml index 2247500801..bfc18a6c71 100644 --- a/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitxonline.Production.yaml +++ b/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitxonline.Production.yaml @@ -3,13 +3,36 @@ secretsprovider: awskms://alias/infrastructure-secrets-production encryptedkey: AQICAHjmo6C0sCNz3fdkFlhbu0tdBZxnHmPYSnqtmocvGiuNygG5AzdO0QY0yXbhDGt3drvfAAAAfjB8BgkqhkiG9w0BBwagbzBtAgEAMGgGCSqGSIb3DQEHATAeBglghkgBZQMEAS4wEQQMevv1o+gghWthYkifAgEQgDtMc3j8K7A1ne9ZjHtpgBo9wlSor6yW7KOQpjGjToqweQ5wvlLlkOQibnSKKxi6Vhsm3gXz7nlzNuliIg== config: aws:region: us-east-1 - consul:address: https://consul-mitxonline-production.odl.mit.edu - xqwatcher:auto_scale: - desired: 3 - max: 8 - min: 1 - xqwatcher:instance_type: r7a.large xqwatcher:business_unit: mitxonline - xqwatcher:target_vpc: mitxonline_vpc + xqwatcher:cluster: applications + xqwatcher:namespace: mitxonline-openedx + xqwatcher:min_replicas: 2 vault:address: https://vault-production.odl.mit.edu vault_server:env_namespace: operations.production + xqwatcher:xqueue_server_url: https://xqueue.mitxonline.odl.mit.edu + xqwatcher:queues: + Watcher-MITx-6.00x: + CONNECTIONS: 5 + HANDLERS: + - HANDLER: xqueue_watcher.containergrader.ContainerGrader + KWARGS: + grader_root: /graders/python3graders/ + image: 610119931565.dkr.ecr.us-east-1.amazonaws.com/mitodl/graders-mit-600x:latest + backend: kubernetes + cpu_limit: 1000m + memory_limit: 512Mi + timeout: 60 + image_pull_policy: always + edxorg-686x: + SERVER_REF: edxorg + CONNECTIONS: 5 + HANDLERS: + - HANDLER: xqueue_watcher.containergrader.ContainerGrader + KWARGS: + grader_root: /graders/ + image: 610119931565.dkr.ecr.us-east-1.amazonaws.com/mitodl/graders-mit-686x:latest + backend: kubernetes + cpu_limit: 1000m + memory_limit: 1Gi + timeout: 60 + image_pull_policy: always diff --git a/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitxonline.QA.yaml b/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitxonline.QA.yaml index bd41dd3103..5b4317485b 100644 --- a/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitxonline.QA.yaml +++ b/src/ol_infrastructure/applications/xqwatcher/Pulumi.applications.xqwatcher.mitxonline.QA.yaml @@ -3,13 +3,23 @@ secretsprovider: awskms://alias/infrastructure-secrets-qa encryptedkey: AQICAHgQW+3bag/cl2fPG3dPdqAPbfcsZuwI7rETXZsx85HRpgHzGaCdqKWGOJ49SaKpOTIJAAAAfjB8BgkqhkiG9w0BBwagbzBtAgEAMGgGCSqGSIb3DQEHATAeBglghkgBZQMEAS4wEQQMolKUc2XHdg9utgvHAgEQgDt45yPejgGlntgwIkd0Eu0ButJHb8w1TpovLlcZAowgytdZ2JXjqvmRyncU1GOtcf7/NYjfjqj5WE5iSg== config: aws:region: us-east-1 - consul:address: https://consul-mitxonline-qa.odl.mit.edu - xqwatcher:auto_scale: - desired: 1 - max: 2 - min: 1 - xqwatcher:instance_type: t3a.small xqwatcher:business_unit: mitxonline - xqwatcher:target_vpc: mitxonline_vpc + xqwatcher:cluster: applications + xqwatcher:namespace: mitxonline-openedx + xqwatcher:min_replicas: 1 vault:address: https://vault-qa.odl.mit.edu vault_server:env_namespace: operations.qa + xqwatcher:xqueue_server_url: https://xqueue.mitxonline.qa.odl.mit.edu + xqwatcher:queues: + Watcher-MITx-6.00x: + CONNECTIONS: 5 + HANDLERS: + - HANDLER: xqueue_watcher.containergrader.ContainerGrader + KWARGS: + grader_root: /graders/python3graders/ + image: 610119931565.dkr.ecr.us-east-1.amazonaws.com/mitodl/graders-mit-600x:latest + backend: kubernetes + cpu_limit: 1000m + memory_limit: 512Mi + timeout: 60 + image_pull_policy: always diff --git a/src/ol_infrastructure/applications/xqwatcher/__main__.py b/src/ol_infrastructure/applications/xqwatcher/__main__.py index 378bfc2c35..7b5d992d5b 100644 --- a/src/ol_infrastructure/applications/xqwatcher/__main__.py +++ b/src/ol_infrastructure/applications/xqwatcher/__main__.py @@ -1,265 +1,900 @@ -"""Create the resources needed to run a xqwatcher server. # noqa: D200""" +"""Create the Kubernetes resources needed to run xqueue-watcher. # noqa: D200 -# Note: This stack has a silent dependency on an peering connection between the VPC -# that it is installed in and the VPC(s) that contain the xqueue instances. +xqueue-watcher polls an xqueue server for student code submissions and grades +them by spawning an isolated container (ContainerGrader) per submission. This +stack replaces the previous EC2 AMI-based deployment with a Kubernetes +Deployment on the shared applications EKS cluster. -import base64 +Secrets are managed via the Vault Secrets Operator (VaultStaticSecret CRD). +""" + +import copy import json -import textwrap +import os from pathlib import Path +from typing import Any + +import pulumi_kubernetes as kubernetes +from pulumi import Config, ResourceOptions, StackReference, export -import pulumi_vault as vault -import yaml -from pulumi import Config, StackReference, export -from pulumi_aws import ec2, get_caller_identity, iam - -from bridge.secrets.sops import read_yaml_secrets -from bridge.settings.openedx.version_matrix import OpenLearningOpenEdxDeployment -from ol_infrastructure.components.aws.auto_scale_group import ( - BlockDeviceMapping, - OLAutoScaleGroupConfig, - OLAutoScaling, - OLLaunchTemplateConfig, - TagSpecification, +from ol_infrastructure.components.applications.eks import ( + OLEKSAuthBinding, + OLEKSAuthBindingConfig, +) +from ol_infrastructure.components.services.vault import ( + OLVaultK8SSecret, + OLVaultK8SStaticSecretConfig, ) -from ol_infrastructure.lib.aws.ec2_helper import InstanceTypes, default_egress_args -from ol_infrastructure.lib.consul import get_consul_provider -from ol_infrastructure.lib.ol_types import AWSBase +from ol_infrastructure.lib.aws.eks_helper import cached_image_uri, setup_k8s_provider +from ol_infrastructure.lib.ol_types import AWSBase, K8sGlobalLabels, Services from ol_infrastructure.lib.pulumi_helper import parse_stack from ol_infrastructure.lib.vault import setup_vault_provider ################################## -## Setup + Config Retrival ## +## Setup + Config Retrieval ## ################################## -if Config("vault_server").get("env_namespace"): +if Config("vault_server").get("env_namespace") or Config("vault").get("address"): setup_vault_provider() + stack_info = parse_stack() xqwatcher_config = Config("xqwatcher") -network_stack = StackReference(f"infrastructure.aws.network.{stack_info.name}") -policy_stack = StackReference("infrastructure.aws.policies") -dns_stack = StackReference("infrastructure.aws.dns") -consul_stack = StackReference( - f"infrastructure.consul.{stack_info.env_prefix}.{stack_info.name}" -) - -env_name = f"{stack_info.env_prefix}-{stack_info.env_suffix}" - -target_vpc_name = xqwatcher_config.get("target_vpc") -target_vpc = network_stack.require_output(target_vpc_name) -vpc_id = target_vpc["id"] - -consul_security_groups = consul_stack.require_output("security_groups") -consul_provider = get_consul_provider(stack_info) -vault_mount_stack = StackReference( - f"substructure.vault.static_mounts.operations.{stack_info.name}" +cluster_name = xqwatcher_config.get("cluster") or "applications" +cluster_stack = StackReference( + f"infrastructure.aws.eks.{cluster_name}.{stack_info.name}" ) -aws_account = get_caller_identity() +env_name = f"{stack_info.env_prefix}-{stack_info.env_suffix}" aws_config = AWSBase( tags={ - "OU": xqwatcher_config.get("business_unit"), + "OU": xqwatcher_config.require("business_unit"), "Environment": env_name, "Application": "open-edx-xqwatcher", "Owner": "platform-engineering", } ) -xqwatcher_server_tag = f"open-edx-xqwatcher-server-{env_name}" -openedx_release = ( - OpenLearningOpenEdxDeployment.get_item(stack_info.env_prefix) - .release_by_env(stack_info.name) - .value +k8s_global_labels = K8sGlobalLabels( + service=Services.xqwatcher, + ou=xqwatcher_config.require("business_unit"), + stack=stack_info, ) -xqwatcher_server_ami = ec2.get_ami( - filters=[ - ec2.GetAmiFilterArgs(name="name", values=["open-edx-xqwatcher-server-*"]), - ec2.GetAmiFilterArgs(name="virtualization-type", values=["hvm"]), - ec2.GetAmiFilterArgs(name="root-device-type", values=["ebs"]), - ec2.GetAmiFilterArgs(name="tag:deployment", values=[stack_info.env_prefix]), - ec2.GetAmiFilterArgs(name="tag:openedx_release", values=[openedx_release]), - ], - most_recent=True, - owners=[aws_account.account_id], -) +setup_k8s_provider(kubeconfig=cluster_stack.require_output("kube_config")) -############################### -## General Resources ## -############################### - -# IAM and instance profile -xqwatcher_server_instance_role = iam.Role( - f"xqwatcher-server-instance-role-{env_name}", - assume_role_policy=json.dumps( - { - "Version": "2012-10-17", - "Statement": { - "Effect": "Allow", - "Action": "sts:AssumeRole", - "Principal": {"Service": "ec2.amazonaws.com"}, - }, - } - ), - path="/ol-infrastructure/xqwatcher-server/role/", - tags=aws_config.tags, -) -iam.RolePolicyAttachment( - f"xqwatcher-server-describe-instance-role-policy-{env_name}", - policy_arn=policy_stack.require_output("iam_policies")["describe_instances"], - role=xqwatcher_server_instance_role.name, +namespace = xqwatcher_config.get("namespace") or f"{stack_info.env_prefix}-openedx" + +docker_image_tag = os.environ.get("XQWATCHER_DOCKER_TAG") or xqwatcher_config.get( + "docker_tag" ) -xqwatcher_server_instance_profile = iam.InstanceProfile( - f"xqwatcher-server-instance-profile-{env_name}", - role=xqwatcher_server_instance_role.name, - path="/ol-infrastructure/xqwatcher-server/profile/", +if not docker_image_tag: + msg = ( + "Either XQWATCHER_DOCKER_TAG env var or xqwatcher:docker_tag config must be set" + ) + raise ValueError(msg) +# Digests use @ (e.g. sha256:abc…), tags use : (e.g. latest, v1.2.3) +_sep = "@" if docker_image_tag.startswith("sha256:") else ":" +docker_image_ref = f"mitodl/xqueue-watcher{_sep}{docker_image_tag}" + +min_replicas = xqwatcher_config.get_int("min_replicas") or 1 +max_replicas = xqwatcher_config.get_int("max_replicas") or 5 +# When true, a second VaultStaticSecret reads edx.org xqueue credentials from +# secret-/edxorg-xqueue and merges them into xqueue_servers.json at pod +# start. Set this only for stacks that actually watch edx.org queues. +edxorg_xqueue_enabled = xqwatcher_config.get_bool("edxorg_xqueue_enabled") or False + +# Deployment-wide ContainerGrader defaults. These become XQWATCHER_GRADER_* +# environment variables on the xqwatcher pod so operators don't have to repeat +# them in every conf.d queue JSON file. Per-queue KWARGS still override these. +grader_namespace = xqwatcher_config.get("grader_namespace") or namespace +grader_cpu_limit = xqwatcher_config.get("grader_cpu_limit") or "500m" +grader_memory_limit = xqwatcher_config.get("grader_memory_limit") or "256Mi" +grader_timeout = xqwatcher_config.get("grader_timeout") or "20" +verify_tls = xqwatcher_config.get("verify_tls") or "true" +submission_size_limit = xqwatcher_config.get("submission_size_limit") or str( + 1024 * 1024 +) # 1 MB default, matching containergrader + +################################## +## Grader Queue Config ## +################################## + +xqueue_server_url = xqwatcher_config.require("xqueue_server_url") + +# Read the non-secret queue configs from Pulumi stack config and inject +# SERVER_REF so credentials are resolved from xqueue_servers.json at runtime. +_queues_raw: dict[str, Any] = xqwatcher_config.require_object("queues") +queues_config: dict[str, Any] = {} +for queue_name, queue_cfg in _queues_raw.items(): + entry = copy.deepcopy(queue_cfg) + # Rewrite bare DockerHub image refs to use the ECR pull-through cache. + for handler_cfg in entry.get("HANDLERS", []): + if handler_cfg.get("HANDLER", "").endswith( + "ContainerGrader" + ) and "image" in handler_cfg.get("KWARGS", {}): + image_ref = handler_cfg["KWARGS"]["image"] + first_component = image_ref.split("/", maxsplit=1)[0] + if "." not in first_component and ":" not in first_component: + handler_cfg["KWARGS"]["image"] = cached_image_uri(image_ref) + entry.setdefault("SERVER_REF", "default") + queues_config[queue_name] = entry + +# Split by SERVER_REF so each Deployment only ships configs for its own server. +# Queues with no SERVER_REF (or SERVER_REF="default") belong to the MIT-hosted +# server; queues with SERVER_REF="edxorg" belong to the edx.org server. +default_queues: dict[str, Any] = { + name: cfg + for name, cfg in queues_config.items() + if cfg.get("SERVER_REF", "default") == "default" +} +edxorg_queues: dict[str, Any] = ( + { + name: cfg + for name, cfg in queues_config.items() + if cfg.get("SERVER_REF") == "edxorg" + } + if edxorg_xqueue_enabled + else {} ) -# Vault policy definition -xqwatcher_server_vault_policy = vault.Policy( - f"xqwatcher-server-vault-policy-{env_name}", - name=f"xqwatcher-server-{stack_info.env_prefix}", - policy=Path(__file__) - .parent.joinpath("xqwatcher_server_policy.hcl") - .read_text() - .replace("DEPLOYMENT", f"{stack_info.env_prefix}"), +################################## +## Vault Policy + K8s Auth ## +################################## + +vault_policy_template = ( + Path(__file__).parent.joinpath("xqwatcher_server_policy.hcl").read_text() ) -# Register xqwatcher AMI for Vault AWS auth -vault.aws.AuthBackendRole( - f"xqwatcher-server-ami-ec2-vault-auth-{env_name}", - backend=f"aws-{stack_info.env_prefix}", - auth_type="iam", - role="xqwatcher-server", - inferred_entity_type="ec2_instance", - inferred_aws_region=aws_config.region, - bound_iam_instance_profile_arns=[xqwatcher_server_instance_profile.arn], - bound_ami_ids=[xqwatcher_server_ami.id], - bound_account_ids=[aws_account.account_id], - bound_vpc_ids=[vpc_id], - token_policies=[xqwatcher_server_vault_policy.name], +vault_policy_text = vault_policy_template.replace("DEPLOYMENT", stack_info.env_prefix) + +xqwatcher_app = OLEKSAuthBinding( + OLEKSAuthBindingConfig( + application_name=f"xqwatcher-{stack_info.env_prefix}", + namespace=namespace, + stack_info=stack_info, + aws_config=aws_config, + iam_policy_document=None, # no direct AWS resource access required + vault_policy_text=vault_policy_text, + cluster_name=cluster_stack.require_output("cluster_name"), + cluster_identities=cluster_stack.require_output("cluster_identities"), + vault_auth_endpoint=cluster_stack.require_output("vault_auth_endpoint"), + irsa_service_account_name="xqwatcher", + vault_sync_service_account_names=f"xqwatcher-{stack_info.env_prefix}-vault", + k8s_labels=k8s_global_labels, + create_irsa_service_account=True, + ) ) +vault_k8s_resources = xqwatcher_app.vault_k8s_resources + ################################## -# Network Access Control # +## Vault Secrets ## ################################## -# Create security group -xqwatcher_server_security_group = ec2.SecurityGroup( - f"xqwatcher-server-security-group-{env_name}", - name=f"xqwatcher-server-operations-{env_name}", - description="Access control for xqwatcher servers", - ingress=[], # no listeners on xqwatcher nodes - egress=default_egress_args, - vpc_id=vpc_id, + +# ── Default (MIT-hosted) xqueue server ────────────────────────────────────── +# Credentials live at secret-/edx-xqueue alongside the xqueue and edxapp +# deployments. Only xqwatcher_password is needed here. +xqueue_servers_secret_name = ( + "xqwatcher-xqueue-servers" # pragma: allowlist secret # noqa: S105 +) +xqueue_servers_template = json.dumps( + { + "default": { + "SERVER": xqueue_server_url, + "AUTH": ["xqwatcher", "{{ .Secrets.xqwatcher_password }}"], + }, + } +) +xqueue_servers_secret = OLVaultK8SSecret( + f"xqwatcher-{env_name}-xqueue-servers-secret", + OLVaultK8SStaticSecretConfig( + name=xqueue_servers_secret_name, + namespace=namespace, + dest_secret_name=xqueue_servers_secret_name, + dest_secret_labels=k8s_global_labels.model_dump(), + labels=k8s_global_labels.model_dump(), + mount=f"secret-{stack_info.env_prefix}", + mount_type="kv-v1", + path="edx-xqueue", + refresh_after="1h", + restart_target_kind="Deployment", + restart_target_name="xqwatcher", + templates={ + "xqueue_servers.json": xqueue_servers_template, + }, + vaultauth=vault_k8s_resources.auth_name, + ), + opts=ResourceOptions( + delete_before_replace=True, + depends_on=[vault_k8s_resources], + ), ) -################################### -# Web Node EC2 Deployment # -################################### +# ── edx.org (external) xqueue server ──────────────────────────────────────── +# Credentials are entirely separate from the MIT-hosted instance and live at +# secret-/edxorg-xqueue. Only created for stacks that watch edx.org +# queues (edxorg_xqueue_enabled = true in stack config). +edxorg_servers_secret_name = ( + "xqwatcher-edxorg-servers" # pragma: allowlist secret # noqa: S105 +) +edxorg_servers_template = json.dumps( + { + "edxorg": { + "SERVER": "https://xqueue.edx.org", + "AUTH": [ + "{{ .Secrets.edxorg_xqueue_username }}", + "{{ .Secrets.edxorg_xqueue_password }}", + ], + }, + } +) -consul_datacenter = consul_stack.require_output("datacenter") -grafana_credentials = read_yaml_secrets( - Path(f"vector/grafana.{stack_info.env_suffix}.yaml") +edxorg_servers_secret = ( + OLVaultK8SSecret( + f"xqwatcher-{env_name}-edxorg-servers-secret", + OLVaultK8SStaticSecretConfig( + name=edxorg_servers_secret_name, + namespace=namespace, + dest_secret_name=edxorg_servers_secret_name, + dest_secret_labels=k8s_global_labels.model_dump(), + labels=k8s_global_labels.model_dump(), + mount=f"secret-{stack_info.env_prefix}", + mount_type="kv-v1", + path="edxorg-xqueue", + refresh_after="1h", + restart_target_kind="Deployment", + restart_target_name="xqwatcher-edxorg", + templates={ + "edxorg_servers.json": edxorg_servers_template, + }, + vaultauth=vault_k8s_resources.auth_name, + ), + opts=ResourceOptions( + delete_before_replace=True, + depends_on=[vault_k8s_resources], + ), + ) + if edxorg_xqueue_enabled + else None ) -vault_secrets = read_yaml_secrets( - Path(f"xqwatcher/secrets.{stack_info.env_prefix}.{stack_info.env_suffix}.yaml") +################################## +## ConfigMap ## +################################## + +# Base xqueue-watcher config (poll settings, logging) and non-secret grader +# queue configs. The Vault-synced secret provides xqueue_servers.json. +xqwatcher_configmap = kubernetes.core.v1.ConfigMap( + f"xqwatcher-{env_name}-configmap", + metadata=kubernetes.meta.v1.ObjectMetaArgs( + name="xqwatcher-config", + namespace=namespace, + labels=k8s_global_labels.model_dump(), + ), + data={ + "xqwatcher.json": json.dumps( + { + "FOLLOW_CLIENT_REDIRECTS": True, + "POLL_INTERVAL": 10, + "POLL_TIME": 10, + "REQUESTS_TIMEOUT": 10, + } + ), + # Emit logs to stdout only; no file rotation needed in containers. + "logging.json": json.dumps( + { + "version": 1, + "disable_existing_loggers": False, + "formatters": { + "default": { + "format": "%(asctime)s - %(filename)s:%(lineno)d -- %(funcName)s [%(levelname)s]: %(message)s", # noqa: E501 + } + }, + "handlers": { + "console": { + "class": "logging.StreamHandler", + "formatter": "default", + "level": "INFO", + } + }, + "loggers": { + "": { + "handlers": ["console"], + "level": "INFO", + } + }, + } + ), + # Non-secret queue configs; SERVER_REF resolves credentials at runtime + # from xqueue_servers.json (mounted from the Vault-synced secret). + # Only queues for the MIT-hosted server (SERVER_REF="default"). + "grader_config.json": json.dumps(default_queues), + }, ) -xqwatcher_vault_mount_name = vault_mount_stack.require_output("xqwatcher_kv")["path"] -vault.kv.SecretV2( - f"xqwatcher-{env_name}-grader-static-secrets", - mount=xqwatcher_vault_mount_name, - name=f"{stack_info.env_prefix}-grader-config", - data_json=json.dumps(vault_secrets), + +# edxorg-specific ConfigMap: only the queues that target the external edx.org +# server. Created only when edxorg_xqueue_enabled is True. +xqwatcher_edxorg_configmap = ( + kubernetes.core.v1.ConfigMap( + f"xqwatcher-{env_name}-edxorg-configmap", + metadata=kubernetes.meta.v1.ObjectMetaArgs( + name="xqwatcher-edxorg-config", + namespace=namespace, + labels=k8s_global_labels.model_dump(), + ), + data={ + "xqwatcher.json": xqwatcher_configmap.data["xqwatcher.json"], + "logging.json": xqwatcher_configmap.data["logging.json"], + "grader_config.json": json.dumps(edxorg_queues), + }, + ) + if edxorg_xqueue_enabled + else None ) -block_device_mappings = [BlockDeviceMapping(volume_size=50)] -tag_specs = [ - TagSpecification( - resource_type="instance", - tags=aws_config.merged_tags({"Name": xqwatcher_server_tag}), - ), - TagSpecification( - resource_type="volume", - tags=aws_config.merged_tags({"Name": xqwatcher_server_tag}), +################################## +## RBAC for ContainerGrader ## +################################## + +# xqwatcher uses the ContainerGrader backend which creates a Kubernetes Job +# per submission. The service account running xqwatcher pods needs permission +# to create/delete Jobs and read pod logs in the same namespace. + +xqwatcher_grader_role = kubernetes.rbac.v1.Role( + f"xqwatcher-{env_name}-grader-role", + metadata=kubernetes.meta.v1.ObjectMetaArgs( + name="xqwatcher-grader", + namespace=namespace, + labels=k8s_global_labels.model_dump(), ), -] - -lt_config = OLLaunchTemplateConfig( - block_device_mappings=block_device_mappings, - image_id=xqwatcher_server_ami.id, - instance_type=xqwatcher_config.get("instance_type") - or InstanceTypes.burstable_small, - instance_profile_arn=xqwatcher_server_instance_profile.arn, - security_groups=[ - xqwatcher_server_security_group, - consul_security_groups["consul_agent"], + rules=[ + kubernetes.rbac.v1.PolicyRuleArgs( + api_groups=["batch"], + resources=["jobs"], + verbs=["create", "delete", "get", "list", "watch"], + ), + kubernetes.rbac.v1.PolicyRuleArgs( + api_groups=[""], + resources=["pods", "pods/log"], + verbs=["get", "list", "watch"], + ), ], - tags=aws_config.merged_tags({"Name": xqwatcher_server_tag}), - tag_specifications=tag_specs, - user_data=consul_datacenter.apply( - lambda consul_dc: base64.b64encode( - "#cloud-config\n{}".format( - yaml.dump( - { - "write_files": [ - { - "path": "/etc/consul.d/02-autojoin.json", - "content": json.dumps( - { - "retry_join": [ - "provider=aws tag_key=consul_env " - f"tag_value={consul_dc}" - ], - "datacenter": consul_dc, - } - ), - "owner": "consul:consul", - }, - { - "path": "/etc/default/vector", - "content": textwrap.dedent( - f"""\ - ENVIRONMENT={consul_dc} - APPLICATION=xqwatcher-{stack_info.env_prefix} - VECTOR_CONFIG_DIR=/etc/vector/ - VECTOR_STRICT_ENV_VARS=false - AWS_REGION={aws_config.region} - GRAFANA_CLOUD_API_KEY={grafana_credentials["api_key"]} - GRAFANA_CLOUD_PROMETHEUS_API_USER={grafana_credentials["prometheus_user_id"]} - GRAFANA_CLOUD_LOKI_API_USER={grafana_credentials["loki_user_id"]} - """ - ), - "owner": "root:root", - }, - ] - }, - sort_keys=True, - ) - ).encode("utf8") - ).decode("utf8") +) + +xqwatcher_grader_rolebinding = kubernetes.rbac.v1.RoleBinding( + f"xqwatcher-{env_name}-grader-rolebinding", + metadata=kubernetes.meta.v1.ObjectMetaArgs( + name="xqwatcher-grader", + namespace=namespace, + labels=k8s_global_labels.model_dump(), + ), + role_ref=kubernetes.rbac.v1.RoleRefArgs( + api_group="rbac.authorization.k8s.io", + kind="Role", + name=xqwatcher_grader_role.metadata.name, ), + subjects=[ + kubernetes.rbac.v1.SubjectArgs( + kind="ServiceAccount", + name="xqwatcher", + namespace=namespace, + ), + ], ) -auto_scale_config = xqwatcher_config.get_object("auto_scale") or { - "desired": 2, - "min": 1, - "max": 3, -} -asg_config = OLAutoScaleGroupConfig( - asg_name=f"xqwatcher-server-{env_name}", - aws_config=aws_config, - desired_size=auto_scale_config["desired"] or 2, - min_size=auto_scale_config["min"] or 1, - max_size=auto_scale_config["max"] or 3, - vpc_zone_identifiers=target_vpc["subnet_ids"], - tags=aws_config.merged_tags({"Name": xqwatcher_server_tag}), +################################## +## Deployment ## +################################## + +app_labels = {**k8s_global_labels.model_dump(), "app": "xqwatcher"} + +xqwatcher_deployment = kubernetes.apps.v1.Deployment( + f"xqwatcher-{env_name}-deployment", + metadata=kubernetes.meta.v1.ObjectMetaArgs( + name="xqwatcher", + namespace=namespace, + labels=k8s_global_labels.model_dump(), + ), + spec=kubernetes.apps.v1.DeploymentSpecArgs( + replicas=min_replicas, + selector=kubernetes.meta.v1.LabelSelectorArgs( + match_labels={"app": "xqwatcher"}, + ), + strategy=kubernetes.apps.v1.DeploymentStrategyArgs( + type="RollingUpdate", + rolling_update=kubernetes.apps.v1.RollingUpdateDeploymentArgs( + max_surge=1, + max_unavailable=0, + ), + ), + template=kubernetes.core.v1.PodTemplateSpecArgs( + metadata=kubernetes.meta.v1.ObjectMetaArgs( + labels=app_labels, + ), + spec=kubernetes.core.v1.PodSpecArgs( + service_account_name="xqwatcher", + automount_service_account_token=True, + # Apply RuntimeDefault seccomp to the xqwatcher pod itself, + # mirroring the profile applied to grading Jobs in + # containergrader.py for defence-in-depth. + security_context=kubernetes.core.v1.PodSecurityContextArgs( + seccomp_profile=kubernetes.core.v1.SeccompProfileArgs( + type="RuntimeDefault", + ), + ), + # Spread replicas across nodes for HA + topology_spread_constraints=[ + kubernetes.core.v1.TopologySpreadConstraintArgs( + max_skew=1, + topology_key="kubernetes.io/hostname", + when_unsatisfiable="ScheduleAnyway", + label_selector=kubernetes.meta.v1.LabelSelectorArgs( + match_labels={"app": "xqwatcher"}, + ), + ) + ], + containers=[ + kubernetes.core.v1.ContainerArgs( + name="xqueue-watcher", + image=cached_image_uri(docker_image_ref), + image_pull_policy="Always", + command=["uv", "run", "--no-sync", "xqueue-watcher"], + args=["-d", "/xqwatcher"], + env=[ + # Non-sensitive manager config values that are not + # already covered by the mounted xqwatcher.json ConfigMap. + kubernetes.core.v1.EnvVarArgs( + name="XQWATCHER_LOGIN_POLL_INTERVAL", value="5" + ), + # ContainerGrader deployment-wide defaults. + # These are used when a queue's KWARGS block does not + # specify the value explicitly. + kubernetes.core.v1.EnvVarArgs( + name="XQWATCHER_GRADER_BACKEND", + value="kubernetes", + ), + # Critical: grading Jobs must land in the same + # namespace as xqwatcher so the RBAC Role binding + # above grants the necessary permissions. + kubernetes.core.v1.EnvVarArgs( + name="XQWATCHER_GRADER_NAMESPACE", + value=grader_namespace, + ), + kubernetes.core.v1.EnvVarArgs( + name="XQWATCHER_GRADER_CPU_LIMIT", + value=grader_cpu_limit, + ), + kubernetes.core.v1.EnvVarArgs( + name="XQWATCHER_GRADER_MEMORY_LIMIT", + value=grader_memory_limit, + ), + kubernetes.core.v1.EnvVarArgs( + name="XQWATCHER_GRADER_TIMEOUT", + value=grader_timeout, + ), + # TLS verification for outbound xqueue HTTPS requests. + # Default "true" (safe for production). Set "false" + # only for dev environments with self-signed certs. + kubernetes.core.v1.EnvVarArgs( + name="XQWATCHER_VERIFY_TLS", + value=verify_tls, + ), + # Hard cap on submission size (bytes) before a grading + # container is launched. Prevents etcd object-size + # overflows and resource-exhaustion attacks. + kubernetes.core.v1.EnvVarArgs( + name="XQWATCHER_SUBMISSION_SIZE_LIMIT", + value=submission_size_limit, + ), + ], + # Liveness: verify the Python runtime is functional. + # The process will crash (and K8s will restart) on + # persistent xqueue connectivity failures, so we rely on + # the restart policy for connectivity-level health. + liveness_probe=kubernetes.core.v1.ProbeArgs( + exec_=kubernetes.core.v1.ExecActionArgs( + command=[ + "uv", + "run", + "--no-sync", + "python", + "-c", + "import xqueue_watcher; import sys; sys.exit(0)", + ] + ), + initial_delay_seconds=30, + period_seconds=60, + failure_threshold=3, + timeout_seconds=10, + ), + resources=kubernetes.core.v1.ResourceRequirementsArgs( + requests={"cpu": "250m", "memory": "256Mi"}, + limits={"memory": "512Mi"}, + ), + security_context=kubernetes.core.v1.SecurityContextArgs( + allow_privilege_escalation=False, + run_as_non_root=True, + run_as_user=1000, + capabilities=kubernetes.core.v1.CapabilitiesArgs( + drop=["ALL"], + ), + ), + volume_mounts=[ + # Manager config and logging config at the root of + # the -d directory; conf.d/ holds queue watcher configs. + kubernetes.core.v1.VolumeMountArgs( + name="xqwatcher-config", + mount_path="/xqwatcher/xqwatcher.json", + sub_path="xqwatcher.json", + read_only=True, + ), + kubernetes.core.v1.VolumeMountArgs( + name="xqwatcher-config", + mount_path="/xqwatcher/logging.json", + sub_path="logging.json", + read_only=True, + ), + # Per-queue grader handler config from the ConfigMap + # (non-secret: no SERVER/AUTH, uses SERVER_REF). + kubernetes.core.v1.VolumeMountArgs( + name="xqwatcher-config", + mount_path="/xqwatcher/conf.d/grader_config.json", + sub_path="grader_config.json", + read_only=True, + ), + # MIT-hosted server definitions only (SERVER_REF="default"). + kubernetes.core.v1.VolumeMountArgs( + name="xqueue-servers", + mount_path="/xqwatcher/xqueue_servers.json", + sub_path="xqueue_servers.json", + read_only=True, + ), + ], + ), + ], + volumes=[ + kubernetes.core.v1.VolumeArgs( + name="xqwatcher-config", + config_map=kubernetes.core.v1.ConfigMapVolumeSourceArgs( + name=xqwatcher_configmap.metadata.name, + ), + ), + kubernetes.core.v1.VolumeArgs( + name="xqueue-servers", + secret=kubernetes.core.v1.SecretVolumeSourceArgs( + secret_name=xqueue_servers_secret_name, + ), + ), + ], + ), + ), + ), + opts=ResourceOptions( + depends_on=[xqueue_servers_secret], + # Allow the HPA to manage replica count without Pulumi reverting it. + ignore_changes=["spec.replicas"], + ), ) -as_setup = OLAutoScaling( - asg_config=asg_config, - lt_config=lt_config, +################################## +## Horizontal Pod Autoscaler ## +################################## + +# Scale on CPU (60 % utilization) and memory (80 % utilization). +# Scale-up is aggressive (double pods per minute) while scale-down is +# conservative (25 % reduction per minute, 5-minute stabilization window) to +# avoid thrashing during bursty submission activity. +xqwatcher_hpa = kubernetes.autoscaling.v2.HorizontalPodAutoscaler( + f"xqwatcher-{env_name}-hpa", + metadata=kubernetes.meta.v1.ObjectMetaArgs( + name="xqwatcher", + namespace=namespace, + labels=k8s_global_labels.model_dump(), + ), + spec=kubernetes.autoscaling.v2.HorizontalPodAutoscalerSpecArgs( + scale_target_ref=kubernetes.autoscaling.v2.CrossVersionObjectReferenceArgs( + api_version="apps/v1", + kind="Deployment", + name="xqwatcher", + ), + min_replicas=min_replicas, + max_replicas=max_replicas, + metrics=[ + kubernetes.autoscaling.v2.MetricSpecArgs( + type="Resource", + resource=kubernetes.autoscaling.v2.ResourceMetricSourceArgs( + name="cpu", + target=kubernetes.autoscaling.v2.MetricTargetArgs( + type="Utilization", + average_utilization=60, + ), + ), + ), + kubernetes.autoscaling.v2.MetricSpecArgs( + type="Resource", + resource=kubernetes.autoscaling.v2.ResourceMetricSourceArgs( + name="memory", + target=kubernetes.autoscaling.v2.MetricTargetArgs( + type="Utilization", + average_utilization=80, + ), + ), + ), + ], + behavior=kubernetes.autoscaling.v2.HorizontalPodAutoscalerBehaviorArgs( + scale_up=kubernetes.autoscaling.v2.HPAScalingRulesArgs( + stabilization_window_seconds=60, + select_policy="Max", + policies=[ + kubernetes.autoscaling.v2.HPAScalingPolicyArgs( + type="Percent", + value=100, + period_seconds=60, + ), + ], + ), + scale_down=kubernetes.autoscaling.v2.HPAScalingRulesArgs( + stabilization_window_seconds=300, + select_policy="Min", + policies=[ + kubernetes.autoscaling.v2.HPAScalingPolicyArgs( + type="Percent", + value=25, + period_seconds=60, + ), + ], + ), + ), + ), + opts=ResourceOptions(depends_on=[xqwatcher_deployment]), ) -export("xqwatcher_security_group", xqwatcher_server_security_group.id) +################################## +## edx.org Watcher Deployment ## +################################## + +# A fully independent Deployment for queues that target the external edx.org +# xqueue server. It shares the service account and RBAC role with the default +# Deployment (both need identical permissions to manage grading Jobs) but has +# its own ConfigMap and VaultStaticSecret so the edxorg credentials are never +# co-located with the MIT-hosted xqueue credentials. +if edxorg_xqueue_enabled and edxorg_servers_secret and xqwatcher_edxorg_configmap: + edxorg_app_labels = {**k8s_global_labels.model_dump(), "app": "xqwatcher-edxorg"} + + xqwatcher_edxorg_deployment = kubernetes.apps.v1.Deployment( + f"xqwatcher-{env_name}-edxorg-deployment", + metadata=kubernetes.meta.v1.ObjectMetaArgs( + name="xqwatcher-edxorg", + namespace=namespace, + labels=k8s_global_labels.model_dump(), + ), + spec=kubernetes.apps.v1.DeploymentSpecArgs( + replicas=min_replicas, + selector=kubernetes.meta.v1.LabelSelectorArgs( + match_labels={"app": "xqwatcher-edxorg"}, + ), + strategy=kubernetes.apps.v1.DeploymentStrategyArgs( + type="RollingUpdate", + rolling_update=kubernetes.apps.v1.RollingUpdateDeploymentArgs( + max_surge=1, + max_unavailable=0, + ), + ), + template=kubernetes.core.v1.PodTemplateSpecArgs( + metadata=kubernetes.meta.v1.ObjectMetaArgs( + labels=edxorg_app_labels, + ), + spec=kubernetes.core.v1.PodSpecArgs( + service_account_name="xqwatcher", + automount_service_account_token=True, + security_context=kubernetes.core.v1.PodSecurityContextArgs( + seccomp_profile=kubernetes.core.v1.SeccompProfileArgs( + type="RuntimeDefault", + ), + ), + topology_spread_constraints=[ + kubernetes.core.v1.TopologySpreadConstraintArgs( + max_skew=1, + topology_key="kubernetes.io/hostname", + when_unsatisfiable="ScheduleAnyway", + label_selector=kubernetes.meta.v1.LabelSelectorArgs( + match_labels={"app": "xqwatcher-edxorg"}, + ), + ) + ], + containers=[ + kubernetes.core.v1.ContainerArgs( + name="xqueue-watcher", + image=cached_image_uri(docker_image_ref), + image_pull_policy="Always", + command=["uv", "run", "--no-sync", "xqueue-watcher"], + args=["-d", "/xqwatcher"], + env=[ + kubernetes.core.v1.EnvVarArgs( + name="XQWATCHER_LOGIN_POLL_INTERVAL", value="5" + ), + kubernetes.core.v1.EnvVarArgs( + name="XQWATCHER_GRADER_BACKEND", + value="kubernetes", + ), + kubernetes.core.v1.EnvVarArgs( + name="XQWATCHER_GRADER_NAMESPACE", + value=grader_namespace, + ), + kubernetes.core.v1.EnvVarArgs( + name="XQWATCHER_GRADER_CPU_LIMIT", + value=grader_cpu_limit, + ), + kubernetes.core.v1.EnvVarArgs( + name="XQWATCHER_GRADER_MEMORY_LIMIT", + value=grader_memory_limit, + ), + kubernetes.core.v1.EnvVarArgs( + name="XQWATCHER_GRADER_TIMEOUT", + value=grader_timeout, + ), + kubernetes.core.v1.EnvVarArgs( + name="XQWATCHER_VERIFY_TLS", + value=verify_tls, + ), + kubernetes.core.v1.EnvVarArgs( + name="XQWATCHER_SUBMISSION_SIZE_LIMIT", + value=submission_size_limit, + ), + ], + liveness_probe=kubernetes.core.v1.ProbeArgs( + exec_=kubernetes.core.v1.ExecActionArgs( + command=[ + "uv", + "run", + "--no-sync", + "python", + "-c", + "import xqueue_watcher; import sys;" + " sys.exit(0)", + ] + ), + initial_delay_seconds=30, + period_seconds=60, + failure_threshold=3, + timeout_seconds=10, + ), + resources=kubernetes.core.v1.ResourceRequirementsArgs( + requests={"cpu": "250m", "memory": "256Mi"}, + limits={"memory": "512Mi"}, + ), + security_context=kubernetes.core.v1.SecurityContextArgs( + allow_privilege_escalation=False, + run_as_non_root=True, + run_as_user=1000, + capabilities=kubernetes.core.v1.CapabilitiesArgs( + drop=["ALL"], + ), + ), + volume_mounts=[ + kubernetes.core.v1.VolumeMountArgs( + name="xqwatcher-edxorg-config", + mount_path="/xqwatcher/xqwatcher.json", + sub_path="xqwatcher.json", + read_only=True, + ), + kubernetes.core.v1.VolumeMountArgs( + name="xqwatcher-edxorg-config", + mount_path="/xqwatcher/logging.json", + sub_path="logging.json", + read_only=True, + ), + kubernetes.core.v1.VolumeMountArgs( + name="xqwatcher-edxorg-config", + mount_path="/xqwatcher/conf.d/grader_config.json", + sub_path="grader_config.json", + read_only=True, + ), + # edx.org server defs (SERVER_REF="edxorg"); + # mounted under the filename xqueue-watcher expects. + kubernetes.core.v1.VolumeMountArgs( + name="edxorg-servers", + mount_path="/xqwatcher/xqueue_servers.json", + sub_path="edxorg_servers.json", + read_only=True, + ), + ], + ), + ], + volumes=[ + kubernetes.core.v1.VolumeArgs( + name="xqwatcher-edxorg-config", + config_map=kubernetes.core.v1.ConfigMapVolumeSourceArgs( + name=xqwatcher_edxorg_configmap.metadata.name, + ), + ), + kubernetes.core.v1.VolumeArgs( + name="edxorg-servers", + secret=kubernetes.core.v1.SecretVolumeSourceArgs( + secret_name=edxorg_servers_secret_name, + ), + ), + ], + ), + ), + ), + opts=ResourceOptions( + depends_on=[edxorg_servers_secret], + ignore_changes=["spec.replicas"], + ), + ) + + xqwatcher_edxorg_hpa = kubernetes.autoscaling.v2.HorizontalPodAutoscaler( + f"xqwatcher-{env_name}-edxorg-hpa", + metadata=kubernetes.meta.v1.ObjectMetaArgs( + name="xqwatcher-edxorg", + namespace=namespace, + labels=k8s_global_labels.model_dump(), + ), + spec=kubernetes.autoscaling.v2.HorizontalPodAutoscalerSpecArgs( + scale_target_ref=kubernetes.autoscaling.v2.CrossVersionObjectReferenceArgs( + api_version="apps/v1", + kind="Deployment", + name="xqwatcher-edxorg", + ), + min_replicas=min_replicas, + max_replicas=max_replicas, + metrics=[ + kubernetes.autoscaling.v2.MetricSpecArgs( + type="Resource", + resource=kubernetes.autoscaling.v2.ResourceMetricSourceArgs( + name="cpu", + target=kubernetes.autoscaling.v2.MetricTargetArgs( + type="Utilization", + average_utilization=60, + ), + ), + ), + kubernetes.autoscaling.v2.MetricSpecArgs( + type="Resource", + resource=kubernetes.autoscaling.v2.ResourceMetricSourceArgs( + name="memory", + target=kubernetes.autoscaling.v2.MetricTargetArgs( + type="Utilization", + average_utilization=80, + ), + ), + ), + ], + behavior=kubernetes.autoscaling.v2.HorizontalPodAutoscalerBehaviorArgs( + scale_up=kubernetes.autoscaling.v2.HPAScalingRulesArgs( + stabilization_window_seconds=60, + select_policy="Max", + policies=[ + kubernetes.autoscaling.v2.HPAScalingPolicyArgs( + type="Percent", + value=100, + period_seconds=60, + ), + ], + ), + scale_down=kubernetes.autoscaling.v2.HPAScalingRulesArgs( + stabilization_window_seconds=300, + select_policy="Min", + policies=[ + kubernetes.autoscaling.v2.HPAScalingPolicyArgs( + type="Percent", + value=25, + period_seconds=60, + ), + ], + ), + ), + ), + opts=ResourceOptions(depends_on=[xqwatcher_edxorg_deployment]), + ) + +################################## +## Exports ## +################################## + +export("k8s_deployment_name", "xqwatcher") +export("k8s_namespace", namespace) +export("k8s_hpa_name", "xqwatcher") +export("xqueue_servers_secret", xqueue_servers_secret_name) diff --git a/src/ol_infrastructure/applications/xqwatcher/xqwatcher_server_policy.hcl b/src/ol_infrastructure/applications/xqwatcher/xqwatcher_server_policy.hcl index f0ba478bf2..fe49a0b9b0 100644 --- a/src/ol_infrastructure/applications/xqwatcher/xqwatcher_server_policy.hcl +++ b/src/ol_infrastructure/applications/xqwatcher/xqwatcher_server_policy.hcl @@ -2,6 +2,10 @@ path "sys/leases/renew" { capabilities = [ "update" ] } -path "secret-xqwatcher/*" { +path "secret-DEPLOYMENT/edx-xqueue" { + capabilities = [ "read" ] +} + +path "secret-DEPLOYMENT/edxorg-xqueue" { capabilities = [ "read" ] } diff --git a/src/ol_infrastructure/components/applications/eks.py b/src/ol_infrastructure/components/applications/eks.py index bc7ef3e11e..061743bd96 100644 --- a/src/ol_infrastructure/components/applications/eks.py +++ b/src/ol_infrastructure/components/applications/eks.py @@ -3,6 +3,7 @@ from pathlib import Path from typing import Any +import pulumi_kubernetes as kubernetes from pulumi import ComponentResource, Config, Output, ResourceOptions from pulumi_aws import get_caller_identity, iam from pulumi_vault import Policy @@ -49,6 +50,11 @@ class OLEKSAuthBindingConfig(BaseModel): k8s_labels: K8sGlobalLabels # Optional parliament config for IAM policy linting parliament_config: dict[str, Any] | None = None + # When True, create the K8s ServiceAccount object(s) for irsa_service_account_name + # with the eks.amazonaws.com/role-arn annotation so pods can reference them. + # Set to False (default) when the ServiceAccount is managed externally (e.g. by + # Helm) or already exists in the cluster. + create_irsa_service_account: bool = False @model_validator(mode="after") def validate_vault_policy(self): @@ -73,6 +79,7 @@ class OLEKSAuthBinding(ComponentResource): irsa_role: iam.Role iam_policy: iam.Policy | None vault_k8s_resources: OLVaultK8SResources + irsa_service_accounts: list[kubernetes.core.v1.ServiceAccount] def __init__( self, @@ -141,6 +148,30 @@ def __init__( ) self.irsa_role = self.trust_role.role + if config.create_irsa_service_account: + sa_names = ( + [config.irsa_service_account_name] + if isinstance(config.irsa_service_account_name, str) + else config.irsa_service_account_name + ) + self.irsa_service_accounts = [ + kubernetes.core.v1.ServiceAccount( + f"{config.application_name}-{sa_name}-irsa-service-account", + metadata=kubernetes.meta.v1.ObjectMetaArgs( + name=sa_name, + namespace=config.namespace, + labels=config.k8s_labels.model_dump(), + annotations={ + "eks.amazonaws.com/role-arn": self.irsa_role.arn, + }, + ), + opts=ResourceOptions(parent=self), + ) + for sa_name in sa_names + ] + else: + self.irsa_service_accounts = [] + # Read Vault policy from file or use provided text vault_policy_text = ( config.vault_policy_path.read_text() @@ -180,7 +211,6 @@ def __init__( or f"https://vault-{stack_info.env_suffix}.odl.mit.edu", vault_auth_endpoint=config.vault_auth_endpoint, vault_auth_role_name=k8s_auth_backend_role.role_name, - service_account_name=service_account_names[0], ) self.vault_k8s_resources = OLVaultK8SResources( resource_config=vault_k8s_resources_config, @@ -195,6 +225,7 @@ def __init__( { "iam_policy": self.iam_policy, "irsa_role": self.irsa_role, + "irsa_service_accounts": self.irsa_service_accounts, "vault_policy": vault_policy, "vault_k8s_auth_role": k8s_auth_backend_role, "vault_k8s_resources": self.vault_k8s_resources, diff --git a/src/ol_infrastructure/lib/ol_types.py b/src/ol_infrastructure/lib/ol_types.py index d9cf4272e8..9c45df42af 100644 --- a/src/ol_infrastructure/lib/ol_types.py +++ b/src/ol_infrastructure/lib/ol_types.py @@ -93,6 +93,7 @@ class Services(StrEnum): vector_log_proxy = "vector-log-proxy" xpro = "xpro" xqueue = "xqueue" + xqwatcher = "xqwatcher" @unique @@ -130,6 +131,7 @@ class Application(StrEnum): vector_log_proxy = "vector-log-proxy" xpro = "xpro" xqueue = "xqueue" + xqwatcher = "xqwatcher" @unique