Skip to content

Commit ff52bd3

Browse files
committed
Instructor onboarding
1 parent f875036 commit ff52bd3

File tree

3 files changed

+245
-1
lines changed

3 files changed

+245
-1
lines changed
Lines changed: 192 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,192 @@
1+
"""Assets for managing instructor onboarding data in the access-forge GitHub repository.
2+
3+
This module pulls email addresses from the combined user course roles
4+
dbt model and pushes them to a private GitHub repository for instructor
5+
access management.
6+
"""
7+
8+
from datetime import UTC, datetime
9+
10+
import polars as pl
11+
from dagster import AssetExecutionContext, AssetIn, AssetKey, Output, asset
12+
from github.GithubException import UnknownObjectException
13+
from ol_orchestrate.resources.github import GithubApiClientFactory
14+
15+
16+
@asset(
17+
name="instructor_onboarding_user_list",
18+
group_name="instructor_onboarding",
19+
ins={
20+
"int__combined__user_course_roles": AssetIn(
21+
key=AssetKey(["int__combined__user_course_roles"])
22+
)
23+
},
24+
description="Generates CSV file with user emails for access-forge repository",
25+
)
26+
def generate_instructor_onboarding_user_list(
27+
context: AssetExecutionContext,
28+
int__combined__user_course_roles: pl.DataFrame,
29+
) -> Output[str]:
30+
"""Pull unique email addresses from user course roles and prepare for GitHub upload.
31+
32+
This asset reads the combined user course roles dbt model, extracts unique email
33+
addresses, and generates a CSV string formatted for the access-forge repository.
34+
35+
The output CSV has three columns:
36+
- email: User's email address (from user_email field)
37+
- role: Set to 'ol-data-analyst' for all users
38+
- sent_invite: Set to 1 for all users
39+
40+
Args:
41+
context: Dagster execution context
42+
int__combined__user_course_roles: DataFrame from dbt model containing fields:
43+
platform, user_username, user_email, user_full_name, courserun_readable_id,
44+
organization, courseaccess_role
45+
46+
Returns:
47+
Output containing CSV string content formatted for access-forge repo
48+
"""
49+
# Select unique email addresses and filter out nulls
50+
user_data = (
51+
int__combined__user_course_roles.select(["user_email"])
52+
.filter(pl.col("user_email").is_not_null())
53+
.unique()
54+
.sort("user_email")
55+
)
56+
57+
# Add role and sent_invite columns with fixed values
58+
user_data = user_data.with_columns(
59+
[pl.lit("ol-data-analyst").alias("role"), pl.lit(1).alias("sent_invite")]
60+
)
61+
62+
# Rename column to match expected format
63+
user_data = user_data.rename({"user_email": "email"})
64+
65+
# Reorder columns: email, role, sent_invite
66+
user_data = user_data.select(["email", "role", "sent_invite"])
67+
68+
# Convert to CSV string
69+
csv_content = user_data.write_csv()
70+
71+
context.log.info("Generated CSV content with %s unique users", len(user_data))
72+
73+
return Output(
74+
value=csv_content,
75+
metadata={
76+
"num_users": len(user_data),
77+
"preview": csv_content[:500],
78+
},
79+
)
80+
81+
82+
@asset(
83+
name="update_access_forge_repo",
84+
group_name="instructor_onboarding",
85+
ins={"instructor_onboarding_user_list": AssetIn()},
86+
required_resource_keys={"github_api"},
87+
description="Updates the access-forge repository with the generated user list",
88+
)
89+
def update_access_forge_repository(
90+
context: AssetExecutionContext,
91+
instructor_onboarding_user_list: str,
92+
github_api: GithubApiClientFactory,
93+
) -> Output[dict]:
94+
"""Push the generated CSV content to the access-forge GitHub repository.
95+
96+
This asset updates or creates a CSV file in the private mitodl/access-forge
97+
repository with the user list generated from the dbt model.
98+
99+
Args:
100+
context: Dagster execution context
101+
instructor_onboarding_user_list: CSV string content to upload with columns:
102+
email, role, sent_invite
103+
github_api: GitHub API client factory resource for authentication
104+
105+
Returns:
106+
Output containing metadata about the commit (repo, file path, action, SHA)
107+
108+
Raises:
109+
Exception: If GitHub API call fails or authentication issues occur
110+
"""
111+
repo_name = "mitodl/access-forge"
112+
file_path = "users/ci/users.csv"
113+
base_branch = "main"
114+
115+
# Create unique branch name with timestamp
116+
timestamp = datetime.now(tz=UTC).strftime("%Y%m%d-%H%M%S")
117+
new_branch = f"dagster/update-user-list-{timestamp}"
118+
119+
commit_message = "dagster-pipeline - update user list from ol-data-platform"
120+
121+
try:
122+
gh_client = github_api.get_client()
123+
repo = gh_client.get_repo(repo_name)
124+
125+
# Get the base branch reference
126+
base_ref = repo.get_git_ref(f"heads/{base_branch}")
127+
base_sha = base_ref.object.sha
128+
129+
# Create new branch from base
130+
repo.create_git_ref(ref=f"refs/heads/{new_branch}", sha=base_sha)
131+
context.log.info("Created branch %s", new_branch)
132+
133+
# Try to get existing file to update it, or create new file
134+
try:
135+
contents = repo.get_contents(file_path, ref=base_branch)
136+
repo.update_file(
137+
path=file_path,
138+
message=commit_message,
139+
content=instructor_onboarding_user_list,
140+
sha=contents.sha,
141+
branch=new_branch,
142+
)
143+
context.log.info("Updated file %s in branch %s", file_path, new_branch)
144+
action = "updated"
145+
except UnknownObjectException:
146+
# File doesn't exist, create it
147+
repo.create_file(
148+
path=file_path,
149+
message=commit_message,
150+
content=instructor_onboarding_user_list,
151+
branch=new_branch,
152+
)
153+
context.log.info("Created file %s in branch %s", file_path, new_branch)
154+
action = "created"
155+
156+
# Create pull request
157+
pr = repo.create_pull(
158+
title=f"Update user list - {timestamp}",
159+
body=(
160+
"Automated update of user list from ol-data-platform Dagster "
161+
f"pipeline.\n\n"
162+
f"- Action: {action} file\n"
163+
f"- File: {file_path}\n"
164+
f"- Users: {instructor_onboarding_user_list.count(chr(10))} entries"
165+
),
166+
head=new_branch,
167+
base=base_branch,
168+
)
169+
170+
context.log.info("Created PR #%s: %s", pr.number, pr.html_url)
171+
172+
return Output(
173+
value={
174+
"repo": repo_name,
175+
"file_path": file_path,
176+
"action": action,
177+
"branch": new_branch,
178+
"pr_number": pr.number,
179+
"pr_url": pr.html_url,
180+
},
181+
metadata={
182+
"repository": repo_name,
183+
"file_path": file_path,
184+
"action": action,
185+
"pr_number": pr.number,
186+
"pr_url": pr.html_url,
187+
},
188+
)
189+
190+
except Exception:
191+
context.log.exception("Failed to create PR in GitHub repository")
192+
raise

dg_projects/edxorg/edxorg/definitions.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
from ol_orchestrate.lib.constants import DAGSTER_ENV, VAULT_ADDRESS
2828
from ol_orchestrate.lib.utils import authenticate_vault
2929
from ol_orchestrate.resources.gcp_gcs import GCSConnection
30+
from ol_orchestrate.resources.github import GithubApiClientFactory
3031
from ol_orchestrate.resources.openedx import OpenEdxApiClientFactory
3132
from ol_orchestrate.resources.outputs import DailyResultsDir, SimpleResultsDir
3233
from ol_orchestrate.resources.secrets.vault import Vault
@@ -46,6 +47,10 @@
4647
flatten_edxorg_course_structure,
4748
normalize_edxorg_tracking_log,
4849
)
50+
from edxorg.assets.instructor_onboarding import (
51+
generate_instructor_onboarding_user_list,
52+
update_access_forge_repository,
53+
)
4954
from edxorg.assets.openedx_course_archives import (
5055
dummy_edxorg_course_xml,
5156
extract_edxorg_courserun_metadata,
@@ -220,7 +225,10 @@ def sync_edxorg_program_reports():
220225
job=define_asset_job(
221226
name="edxorg_api_daily_job",
222227
selection=AssetSelection.assets(
223-
edxorg_program_metadata, edxorg_mitx_course_metadata
228+
edxorg_program_metadata,
229+
edxorg_mitx_course_metadata,
230+
generate_instructor_onboarding_user_list,
231+
update_access_forge_repository,
224232
),
225233
),
226234
cron_schedule="0 5 * * *",
@@ -252,6 +260,7 @@ def sync_edxorg_program_reports():
252260
"gcp_gcs": gcs_connection,
253261
"vault": vault,
254262
"edxorg_api": OpenEdxApiClientFactory(deployment="edxorg", vault=vault),
263+
"github_api": GithubApiClientFactory(vault=vault),
255264
"s3": S3Resource(profile_name="edxorg"),
256265
"s3_download": S3Resource(profile_name="edxorg"),
257266
"s3_upload": S3Resource(),
@@ -273,6 +282,8 @@ def sync_edxorg_program_reports():
273282
dummy_edxorg_course_xml,
274283
edxorg_program_metadata,
275284
edxorg_mitx_course_metadata,
285+
generate_instructor_onboarding_user_list,
286+
update_access_forge_repository,
276287
],
277288
schedules=[edxorg_api_daily_schedule],
278289
)
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
"""GitHub API client resource for Dagster pipelines."""
2+
3+
from dagster import ConfigurableResource
4+
from github import Github
5+
from pydantic import Field, PrivateAttr
6+
7+
from ol_orchestrate.resources.secrets.vault import Vault
8+
9+
10+
class GithubApiClientFactory(ConfigurableResource):
11+
"""Factory for creating authenticated GitHub API clients.
12+
13+
This resource fetches GitHub credentials from Vault and creates an authenticated
14+
PyGithub client instance for interacting with the GitHub API.
15+
"""
16+
17+
vault: Vault = Field(description="Vault resource for retrieving GitHub API token")
18+
vault_mount_point: str = Field(
19+
default="secret-data", description="Vault mount point for secrets"
20+
)
21+
vault_secret_path: str = Field(
22+
default="pipelines/github/api", description="Path to GitHub API token in Vault"
23+
)
24+
25+
_client: Github | None = PrivateAttr(default=None)
26+
27+
def get_client(self) -> Github:
28+
"""Create and return an authenticated GitHub client.
29+
30+
Returns:
31+
Github: Authenticated PyGithub client instance for interacting with
32+
GitHub API.
33+
"""
34+
if self._client is None:
35+
secret_data = self.vault.client.secrets.kv.v1.read_secret(
36+
mount_point=self.vault_mount_point, path=self.vault_secret_path
37+
)
38+
token = secret_data["data"]["token"]
39+
self._client = Github(token)
40+
41+
return self._client

0 commit comments

Comments
 (0)