transcript-skills-analysis/lambda_function.py at main · DigiCred-Holdings/transcript-skills-analysis · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
import json
import boto3
import os

s3_client = boto3.client('s3')
bedrock_client = boto3.client("bedrock-runtime")
REGISTRY_URI = os.environ['REGISTRY_S3_URI']

def load_skills_dataset():
    # Get bucket key from environment variable S3 URI e.g. s3://digicred-credential-analysis/dev/staging_registry.json
    bucket, key = REGISTRY_URI.replace("s3://", "").split("/", 1)
    response = s3_client.get_object(Bucket=bucket, Key=key)
    if response['ResponseMetadata']['HTTPStatusCode'] != 200:
        raise Exception(f"Failed to retrieve data from S3: {response['ResponseMetadata']['HTTPStatusCode']}")
    try:
        content = response['Body'].read().decode('utf-8')
        result = json.loads(content)
    except Exception as e:
        raise Exception(f'Failed to parse data from s3:', e)
    return result

def find_relevant_courses(course_title_code_list, all_courses):
    all_course_codes = [course["code"].upper() for course in all_courses if course["code"]]
    found_student_courses = []
    overloaded_codes = []
    missing_codes = []
    for given_title, given_code in course_title_code_list:
        candidates = []
        for code_to_evaluate in all_course_codes:
            if given_code in code_to_evaluate:
                candidates += [course for course in all_courses if course.get("code") == code_to_evaluate]

        if len(candidates) == 1:
            found_student_courses.append(candidates[0])
        elif candidates:
            overloaded_codes.append([given_title, given_code])
        else:
            missing_codes.append([given_title, given_code])

    print(f"Warning: {len(overloaded_codes)} courses found multiple matches in the database")
    print(f"Found multiple matches for the following courses in registry {overloaded_codes}")

    print(f"Warning: {len(missing_codes)} courses were not found in the database.")
    print(f"Could not find the following courses in registry: {missing_codes}")
    return found_student_courses

def get_course_data(course_title_code_list):
    all_courses = load_skills_dataset()
    course_skill_data = find_relevant_courses(course_title_code_list, all_courses)
    return course_skill_data

def package_skills(course_skill_data):
    student_skills = {}
    for course in course_skill_data:
        for skill in course["skills_curated"]:
            id = skill["skill_id"]
            if id not in student_skills:
                student_skills[id] = {
                    "name": skill["skill"],
                    "category": skill["skill_category"],
                    "frequency": skill["frequency"],
                    "count": 1,
                    "max_skill_level": skill["skill_level"],
                    "sum_skill_level": skill["skill_level"],
                    "courses": [(course["code"], skill["skill_level"])]
                }
            else:
                student_skills[id]["count"] += 1
                if skill["skill_level"] > student_skills[id]["max_skill_level"]:
                    student_skills[id]["max_skill_level"] = skill["skill_level"]
                student_skills[id]["sum_skill_level"] += skill["skill_level"]
                student_skills[id]["courses"].append((course["code"], skill["skill_level"]))
    return student_skills

def get_skills_of_interest(all_skills):
    max_count_skill = None
    max_count = 0
    max_level_skill = None
    max_average_level = 0
    unique_skill = None
    unique_skill_frequency = float("inf")
    for id, skill_data in all_skills.items():
        if skill_data["count"] > max_count:
            max_count_skill = id
            max_count = skill_data["count"]

        skill_average = skill_data["sum_skill_level"] / len(skill_data["courses"])
        skill_data["skill_level_average"] = f"{skill_average:.2f}"
        if skill_average > max_average_level:
            max_level_skill = id
            max_average_level = skill_average

        if skill_data["frequency"] < unique_skill_frequency:
            unique_skill = id
            unique_skill_frequency = skill_data["frequency"]


    return [max_count_skill, max_level_skill, unique_skill]

def get_skill_level_counts(all_skills):
    skill_level_counts = [0, 0, 0]
    for id, skill in all_skills.items():
        max_skill_level = 0
        for course in skill["courses"]:
            if course[1] > max_skill_level: max_skill_level = course[1]
        skill_level_counts[max_skill_level - 1] += 1
    return skill_level_counts

def invoke_bedrock(system_prompt, messages):
    response = bedrock_client.converse(
        modelId="amazon.nova-micro-v1:0",
        messages=messages,
        system=system_prompt,
        inferenceConfig={
            "maxTokens": 2000,
            "temperature": 0.6
        }
    )
    return response["output"]["message"]["content"][0]["text"]

def add_future_pathways(skills_of_interest):
    system_prompt = [{
        "text": '''
            In about 25 words, detail a few ways that graduating high school student could further their
            development of a given skill. Include a variety of majors, professional
            certifications, or careers that value and develop that skill. Write in full sentences in imperative form.
            Use a direct but unimposing tone. Do not include the name of the skill.
        '''
    }]

    for skill in skills_of_interest:
        user_messages = [{
            "role": "user",
            "content": [{
                "text": skill["name"]
            }]
        }]
        bedrock_response = invoke_bedrock(system_prompt, user_messages)
        skill["pathways"] = bedrock_response
        print(f"Bedrock pathways response for {skill["name"]}: {bedrock_response}")

def llm_summary(skills_of_interest):
    skills_string_list = ", ".join([skill["name"] for skill in skills_of_interest])
    system_prompt = [{
        "text": '''
            Write a summary to go at the end of a transcript skill analysis for a high school student.
            The primary goal of the summary should be to reinforce to the student that their transcript represents
            real skills that are useful and can help them reach their goals. The blurb should be less than 100 words,
            positive in tone, and written in the second person. Don't be too sycophantic or make specific assertions
            about what they are qualified to do. For example, given the skills Writing, Critical Thinking /
            Problem Solving, Woodworking, Research, and Problem Solving a good summary might look like:
            "You can write clearly, question assumptions, and finish a woodworking project without splinters.
            Research papers don't intimidate you, and you've learned to map big assignments into small, doable steps.
            These skills will carry you well into your future and beyond."
        '''
    }]
    user_messages = [{
        "role": "user",
        "content": [{
            "text": skills_string_list
        }]
    }]

    bedrock_response = invoke_bedrock(system_prompt, user_messages)
    print(f"Bedrock summary response for {skills_string_list}: {bedrock_response}")
    return bedrock_response

def lambda_handler(event, context):
    if type(event["body"]) is str:
        body = json.loads(event["body"])
    else:
        body = event["body"]
    if not body:
        return {
            'statusCode': 400,
            'body': 'Invalid input: body cannot be empty.'
        }

    ()
    if "coursesList" not in body:
        return {
            'statusCode': 400,
            'body': 'Invalid input: coursesList is required.'
        }

    print(f"Lambda started with input: {body}")

    course_skills_data = get_course_data(body["coursesList"])

    student_skills = package_skills(course_skills_data)

    skills_of_interest = get_skills_of_interest(student_skills)
    full_skills_of_interest = [student_skills[id] for id in skills_of_interest]
    add_future_pathways(full_skills_of_interest)

    print("Highest count skill:", full_skills_of_interest[0])
    print("Highest level skill:", full_skills_of_interest[1])
    print("Most unique skill:", full_skills_of_interest[2])

    skill_level_counts = get_skill_level_counts(student_skills)
    summary = llm_summary(full_skills_of_interest)

    analyzed_course_ids = [course["code"] for course in course_skills_data]
    response = {
        'status': 200,
        'body': {
            "count": str(len(student_skills)),
            "skills_of_interest": full_skills_of_interest,
            "skill_level_counts": skill_level_counts,
            "summary": summary,
            "course_ids": analyzed_course_ids,
        }
    }
    print(f"Lambda returning output: {response}")
    return response