-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathexample.py
91 lines (82 loc) · 4.89 KB
/
example.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import pandas as pd
from qualkit.clean import clean, remove_dont_knows, lemmatize
from qualkit.anchored_topic_model import anchored_topic_model, topic_metrics
# Using an anchored topic model
# =============================
# In this example we start by creating a 'naive' topic model to get an idea of the content.
# We then apply domain knowledge to create a set of "anchor" terms for the topics, and re-run the model
# using the anchors as a guide to the algorithm creating the model.
# Finally we create a topic model for the documents that don't match the resulting model, to
# see if we need to modify the anchor terms.
data = pd.read_csv('data/DEI_TS_2021.csv')
# data['Unique Response Number'] = data['Unique Response Number'].astype(str)
# always clean data first! The output is called "cleaned" in the result rather than overwrite the original data
data = clean(data, 'Q11')
data = remove_dont_knows(data, 'cleaned')
data = lemmatize(data, 'cleaned')
topic_metrics(data,'cleaned', number_of_topics=20)
#
# Create a topic model with no anchor topics as a baseline
#
print("\nNon-anchored\n")
df1 = anchored_topic_model(data, 'cleaned', number_of_topics=20, print_topic_details=True)
#
# Create an anchored topic model with suggested topics anchor terms
#
topic_names = [
'Library resources',
'Course materials',
'Live/in-person',
'Help & support',
'Information & guidance',
'Workload',
'Technology',
'Wifi and equipment',
'Communication',
'Interactivity',
'Personal & small group learning',
'Nothing/OK',
'Organisation',
'Assessment',
'Teaching',
'Online learning',
'Accessibility',
'Don\'t knows',
'Digital skills'
]
anchors = [
["resource", "library", "book", 'access', 'textbook', 'ebooks', 'reading', 'list'],
["pre", "prerecord", "recorded", 'recording', 'record', "prerecorded", "video", "content", 'powerpoint', 'material', 'powerpoints'],
["live", "face", "physical", 'campus', 'person'],
["help", "support", 'advice', 'motivate', "supportive", "mental", "health", 'safe', 'safety', 'assistance', 'awareness', 'outreach', 'check in'],
['guidance', 'guide', 'info', 'information', 'show', 'how', 'instruction', 'clear', 'clearer', 'explain', 'clarity'],
["time", 'pressure', "workload", "overload", "work", "deadline", 'spread', 'spaced', 'space', 'pace', 'pacing', 'paced', 'break', 'slower', 'slow', 'shorter', 'short', 'extension'],
["vle", "platform", 'software', 'interface', 'onlinemeetingtool', 'technology', 'system', 'tech', 'navigation'],
['wifi', 'network', 'broadband', "connection", "internet", "data", 'laptop', 'computer', "equipment", 'specialist', 'specialised', 'software', 'camera', 'mic', 'device'],
['talk', 'reply', 'interact', 'response', 'respond', 'contact', 'listen', 'email', "communication", "communicate", "communicating", "feedback", 'questions'],
['participation', 'interactive', 'interactivity', 'interaction', 'discussion', 'quiz', 'activity', 'variety', 'kahoot', 'kahoots'],
['personal', 'individual', 'seminar', 'tutorial', 'group', 'groupwork', 'small', 'workshop', 'smaller', 'onetoone', 'onetoones', 'collaboration'],
['nothing', 'happy', 'keep', 'good', 'same', 'continue', 'great', 'fine', 'satisfied', 'perfect'],
['more organised', 'organised', 'arrange', 'staff', 'organized', 'organisation', 'structure', 'structured', 'planned', 'detailed', 'manage', 'consistent', 'consistency', 'management', 'plan', 'timetable', 'timetabled', 'earlier', 'later', 'time', 'schedule'],
['assessment', 'assignment', 'exam', 'revision', 'results', 'practice', 'test', 'revise', 'track', 'progress'],
['teaching', 'effectively', 'teacher', 'engaged', 'involved', 'involvement', 'engage', 'engagement', 'engaging', 'interesting', 'boring'],
['online', 'learning', 'dont', 'not', 'less', 'stop', 'back', 'rid'],
['accessible', 'accessibility', 'subtitle', 'caption', 'blind', 'deaf', 'dyslexia', 'dyslexic', 'captioning'],
['dont', 'know', 'not', 'sure', 'idea'],
['tip', 'skill', 'basic', 'digital', 'training', 'train', 'online', 'knowledge']
]
# ['long', 'screen', 'time', 'early', 'late']
print("\nAnchored\n")
df2 = anchored_topic_model(data, 'cleaned', topic_names=topic_names, anchors=anchors, print_topic_details=True, return_csv=True)
# manual fix for a model problem - the single word 'nothing' doesn't get allocated to a topic
terms = df2[(df2['Topic name'] == 'Nothing/OK')]['Topic label'].values[0]
df2.loc[df2['cleaned'] == 'nothing', 'Topic label'] = terms
df2.loc[df2['cleaned'] == 'nothing', 'Topic name'] = 'Nothing/OK'
df2.to_csv('output/corex.csv')
#
# Create a topic model for all the terms that weren't matched to see if there are any patterns
#
print("\nUnmatched\n")
unmatched = df2[(df2['Topic label'] == 'No matching topic')].copy()
df2 = anchored_topic_model(unmatched, 'cleaned', number_of_topics=18, print_topic_details=True)
df2.to_csv('output/corex_unmatched.csv')