-
Notifications
You must be signed in to change notification settings - Fork 2
/
ProcessMeetData.py
109 lines (84 loc) · 3.87 KB
/
ProcessMeetData.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import pandas as pd
import random
import regex
import math
from collections import namedtuple
from datetime import date, datetime, timedelta
timeslot = namedtuple('Interval', ['start', 'end'])
id = 0
INPUT_PATH = "D:\Downloads\drive-download-20210410T013418Z-001\Mukono, Uganda.xlsx" # path to input excel file
DURATION_FILE_PATH = "D:\Downloads\drive-download-20210410T013418Z-001\meet_data\Mukono.csv" # output of file containing duration info
AUGMENTED_MEET_FILE_PATH = "D:\Downloads\drive-download-20210410T013418Z-001\meet_data\Mukono.csv" # output of original file with id tag added
def is_mentor(email):
if pd.isnull(email):
return True
domain = email[email.index('@') + 1 : ]
if (domain == "villagebookbuilders.org" and email != "[email protected]") or email == "[email protected]": # is a mentee; Brett and mpmukono are special cases (a mentor and mentee email, respectively)
return False
else:
return True # default to assuming a user is a mentor
def get_meet_ids(df):
meetings = df["Meeting Code"].unique()
return meetings
def meeting_by_id(id):
meets = []
meets.append(pd.DataFrame(df.loc[df["Meeting Code"] == id]))
return meets
def utc_to_time(utc):
time = utc[utc.index('T') + 1 : ]
time_obj = datetime.strptime(time, "%H:%M:%S")
return time_obj
def merge_times(times): # merge mentors or mentees if more than one of each is present in a meeting at the same time
times.sort()
for i in range(len(times) - 1):
first_end = times[i][1]
next_start = times[i + 1][0]
if first_end.time() > next_start.time():
new_start = min(times[i].start, times[i+1].start)
new_end = max(times[i].end, times[i+1].end)
new_slot = timeslot(new_start, new_end)
first_time = times[i]
next_time = times[i+1]
times.remove(first_time)
times.remove(next_time)
times.append(new_slot)
break
merge_times(times)
return times
def get_overlap(first, second): # returns whatever overlap may exist between meetings
return max(0, (min(first[1], second[1]) - max(first[0], second[0])).total_seconds())
def calculate_overlap(meeting):
mentor_intervals = []
student_intervals = []
names = meeting["Participant Identifier"].unique()
for row in meeting.iterrows(): # for each meeting, split into mentor/mentee arrays and merge overlapping times within each array
start = row[1]['UTC']
start = utc_to_time(start)
end = start + timedelta(0,row[1]['Duration'])
timepair = timeslot(start, end)
name = row[1]['Participant Identifier']
if is_mentor(name):
mentor_intervals.append(timepair)
else:
student_intervals.append(timepair)
mentor_intervals = merge_times(mentor_intervals)
student_intervals = merge_times(student_intervals)
overlap = 0
for mentor_interval in mentor_intervals:
for student_interval in student_intervals:
overlap += get_overlap(mentor_interval, student_interval)
return overlap
df = pd.read_excel(INPUT_PATH)
unique_meets = get_meet_ids(df)
df["id"] = 0
new_df = [["Meeting Code", "Tutoring Time (s)", "ID"]] # columns for new spreadsheet
for meet_id in unique_meets:
meets = meeting_by_id(meet_id)
for meet in meets:
overlap = calculate_overlap(meet)
id = id
new_df.append([meet_id, overlap, id])
df.loc[df['Meeting Code'].eq(meet_id),'id'] = id # Find all values in original DF where meeting code is the same as the current meet; replace filler ID with current ID value corresponding to row in the new table
id += 1
pd.DataFrame(new_df).to_csv(DURATION_FILE_PATH)
df.to_csv(AUGMENTED_MEET_FILE_PATH)