-
Notifications
You must be signed in to change notification settings - Fork 20
/
Copy pathevaluator.py
993 lines (882 loc) · 59.5 KB
/
evaluator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
#!/usr/bin/env python
# Copyright 2019 Division of Medical Image Computing, German Cancer Research Center (DKFZ).
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
import os
from multiprocessing import Pool
import pickle
import time
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.metrics import roc_curve, precision_recall_curve
from sklearn.metrics import mean_squared_error, mean_absolute_error, accuracy_score
import torch
import utils.model_utils as mutils
import plotting as plg
import warnings
def get_roi_ap_from_df(inputs):
'''
:param df: data frame.
:param det_thresh: min_threshold for filtering out low confidence predictions.
:param per_patient_ap: boolean flag. evaluate average precision per patient id and average over per-pid results,
instead of computing one ap over whole data set.
:return: average_precision (float)
'''
df, det_thresh, per_patient_ap = inputs
if per_patient_ap:
pids_list = df.pid.unique()
aps = []
for match_iou in df.match_iou.unique():
iou_df = df[df.match_iou == match_iou]
for pid in pids_list:
pid_df = iou_df[iou_df.pid == pid]
all_p = len(pid_df[pid_df.class_label == 1])
pid_df = pid_df[(pid_df.det_type == 'det_fp') | (pid_df.det_type == 'det_tp')].sort_values('pred_score', ascending=False)
pid_df = pid_df[pid_df.pred_score > det_thresh]
if (len(pid_df) ==0 and all_p == 0):
pass
elif (len(pid_df) > 0 and all_p == 0):
aps.append(0)
else:
aps.append(compute_roi_ap(pid_df, all_p))
return np.mean(aps)
else:
aps = []
for match_iou in df.match_iou.unique():
iou_df = df[df.match_iou == match_iou]
# it's important to not apply the threshold before counting all_p in order to not lose the fn!
all_p = len(iou_df[(iou_df.det_type == 'det_tp') | (iou_df.det_type == 'det_fn')])
# sorting out all entries that are not fp or tp or have confidence(=pred_score) <= detection_threshold
iou_df = iou_df[(iou_df.det_type == 'det_fp') | (iou_df.det_type == 'det_tp')].sort_values('pred_score', ascending=False)
iou_df = iou_df[iou_df.pred_score > det_thresh]
if all_p>0:
aps.append(compute_roi_ap(iou_df, all_p))
return np.mean(aps)
def compute_roi_ap(df, all_p):
"""
adapted from: https://github.com/cocodataset/cocoapi/blob/master/PythonAPI/pycocotools/cocoeval.py
:param df: dataframe containing class labels of predictions sorted in descending manner by their prediction score.
:param all_p: number of all ground truth objects. (for denominator of recall.)
:return:
"""
tp = df.class_label.values
fp = (tp == 0) * 1
#recall thresholds, where precision will be measured
R = np.linspace(0., 1., np.round((1. - 0.) / .01).astype(int) + 1, endpoint=True)
tp_sum = np.cumsum(tp)
fp_sum = np.cumsum(fp)
n_dets = len(tp)
rc = tp_sum / all_p
pr = tp_sum / (fp_sum + tp_sum)
# initialize precision array over recall steps (q=queries).
q = [0. for _ in range(len(R))]
# numpy is slow without cython optimization for accessing elements
# python array gets significant speed improvement
pr = pr.tolist()
for i in range(n_dets - 1, 0, -1):
if pr[i] > pr[i - 1]:
pr[i - 1] = pr[i]
#--> pr[i]<=pr[i-1] for all i since we want to consider the maximum
#precision value for a queried interval
# discretize empiric recall steps with given bins.
assert np.all(rc[:-1]<=rc[1:]), "recall not sorted ascendingly"
inds = np.searchsorted(rc, R, side='left')
try:
for rc_ix, pr_ix in enumerate(inds):
q[rc_ix] = pr[pr_ix]
except IndexError: #now q is filled with pr values up to first non-available index
pass
return np.mean(q)
def roi_avp(inputs):
'''
:param df: data frame.
:param det_thresh: min_threshold for filtering out low confidence predictions.
:param per_patient_ap: boolean flag. evaluate average precision per patient id and average over per-pid results,
instead of computing one ap over whole data set.
:return: average_precision (float)
'''
df, det_thresh, per_patient_ap = inputs
if per_patient_ap:
pids_list = df.pid.unique()
aps = []
for match_iou in df.match_iou.unique():
iou_df = df[df.match_iou == match_iou]
for pid in pids_list:
pid_df = iou_df[iou_df.pid == pid]
all_p = len(pid_df[pid_df.class_label == 1])
mask = ((pid_df.rg_bins == pid_df.rg_bin_target) & (pid_df.det_type == 'det_tp')) | (pid_df.det_type == 'det_fp')
pid_df = pid_df[mask].sort_values('pred_score', ascending=False)
pid_df = pid_df[pid_df.pred_score > det_thresh]
if (len(pid_df) ==0 and all_p == 0):
pass
elif (len(pid_df) > 0 and all_p == 0):
aps.append(0)
else:
aps.append(compute_roi_ap(pid_df, all_p))
return np.mean(aps)
else:
aps = []
for match_iou in df.match_iou.unique():
iou_df = df[df.match_iou == match_iou]
#it's important to not apply the threshold before counting all_positives!
all_p = len(iou_df[(iou_df.det_type == 'det_tp') | (iou_df.det_type == 'det_fn')])
# filtering out tps which don't match rg_bin target at this point is same as reclassifying them as fn.
# also sorting out all entries that are not fp or have confidence(=pred_score) <= detection_threshold
mask = ((iou_df.rg_bins == iou_df.rg_bin_target) & (iou_df.det_type == 'det_tp')) | (iou_df.det_type == 'det_fp')
iou_df = iou_df[mask].sort_values('pred_score', ascending=False)
iou_df = iou_df[iou_df.pred_score > det_thresh]
if all_p>0:
aps.append(compute_roi_ap(iou_df, all_p))
return np.mean(aps)
def compute_prc(df):
"""compute precision-recall curve with maximum precision per recall interval.
:param df:
:param all_p: # of all positive samples in data.
:return: array: [precisions, recall query values]
"""
assert (df.class_label==1).any(), "cannot compute prc when no positives in data."
all_p = len(df[(df.det_type == 'det_tp') | (df.det_type == 'det_fn')])
df = df[(df.det_type=="det_tp") | (df.det_type=="det_fp")]
df = df.sort_values("pred_score", ascending=False)
# recall thresholds, where precision will be measured
scores = df.pred_score.values
labels = df.class_label.values
n_dets = len(scores)
pr = np.zeros((n_dets,))
rc = pr.copy()
for rank in range(n_dets):
tp = np.count_nonzero(labels[:rank+1]==1)
fp = np.count_nonzero(labels[:rank+1]==0)
pr[rank] = tp/(tp+fp)
rc[rank] = tp/all_p
#after obj detection convention/ coco-dataset template: take maximum pr within intervals:
# --> pr[i]<=pr[i-1] for all i since we want to consider the maximum
# precision value for a queried interval
for i in range(n_dets - 1, 0, -1):
if pr[i] > pr[i - 1]:
pr[i - 1] = pr[i]
R = np.linspace(0., 1., np.round((1. - 0.) / .01).astype(int) + 1, endpoint=True)#precision queried at R points
inds = np.searchsorted(rc, R, side='left')
queries = np.zeros((len(R),))
try:
for q_ix, rank in enumerate(inds):
queries[q_ix] = pr[rank]
except IndexError:
pass
return np.array((queries, R))
def RMSE(y_true, y_pred, weights=None):
if len(y_true)>0:
return np.sqrt(mean_squared_error(y_true, y_pred, sample_weight=weights))
else:
return np.nan
def MAE_w_std(y_true, y_pred, weights=None):
if len(y_true)>0:
y_true, y_pred = np.array(y_true), np.array(y_pred)
deltas = np.abs(y_true-y_pred)
mae = np.average(deltas, weights=weights, axis=0).item()
skmae = mean_absolute_error(y_true, y_pred, sample_weight=weights)
assert np.allclose(mae, skmae, atol=1e-6), "mae {}, sklearn mae {}".format(mae, skmae)
std = np.std(weights*deltas)
return mae, std
else:
return np.nan, np.nan
def MAE(y_true, y_pred, weights=None):
if len(y_true)>0:
return mean_absolute_error(y_true, y_pred, sample_weight=weights)
else:
return np.nan
def accuracy(y_true, y_pred, weights=None):
if len(y_true)>0:
return accuracy_score(y_true, y_pred, sample_weight=weights)
else:
return np.nan
# noinspection PyCallingNonCallable
class Evaluator():
""" Evaluates given results dicts. Can return results as updated monitor_metrics. Can save test data frames to
file.
"""
def __init__(self, cf, logger, mode='test'):
"""
:param mode: either 'train', 'val_sampling', 'val_patient' or 'test'. handles prediction lists of different forms.
"""
self.cf = cf
self.logger = logger
self.mode = mode
self.regress_flag = any(['regression' in task for task in self.cf.prediction_tasks])
self.plot_dir = self.cf.test_dir if self.mode == "test" else self.cf.plot_dir
if self.cf.plot_prediction_histograms:
self.hist_dir = os.path.join(self.plot_dir, 'histograms')
os.makedirs(self.hist_dir, exist_ok=True)
if self.cf.plot_stat_curves:
self.curves_dir = os.path.join(self.plot_dir, 'stat_curves')
os.makedirs(self.curves_dir, exist_ok=True)
def eval_losses(self, batch_res_dicts):
if hasattr(self.cf, "losses_to_monitor"):
loss_names = self.cf.losses_to_monitor
else:
loss_names = {name for b_res_dict in batch_res_dicts for name in b_res_dict if 'loss' in name}
self.epoch_losses = {l_name: torch.tensor([b_res_dict[l_name] for b_res_dict in batch_res_dicts if l_name
in b_res_dict.keys()]).mean().item() for l_name in loss_names}
def eval_segmentations(self, batch_res_dicts, pid_list):
batch_dices = [b_res_dict['batch_dices'] for b_res_dict in batch_res_dicts if
'batch_dices' in b_res_dict.keys()] # shape (n_batches, n_seg_classes)
if len(batch_dices) > 0:
batch_dices = np.array(batch_dices) # dims n_batches x 1 in sampling / n_test_epochs x n_classes
assert batch_dices.shape[1] == self.cf.num_seg_classes, "bdices shp {}, n seg cl {}, pid lst len {}".format(
batch_dices.shape, self.cf.num_seg_classes, len(pid_list))
self.seg_df = pd.DataFrame()
for seg_id in range(batch_dices.shape[1]):
self.seg_df[self.cf.seg_id2label[seg_id].name + "_dice"] = batch_dices[:,
seg_id] # one row== one batch, one column== one class
# self.seg_df[self.cf.seg_id2label[seg_id].name+"_dice"] = np.concatenate(batch_dices[:,:,seg_id])
self.seg_df['fold'] = self.cf.fold
if self.mode == "val_patient" or self.mode == "test":
# need to make it more conform between sampling and patient-mode
self.seg_df["pid"] = [pid for pix, pid in enumerate(pid_list)] # for b_inst in batch_inst_boxes[pix]]
else:
self.seg_df["pid"] = np.nan
def eval_boxes(self, batch_res_dicts, pid_list, obj_cl_dict,
obj_cl_identifiers={"gt":'class_targets', "pred":'box_pred_class_id'}):
"""
:param batch_res_dicts:
:param pid_list: [pid_0, pid_1, ...]
:return:
"""
if self.mode == 'train' or self.mode == 'val_sampling':
# one pid per batch element
# batch_size > 1, with varying patients across batch:
# [[[results_0, ...], [pid_0, ...]], [[results_n, ...], [pid_n, ...]], ...]
# -> [results_0, results_1, ..]
batch_inst_boxes = [b_res_dict['boxes'] for b_res_dict in batch_res_dicts] # len: nr of batches in epoch
batch_inst_boxes = [[b_inst_boxes] for whole_batch_boxes in batch_inst_boxes for b_inst_boxes in
whole_batch_boxes] # len: batch instances of whole epoch
assert np.all(len(b_boxes_list) == self.cf.batch_size for b_boxes_list in batch_inst_boxes)
elif self.mode == "val_patient" or self.mode == "test":
# patient processing, one element per batch = one patient.
# [[results_0, pid_0], [results_1, pid_1], ...] -> [results_0, results_1, ..]
# in patientbatchiterator there is only one pid per batch
batch_inst_boxes = [b_res_dict['boxes'] for b_res_dict in batch_res_dicts]
# in patient mode not actually per batch instance, but per whole batch!
if hasattr(self.cf, "eval_test_separately") and self.cf.eval_test_separately:
""" you could write your own routines to add GTs to raw predictions for evaluation.
implemented standard is: cf.eval_test_separately = False or not set --> GTs are saved at same time
and in same file as raw prediction results.
"""
raise NotImplementedError
assert len(batch_inst_boxes) == len(pid_list)
df_list_preds = []
df_list_labels = []
df_list_class_preds = []
df_list_pids = []
df_list_type = []
df_list_match_iou = []
df_list_n_missing = []
df_list_regressions = []
df_list_rg_targets = []
df_list_rg_bins = []
df_list_rg_bin_targets = []
df_list_rg_uncs = []
for match_iou in self.cf.ap_match_ious:
self.logger.info('evaluating with ap_match_iou: {}'.format(match_iou))
for cl in list(obj_cl_dict.keys()):
for pix, pid in enumerate(pid_list):
len_df_list_before_patient = len(df_list_pids)
# input of each batch element is a list of boxes, where each box is a dictionary.
for b_inst_ix, b_boxes_list in enumerate(batch_inst_boxes[pix]):
b_tar_boxes = []
b_cand_boxes, b_cand_scores, b_cand_n_missing = [], [], []
if self.regress_flag:
b_tar_regs, b_tar_rg_bins = [], []
b_cand_regs, b_cand_rg_bins, b_cand_rg_uncs = [], [], []
for box in b_boxes_list:
# each box is either gt or detection or proposal/anchor
# we need all gts in the same order & all dets in same order
if box['box_type'] == 'gt' and box[obj_cl_identifiers["gt"]] == cl:
b_tar_boxes.append(box["box_coords"])
if self.regress_flag:
b_tar_regs.append(np.array(box['regression_targets'], dtype='float32'))
b_tar_rg_bins.append(box['rg_bin_targets'])
if box['box_type'] == 'det' and box[obj_cl_identifiers["pred"]] == cl:
b_cand_boxes.append(box["box_coords"])
b_cand_scores.append(box["box_score"])
b_cand_n_missing.append(box["cluster_n_missing"] if 'cluster_n_missing' in box.keys() else np.nan)
if self.regress_flag:
b_cand_regs.append(box["regression"])
b_cand_rg_bins.append(box["rg_bin"])
b_cand_rg_uncs.append(box["rg_uncertainty"] if 'rg_uncertainty' in box.keys() else np.nan)
b_tar_boxes = np.array(b_tar_boxes)
b_cand_boxes, b_cand_scores, b_cand_n_missing = np.array(b_cand_boxes), np.array(b_cand_scores), np.array(b_cand_n_missing)
if self.regress_flag:
b_tar_regs, b_tar_rg_bins = np.array(b_tar_regs), np.array(b_tar_rg_bins)
b_cand_regs, b_cand_rg_bins, b_cand_rg_uncs = np.array(b_cand_regs), np.array(b_cand_rg_bins), np.array(b_cand_rg_uncs)
# check if predictions and ground truth boxes exist and match them according to match_iou.
if not 0 in b_cand_boxes.shape and not 0 in b_tar_boxes.shape:
assert np.all(np.round(b_cand_scores,6) <= 1.), "there is a box score>1: {}".format(b_cand_scores[~(b_cand_scores<=1.)])
#coords_check = np.array([len(coords)==self.cf.dim*2 for coords in b_cand_boxes])
#assert np.all(coords_check), "cand box with wrong bcoords dim: {}, mode: {}".format(b_cand_boxes[~coords_check], self.mode)
expected_dim = len(b_cand_boxes[0])
assert np.all([len(coords) == expected_dim for coords in b_tar_boxes]), \
"gt/cand box coords mismatch, expected dim: {}.".format(expected_dim)
# overlaps: shape len(cand_boxes) x len(tar_boxes)
overlaps = mutils.compute_overlaps(b_cand_boxes, b_tar_boxes)
# match_cand_ixs: shape (nr_of_matches,)
# theses indices are the indices of b_cand_boxes
match_cand_ixs = np.argwhere(np.max(overlaps, axis=1) > match_iou)[:, 0]
non_match_cand_ixs = np.argwhere(np.max(overlaps, 1) <= match_iou)[:, 0]
# the corresponding gt assigned to the pred boxes by highest iou overlap,
# i.e., match_gt_ixs holds index into b_tar_boxes for each entry in match_cand_ixs,
# i.e., gt_ixs and cand_ixs are paired via their position in their list
# (cand_ixs[j] corresponds to gt_ixs[j])
match_gt_ixs = np.argmax(overlaps[match_cand_ixs, :], axis=1) if \
not 0 in match_cand_ixs.shape else np.array([])
assert len(match_gt_ixs)==len(match_cand_ixs)
#match_gt_ixs: shape (nr_of_matches,) or 0
non_match_gt_ixs = np.array(
[ii for ii in np.arange(b_tar_boxes.shape[0]) if ii not in match_gt_ixs])
unique, counts = np.unique(match_gt_ixs, return_counts=True)
# check for double assignments, i.e. two predictions having been assigned to the same gt.
# according to the COCO-metrics, only one prediction counts as true positive, the rest counts as
# false positive. This case is supposed to be avoided by the model itself by,
# e.g. using a low enough NMS threshold.
if np.any(counts > 1):
double_match_gt_ixs = unique[np.argwhere(counts > 1)[:, 0]]
keep_max = []
double_match_list = []
for dg in double_match_gt_ixs:
double_match_cand_ixs = match_cand_ixs[np.argwhere(match_gt_ixs == dg)]
keep_max.append(double_match_cand_ixs[np.argmax(b_cand_scores[double_match_cand_ixs])])
double_match_list += [ii for ii in double_match_cand_ixs]
fp_ixs = np.array([ii for ii in match_cand_ixs if
(ii in double_match_list and ii not in keep_max)])
# count as fp: boxes that match gt above match_iou threshold but have not highest class confidence score
match_gt_ixs = np.array([gt_ix for ii, gt_ix in enumerate(match_gt_ixs) if match_cand_ixs[ii] not in fp_ixs])
match_cand_ixs = np.array([cand_ix for cand_ix in match_cand_ixs if cand_ix not in fp_ixs])
assert len(match_gt_ixs) == len(match_cand_ixs)
df_list_preds += [ii for ii in b_cand_scores[fp_ixs]]
df_list_labels += [0] * fp_ixs.shape[0] # means label==gt==0==bg for all these fp_ixs
df_list_class_preds += [cl] * fp_ixs.shape[0]
df_list_n_missing += [n for n in b_cand_n_missing[fp_ixs]]
if self.regress_flag:
df_list_regressions += [r for r in b_cand_regs[fp_ixs]]
df_list_rg_bins += [r for r in b_cand_rg_bins[fp_ixs]]
df_list_rg_uncs += [r for r in b_cand_rg_uncs[fp_ixs]]
df_list_rg_targets += [[0.]*self.cf.regression_n_features] * fp_ixs.shape[0]
df_list_rg_bin_targets += [0.] * fp_ixs.shape[0]
df_list_pids += [pid] * fp_ixs.shape[0]
df_list_type += ['det_fp'] * fp_ixs.shape[0]
# matched/tp:
if not 0 in match_cand_ixs.shape:
df_list_preds += list(b_cand_scores[match_cand_ixs])
df_list_labels += [1] * match_cand_ixs.shape[0]
df_list_class_preds += [cl] * match_cand_ixs.shape[0]
df_list_n_missing += list(b_cand_n_missing[match_cand_ixs])
if self.regress_flag:
df_list_regressions += list(b_cand_regs[match_cand_ixs])
df_list_rg_bins += list(b_cand_rg_bins[match_cand_ixs])
df_list_rg_uncs += list(b_cand_rg_uncs[match_cand_ixs])
assert len(match_cand_ixs)==len(match_gt_ixs)
df_list_rg_targets += list(b_tar_regs[match_gt_ixs])
df_list_rg_bin_targets += list(b_tar_rg_bins[match_gt_ixs])
df_list_pids += [pid] * match_cand_ixs.shape[0]
df_list_type += ['det_tp'] * match_cand_ixs.shape[0]
# rest fp:
if not 0 in non_match_cand_ixs.shape:
df_list_preds += list(b_cand_scores[non_match_cand_ixs])
df_list_labels += [0] * non_match_cand_ixs.shape[0]
df_list_class_preds += [cl] * non_match_cand_ixs.shape[0]
df_list_n_missing += list(b_cand_n_missing[non_match_cand_ixs])
if self.regress_flag:
df_list_regressions += list(b_cand_regs[non_match_cand_ixs])
df_list_rg_bins += list(b_cand_rg_bins[non_match_cand_ixs])
df_list_rg_uncs += list(b_cand_rg_uncs[non_match_cand_ixs])
df_list_rg_targets += [[0.]*self.cf.regression_n_features] * non_match_cand_ixs.shape[0]
df_list_rg_bin_targets += [0.] * non_match_cand_ixs.shape[0]
df_list_pids += [pid] * non_match_cand_ixs.shape[0]
df_list_type += ['det_fp'] * non_match_cand_ixs.shape[0]
# fn:
if not 0 in non_match_gt_ixs.shape:
df_list_preds += [0] * non_match_gt_ixs.shape[0]
df_list_labels += [1] * non_match_gt_ixs.shape[0]
df_list_class_preds += [cl] * non_match_gt_ixs.shape[0]
df_list_n_missing += [np.nan] * non_match_gt_ixs.shape[0]
if self.regress_flag:
df_list_regressions += [[0.]*self.cf.regression_n_features] * non_match_gt_ixs.shape[0]
df_list_rg_bins += [0.] * non_match_gt_ixs.shape[0]
df_list_rg_uncs += [np.nan] * non_match_gt_ixs.shape[0]
df_list_rg_targets += list(b_tar_regs[non_match_gt_ixs])
df_list_rg_bin_targets += list(b_tar_rg_bins[non_match_gt_ixs])
df_list_pids += [pid] * non_match_gt_ixs.shape[0]
df_list_type += ['det_fn'] * non_match_gt_ixs.shape[0]
# only fp:
if not 0 in b_cand_boxes.shape and 0 in b_tar_boxes.shape:
# means there is no gt in all samples! any preds have to be fp.
df_list_preds += list(b_cand_scores)
df_list_labels += [0] * b_cand_boxes.shape[0]
df_list_class_preds += [cl] * b_cand_boxes.shape[0]
df_list_n_missing += list(b_cand_n_missing)
if self.regress_flag:
df_list_regressions += list(b_cand_regs)
df_list_rg_bins += list(b_cand_rg_bins)
df_list_rg_uncs += list(b_cand_rg_uncs)
df_list_rg_targets += [[0.]*self.cf.regression_n_features] * b_cand_boxes.shape[0]
df_list_rg_bin_targets += [0.] * b_cand_boxes.shape[0]
df_list_pids += [pid] * b_cand_boxes.shape[0]
df_list_type += ['det_fp'] * b_cand_boxes.shape[0]
# only fn:
if 0 in b_cand_boxes.shape and not 0 in b_tar_boxes.shape:
df_list_preds += [0] * b_tar_boxes.shape[0]
df_list_labels += [1] * b_tar_boxes.shape[0]
df_list_class_preds += [cl] * b_tar_boxes.shape[0]
df_list_n_missing += [np.nan] * b_tar_boxes.shape[0]
if self.regress_flag:
df_list_regressions += [[0.]*self.cf.regression_n_features] * b_tar_boxes.shape[0]
df_list_rg_bins += [0.] * b_tar_boxes.shape[0]
df_list_rg_uncs += [np.nan] * b_tar_boxes.shape[0]
df_list_rg_targets += list(b_tar_regs)
df_list_rg_bin_targets += list(b_tar_rg_bins)
df_list_pids += [pid] * b_tar_boxes.shape[0]
df_list_type += ['det_fn'] * b_tar_boxes.shape[0]
# empty patient with 0 detections needs empty patient score, in order to not disappear from stats.
# filtered out for roi-level evaluation later. During training (and val_sampling),
# tn are assigned per sample independently of associated patients.
# i.e., patient_tn is also meant as sample_tn if a list of samples is evaluated instead of whole patient
if len(df_list_pids) == len_df_list_before_patient:
df_list_preds += [0]
df_list_labels += [0]
df_list_class_preds += [cl]
df_list_n_missing += [np.nan]
if self.regress_flag:
df_list_regressions += [[0.]*self.cf.regression_n_features]
df_list_rg_bins += [0.]
df_list_rg_uncs += [np.nan]
df_list_rg_targets += [[0.]*self.cf.regression_n_features]
df_list_rg_bin_targets += [0.]
df_list_pids += [pid]
df_list_type += ['patient_tn'] # true negative: no ground truth boxes, no detections.
df_list_match_iou += [match_iou] * (len(df_list_preds) - len(df_list_match_iou))
self.test_df = pd.DataFrame()
self.test_df['pred_score'] = df_list_preds
self.test_df['class_label'] = df_list_labels
# class labels are gt, 0,1, only indicate neg/pos (or bg/fg) remapped from all classes
self.test_df['pred_class'] = df_list_class_preds # can be diff than 0,1
self.test_df['pid'] = df_list_pids
self.test_df['det_type'] = df_list_type
self.test_df['fold'] = self.cf.fold
self.test_df['match_iou'] = df_list_match_iou
self.test_df['cluster_n_missing'] = df_list_n_missing
if self.regress_flag:
self.test_df['regressions'] = df_list_regressions
self.test_df['rg_targets'] = df_list_rg_targets
self.test_df['rg_uncertainties'] = df_list_rg_uncs
self.test_df['rg_bins'] = df_list_rg_bins
# super weird error: pandas does not properly add an attribute if column is named "rg_bin_targets" ... ?!?
self.test_df['rg_bin_target'] = df_list_rg_bin_targets
assert hasattr(self.test_df, "rg_bin_target")
#fn_df = self.test_df[self.test_df["det_type"] == "det_fn"]
pass
def evaluate_predictions(self, results_list, monitor_metrics=None):
"""
Performs the matching of predicted boxes and ground truth boxes. Loops over list of matching IoUs and foreground classes.
Resulting info of each prediction is stored as one line in an internal dataframe, with the keys:
det_type: 'tp' (true positive), 'fp' (false positive), 'fn' (false negative), 'tn' (true negative)
pred_class: foreground class which the object predicts.
pid: corresponding patient-id.
pred_score: confidence score [0, 1]
fold: corresponding fold of CV.
match_iou: utilized IoU for matching.
:param results_list: list of model predictions. Either from train/val_sampling (patch processing) for monitoring with form:
[[[results_0, ...], [pid_0, ...]], [[results_n, ...], [pid_n, ...]], ...]
Or from val_patient/testing (patient processing), with form: [[results_0, pid_0], [results_1, pid_1], ...])
:param monitor_metrics (optional): dict of dicts with all metrics of previous epochs.
:return monitor_metrics: if provided (during training), return monitor_metrics now including results of current epoch.
"""
# gets results_list = [[batch_instances_box_lists], [batch_instances_pids]]*n_batches
# we want to evaluate one batch_instance (= 2D or 3D image) at a time.
self.logger.info('evaluating in mode {}'.format(self.mode))
batch_res_dicts = [batch[0] for batch in results_list] # len: nr of batches in epoch
if self.mode == 'train' or self.mode=='val_sampling':
# one pid per batch element
# [[[results_0, ...], [pid_0, ...]], [[results_n, ...], [pid_n, ...]], ...]
# -> [pid_0, pid_1, ...]
# additional list wrapping to make conform with below per-patient batches, where one pid is linked to more than one batch instance
pid_list = [batch_instance_pid for batch in results_list for batch_instance_pid in batch[1]]
elif self.mode == "val_patient" or self.mode=="test":
# [[results_0, pid_0], [results_1, pid_1], ...] -> [pid_0, pid_1, ...]
# in patientbatchiterator there is only one pid per batch
pid_list = [np.unique(batch[1]) for batch in results_list]
assert np.all([len(pid)==1 for pid in pid_list]), "pid list in patient-eval mode, should only contain a single scalar per patient: {}".format(pid_list)
pid_list = [pid[0] for pid in pid_list]
else:
raise Exception("undefined run mode encountered")
self.eval_losses(batch_res_dicts)
self.eval_segmentations(batch_res_dicts, pid_list)
self.eval_boxes(batch_res_dicts, pid_list, self.cf.class_dict)
if monitor_metrics is not None:
# return all_stats, updated monitor_metrics
return self.return_metrics(self.test_df, self.cf.class_dict, monitor_metrics)
def return_metrics(self, df, obj_cl_dict, monitor_metrics=None, boxes_only=False):
"""
Calculates metric scores for internal data frame. Called directly from evaluate_predictions during training for
monitoring, or from score_test_df during inference (for single folds or aggregated test set).
Loops over foreground classes and score_levels ('roi' and/or 'patient'), gets scores and stores them.
Optionally creates plots of prediction histograms and ROC/PR curves.
:param df: Data frame that holds evaluated predictions.
:param obj_cl_dict: Dict linking object-class ids to object-class names. E.g., {1: "bikes", 2 : "cars"}. Set in
configs as cf.class_dict.
:param monitor_metrics: dict of dicts with all metrics of previous epochs. This function adds metrics for
current epoch and returns the same object.
:param boxes_only: whether to produce metrics only for the boxes, not the segmentations.
:return: all_stats: list. Contains dicts with resulting scores for each combination of foreground class and
score_level.
:return: monitor_metrics
"""
# -------------- monitoring independent of class, score level ------------
if monitor_metrics is not None:
for l_name in self.epoch_losses:
monitor_metrics[l_name] = [self.epoch_losses[l_name]]
# -------------- metrics calc dependent on class, score level ------------
all_stats = [] # all_stats: one entry per score_level per class
for cl in list(obj_cl_dict.keys()): # bg eval is neglected
cl_name = obj_cl_dict[cl]
cl_df = df[df.pred_class == cl]
if hasattr(self, "seg_df") and not boxes_only:
dice_col = self.cf.seg_id2label[cl].name+"_dice"
seg_cl_df = self.seg_df.loc[:,['pid', dice_col, 'fold']]
for score_level in self.cf.report_score_level:
stats_dict = {}
stats_dict['name'] = 'fold_{} {} {}'.format(self.cf.fold, score_level, cl_name)
# -------------- RoI-based -----------------
if score_level == 'rois':
stats_dict['auc'] = np.nan
stats_dict['roc'] = np.nan
if monitor_metrics is not None:
tn = len(cl_df[cl_df.det_type == "patient_tn"])
tp = len(cl_df[(cl_df.det_type == "det_tp")&(cl_df.pred_score>self.cf.min_det_thresh)])
fp = len(cl_df[(cl_df.det_type == "det_fp")&(cl_df.pred_score>self.cf.min_det_thresh)])
fn = len(cl_df[cl_df.det_type == "det_fn"])
sens = np.divide(tp, (fn + tp))
monitor_metrics.update({"Bin_Stats/" + cl_name + "_fp": [fp], "Bin_Stats/" + cl_name + "_tp": [tp],
"Bin_Stats/" + cl_name + "_fn": [fn], "Bin_Stats/" + cl_name + "_tn": [tn],
"Bin_Stats/" + cl_name + "_sensitivity": [sens]})
# list wrapping only needed bc other metrics are recorded over all epochs;
spec_df = cl_df[cl_df.det_type != 'patient_tn']
if self.regress_flag:
# filter false negatives out for regression-only eval since regressor didn't predict
truncd_df = spec_df[(((spec_df.det_type == "det_fp") | (
spec_df.det_type == "det_tp")) & spec_df.pred_score > self.cf.min_det_thresh)]
truncd_df_tp = truncd_df[truncd_df.det_type == "det_tp"]
weights, weights_tp = truncd_df.pred_score.tolist(), truncd_df_tp.pred_score.tolist()
y_true, y_pred = truncd_df.rg_targets.tolist(), truncd_df.regressions.tolist()
stats_dict["rg_RMSE"] = RMSE(y_true, y_pred)
stats_dict["rg_MAE"] = MAE(y_true, y_pred)
stats_dict["rg_RMSE_weighted"] = RMSE(y_true, y_pred, weights)
stats_dict["rg_MAE_weighted"] = MAE(y_true, y_pred, weights)
y_true, y_pred = truncd_df_tp.rg_targets.tolist(), truncd_df_tp.regressions.tolist()
stats_dict["rg_MAE_weighted_tp"] = MAE(y_true, y_pred, weights_tp)
stats_dict["rg_MAE_w_std_weighted_tp"] = MAE_w_std(y_true, y_pred, weights_tp)
y_true, y_pred = truncd_df.rg_bin_target.tolist(), truncd_df.rg_bins.tolist()
stats_dict["rg_bin_accuracy"] = accuracy(y_true, y_pred)
stats_dict["rg_bin_accuracy_weighted"] = accuracy(y_true, y_pred, weights)
y_true, y_pred = truncd_df_tp.rg_bin_target.tolist(), truncd_df_tp.rg_bins.tolist()
stats_dict["rg_bin_accuracy_weighted_tp"] = accuracy(y_true, y_pred, weights_tp)
if np.any(~truncd_df.rg_uncertainties.isna()):
# det_fn are expected to be NaN so they drop out in means
stats_dict.update({"rg_uncertainty": truncd_df.rg_uncertainties.mean(),
"rg_uncertainty_tp": truncd_df_tp.rg_uncertainties.mean(),
"rg_uncertainty_tp_weighted": (truncd_df_tp.rg_uncertainties * truncd_df_tp.pred_score).sum()
/ truncd_df_tp.pred_score.sum()
})
if (spec_df.class_label==1).any():
stats_dict['ap'] = get_roi_ap_from_df((spec_df, self.cf.min_det_thresh, self.cf.per_patient_ap))
stats_dict['prc'] = precision_recall_curve(spec_df.class_label.tolist(), spec_df.pred_score.tolist())
if self.regress_flag:
stats_dict['avp'] = roi_avp((spec_df, self.cf.min_det_thresh, self.cf.per_patient_ap))
else:
stats_dict['ap'] = np.nan
stats_dict['prc'] = np.nan
stats_dict['avp'] = np.nan
# np.nan is formattable by __format__ as a float, None-type is not
if hasattr(self, "seg_df") and not boxes_only:
stats_dict["dice"] = seg_cl_df.loc[:,dice_col].mean() # mean per all rois in this epoch
stats_dict["dice_std"] = seg_cl_df.loc[:,dice_col].std()
# for the aggregated test set case, additionally get the scores of averaging over fold results.
if self.cf.evaluate_fold_means and len(df.fold.unique()) > 1:
aps = []
for fold in df.fold.unique():
fold_df = spec_df[spec_df.fold == fold]
if (fold_df.class_label==1).any():
aps.append(get_roi_ap_from_df((fold_df, self.cf.min_det_thresh, self.cf.per_patient_ap)))
stats_dict['ap_folds_mean'] = np.mean(aps) if len(aps)>0 else np.nan
stats_dict['ap_folds_std'] = np.std(aps) if len(aps)>0 else np.nan
stats_dict['auc_folds_mean'] = np.nan
stats_dict['auc_folds_std'] = np.nan
if self.regress_flag:
avps, accuracies, MAEs = [], [], []
for fold in df.fold.unique():
fold_df = spec_df[spec_df.fold == fold]
if (fold_df.class_label == 1).any():
avps.append(roi_avp((fold_df, self.cf.min_det_thresh, self.cf.per_patient_ap)))
truncd_df_tp = fold_df[((fold_df.det_type == "det_tp") & fold_df.pred_score > self.cf.min_det_thresh)]
weights_tp = truncd_df_tp.pred_score.tolist()
y_true, y_pred = truncd_df_tp.rg_bin_target.tolist(), truncd_df_tp.rg_bins.tolist()
accuracies.append(accuracy(y_true, y_pred, weights_tp))
y_true, y_pred = truncd_df_tp.rg_targets.tolist(), truncd_df_tp.regressions.tolist()
MAEs.append(MAE_w_std(y_true, y_pred, weights_tp))
stats_dict['avp_folds_mean'] = np.mean(avps) if len(avps) > 0 else np.nan
stats_dict['avp_folds_std'] = np.std(avps) if len(avps) > 0 else np.nan
stats_dict['rg_bin_accuracy_weighted_tp_folds_mean'] = np.mean(accuracies) if len(accuracies) > 0 else np.nan
stats_dict['rg_bin_accuracy_weighted_tp_folds_std'] = np.std(accuracies) if len(accuracies) > 0 else np.nan
stats_dict['rg_MAE_w_std_weighted_tp_folds_mean'] = np.mean(MAEs, axis=0) if len(MAEs) > 0 else np.nan
stats_dict['rg_MAE_w_std_weighted_tp_folds_std'] = np.std(MAEs, axis=0) if len(MAEs) > 0 else np.nan
if hasattr(self, "seg_df") and not boxes_only and self.cf.evaluate_fold_means and len(seg_cl_df.fold.unique()) > 1:
fold_means = seg_cl_df.groupby(['fold'], as_index=True).agg({dice_col:"mean"})
stats_dict["dice_folds_mean"] = float(fold_means.mean())
stats_dict["dice_folds_std"] = float(fold_means.std())
# -------------- patient-based -----------------
# on patient level, aggregate predictions per patient (pid): The patient predicted score is the highest
# confidence prediction for this class. The patient class label is 1 if roi of this class exists in patient, else 0.
if score_level == 'patient':
#this is the critical part in patient scoring: only the max gt and max pred score are taken per patient!
#--> does mix up values from separate detections
spec_df = cl_df.groupby(['pid'], as_index=False)
agg_args = {'class_label': 'max', 'pred_score': 'max', 'fold': 'first'}
if self.regress_flag:
# pandas throws error if aggregated value is np.array, not if is list.
agg_args.update({'regressions': lambda series: list(series.iloc[np.argmax(series.apply(np.linalg.norm).values)]),
'rg_targets': lambda series: list(series.iloc[np.argmax(series.apply(np.linalg.norm).values)]),
'rg_bins': 'max', 'rg_bin_target': 'max',
'rg_uncertainties': 'max'
})
if hasattr(cl_df, "cluster_n_missing"):
agg_args.update({'cluster_n_missing': 'mean'})
spec_df = spec_df.agg(agg_args)
if len(spec_df.class_label.unique()) > 1:
stats_dict['auc'] = roc_auc_score(spec_df.class_label.tolist(), spec_df.pred_score.tolist())
stats_dict['roc'] = roc_curve(spec_df.class_label.tolist(), spec_df.pred_score.tolist())
else:
stats_dict['auc'] = np.nan
stats_dict['roc'] = np.nan
if (spec_df.class_label == 1).any():
patient_cl_labels = spec_df.class_label.tolist()
stats_dict['ap'] = average_precision_score(patient_cl_labels, spec_df.pred_score.tolist())
stats_dict['prc'] = precision_recall_curve(patient_cl_labels, spec_df.pred_score.tolist())
if self.regress_flag:
avp_scores = spec_df[spec_df.rg_bins == spec_df.rg_bin_target].pred_score.tolist()
avp_scores += [0.] * (len(patient_cl_labels) - len(avp_scores))
stats_dict['avp'] = average_precision_score(patient_cl_labels, avp_scores)
else:
stats_dict['ap'] = np.nan
stats_dict['prc'] = np.nan
stats_dict['avp'] = np.nan
if self.regress_flag:
y_true, y_pred = spec_df.rg_targets.tolist(), spec_df.regressions.tolist()
stats_dict["rg_RMSE"] = RMSE(y_true, y_pred)
stats_dict["rg_MAE"] = MAE(y_true, y_pred)
stats_dict["rg_bin_accuracy"] = accuracy(spec_df.rg_bin_target.tolist(), spec_df.rg_bins.tolist())
stats_dict["rg_uncertainty"] = spec_df.rg_uncertainties.mean()
if hasattr(self, "seg_df") and not boxes_only:
seg_cl_df = seg_cl_df.groupby(['pid'], as_index=False).agg(
{dice_col: "mean", "fold": "first"}) # mean of all rois per patient in this epoch
stats_dict["dice"] = seg_cl_df.loc[:,dice_col].mean() #mean of all patients
stats_dict["dice_std"] = seg_cl_df.loc[:, dice_col].std()
# for the aggregated test set case, additionally get the scores for averaging over fold results.
if self.cf.evaluate_fold_means and len(df.fold.unique()) > 1 and self.mode in ["test", "analysis"]:
aucs = []
aps = []
for fold in df.fold.unique():
fold_df = spec_df[spec_df.fold == fold]
if (fold_df.class_label==1).any():
aps.append(
average_precision_score(fold_df.class_label.tolist(), fold_df.pred_score.tolist()))
if len(fold_df.class_label.unique())>1:
aucs.append(roc_auc_score(fold_df.class_label.tolist(), fold_df.pred_score.tolist()))
stats_dict['auc_folds_mean'] = np.mean(aucs)
stats_dict['auc_folds_std'] = np.std(aucs)
stats_dict['ap_folds_mean'] = np.mean(aps)
stats_dict['ap_folds_std'] = np.std(aps)
if hasattr(self, "seg_df") and not boxes_only and self.cf.evaluate_fold_means and len(seg_cl_df.fold.unique()) > 1:
fold_means = seg_cl_df.groupby(['fold'], as_index=True).agg({dice_col:"mean"})
stats_dict["dice_folds_mean"] = float(fold_means.mean())
stats_dict["dice_folds_std"] = float(fold_means.std())
all_stats.append(stats_dict)
# -------------- monitoring, visualisation -----------------
# fill new results into monitor_metrics dict. for simplicity, only one class (of interest) is monitored on patient level.
patient_interests = [self.cf.class_dict[self.cf.patient_class_of_interest],]
if hasattr(self.cf, "bin_dict"):
patient_interests += [self.cf.bin_dict[self.cf.patient_bin_of_interest]]
if monitor_metrics is not None and (score_level != 'patient' or cl_name in patient_interests):
name = 'patient_'+cl_name if score_level == 'patient' else cl_name
for metric in self.cf.metrics:
if metric in stats_dict.keys():
monitor_metrics[name + '_'+metric].append(stats_dict[metric])
else:
print("WARNING: skipped monitor metric {}_{} since not avail".format(name, metric))
# histograms
if self.cf.plot_prediction_histograms:
out_filename = os.path.join(self.hist_dir, 'pred_hist_{}_{}_{}_{}'.format(
self.cf.fold, self.mode, score_level, cl_name))
plg.plot_prediction_hist(self.cf, spec_df, out_filename)
# analysis of the hyper-parameter cf.min_det_thresh, for optimization on validation set.
if self.cf.scan_det_thresh and "val" in self.mode:
conf_threshs = list(np.arange(0.8, 1, 0.02))
pool = Pool(processes=self.cf.n_workers)
mp_inputs = [[spec_df, ii, self.cf.per_patient_ap] for ii in conf_threshs]
aps = pool.map(get_roi_ap_from_df, mp_inputs, chunksize=1)
pool.close()
pool.join()
self.logger.info('results from scanning over det_threshs: {}'.format([[i, j] for i, j in zip(conf_threshs, aps)]))
class_means = pd.DataFrame(columns=self.cf.report_score_level)
for slevel in self.cf.report_score_level:
level_stats = pd.DataFrame([stats for stats in all_stats if slevel in stats["name"]])[self.cf.metrics]
class_means.loc[:, slevel] = level_stats.mean()
all_stats.extend([{"name": 'fold_{} {} {}'.format(self.cf.fold, slevel, "class_means"), **level_means} for
slevel, level_means in class_means.to_dict().items()])
if self.cf.plot_stat_curves:
out_filename = os.path.join(self.curves_dir, '{}_{}_stat_curves'.format(self.cf.fold, self.mode))
plg.plot_stat_curves(self.cf, all_stats, out_filename)
if self.cf.plot_prediction_histograms and hasattr(df, "cluster_n_missing") and df.cluster_n_missing.notna().any():
out_filename = os.path.join(self.hist_dir, 'n_missing_hist_{}_{}.png'.format(self.cf.fold, self.mode))
plg.plot_wbc_n_missing(self.cf, df, outfile=out_filename)
return all_stats, monitor_metrics
def write_to_results_table(self, stats, metrics_to_score):
"""Write overall results to a common inter-experiment table.
:param metrics_to_score:
:return:
"""
results_table_path = os.path.join(self.cf.test_dir, "../../", 'results_table.csv')
with open(results_table_path, 'a') as handle:
# ---column headers---
handle.write('\n{},'.format("Experiment Name"))
handle.write('{},'.format("Time Stamp"))
handle.write('{},'.format("Samples Seen"))
handle.write('{},'.format("Spatial Dim"))
handle.write('{},'.format("Patch Size"))
handle.write('{},'.format("CV Folds"))
handle.write('{},'.format("{}-clustering IoU".format(self.cf.clustering)))
handle.write('{},'.format("Merge-2D-to-3D IoU"))
if hasattr(self.cf, "test_against_exact_gt"):
handle.write('{},'.format('Exact GT'))
for s in stats:
if self.cf.class_dict[self.cf.patient_class_of_interest] in s['name'] or "mean" in s["name"]:
for metric in metrics_to_score:
if metric in s.keys() and not np.isnan(s[metric]):
if metric == 'ap':
handle.write('{}_{} : {}_{},'.format(*s['name'].split(" ")[1:], metric,
int(np.mean(self.cf.ap_match_ious) * 100)))
elif not "folds_std" in metric:
handle.write('{}_{} : {},'.format(*s['name'].split(" ")[1:], metric))
else:
print("WARNING: skipped metric {} since not avail".format(metric))
handle.write('\n')
# --- columns content---
handle.write('{},'.format(self.cf.exp_dir.split(os.sep)[-1]))
handle.write('{},'.format(time.strftime("%d%b%y %H:%M:%S")))
handle.write('{},'.format(self.cf.num_epochs * self.cf.num_train_batches * self.cf.batch_size))
handle.write('{}D,'.format(self.cf.dim))
handle.write('{},'.format("x".join([str(self.cf.patch_size[i]) for i in range(self.cf.dim)])))
handle.write('{},'.format(str(self.test_df.fold.unique().tolist()).replace(",", "")))
handle.write('{},'.format(self.cf.clustering_iou if self.cf.clustering else str("N/A")))
handle.write('{},'.format(self.cf.merge_3D_iou if self.cf.merge_2D_to_3D_preds else str("N/A")))
if hasattr(self.cf, "test_against_exact_gt"):
handle.write('{},'.format(self.cf.test_against_exact_gt))
for s in stats:
if self.cf.class_dict[self.cf.patient_class_of_interest] in s['name'] or "mean" in s["name"]:
for metric in metrics_to_score:
if metric in s.keys() and not np.isnan(
s[metric]): # needed as long as no dice on patient level possible
if "folds_mean" in metric:
handle.write('{:0.3f}\u00B1{:0.3f}, '.format(s[metric],
s["_".join(
(*metric.split("_")[:-1], "std"))]))
elif not "folds_std" in metric:
handle.write('{:0.3f}, '.format(s[metric]))
handle.write('\n')
def score_test_df(self, max_fold=None, internal_df=True):
"""
Writes out resulting scores to text files: First checks for class-internal-df (typically current) fold,
gets resulting scores, writes them to a text file and pickles data frame. Also checks if data-frame pickles of
all folds of cross-validation exist in exp_dir. If true, loads all dataframes, aggregates test sets over folds,
and calculates and writes out overall metrics.
"""
# this should maybe be extended to auc, ap stds.
metrics_to_score = self.cf.metrics.copy() # + [ m+ext for m in self.cf.metrics if "dice" in m for ext in ["_std"]]
if internal_df:
self.test_df.to_pickle(os.path.join(self.cf.test_dir, '{}_test_df.pkl'.format(self.cf.fold)))
if hasattr(self, "seg_df"):
self.seg_df.to_pickle(os.path.join(self.cf.test_dir, '{}_test_seg_df.pkl'.format(self.cf.fold)))
stats, _ = self.return_metrics(self.test_df, self.cf.class_dict)
with open(os.path.join(self.cf.test_dir, 'results.txt'), 'a') as handle:
handle.write('\n****************************\n')
handle.write('\nresults for fold {}, {} \n'.format(self.cf.fold, time.strftime("%d/%m/%y %H:%M:%S")))
handle.write('\n****************************\n')
handle.write('\nfold df shape {}\n \n'.format(self.test_df.shape))
for s in stats:
for metric in metrics_to_score:
if metric in s.keys(): #needed as long as no dice on patient level poss
if "accuracy" in metric:
handle.write('{} {:0.4f} '.format(metric, s[metric]))
else:
handle.write('{} {:0.3f} '.format(metric, s[metric]))
else:
print("WARNING: skipped metric {} since not avail".format(metric))
handle.write('{} \n'.format(s['name']))
if max_fold is None:
max_fold = self.cf.n_cv_splits-1
if self.cf.fold == max_fold:
print("max fold/overall stats triggered")
self.cf.fold = 'overall'
if self.cf.evaluate_fold_means:
metrics_to_score += [m + ext for m in self.cf.metrics for ext in ("_folds_mean", "_folds_std")]
if not self.cf.hold_out_test_set or not self.cf.ensemble_folds:
fold_df_paths = sorted([ii for ii in os.listdir(self.cf.test_dir)
if 'test_df.pkl' in ii and not "overall" in ii])
fold_seg_df_paths = sorted([ii for ii in os.listdir(self.cf.test_dir)
if 'test_seg_df.pkl' in ii and not "overall" in ii])
for paths in [fold_df_paths, fold_seg_df_paths]:
assert len(paths) <= self.cf.n_cv_splits, "found {} > nr of cv splits results dfs in {}".format(
len(paths), self.cf.test_dir)
with open(os.path.join(self.cf.test_dir, 'results.txt'), 'a') as handle:
dfs_list = [pd.read_pickle(os.path.join(self.cf.test_dir, ii)) for ii in fold_df_paths]
seg_dfs_list = [pd.read_pickle(os.path.join(self.cf.test_dir, ii)) for ii in fold_seg_df_paths]
self.test_df = pd.concat(dfs_list, sort=True)
if len(seg_dfs_list)>0:
self.seg_df = pd.concat(seg_dfs_list, sort=True)
stats, _ = self.return_metrics(self.test_df, self.cf.class_dict)
handle.write('\n****************************\n')
handle.write('\nOVERALL RESULTS \n')
handle.write('\n****************************\n')
handle.write('\ndf shape \n \n'.format(self.test_df.shape))
for s in stats:
for metric in metrics_to_score:
if metric in s.keys():
handle.write('{} {:0.3f} '.format(metric, s[metric]))
handle.write('{} \n'.format(s['name']))
self.write_to_results_table(stats, metrics_to_score)
with open(os.path.join(self.cf.test_dir, 'results_extr_scores.txt'), 'w') as handle:
handle.write('\n****************************\n')
handle.write('\nextremal scores for fold {} \n'.format(self.cf.fold))
handle.write('\n****************************\n')
# want: pid & fold (&other) of highest scoring tp & fp in test_df
for cl in self.cf.class_dict.keys():
print("\nClass {}".format(self.cf.class_dict[cl]), file=handle)
cl_df = self.test_df[self.test_df.pred_class == cl] #.dropna(axis=1)
for det_type in ['det_tp', 'det_fp']:
filtered_df = cl_df[cl_df.det_type==det_type]
print("\nHighest scoring {} of class {}".format(det_type, self.cf.class_dict[cl]), file=handle)
if len(filtered_df)>0:
print(filtered_df.loc[filtered_df.pred_score.idxmax()], file=handle)
else:
print("No detections of type {} for class {} in this df".format(det_type, self.cf.class_dict[cl]), file=handle)
handle.write('\n****************************\n')