MSc_project/annotation_data_generator.py at master · dpaddon/MSc_project · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Jul 10 09:54:47 2018

@author: daniel

File to create a comprehensive data set drawing from the original
images, annotations generated by the tierpsy tracker, and
hand-drawn annotations
"""

import os
import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from copy import copy
import tables

from utility_functions import masks_from_XML


# define the various directories
DATA_DIR = '/Users/daniel/Documents/UCL/Project/Data/annotation-data'
VIDEOS_DIR = os.path.join(DATA_DIR, 'MaskedVideos')
FEATURES_DIR = os.path.join(DATA_DIR, 'Results')
ANNS_DIR = os.path.join(DATA_DIR, 'annotations_final')
#OUTPUT_DIR = os.path.join(DATA_DIR, 'collated_dataset')


# decide whether to also save XML annotations
# Set this to False to save images for benchmark testing
use_xmls = False


###############################################################
# Define the frames to ignore due to poor quality hand annotations

exclude_file_names = ["CB4856_worms10_food1-10_Set1_Pos4_Ch2_20102017_125044",
              "JU2234_worms10_food1-10_Set1_Pos4_Ch3_20102017_125033",
              "JU2578_worms10_food1-10_Set1_Pos4_Ch4_20102017_125033",
              "N2_worms10_food1-10_Set1_Pos4_Ch5_20102017_125024",
              "VC2010_worms10_food1-10_Set1_Pos4_Ch6_20122017_150107",
              "CX11271_worms10_food1-10_Set1_Pos4_Ch4_19052017_113042",
              "ED3049_worms10_food1-10_Set6_Pos5_Ch4_19052017_151021",
              "JU360_worms10_food1-10_Set6_Pos5_Ch6_19052017_151012",
              "N2_worms10_CSCD068947_10_Set2_Pos5_Ch1_08082017_212337",
              "N2_worms10_CSCD068947_1_Set1_Pos4_Ch4_08082017_210418",
             "JU2587_worms10_food1-10_Set1_Pos4_Ch1_20102017_125044",
             "NIC199_worms10_food1-10_Set7_Pos4_Ch4_19052017_153012"]

exclude_lists = [[561, 2871, 2941, 3501, 4411, 4551, 5251, 13441, 14001, 16451, 21771, 21911, 22191, 22331],
                [1191, 1331, 1611, 3431, 4061, 6021, 6411, 7071, 12321, 2031, 10851, 11061],
                [141, 211, 421, 1331, 7981, 8611, 12601, 14001, 14071, 15681, 18551, 981, 1331, 1891, 4481, 13231, 9451, 10711, 15821],
                [8621, 13371, 20231],
                [9381, 11411, 12181, 14211, 22261, 11411, 21351, 11551, 16521],
                [1681, 9451, 19251, 20371, 1821, 4691, 4761, 4831, 4901, 4971, 5041, 1961, 2731, 2801, 2871, 2941, 2871],
                [10781,11761, 13441, 15961, 16031, 17571, 17991, 19111, 19181, 20791, 21141, 18271],
                [1471, 17361, 9311, 9381, 10361, 12321, 17641, 21281, 22471],
                [1, 5181, 421, 4261, 5111, 5531, 14911, 21001, 21421, 22401, 631, 11201, 12741, 13021, 14001, 14071, 14141, 14281,
                 14491, 14561, 14701, 14981, 15121, 15261, 15611, 15891, 16031, 16171, 16241, 17501, 18271, 981, 2031, 4061, 5391,
                 6091, 6371, 7701, 8891, 9731, 10991, 11271, 11411, 11621, 11901, 12321, 13231, 13371, 13791, 13931, 14351, 14841,
                 15401, 15751, 17291, 17781, 18061, 18411, 18481, 19461, 19601, 19881, 21491, 21631, 22331, 6791, 9311, 12601,
                 12531, 22121, 12951, 13301, 13441, 14771, 16381, 16451, 13161, 15681, 13651 ,13721, 15961, 16101, 16241, 16311,
                 18551, 19111, 19181, 19811, 21351],
                [1, 491, 771, 841, 981, 1051, 421, 6231, 2101, 3151, 3501,3571, 3781, 3991, 4131, 4971, 5601, 5671, 5811, 6021,
                6511, 6581, 6651, 7421, 7491, 7561, 7631, 7701, 7981, 8051, 8121, 8261, 8541, 8611, 8681, 8821, 4271, 4411,
                4761, 5041, 5181, 6721, 7281, 7351, 8891, 8961, 9031, 9101, 9171, 9241, 9311, 9381, 9451, 9521, 9591, 9661,
                 9731, 9801, 9871, 9941, 10011, 10081, 10151, 10221, 10291, 10361, 10431, 10501, 10571, 10641, 10711, 10781,
                 10851, 10921, 10991, 11061, 11131, 11201, 11271, 11341, 11411, 11481],
                [1, 281, 351, 421, 491, 561, 1121, 2591, 3151, 3221, 3291, 3361, 3431, 3781, 3921,
                 4411, 4621, 6021, 6931, 7071, 7421, 8051, 8401, 12951, 13021, 13161, 13301, 13371,
                 13441, 13511, 13581, 13651, 13721, 13791, 13681, 13931, 14001, 14071, 14211, 16101,
                 16101, 16241, 16381, 16451, 16521, 16591, 16661, 16731, 16801, 16871, 16941, 17151,
                 17221, 17291, 17361, 17431, 17501, 17571, 17641, 17711, 17781, 18341, 18481, 18551,
                 18621, 18691, 18831, 18901, 18971, 19391, 19461, 19671, 19741, 19811, 19881, 19951,
                 20091, 20161, 20231, 20301, 20371, 20441, 20511, 20581, 351, 561, 911, 3571, 3921,
                 4131, 5111, 7351, 8681, 9031, 9591, 9941, 10641, 11061, 13441, 13931, 19601, 1611,
                 2941,  4831, 22261, 22471,1611, 2941,  4831, 22261, 22471, 2661, 3991, 5041, 19741,
                 5111, 6021, 6931, 7841, 12181, 20651, 20721, 20791, 20861, 20931, 21001, 21071],
                [14351, 14421, 14491, 14561, 14631, 14701, 14771, 14841, 14911, 14981, 15051, 15121, 15191,
                 15261, 15331, 15401, 15471, 15541, 15611, 15681, 15751, 15821, 15891, 15961, 16031, 16101,
                 16171, 16241, 16311, 16381, 16451, 16521, 16591, 16661, 16731, 16801, 16871, 16941, 17011,
                 17081, 17151, 17221, 17291, 17361, 17431, 17501, 17571, 17641, 17711, 17781, 17851, 17921,
                 17991, 18061, 18131, 18201, 18271, 18341, 18411, 18481, 18551, 18621, 18691, 18761, 18831,
                 18901, 18971, 19041, 19111, 19181, 19251, 19321, 19391, 19461, 19531, 19601, 19671, 19741,
                 19811, 19881, 19951, 20021, 20091, 20161, 20231, 20301]]

exclude_dict = {}
for (fn, li) in zip(exclude_file_names, exclude_lists):
    exclude_dict[fn] = sorted(list(set(li)), key=int)
###############################################################

corrections_filename_list = ["JU2587_worms10_food1-10_Set1_Pos4_Ch1_20102017_125044",
                 "N2_worms10_CSCD068947_1_Set1_Pos4_Ch4_08082017_210418",
                 "N2_worms10_CSCD068947_10_Set2_Pos5_Ch1_08082017_212337",
                 "NIC199_worms10_food1-10_Set7_Pos4_Ch4_19052017_153012"]

###############################################################
anns_dict = {}
for fName in exclude_file_names:
    ANNS_DICT_DIR = os.path.join(ANNS_DIR, fName)
    anns_list = [int(f[:-4]) for f in os.listdir(ANNS_DICT_DIR) if f.endswith('xml')]
    anns_dict[fName] = sorted(list(set(anns_list) - set(exclude_dict[fName])), key=int)

###############################################################


# get list of file names
# fNames = sorted([f for f in os.listdir(ANNS_DIR) if not f.startswith('.') if not f.endswith('212337')]) #exclude the worst annotation folder
fNames = sorted(exclude_file_names)
print("Filenames: ")
print("\n".join(fNames))
print("")


# Loop through each of the datasets
for filename in fNames:

    # set which dataset we are using
    print("Set being created: {}".format(filename))

    # define the filenames for the images, features, and XML annotations
    images_file = os.path.join(VIDEOS_DIR, filename + ".hdf5")
    features_file = os.path.join(FEATURES_DIR, filename + "_featuresN.hdf5")
    XML_DIR = os.path.join(ANNS_DIR, filename)

    CROPPED_OUTPUT_DIR = os.path.join(DATA_DIR, 'cropped_annotations_only_dataset_tierpsy', filename)
    FULLSIZE_OUTPUT_DIR = os.path.join(DATA_DIR, 'fullsize_annotations_only_dataset_tierpsy', filename)


    # load images file
    with pd.HDFStore(features_file, 'r') as fid:
        #all the worm coordinates and how the skeletons matrix related with a given frame is here
        trajectories_data = fid['/trajectories_data']

    # Get total number of frames in file:
    num_frames = trajectories_data['frame_number'].max()

    # Load every 70th frame
#    for frame_number in range(num_frames)[1::70]:

    #only process the annotations which weren't excluded
    for frame_number in anns_dict[filename]:

        #############
#        # Ignore frames in the exlcude list
#        if frame_number in exclude_dict[filename]:
#            continue
#        #############

        print(frame_number)

        # read image (full or masked)
        img_field = '/mask'
#        img_field = "/full_data"

        # Select only the data for this frame
        traj_g = trajectories_data.groupby('frame_number')
        frame_data = traj_g.get_group(frame_number)

        # load existing annotations

        # Select only skeletonised worms
        # worms that where not succesfully skeletonized will have a -1 here
        skel_id = frame_data['skeleton_id'].values
        neg_skel_id = skel_id[skel_id<0]
        skel_id = skel_id[skel_id>=0]

        # Open the frame from the hdf5 file
        with tables.File(images_file, 'r') as fid:
            img = fid.get_node(img_field)[frame_number]
            img = img.T

        # get the worm contour coordinates
        with tables.File(features_file, 'r') as fid:
            # Reduce the coordinates by a factor of 10 so that they
            # match the image dimensions
            skel = fid.get_node('/coordinates/skeletons')[skel_id, :, :]/10
            cnt1 = fid.get_node('/coordinates/dorsal_contours')[skel_id, :, :]/10
            cnt2 = fid.get_node('/coordinates/ventral_contours')[skel_id, :, :]/10

#        # Plot the image with Tierpsy annotations
#        # Note we have to transpose the image to match the XML annotations
#        # Suspect this is due to MATLAB vs Numpy / col- vs row-major indexing

#        plt.figure(figsize=(30,30))
#        plt.imshow(img.T, interpolation='none', cmap='gray')
#
#        #add all the worms identified
#        for _, row in frame_data.iterrows():
#           cc = plt.Circle((row['coord_y'], row['coord_x']), \
#                           row['roi_size']/2, lw=2, color='g', fill=False)
#           plt.gca().add_artist(cc)
#
#        # add all the skeletonized worms
#
#        # We also have to transpose the X and Y coordinates of the plots
#        for (ss, cc1, cc2) in zip(skel, cnt1, cnt2):
#            plt.plot(ss[:, 1], ss[:, 0], 'r')
#            plt.plot(cc1[:, 1], cc1[:, 0], 'tomato')
#            plt.plot(cc2[:, 1], cc2[:, 0], color='salmon')
#
#        plt.show()
#        plt.close()

        # Get annotations as complete masks, starting with existing data
        masks = []

        # Loop through all of the existing worms
        for (cc1, cc2) in zip(cnt1, cnt2):
            cnt_close = np.vstack([cc1, cc2[-1::-1]])

            # convert the outline to a solid mask
            mask = np.zeros(img.shape)
            cv2.fillPoly(mask, pts =[np.int32(cnt_close)], color=(255,255,255))

            #append this mask to our list of masks
            # note we have to transpose this mask (matlab vs numpy matrix indexing)
            masks.append(copy(mask.T))


        if use_xmls:
            # load xml annotations (if they exist for this frame)
            # and append them to the list of masks
            annotation_path = os.path.join(XML_DIR, str(frame_number) + ".xml")
            if os.path.exists(annotation_path):
                xml_masks, xml_heads = masks_from_XML(annotation_path, img)
                masks.extend(copy(xml_masks))


        # Save fullsize images and masks
        os.makedirs(FULLSIZE_OUTPUT_DIR + '/{}/image'.format(frame_number), exist_ok=True)
        image_filename = FULLSIZE_OUTPUT_DIR + '/{}/image/image_{}.png'.format(frame_number,frame_number)
        plt.imsave(fname=image_filename, arr=img, format='png', cmap='gray')

        j = 0
        for m in masks:
            os.makedirs(FULLSIZE_OUTPUT_DIR + '/{}/masks'.format(frame_number), exist_ok=True)
            mask_filename = FULLSIZE_OUTPUT_DIR + '/{}/masks/mask_{}.png'.format(frame_number,j)
            plt.imsave(fname=mask_filename, arr=m, format='png', cmap='gray')
            j += 1


        # Splitting the images into 16 smaller chunks
        # Originially the images are 2048*2048 however this is far too large
        # to fit in to GPU RAM.

        h = img.shape[0]
        w = img.shape[1]

        for x in range(4):
            for y in range(4):

                # We loop through all the chunks and set pos_example = True if
                # the chunk contains at least part of a worm, keeping only
                # those chunks.
                pos_example = False


                j = 0

                # Loop through all of the worms
                for m in masks:
                    # crop the mask for the chunk being examined
                    cropped_mask = m[int((h/4)*x):int((h/4)*(x+1)), int((w/4)*y):int((w/4)*(y+1))]

                    # if the chunk contains a worm:
                    if np.any(cropped_mask):
                        # Create a subdir for the masks for this crop
                        os.makedirs(CROPPED_OUTPUT_DIR + '/{}_{}{}/masks'.format(frame_number,x,y), exist_ok=True)
                        mask_filename = CROPPED_OUTPUT_DIR + '/{}_{}{}/masks/mask_{}.png'.format(frame_number,x,y,j)
                        plt.imsave(fname=mask_filename, arr=cropped_mask, format='png', cmap='gray')

                        # Set this flag True to save this image crop
                        pos_example = True

                    j += 1

                if pos_example:
                     os.makedirs(CROPPED_OUTPUT_DIR + '/{}_{}{}/image'.format(frame_number,x,y), exist_ok=True)
                     cropped_img = img[int((h/4)*x):int((h/4)*(x+1)), int((w/4)*y):int((w/4)*(y+1))]
                     image_filename = CROPPED_OUTPUT_DIR + '/{}_{}{}/image/image_{}_{}{}.png'.format(frame_number,x,y,frame_number,x,y)
                     plt.imsave(fname=image_filename, arr=cropped_img, format='png', cmap='gray')


#TODO: make this script output tf.record files (sharded by set?)