1
1
#!usr/bin/env python3
2
2
3
3
import os
4
+ import logging
4
5
import socket
5
6
import threading
6
7
from textwrap import dedent
14
15
15
16
from darc import DARCBase
16
17
from darc import util
18
+ from darc .control import send_command
17
19
from darc .definitions import WORKERS , TSAMP
18
20
19
21
@@ -33,6 +35,9 @@ def __init__(self, *args, **kwargs):
33
35
34
36
self .scavenger = None
35
37
38
+ # reduce logging from status check commands
39
+ logging .getLogger ('darc.control' ).setLevel (logging .ERROR )
40
+
36
41
def run (self ):
37
42
"""
38
43
Main loop. Create thread scavenger, then run parent class run method
@@ -166,6 +171,7 @@ def __init__(self, *args, **kwargs):
166
171
self .warnings_sent = []
167
172
self .status = None
168
173
self .process = None
174
+ self .central_result_dir = None
169
175
170
176
def start_observation (self , obs_config , reload = True ):
171
177
"""
@@ -282,19 +288,57 @@ def _wait_for_workers(self):
282
288
283
289
def _check_node_online (self , node ):
284
290
"""
285
- Check if the processor on a node is still online
291
+ Check if the processor on a node is still online and processing the current observation
286
292
287
293
:param str node: Hostname of node to check
288
294
:return: status (bool): True if node is online, else False
289
295
"""
290
- self .logger .warning ("Node status check not yet implemented, returning True" )
291
- return True
296
+ # check if the processor on the node is online
297
+ try :
298
+ reply = send_command (self .node_timeout , 'processor' , 'status' , host = node )
299
+ if reply is None :
300
+ self .logger .debug (f"No reply received from { node } , assuming it is offline" )
301
+ return False
302
+ status = reply ['message' ]['processor' ]
303
+ except Exception as e :
304
+ self .logger .error (f"Failed to get { node } status: { type (e )} : { e } " )
305
+ status = ''
306
+ if status != 'running' :
307
+ # processor is not running
308
+ self .logger .debug (f"{ node } processor is not running" )
309
+ return False
310
+
311
+ # get list of running observations from node
312
+ self .logger .debug (f"{ node } is online, checking for observations" )
313
+ try :
314
+ output = send_command (self .node_timeout , 'processor' , 'get_attr observations' )['message' ]['processor' ]
315
+ # parse the observation list
316
+ # the list contains reference to processes, which should be put in quotes first
317
+ output = ast .literal_eval (output .replace ('<' , '\' <' ).replace ('>' , '>\' ' ))
318
+ taskids = output ['ProcessorManager.observations' ].keys ()
319
+ except Exception as e :
320
+ self .logger .error (f"Failed to get observation list from { node } : { type (e )} : { e } " )
321
+ return False
322
+ self .logger .debug (f"{ node } taskids: { taskids } " )
323
+
324
+ # check if the node is still processing the current taskid
325
+ try :
326
+ taskid = self .obs_config ['parset' ]['task.taskID' ]
327
+ except (KeyError , TypeError ):
328
+ # KeyError if parset or task.taskID are missing, TypeError if obs_config is None
329
+ self .logger .error (f"Failed to get task ID of current master observation, assuming { node } is online" )
330
+ return True
331
+
332
+ if taskid in taskids :
333
+ return True
334
+ else :
335
+ return False
292
336
293
337
def _send_warning (self , node ):
294
338
"""
295
339
Send a warning email about a node
296
340
"""
297
- self .logger .warning (" Warning email not yet implemented" )
341
+ self .logger .warning (f"Received request to warn about { node } . Warning email not yet implemented" )
298
342
299
343
def _process_results (self , info , coordinates ):
300
344
"""
0 commit comments