@@ -323,6 +323,46 @@ def all_squeue_fields
323
323
}
324
324
end
325
325
326
+ # Job info fields requested from a formatted `sacct` call
327
+ def sacct_info_fields
328
+ {
329
+ # The user name of the user who ran the job.
330
+ user : 'User' ,
331
+ # The group name of the user who ran the job.
332
+ group_name : 'Group' ,
333
+ # Job Id for reference
334
+ job_id : 'JobId' ,
335
+ # The name of the job or job step
336
+ job_name : 'JobName' ,
337
+ # The job's elapsed time.
338
+ elapsed : 'Elapsed' ,
339
+ # Minimum required memory for the job
340
+ req_mem : 'ReqMem' ,
341
+ # Count of allocated CPUs
342
+ alloc_cpus : 'AllocCPUS' ,
343
+ # Number of requested CPUs.
344
+ req_cpus : 'ReqCPUS' ,
345
+ # What the timelimit was/is for the job
346
+ time_limit : 'Timelimit' ,
347
+ # Displays the job status, or state
348
+ state : 'State' ,
349
+ # The sum of the SystemCPU and UserCPU time used by the job or job step
350
+ total_cpu : 'TotalCPU' ,
351
+ # Maximum resident set size of all tasks in job.
352
+ max_rss : 'MaxRSS' ,
353
+ # Identifies the partition on which the job ran.
354
+ partition : 'Partition' ,
355
+ # The time the job was submitted. In the same format as End.
356
+ submit_time : 'Submit' ,
357
+ # Initiation time of the job. In the same format as End.
358
+ start_time : 'Start' ,
359
+ # Termination time of the job.
360
+ end : 'End' ,
361
+ # Trackable resources. These are the minimum resource counts requested by the job/step at submission time.
362
+ gres : 'ReqTRES'
363
+ }
364
+ end
365
+
326
366
def queues
327
367
info_raw = call ( 'scontrol' , 'show' , 'part' , '-o' )
328
368
@@ -357,6 +397,31 @@ def nodes
357
397
end . compact
358
398
end
359
399
400
+ def sacct_info ( job_ids , states , from , to , show_steps )
401
+ # https://slurm.schedmd.com/sacct.html
402
+ fields = sacct_info_fields
403
+ args = [ '-P' ] # Output will be delimited
404
+ args . concat [ '--delimiter' , UNIT_SEPARATOR ]
405
+ args . concat [ '-n' ] # No header
406
+ args . concat [ '--units' , 'G' ] # Memory units in GB
407
+ args . concat [ '--allocations' ] unless show_steps # Show statistics relevant to the job, not taking steps into consideration
408
+ args . concat [ '-o' , fields . values . join ( ',' ) ] # Required data
409
+ args . concat [ '--state' , states . join ( ',' ) ] unless states . empty? # Filter by these states
410
+ args . concat [ '-j' , job_ids . join ( ',' ) ] unless job_ids . empty? # Filter by these job ids
411
+ args . concat [ '-S' , from ] if from # Filter from This date
412
+ args . concat [ '-E' , to ] if to # Filter until this date
413
+
414
+ jobs_info = [ ]
415
+ StringIO . open ( call ( 'sacct' , *args ) ) do |output |
416
+ output . each_line do |line |
417
+ # Replace blank values with nil
418
+ values = line . strip . split ( UNIT_SEPARATOR ) . map { |value | value . to_s . empty? ? nil : value }
419
+ jobs_info << Hash [ fields . keys . zip ( values ) ] unless values . empty?
420
+ end
421
+ end
422
+ jobs_info
423
+ end
424
+
360
425
private
361
426
def str_to_queue_info ( line )
362
427
hsh = line . split ( ' ' ) . map do |token |
@@ -466,8 +531,23 @@ def squeue_attrs_for_info_attrs(attrs)
466
531
'SE' => :completed , # SPECIAL_EXIT
467
532
'ST' => :running , # STOPPED
468
533
'S' => :suspended , # SUSPENDED
469
- 'TO' => :completed , # TIMEOUT
470
- 'OOM' => :completed # OUT_OF_MEMORY
534
+ 'TO' => :completed , # TIMEOUT
535
+ 'OOM' => :completed , # OUT_OF_MEMORY
536
+
537
+ 'BOOT_FAIL' => :completed ,
538
+ 'CANCELED' => :completed ,
539
+ 'COMPLETED' => :completed ,
540
+ 'DEADLINE' => :completed ,
541
+ 'FAILED' => :completed ,
542
+ 'NODE_FAIL' => :completed ,
543
+ 'OUT_OF_MEMORY' => :completed ,
544
+ 'PENDING' => :queued ,
545
+ 'PREEMPTED' => :completed ,
546
+ 'RUNNING' => :running ,
547
+ 'REQUEUED' => :queued ,
548
+ 'REVOKED' => :completed ,
549
+ 'SUSPENDED' => :suspended ,
550
+ 'TIMEOUT' => :completed ,
471
551
}
472
552
473
553
# @api private
@@ -586,6 +666,45 @@ def info_all(attrs: nil)
586
666
raise JobAdapterError , e . message
587
667
end
588
668
669
+ # Retrieve historic info for all completed jobs from the resource manager.
670
+ #
671
+ # Known options:
672
+ # job_ids [Array<#to_s>] optional list of job ids to filter the results.
673
+ # states [Array<#to_s>] optional list of job state codes.
674
+ # Selects jobs based on their state during the time period given.
675
+ # from [#to_s] optional date string to filter jobs in any state after the specified time.
676
+ # If states are provided, filter jobs in these states after this period
677
+ # to [#to_s] optional date string to filter jobs in any state before the specified time.
678
+ # If states are provided, filter jobs in these states before this period.
679
+ # show_steps [#Boolean] optional boolean to filter job steps from the results.
680
+ #
681
+ # @return [Array<Info>] information describing submitted jobs
682
+ # @see Adapter#info_historic
683
+ def info_historic ( opts : { } )
684
+ job_ids = opts . fetch ( :job_ids , [ ] )
685
+ states = opts . fetch ( :states , [ ] )
686
+ from = opts . fetch ( :from , nil )
687
+ to = opts . fetch ( :to , nil )
688
+ show_steps = opts . fetch ( :show_steps , false )
689
+ @slurm . sacct_info ( job_ids , states , from , to , show_steps ) . map do |v |
690
+ Info . new (
691
+ id : v [ :job_id ] ,
692
+ status : get_state ( v [ :state ] ) ,
693
+ job_name : v [ :job_name ] ,
694
+ job_owner : v [ :user ] ,
695
+ procs : v [ :alloc_cpus ] ,
696
+ queue_name : v [ :partition ] ,
697
+ wallclock_time : duration_in_seconds ( v [ :elapsed ] ) ,
698
+ wallclock_limit : duration_in_seconds ( v [ :time_limit ] ) ,
699
+ cpu_time : duration_in_seconds ( v [ :total_cpu ] ) ,
700
+ submission_time : parse_time ( v [ :submit_time ] ) ,
701
+ dispatch_time : parse_time ( v [ :start_time ] ) ,
702
+ native : v ,
703
+ gpus : self . class . gpus_from_gres ( v [ :gres ] )
704
+ )
705
+ end
706
+ end
707
+
589
708
# Retrieve job info from the resource manager
590
709
# @param id [#to_s] the id of the job
591
710
# @raise [JobAdapterError] if something goes wrong getting job info
@@ -718,6 +837,13 @@ def seconds_to_duration(time)
718
837
"%02d:%02d:%02d" % [ time /3600 , time /60 %60 , time %60 ]
719
838
end
720
839
840
+ # Parse date time string ignoring unknown values returned by Slurm
841
+ def parse_time ( date_time )
842
+ return nil if date_time . empty? || %w[ N/A NONE UNKNOWN ] . include? ( date_time . to_s . upcase )
843
+
844
+ Time . parse ( date_time )
845
+ end
846
+
721
847
# Convert host list string to individual nodes
722
848
# "em082"
723
849
# "em[014,055-056,161]"
0 commit comments