Skip to content

Commit be9ede9

Browse files
authored
Added support to execute sacct to get job historic data for metrics (#857)
* Added support to execute sacct to get job historic data for metrics * Updated sacct call to return an array of info objects * Added custom parse_time method to sacct call * Added info_historic adapter method * Added group_name to info_historic call
1 parent 9c10c77 commit be9ede9

File tree

5 files changed

+194
-2
lines changed

5 files changed

+194
-2
lines changed

lib/ood_core/job/adapter.rb

+12
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,18 @@ def info_all(attrs: nil)
5757
raise NotImplementedError, "subclass did not define #info_all"
5858
end
5959

60+
# Retrieve historic info for all completed jobs from the resource manager.
61+
# This depends on the data retention configuration of the resource manager.
62+
# @abstract Subclass is expected to implement {#info_historic}
63+
# @raise [NotImplementedError] if subclass did not define {#info_historic}
64+
#
65+
# @param opts [#to_h] options to filter jobs in the resource manager.
66+
#
67+
# @return [Array<Info>] information describing the jobs
68+
def info_historic(opts: {})
69+
raise NotImplementedError, "subclass did not define #info_historic"
70+
end
71+
6072
# Retrieve info for all jobs for a given owner or owners from the
6173
# resource manager
6274
# @param owner [#to_s, Array<#to_s>] the owner(s) of the jobs

lib/ood_core/job/adapters/slurm.rb

+128-2
Original file line numberDiff line numberDiff line change
@@ -323,6 +323,46 @@ def all_squeue_fields
323323
}
324324
end
325325

326+
# Job info fields requested from a formatted `sacct` call
327+
def sacct_info_fields
328+
{
329+
# The user name of the user who ran the job.
330+
user: 'User',
331+
# The group name of the user who ran the job.
332+
group_name: 'Group',
333+
# Job Id for reference
334+
job_id: 'JobId',
335+
# The name of the job or job step
336+
job_name: 'JobName',
337+
# The job's elapsed time.
338+
elapsed: 'Elapsed',
339+
# Minimum required memory for the job
340+
req_mem: 'ReqMem',
341+
# Count of allocated CPUs
342+
alloc_cpus: 'AllocCPUS',
343+
# Number of requested CPUs.
344+
req_cpus: 'ReqCPUS',
345+
# What the timelimit was/is for the job
346+
time_limit: 'Timelimit',
347+
# Displays the job status, or state
348+
state: 'State',
349+
# The sum of the SystemCPU and UserCPU time used by the job or job step
350+
total_cpu: 'TotalCPU',
351+
# Maximum resident set size of all tasks in job.
352+
max_rss: 'MaxRSS',
353+
# Identifies the partition on which the job ran.
354+
partition: 'Partition',
355+
# The time the job was submitted. In the same format as End.
356+
submit_time: 'Submit',
357+
# Initiation time of the job. In the same format as End.
358+
start_time: 'Start',
359+
# Termination time of the job.
360+
end: 'End',
361+
# Trackable resources. These are the minimum resource counts requested by the job/step at submission time.
362+
gres: 'ReqTRES'
363+
}
364+
end
365+
326366
def queues
327367
info_raw = call('scontrol', 'show', 'part', '-o')
328368

@@ -357,6 +397,31 @@ def nodes
357397
end.compact
358398
end
359399

400+
def sacct_info(job_ids, states, from, to, show_steps)
401+
# https://slurm.schedmd.com/sacct.html
402+
fields = sacct_info_fields
403+
args = ['-P'] # Output will be delimited
404+
args.concat ['--delimiter', UNIT_SEPARATOR]
405+
args.concat ['-n'] # No header
406+
args.concat ['--units', 'G'] # Memory units in GB
407+
args.concat ['--allocations'] unless show_steps # Show statistics relevant to the job, not taking steps into consideration
408+
args.concat ['-o', fields.values.join(',')] # Required data
409+
args.concat ['--state', states.join(',')] unless states.empty? # Filter by these states
410+
args.concat ['-j', job_ids.join(',')] unless job_ids.empty? # Filter by these job ids
411+
args.concat ['-S', from] if from # Filter from This date
412+
args.concat ['-E', to] if to # Filter until this date
413+
414+
jobs_info = []
415+
StringIO.open(call('sacct', *args)) do |output|
416+
output.each_line do |line|
417+
# Replace blank values with nil
418+
values = line.strip.split(UNIT_SEPARATOR).map{ |value| value.to_s.empty? ? nil : value }
419+
jobs_info << Hash[fields.keys.zip(values)] unless values.empty?
420+
end
421+
end
422+
jobs_info
423+
end
424+
360425
private
361426
def str_to_queue_info(line)
362427
hsh = line.split(' ').map do |token|
@@ -466,8 +531,23 @@ def squeue_attrs_for_info_attrs(attrs)
466531
'SE' => :completed, # SPECIAL_EXIT
467532
'ST' => :running, # STOPPED
468533
'S' => :suspended, # SUSPENDED
469-
'TO' => :completed, # TIMEOUT
470-
'OOM' => :completed # OUT_OF_MEMORY
534+
'TO' => :completed, # TIMEOUT
535+
'OOM' => :completed, # OUT_OF_MEMORY
536+
537+
'BOOT_FAIL' => :completed,
538+
'CANCELED' => :completed,
539+
'COMPLETED' => :completed,
540+
'DEADLINE' => :completed,
541+
'FAILED' => :completed,
542+
'NODE_FAIL' => :completed,
543+
'OUT_OF_MEMORY' => :completed,
544+
'PENDING' => :queued,
545+
'PREEMPTED' => :completed,
546+
'RUNNING' => :running,
547+
'REQUEUED' => :queued,
548+
'REVOKED' => :completed,
549+
'SUSPENDED' => :suspended,
550+
'TIMEOUT' => :completed,
471551
}
472552

473553
# @api private
@@ -586,6 +666,45 @@ def info_all(attrs: nil)
586666
raise JobAdapterError, e.message
587667
end
588668

669+
# Retrieve historic info for all completed jobs from the resource manager.
670+
#
671+
# Known options:
672+
# job_ids [Array<#to_s>] optional list of job ids to filter the results.
673+
# states [Array<#to_s>] optional list of job state codes.
674+
# Selects jobs based on their state during the time period given.
675+
# from [#to_s] optional date string to filter jobs in any state after the specified time.
676+
# If states are provided, filter jobs in these states after this period
677+
# to [#to_s] optional date string to filter jobs in any state before the specified time.
678+
# If states are provided, filter jobs in these states before this period.
679+
# show_steps [#Boolean] optional boolean to filter job steps from the results.
680+
#
681+
# @return [Array<Info>] information describing submitted jobs
682+
# @see Adapter#info_historic
683+
def info_historic(opts: {})
684+
job_ids = opts.fetch(:job_ids, [])
685+
states = opts.fetch(:states, [])
686+
from = opts.fetch(:from, nil)
687+
to = opts.fetch(:to, nil)
688+
show_steps = opts.fetch(:show_steps, false)
689+
@slurm.sacct_info(job_ids, states, from, to, show_steps).map do |v|
690+
Info.new(
691+
id: v[:job_id],
692+
status: get_state(v[:state]),
693+
job_name: v[:job_name],
694+
job_owner: v[:user],
695+
procs: v[:alloc_cpus],
696+
queue_name: v[:partition],
697+
wallclock_time: duration_in_seconds(v[:elapsed]),
698+
wallclock_limit: duration_in_seconds(v[:time_limit]),
699+
cpu_time: duration_in_seconds(v[:total_cpu]),
700+
submission_time: parse_time(v[:submit_time]),
701+
dispatch_time: parse_time(v[:start_time]),
702+
native: v,
703+
gpus: self.class.gpus_from_gres(v[:gres])
704+
)
705+
end
706+
end
707+
589708
# Retrieve job info from the resource manager
590709
# @param id [#to_s] the id of the job
591710
# @raise [JobAdapterError] if something goes wrong getting job info
@@ -718,6 +837,13 @@ def seconds_to_duration(time)
718837
"%02d:%02d:%02d" % [time/3600, time/60%60, time%60]
719838
end
720839

840+
# Parse date time string ignoring unknown values returned by Slurm
841+
def parse_time(date_time)
842+
return nil if date_time.empty? || %w[N/A NONE UNKNOWN].include?(date_time.to_s.upcase)
843+
844+
Time.parse(date_time)
845+
end
846+
721847
# Convert host list string to individual nodes
722848
# "em082"
723849
# "em[014,055-056,161]"

spec/fixtures/scripts/sacct.rb

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
#!/usr/bin/env ruby
2+
3+
puts "ood\u001Food\u001F20251\u001FRDesktop\u001F00:00:00\u001F0.98G\u001F2\u001F2\u001F01:00:00\u001FPENDING\u001F00:00:00\u001F\u001Fnormal\u001F2025-01-03T16:54:57\u001FUnknown\u001FUnknown\u001Fbilling=2,cpu=2,gpu=1,mem=0.98G,node=2"
4+
puts "ood\u001Food\u001F20252\u001FRStudio\u001F00:13:47\u001F0.49G\u001F1\u001F1\u001F01:00:00\u001FRUNNING\u001F00:00:00\u001F\u001Fnormal\u001F2025-01-03T16:55:37\u001F2025-01-03T16:55:37\u001FUnknown\u001Fbilling=1,cpu=1,mem=0.49G,node=1"

spec/job/adapter_spec.rb

+7
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
it { is_expected.to respond_to(:submit).with(1).argument.and_keywords(:after, :afterok, :afternotok, :afterany) }
88
it { is_expected.to respond_to(:info_all).with(0).arguments.and_keywords(:attrs) }
9+
it { is_expected.to respond_to(:info_historic).with(0).arguments.and_keywords(:opts) }
910
it { is_expected.to respond_to(:info_where_owner).with(1).argument.and_keywords(:attrs) }
1011
it { is_expected.to respond_to(:info).with(1).argument }
1112
it { is_expected.to respond_to(:status).with(1).argument }
@@ -34,6 +35,12 @@
3435
end
3536
end
3637

38+
describe "#info_historic" do
39+
it "raises NotImplementedError" do
40+
expect { subject.info_historic }.to raise_error(NotImplementedError)
41+
end
42+
end
43+
3744
describe "#info_where_owner" do
3845
let(:bob_job) { double("bob", job_owner: "bob") }
3946
let(:sam_job) { double("sam", job_owner: "sam") }

spec/job/adapters/slurm_spec.rb

+43
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010

1111
it { is_expected.to respond_to(:submit).with(1).argument.and_keywords(:after, :afterok, :afternotok, :afterany) }
1212
it { is_expected.to respond_to(:info_all).with(0).arguments.and_keywords(:attrs) }
13+
it { is_expected.to respond_to(:info_historic).with(0).arguments.and_keywords(:opts) }
1314
it { is_expected.to respond_to(:info_where_owner).with(1).argument.and_keywords(:attrs) }
1415
it { is_expected.to respond_to(:info).with(1).argument }
1516
it { is_expected.to respond_to(:status).with(1).argument }
@@ -358,6 +359,48 @@ def build_script(opts = {})
358359
end
359360
end
360361

362+
describe "#info_historic" do
363+
context "when no jobs" do
364+
it "returns an array of all the jobs" do
365+
adapter = OodCore::Job::Adapters::Slurm.new(slurm: double(sacct_info: []))
366+
expect(adapter.info_historic).to eq([])
367+
end
368+
end
369+
370+
context "when jobs" do
371+
it "returns an array of all the jobs" do
372+
batch = OodCore::Job::Adapters::Slurm::Batch.new(
373+
conf: "/etc/slurm/conf/",
374+
bin: nil,
375+
bin_overrides: { "sacct" => "spec/fixtures/scripts/sacct.rb"}
376+
)
377+
jobs = OodCore::Job::Adapters::Slurm.new(slurm: batch).info_historic
378+
379+
expect(jobs.count).to eq(2)
380+
381+
j1 = jobs.first
382+
expect(j1.id).to eq("20251")
383+
expect(j1.job_name).to eq("RDesktop")
384+
expect(j1.queue_name).to eq("normal")
385+
expect(j1.status).to eq("queued")
386+
expect(j1.status).to eq(OodCore::Job::Status.new(state: :queued))
387+
expect(j1.status.to_s).to eq("queued")
388+
expect(j1.gpus).to eq(1)
389+
expect(j1.gpu?).to eq(true)
390+
391+
j2 = jobs.last
392+
expect(j2.id).to eq("20252")
393+
expect(j2.job_name).to eq("RStudio")
394+
expect(j2.queue_name).to eq("normal")
395+
expect(j2.status).to eq("running")
396+
expect(j2.status).to eq(OodCore::Job::Status.new(state: :running))
397+
expect(j2.status.to_s).to eq("running")
398+
expect(j2.gpus).to eq(0)
399+
expect(j2.gpu?).to eq(false)
400+
end
401+
end
402+
end
403+
361404
describe "#info" do
362405
def job_info(opts = {})
363406
OodCore::Job::Info.new(

0 commit comments

Comments
 (0)