Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions lib/cli/commands/config/update.rb
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,8 @@ def call(**options)
@system_data = Services::SysInfo.all_info
end

@system_data = Services::SysInfo.complete_info(@system_data)

spinner.success('(successful)')

# New data to file
Expand Down
78 changes: 77 additions & 1 deletion lib/services/interactive_wizard.rb
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,83 @@ module Services
class InteractiveWizard
def system_info
config = YAML.load_file(File.expand_path('../../config/config.yaml', __dir__))
@info = AlcesJob::Services::SysInfo.load_info(config)

if File.exist?(config['admin_config_file'])
@info = AlcesJob::Services::SysInfo.load_info(config)
else
candidate = AlcesJob::Services::SysInfo.actual_info
@info = if AlcesJob::Services::SysInfo.slurm_available?(candidate)
AlcesJob::Services::SysInfo.complete_info(candidate)
else
prompt_for_system_info
end
end
rescue Errno::ENOENT, Psych::SyntaxError
@info = prompt_for_system_info
end

def valid_system_info?(info)
info.is_a?(Hash) && info[:nodes].is_a?(Array) && info[:nodes].any? &&
info[:partitions].is_a?(Array) && info[:partitions].any?
end

def prompt_for_system_info
prompt = TTY::Prompt.new

puts 'Unable to detect a Slurm environment. Please enter fallback cluster configuration for the system you wish to run the script on.'

partitions = prompt_for_partitions(prompt)
nodes = prompt_for_nodes(prompt)
gpu_total = prompt.ask('How many GPUs are available in total?', default: '0') do |q|
q.validate(/\A\d+\z/)
q.messages[:valid?] = 'Please enter a whole number.'
q.convert :int
end

{
nodes: nodes,
partitions: partitions,
packages: [],
gpu_total: gpu_total
}
end

def prompt_for_partitions(prompt)
partition_input = prompt.ask('Partition names (comma-separated)', default: 'default') do |q|
q.required true
end

partition_names = partition_input.split(',').map(&:strip).reject(&:empty?)
partition_names = ['default'] if partition_names.empty?

partition_names.map.with_index do |name, index|
time_limit = prompt.ask("Max time for partition #{name}", default: '0-07:00:00') do |q|
q.validate(/^\d+-\d{2}:\d{2}:\d{2}$/)
q.messages[:valid?] = 'Time must be in format D-HH:MM:SS.'
end

{
partition: name,
time_limit: time_limit,
default: index.zero?
}
end
end

def prompt_for_nodes(prompt)
max_cpu_cores = prompt.ask('Maximum CPU cores per node', default: '4') do |q|
q.validate(/\A\d+\z/)
q.messages[:valid?] = 'Please enter a whole number.'
q.convert :int
end

max_memory = prompt.ask('Maximum memory per node (MB)', default: '5000') do |q|
q.validate(/\A\d+\z/)
q.messages[:valid?] = 'Please enter a whole number.'
q.convert :int
end

[{ node: 'local', cpus: max_cpu_cores, memory: max_memory }]
end

def valid_slurm_time?(time_string, max_time = '7-00:00:00')
Expand Down
44 changes: 43 additions & 1 deletion lib/services/sysinfo.rb
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,18 @@ module SysInfo
# Load system information from file if available or grabs info
# @return [Hash{nodes: Array<Hash>, partitions: Array<Hash>, packages: Array<String>, gpu_total: Integer}]
def self.load_info(config)
return YAML.load_file(config['admin_config_file']) if File.exist?(config['admin_config_file'])
return complete_info(YAML.load_file(config['admin_config_file'])) if File.exist?(config['admin_config_file'])

all_info
end

# Gets all system information
# @return [Hash{nodes: Array<Hash>, partitions: Array<Hash>, packages: Array<String>, gpu_total: Integer}]
def self.all_info
complete_info(actual_info)
end

def self.actual_info
{
nodes: node_info,
partitions: partition_info,
Expand All @@ -25,6 +29,44 @@ def self.all_info
}
end

def self.slurm_available?(info = nil)
info ||= actual_info
info[:nodes].is_a?(Array) && !info[:nodes].empty? &&
info[:partitions].is_a?(Array) && !info[:partitions].empty?
end

def self.complete_info(info)
return default_info unless info.is_a?(Hash)

{
nodes: info[:nodes] || info['nodes'] || default_nodes,
partitions: info[:partitions] || info['partitions'] || default_partitions,
packages: info[:packages] || info['packages'] || default_packages,
gpu_total: info[:gpu_total] || info['gpu_total'] || 0
}
end

def self.default_info
{
nodes: default_nodes,
partitions: default_partitions,
packages: default_packages,
gpu_total: 0
}
end

def self.default_nodes
[{ node: 'local', cpus: 4, memory: 5_000 }]
end

def self.default_partitions
[{ partition: 'default', time_limit: '0-07:00:00', default: true }]
end

def self.default_packages
[]
end

# Gets the node information
# @return [Array<{node: String, cpus: Integer, memory: Integer}>, nil]
def self.node_info
Expand Down
42 changes: 39 additions & 3 deletions spec/sysinfo_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -136,23 +136,23 @@
describe '.gpu_info' do
it 'returns the GPU count from command output' do
allow(Open3).to receive(:capture3)
.with("scontrol show nodes | grep -o 'gpu:[^:]*:[0-9]*' | cut -d':' -f3 | paste -sd+ | bc`")
.with("scontrol show nodes | grep -o 'gpu:[^:]*:[0-9]*' | cut -d':' -f3 | paste -sd+ | bc")
.and_return(["4\n", '', success_status])

expect(described_class.gpu_info).to eq(4)
end

it 'returns zero when the command fails' do
allow(Open3).to receive(:capture3)
.with("scontrol show nodes | grep -o 'gpu:[^:]*:[0-9]*' | cut -d':' -f3 | paste -sd+ | bc`")
.with("scontrol show nodes | grep -o 'gpu:[^:]*:[0-9]*' | cut -d':' -f3 | paste -sd+ | bc")
.and_return(['', 'error', failure_status])

expect(described_class.gpu_info).to eq(0)
end

it 'returns zero when scontrol is not installed' do
allow(Open3).to receive(:capture3)
.with("scontrol show nodes | grep -o 'gpu:[^:]*:[0-9]*' | cut -d':' -f3 | paste -sd+ | bc`")
.with("scontrol show nodes | grep -o 'gpu:[^:]*:[0-9]*' | cut -d':' -f3 | paste -sd+ | bc")
.and_raise(Errno::ENOENT)

expect(described_class.gpu_info).to eq(0)
Expand Down Expand Up @@ -184,5 +184,41 @@
}
)
end

it 'fills missing values with defaults when slurm is unavailable' do
allow(described_class).to receive(:node_info).and_return(nil)
allow(described_class).to receive(:partition_info).and_return(nil)
allow(described_class).to receive(:package_info).and_return(nil)
allow(described_class).to receive(:gpu_info).and_return(0)

expect(described_class.all_info).to eq(
{
nodes: [{ node: 'local', cpus: 4, memory: 5_000 }],
partitions: [{ partition: 'default', time_limit: '0-07:00:00', default: true }],
packages: [],
gpu_total: 0
}
)
end
end

describe '.slurm_available?' do
it 'returns true when node and partition info exist' do
info = {
nodes: [{ node: 'node01', cpus: 4, memory: 8_000 }],
partitions: [{ partition: 'default', time_limit: '0-07:00:00', default: true }]
}

expect(described_class.slurm_available?(info)).to be true
end

it 'returns false when node or partition info is missing' do
info = {
nodes: nil,
partitions: nil
}

expect(described_class.slurm_available?(info)).to be false
end
end
end
Loading