diff --git a/lib/cli/commands/config/update.rb b/lib/cli/commands/config/update.rb index 7649382..e436243 100644 --- a/lib/cli/commands/config/update.rb +++ b/lib/cli/commands/config/update.rb @@ -91,6 +91,8 @@ def call(**options) @system_data = Services::SysInfo.all_info end + @system_data = Services::SysInfo.complete_info(@system_data) + spinner.success('(successful)') # New data to file diff --git a/lib/services/interactive_wizard.rb b/lib/services/interactive_wizard.rb index 6f47008..8561f5b 100644 --- a/lib/services/interactive_wizard.rb +++ b/lib/services/interactive_wizard.rb @@ -13,7 +13,83 @@ module Services class InteractiveWizard def system_info config = YAML.load_file(File.expand_path('../../config/config.yaml', __dir__)) - @info = AlcesJob::Services::SysInfo.load_info(config) + + if File.exist?(config['admin_config_file']) + @info = AlcesJob::Services::SysInfo.load_info(config) + else + candidate = AlcesJob::Services::SysInfo.actual_info + @info = if AlcesJob::Services::SysInfo.slurm_available?(candidate) + AlcesJob::Services::SysInfo.complete_info(candidate) + else + prompt_for_system_info + end + end + rescue Errno::ENOENT, Psych::SyntaxError + @info = prompt_for_system_info + end + + def valid_system_info?(info) + info.is_a?(Hash) && info[:nodes].is_a?(Array) && info[:nodes].any? && + info[:partitions].is_a?(Array) && info[:partitions].any? + end + + def prompt_for_system_info + prompt = TTY::Prompt.new + + puts 'Unable to detect a Slurm environment. Please enter fallback cluster configuration for the system you wish to run the script on.' + + partitions = prompt_for_partitions(prompt) + nodes = prompt_for_nodes(prompt) + gpu_total = prompt.ask('How many GPUs are available in total?', default: '0') do |q| + q.validate(/\A\d+\z/) + q.messages[:valid?] = 'Please enter a whole number.' + q.convert :int + end + + { + nodes: nodes, + partitions: partitions, + packages: [], + gpu_total: gpu_total + } + end + + def prompt_for_partitions(prompt) + partition_input = prompt.ask('Partition names (comma-separated)', default: 'default') do |q| + q.required true + end + + partition_names = partition_input.split(',').map(&:strip).reject(&:empty?) + partition_names = ['default'] if partition_names.empty? + + partition_names.map.with_index do |name, index| + time_limit = prompt.ask("Max time for partition #{name}", default: '0-07:00:00') do |q| + q.validate(/^\d+-\d{2}:\d{2}:\d{2}$/) + q.messages[:valid?] = 'Time must be in format D-HH:MM:SS.' + end + + { + partition: name, + time_limit: time_limit, + default: index.zero? + } + end + end + + def prompt_for_nodes(prompt) + max_cpu_cores = prompt.ask('Maximum CPU cores per node', default: '4') do |q| + q.validate(/\A\d+\z/) + q.messages[:valid?] = 'Please enter a whole number.' + q.convert :int + end + + max_memory = prompt.ask('Maximum memory per node (MB)', default: '5000') do |q| + q.validate(/\A\d+\z/) + q.messages[:valid?] = 'Please enter a whole number.' + q.convert :int + end + + [{ node: 'local', cpus: max_cpu_cores, memory: max_memory }] end def valid_slurm_time?(time_string, max_time = '7-00:00:00') diff --git a/lib/services/sysinfo.rb b/lib/services/sysinfo.rb index da80510..29a253f 100644 --- a/lib/services/sysinfo.rb +++ b/lib/services/sysinfo.rb @@ -9,7 +9,7 @@ module SysInfo # Load system information from file if available or grabs info # @return [Hash{nodes: Array, partitions: Array, packages: Array, gpu_total: Integer}] def self.load_info(config) - return YAML.load_file(config['admin_config_file']) if File.exist?(config['admin_config_file']) + return complete_info(YAML.load_file(config['admin_config_file'])) if File.exist?(config['admin_config_file']) all_info end @@ -17,6 +17,10 @@ def self.load_info(config) # Gets all system information # @return [Hash{nodes: Array, partitions: Array, packages: Array, gpu_total: Integer}] def self.all_info + complete_info(actual_info) + end + + def self.actual_info { nodes: node_info, partitions: partition_info, @@ -25,6 +29,44 @@ def self.all_info } end + def self.slurm_available?(info = nil) + info ||= actual_info + info[:nodes].is_a?(Array) && !info[:nodes].empty? && + info[:partitions].is_a?(Array) && !info[:partitions].empty? + end + + def self.complete_info(info) + return default_info unless info.is_a?(Hash) + + { + nodes: info[:nodes] || info['nodes'] || default_nodes, + partitions: info[:partitions] || info['partitions'] || default_partitions, + packages: info[:packages] || info['packages'] || default_packages, + gpu_total: info[:gpu_total] || info['gpu_total'] || 0 + } + end + + def self.default_info + { + nodes: default_nodes, + partitions: default_partitions, + packages: default_packages, + gpu_total: 0 + } + end + + def self.default_nodes + [{ node: 'local', cpus: 4, memory: 5_000 }] + end + + def self.default_partitions + [{ partition: 'default', time_limit: '0-07:00:00', default: true }] + end + + def self.default_packages + [] + end + # Gets the node information # @return [Array<{node: String, cpus: Integer, memory: Integer}>, nil] def self.node_info diff --git a/spec/sysinfo_spec.rb b/spec/sysinfo_spec.rb index 22b5ad8..b89f56a 100644 --- a/spec/sysinfo_spec.rb +++ b/spec/sysinfo_spec.rb @@ -136,7 +136,7 @@ describe '.gpu_info' do it 'returns the GPU count from command output' do allow(Open3).to receive(:capture3) - .with("scontrol show nodes | grep -o 'gpu:[^:]*:[0-9]*' | cut -d':' -f3 | paste -sd+ | bc`") + .with("scontrol show nodes | grep -o 'gpu:[^:]*:[0-9]*' | cut -d':' -f3 | paste -sd+ | bc") .and_return(["4\n", '', success_status]) expect(described_class.gpu_info).to eq(4) @@ -144,7 +144,7 @@ it 'returns zero when the command fails' do allow(Open3).to receive(:capture3) - .with("scontrol show nodes | grep -o 'gpu:[^:]*:[0-9]*' | cut -d':' -f3 | paste -sd+ | bc`") + .with("scontrol show nodes | grep -o 'gpu:[^:]*:[0-9]*' | cut -d':' -f3 | paste -sd+ | bc") .and_return(['', 'error', failure_status]) expect(described_class.gpu_info).to eq(0) @@ -152,7 +152,7 @@ it 'returns zero when scontrol is not installed' do allow(Open3).to receive(:capture3) - .with("scontrol show nodes | grep -o 'gpu:[^:]*:[0-9]*' | cut -d':' -f3 | paste -sd+ | bc`") + .with("scontrol show nodes | grep -o 'gpu:[^:]*:[0-9]*' | cut -d':' -f3 | paste -sd+ | bc") .and_raise(Errno::ENOENT) expect(described_class.gpu_info).to eq(0) @@ -184,5 +184,41 @@ } ) end + + it 'fills missing values with defaults when slurm is unavailable' do + allow(described_class).to receive(:node_info).and_return(nil) + allow(described_class).to receive(:partition_info).and_return(nil) + allow(described_class).to receive(:package_info).and_return(nil) + allow(described_class).to receive(:gpu_info).and_return(0) + + expect(described_class.all_info).to eq( + { + nodes: [{ node: 'local', cpus: 4, memory: 5_000 }], + partitions: [{ partition: 'default', time_limit: '0-07:00:00', default: true }], + packages: [], + gpu_total: 0 + } + ) + end + end + + describe '.slurm_available?' do + it 'returns true when node and partition info exist' do + info = { + nodes: [{ node: 'node01', cpus: 4, memory: 8_000 }], + partitions: [{ partition: 'default', time_limit: '0-07:00:00', default: true }] + } + + expect(described_class.slurm_available?(info)).to be true + end + + it 'returns false when node or partition info is missing' do + info = { + nodes: nil, + partitions: nil + } + + expect(described_class.slurm_available?(info)).to be false + end end end