diff --git a/src/bellman.jl b/src/bellman.jl index 2141673..a1a81c8 100644 --- a/src/bellman.jl +++ b/src/bellman.jl @@ -1,5 +1,5 @@ """ - bellman(V, model; upper_bound = false, maximize = true) + expectation(V, model; upper_bound = false, maximize = true) Compute robust Bellman update with the value function `V` and the model `model`, e.g. [`IntervalMarkovDecisionProcess`](@ref), that upper or lower bounds the expectation of the value function `V`. @@ -56,7 +56,7 @@ istates = [Int32(1)] model = IntervalMarkovDecisionProcess(transition_probs, istates) Vprev = [1.0, 2.0, 3.0] -Vcur = IntervalMDP.bellman(Vprev, model; upper_bound = false) +Vcur = IntervalMDP.expectation(Vprev, model; upper_bound = false) # output @@ -71,7 +71,7 @@ Vcur = IntervalMDP.bellman(Vprev, model; upper_bound = false) For a hot-loop, it is more efficient to use `bellman!` and pass in pre-allocated objects. """ -function bellman( +function expectation( V, model, alg::BellmanAlgorithm = default_bellman_algorithm(model); @@ -79,7 +79,7 @@ function bellman( maximize = true, prop = nothing, ) - Vres = similar(V, source_shape(model)) + Vres = Array{eltype(V)}(undef, (action_values(model)..., state_values(model)...)) return expectation!( Vres, @@ -451,17 +451,9 @@ Base.@propagate_inbounds function state_expectation!( for jₐ in available(model, jₛ) ambiguity_set = marginal[jₐ, jₛ] budget = workspace.budget[sub2ind(marginal, jₐ, jₛ)] - workspace.actions[jₐ] = + Vres[jₐ, jₛ] = state_action_expectation(workspace, V, ambiguity_set, budget, upper_bound) end - - Vres[jₛ] = extract_strategy!( - strategy_cache, - workspace.actions, - available(model, jₛ), - jₛ, - maximize, - ) end diff --git a/src/robust_value_iteration.jl b/src/robust_value_iteration.jl index 1207bd5..252099c 100644 --- a/src/robust_value_iteration.jl +++ b/src/robust_value_iteration.jl @@ -205,7 +205,35 @@ function _value_iteration!(problem::AbstractIntervalMDPProblem, alg; callback = return value_function.current, k, value_function.previous, strategy_cache end -function bellman_update!(workspace, strategy_cache, value_function, k, mp, spec) +function bellman_update!(workspace, strategy_cache, value_function::StateValueFunction, k, mp, spec) + + # 1. compute expectation for Q(s, a) + expectation!( + workspace, + select_strategy_cache(strategy_cache, k), + value_function.intermediate_state_action_value, + value_function.previous, + select_model(mp, k); # For time-varying available and labelling functions + upper_bound = isoptimistic(spec), + maximize = ismaximize(spec), + prop = system_property(spec), + ) + + # 2. extract strategy and compute V(s) = max_a Q(s, a) + strategy!( + select_strategy_cache(strategy_cache, k), + value_function.current, + value_function.intermediate_state_action_value, + select_model(mp, k), + ismaximize(spec), + ) + + # 3. post process + step_postprocess_value_function!(value_function, spec) + step_postprocess_strategy_cache!(strategy_cache) +end + +function bellman_update!(workspace, strategy_cache::NonOptimizingStrategyCache, value_function::StateValueFunction, k, mp, spec) expectation!( workspace, select_strategy_cache(strategy_cache, k), diff --git a/src/strategy_cache.jl b/src/strategy_cache.jl index 0da47b1..209dc40 100644 --- a/src/strategy_cache.jl +++ b/src/strategy_cache.jl @@ -159,3 +159,26 @@ function _extract_strategy!(cur_strategy, values, available_actions, neutral, j @inbounds cur_strategy[jₛ] = opt_index return opt_val end + + +function strategy!( + strategy_cache::OptimizingStrategyCache, + Vres::AbstractArray{R}, + Q::AbstractArray{R}, + model, + maximize, +) where {R <: Real} + + #TODO: can be threaded? + for jₛ in CartesianIndices(source_shape(model)) + + Vres[jₛ] = extract_strategy!( + strategy_cache, + Q[:, jₛ], + available(model, jₛ), + jₛ, + maximize, + ) + + end +end \ No newline at end of file diff --git a/src/value.jl b/src/value.jl index cd6c18e..d0613b6 100644 --- a/src/value.jl +++ b/src/value.jl @@ -12,8 +12,8 @@ function StateValueFunction(problem::AbstractIntervalMDPProblem) previous .= zero(valuetype(mp)) current = copy(previous) - dim = Tuple(Iterators.flatten(zip(action_values(mp), state_values(mp)))) - # interleaved concat gives shape: (a1, a2) , (s1, s2) => (a1, s1, a2, s2) + dim = (action_values(mp)..., state_values(mp)...) + # concat gives shape: (a1, a2) , (s1, s2) => (a1, a2, s1, s2) # (a, s) to access s more frequently due to column major # TODO: works for IMDP, need to check for fIMDP intermediate_state_action_value = arrayfactory(mp, valuetype(mp), dim) @@ -45,7 +45,7 @@ end function StateActionValueFunction(problem::AbstractIntervalMDPProblem) mp = system(problem) - dim = Tuple(Iterators.flatten(zip(action_values(mp), state_values(mp)))) + dim = (action_values(mp)..., state_values(mp)...) # TODO: works for IMDP, need to check for fIMDP previous = arrayfactory(mp, valuetype(mp), dim) previous .= zero(valuetype(mp)) diff --git a/src/workspace.jl b/src/workspace.jl index c36ca4c..56a7b4b 100644 --- a/src/workspace.jl +++ b/src/workspace.jl @@ -53,7 +53,6 @@ struct DenseIntervalOMaxWorkspace{T <: Real} budget::Vector{T} scratch::Vector{Int32} permutation::Vector{Int32} - actions::Vector{T} end function DenseIntervalOMaxWorkspace( @@ -63,8 +62,7 @@ function DenseIntervalOMaxWorkspace( budget = 1 .- vec(sum(ambiguity_set.lower; dims = 1)) scratch = Vector{Int32}(undef, num_target(ambiguity_set)) perm = Vector{Int32}(undef, num_target(ambiguity_set)) - actions = Vector{R}(undef, nactions) - return DenseIntervalOMaxWorkspace(budget, scratch, perm, actions) + return DenseIntervalOMaxWorkspace(budget, scratch, perm) end permutation(ws::DenseIntervalOMaxWorkspace) = ws.permutation @@ -83,7 +81,7 @@ function ThreadedDenseIntervalOMaxWorkspace( perm = Vector{Int32}(undef, num_target(ambiguity_set)) workspaces = [ - DenseIntervalOMaxWorkspace(budget, scratch, perm, Vector{R}(undef, nactions)) + DenseIntervalOMaxWorkspace(budget, scratch, perm) for _ in 1:Threads.nthreads() ] return ThreadedDenseIntervalOMaxWorkspace(workspaces) diff --git a/test/base/bellman.jl b/test/base/bellman.jl index 9fb1afd..e981d23 100644 --- a/test/base/bellman.jl +++ b/test/base/bellman.jl @@ -13,41 +13,41 @@ using IntervalMDP @testset "maximization" begin ws = IntervalMDP.construct_workspace(prob) strategy_cache = IntervalMDP.construct_strategy_cache(prob) - Vres = zeros(N, 2) + Vres = zeros(N, 1, 2) IntervalMDP._expectation_helper!(ws, strategy_cache, Vres, V, prob; upper_bound = true) - @test Vres ≈ N[27 // 10, 17 // 10] # [0.3 * 2 + 0.7 * 3, 0.5 * 1 + 0.3 * 2 + 0.2 * 3] + @test Vres ≈ N[27 // 10, 17 // 10]' # [0.3 * 2 + 0.7 * 3, 0.5 * 1 + 0.3 * 2 + 0.2 * 3] ws = IntervalMDP.DenseIntervalOMaxWorkspace(prob, 1) strategy_cache = IntervalMDP.construct_strategy_cache(prob) Vres = similar(Vres) IntervalMDP._expectation_helper!(ws, strategy_cache, Vres, V, prob; upper_bound = true) - @test Vres ≈ N[27 // 10, 17 // 10] + @test Vres ≈ N[27 // 10, 17 // 10]' ws = IntervalMDP.ThreadedDenseIntervalOMaxWorkspace(prob, 1) strategy_cache = IntervalMDP.construct_strategy_cache(prob) Vres = similar(Vres) IntervalMDP._expectation_helper!(ws, strategy_cache, Vres, V, prob; upper_bound = true) - @test Vres ≈ N[27 // 10, 17 // 10] + @test Vres ≈ N[27 // 10, 17 // 10]' end #### Minimization @testset "minimization" begin ws = IntervalMDP.construct_workspace(prob) strategy_cache = IntervalMDP.construct_strategy_cache(prob) - Vres = zeros(N, 2) + Vres = zeros(N, 1, 2) IntervalMDP._expectation_helper!(ws, strategy_cache, Vres, V, prob; upper_bound = false) - @test Vres ≈ N[17 // 10, 15 // 10] # [0.5 * 1 + 0.3 * 2 + 0.2 * 3, 0.6 * 1 + 0.3 * 2 + 0.1 * 3] + @test Vres ≈ N[17 // 10, 15 // 10]' # [0.5 * 1 + 0.3 * 2 + 0.2 * 3, 0.6 * 1 + 0.3 * 2 + 0.1 * 3] ws = IntervalMDP.DenseIntervalOMaxWorkspace(prob, 1) strategy_cache = IntervalMDP.construct_strategy_cache(prob) Vres = similar(Vres) IntervalMDP._expectation_helper!(ws, strategy_cache, Vres, V, prob; upper_bound = false) - @test Vres ≈ N[17 // 10, 15 // 10] + @test Vres ≈ N[17 // 10, 15 // 10]' ws = IntervalMDP.ThreadedDenseIntervalOMaxWorkspace(prob, 1) strategy_cache = IntervalMDP.construct_strategy_cache(prob) Vres = similar(Vres) IntervalMDP._expectation_helper!(ws, strategy_cache, Vres, V, prob; upper_bound = false) - @test Vres ≈ N[17 // 10, 15 // 10] + @test Vres ≈ N[17 // 10, 15 // 10]' end end diff --git a/test/base/factored.jl b/test/base/factored.jl index 1fb5264..526e0b3 100644 --- a/test/base/factored.jl +++ b/test/base/factored.jl @@ -424,7 +424,7 @@ end end @testset "maximization" begin - Vexpected = IntervalMDP.bellman(V, imc; upper_bound = true) # Using O-maximization, should be equivalent + Vexpected = IntervalMDP.expectation(V, imc; upper_bound = true) # Using O-maximization, should be equivalent ws = IntervalMDP.construct_workspace(imc, LPMcCormickRelaxation()) strategy_cache = IntervalMDP.construct_strategy_cache(imc) @@ -468,7 +468,7 @@ end end @testset "minimization" begin - Vexpected = IntervalMDP.bellman(V, imc; upper_bound = false) # Using O-maximization, should be equivalent + Vexpected = IntervalMDP.expectation(V, imc; upper_bound = false) # Using O-maximization, should be equivalent ws = IntervalMDP.construct_workspace(imc, LPMcCormickRelaxation()) strategy_cache = IntervalMDP.construct_strategy_cache(imc) diff --git a/test/base/imdp.jl b/test/base/imdp.jl index 7480faa..c81723c 100644 --- a/test/base/imdp.jl +++ b/test/base/imdp.jl @@ -49,14 +49,14 @@ using IntervalMDP @testset "bellman" begin V = N[1, 2, 3] - Vres = IntervalMDP.bellman(V, mdp; upper_bound = false, maximize = true) + Vres = IntervalMDP.expectation(V, mdp; upper_bound = false, maximize = true) @test Vres ≈ N[ (1 // 2) * 1 + (3 // 10) * 2 + (1 // 5) * 3, (3 // 10) * 1 + (3 // 10) * 2 + (2 // 5) * 3, 1 * 3, ] - Vres = similar(Vres) + Vres = Array{N}(undef, (3, 3)) IntervalMDP.expectation!(Vres, V, mdp; upper_bound = false, maximize = true) @test Vres ≈ N[ (1 // 2) * 1 + (3 // 10) * 2 + (1 // 5) * 3, @@ -64,14 +64,14 @@ using IntervalMDP 1 * 3, ] - Vres = IntervalMDP.bellman(V, mdp; upper_bound = true, maximize = false) + Vres = IntervalMDP.expectation(V, mdp; upper_bound = true, maximize = false) @test Vres ≈ N[ (1 // 2) * 1 + (3 // 10) * 2 + (1 // 5) * 3, (1 // 5) * 1 + (2 // 5) * 2 + (2 // 5) * 3, 1 * 3, ] - Vres = similar(Vres) + Vres = Array{N}(undef, (3, 3)) IntervalMDP.expectation!(Vres, V, mdp; upper_bound = true, maximize = false) @test Vres ≈ N[ (1 // 2) * 1 + (3 // 10) * 2 + (1 // 5) * 3, diff --git a/test/base/product.jl b/test/base/product.jl index 77edbfd..bd7b62a 100644 --- a/test/base/product.jl +++ b/test/base/product.jl @@ -137,7 +137,7 @@ end 0 5 ] - Vres = IntervalMDP.bellman(V, prod_proc; upper_bound = false) + Vres = IntervalMDP.expectation(V, prod_proc; upper_bound = false) @test Vres ≈ N[ 30//10 24//10 @@ -220,7 +220,7 @@ end ] # No Strategy - Vres = IntervalMDP.bellman(V, prod_proc; upper_bound = false) + Vres = IntervalMDP.expectation(V, prod_proc; upper_bound = false) @test Vtar ≈ Vres atol=eps # Non Stationary Strategy (Init iteration) @@ -421,7 +421,7 @@ end 45//10 5 ] - Vres = IntervalMDP.bellman(V, prod_proc; upper_bound = false) + Vres = IntervalMDP.expectation(V, prod_proc; upper_bound = false) @test Vres ≈ Vtar end end @@ -504,7 +504,7 @@ end ] # No Strategy - Vres = IntervalMDP.bellman(V, prod_proc; upper_bound = false) + Vres = IntervalMDP.expectation(V, prod_proc; upper_bound = false) @test Vtar ≈ Vres atol=eps # Non Stationary Strategy diff --git a/test/cuda/dense/imdp.jl b/test/cuda/dense/imdp.jl index d70f748..67bb4e0 100644 --- a/test/cuda/dense/imdp.jl +++ b/test/cuda/dense/imdp.jl @@ -48,7 +48,7 @@ using IntervalMDP, CUDA @testset "bellman" begin V = IntervalMDP.cu(N[1, 2, 3]) - Vres = IntervalMDP.bellman(V, mdp; upper_bound = false, maximize = true) + Vres = IntervalMDP.expectation(V, mdp; upper_bound = false, maximize = true) Vres = IntervalMDP.cpu(Vres) # Convert to CPU for testing @test Vres ≈ N[ (1 // 2) * 1 + (3 // 10) * 2 + (1 // 5) * 3, diff --git a/test/cuda/sparse/imdp.jl b/test/cuda/sparse/imdp.jl index a68301b..39fd616 100644 --- a/test/cuda/sparse/imdp.jl +++ b/test/cuda/sparse/imdp.jl @@ -51,7 +51,7 @@ using IntervalMDP, CUDA, SparseArrays @testset "bellman" begin V = IntervalMDP.cu(N[1, 2, 3]) - Vres = IntervalMDP.bellman(V, mdp; upper_bound = false, maximize = true) + Vres = IntervalMDP.expectation(V, mdp; upper_bound = false, maximize = true) Vres = IntervalMDP.cpu(Vres) # Convert to CPU for testing @test Vres ≈ N[ (1 // 2) * 1 + (3 // 10) * 2 + (1 // 5) * 3, diff --git a/test/sparse/factored.jl b/test/sparse/factored.jl index 7291c6f..4baa401 100644 --- a/test/sparse/factored.jl +++ b/test/sparse/factored.jl @@ -90,7 +90,7 @@ end end @testset "maximization" begin - Vexpected = IntervalMDP.bellman(V, imc; upper_bound = true) # Using O-maximization, should be equivalent + Vexpected = IntervalMDP.expectation(V, imc; upper_bound = true) # Using O-maximization, should be equivalent ws = IntervalMDP.construct_workspace(imc, LPMcCormickRelaxation()) strategy_cache = IntervalMDP.construct_strategy_cache(imc) @@ -134,7 +134,7 @@ end end @testset "minimization" begin - Vexpected = IntervalMDP.bellman(V, imc; upper_bound = false) # Using O-maximization, should be equivalent + Vexpected = IntervalMDP.expectation(V, imc; upper_bound = false) # Using O-maximization, should be equivalent ws = IntervalMDP.construct_workspace(imc, LPMcCormickRelaxation()) strategy_cache = IntervalMDP.construct_strategy_cache(imc) diff --git a/test/sparse/imdp.jl b/test/sparse/imdp.jl index 9b5ac4d..63ff30a 100644 --- a/test/sparse/imdp.jl +++ b/test/sparse/imdp.jl @@ -51,7 +51,7 @@ using IntervalMDP, SparseArrays @testset "bellman" begin V = N[1, 2, 3] - Vres = IntervalMDP.bellman(V, mdp; upper_bound = false, maximize = true) + Vres = IntervalMDP.expectation(V, mdp; upper_bound = false, maximize = true) @test Vres ≈ N[ (1 // 2) * 1 + (3 // 10) * 2 + (1 // 5) * 3, (3 // 10) * 1 + (3 // 10) * 2 + (2 // 5) * 3,