Allow multidimensional actions in ppo (#151)

albheim · web-flow · commit 7913db6cc543 · 2021-03-01T18:08:16.000+08:00
* Hack to allow multidim actions in ppo

* Fix for single dim envs

* Handle single and multi actions separately

* Update PPOPolicy docstring for multidim actions

* Update docstring for PPOPolicy
diff --git a/src/algorithms/policy_gradient/multi_thread_env.jl b/src/algorithms/policy_gradient/multi_thread_env.jl
@@ -82,9 +82,14 @@ end
 MacroTools.@forward MultiThreadEnv.envs Base.getindex, Base.length, Base.iterate
 
 function (env::MultiThreadEnv)(actions)
+    N = ndims(actions)
     @sync for i in 1:length(env)
         @spawn begin
-            env[i](actions[i])
+            if N == 1 
+                env[i](actions[i])
+            else
+                env[i](selectdim(actions, N, i))
+            end
         end
     end
 end
@@ -126,6 +131,7 @@ function RLBase.is_terminated(env::MultiThreadEnv)
 end
 
 function RLBase.legal_action_space_mask(env::MultiThreadEnv)
+    N = ndims(env.states)
     @sync for i in 1:length(env)
         @spawn selectdim(env.legal_action_space_mask, N, i) .=
             legal_action_space_mask(env[i])
diff --git a/src/algorithms/policy_gradient/ppo.jl b/src/algorithms/policy_gradient/ppo.jl
@@ -71,7 +71,12 @@ end
 - `rng = Random.GLOBAL_RNG`,
 
 By default, `dist` is set to `Categorical`, which means it will only works
-on environments of discrete actions. To work with environments of
+on environments of discrete actions. To work with environments of continuous
+actions `dist` should be set to `Normal` and the `actor` in the `approximator`
+should be a `GaussianNetwork`. Using it with a `GaussianNetwork` supports 
+multi-dimensional action spaces, though it only supports it under the assumption
+that the dimensions are independent since the `GaussianNetwork` outputs a single
+`μ` and `σ` for each dimension which is used to simplify the calculations.
 """
 mutable struct PPOPolicy{A<:ActorCritic,D,R} <: AbstractPolicy
     approximator::A
@@ -178,7 +183,12 @@ end
 function (agent::Agent{<:PPOPolicy})(env::MultiThreadEnv)
     dist = prob(agent.policy, env)
     action = rand.(agent.policy.rng, dist)
-    EnrichedAction(action; action_log_prob = logpdf.(dist, action))
+    if ndims(action) == 2
+        action_log_prob = sum(logpdf.(dist, action), dims=1)
+    else
+        action_log_prob = logpdf.(dist, action)
+    end
+    EnrichedAction(action; action_log_prob=vec(action_log_prob))
 end
 
 function RLBase.update!(
@@ -227,7 +237,7 @@ function _update!(p::PPOPolicy, t::AbstractTrajectory)
     )
     returns = advantages .+ select_last_dim(states_plus_values, 1:n_rollout)
 
-    actions = select_last_dim(t[:action], 1:n)
+    actions_flatten = flatten_batch(select_last_dim(t[:action], 1:n))
     action_log_probs = select_last_dim(t[:action_log_prob], 1:n)
 
     # TODO: normalize advantage
@@ -246,7 +256,7 @@ function _update!(p::PPOPolicy, t::AbstractTrajectory)
                 @error "TODO:"
             end
             s = send_to_device(D, select_last_dim(states_flatten, inds))  # !!! performance critical
-            a = vec(actions)[inds]
+            a = send_to_device(D, select_last_dim(actions_flatten, inds))
             r = send_to_device(D, vec(returns)[inds])
             log_p = send_to_device(D, vec(action_log_probs)[inds])
             adv = send_to_device(D, vec(advantages)[inds])
@@ -256,8 +266,12 @@ function _update!(p::PPOPolicy, t::AbstractTrajectory)
                 v′ = AC.critic(s) |> vec
                 if AC.actor isa GaussianNetwork
                     μ, σ = AC.actor(s)
-                    log_p′ₐ = normlogpdf(μ, σ, a)
-                    entropy_loss = mean((log(2.0f0π) + 1) / 2 .+ log.(σ))
+                    if ndims(a) == 2
+                        log_p′ₐ = sum(normlogpdf(μ, σ, a), dims=1)
+                    else
+                        log_p′ₐ = normlogpdf(μ, σ, a)
+                    end
+                    entropy_loss = mean((log(2.0f0π) + 1) / 2 .+ sum(log.(σ), dims=1))
                 else
                     # actor is assumed to return discrete logits
                     logit′ = AC.actor(s)