Adding Mean Actor Critic (#108)

RajGhugare19 · web-flow · commit 1780ac6b1184 · 2020-10-26T22:18:33.000+08:00
* Update policy_gradient.jl

* CartPole MAC experiment

* MAC.jl

* Adding Test for MAC

* Update MAC.jl
diff --git a/src/algorithms/policy_gradient/MAC.jl b/src/algorithms/policy_gradient/MAC.jl
@@ -0,0 +1,133 @@
+export MACLearner
+
+using Flux
+
+"""
+    MACLearner(;kwargs...)
+   Keyword arguments
+- `approximator`::[`ActorCritic`](@ref)
+- `γ::Float32`, reward discount rate
+-  `bootstrap::bool`, if false then Q function is approximated using monte carlo returns.
+"""
+
+Base.@kwdef mutable struct MACLearner{A<:ActorCritic} <:AbstractLearner
+    approximator::A
+    γ::Float32
+    max_grad_norm::Union{Nothing,Float32} = nothing
+    norm::Float32 = 0.0f0
+    actor_loss::Float32 = 0.0f0
+    critic_loss::Float32 = 0.0f0
+    loss::Float32 = 0.0f0
+    bootstrap::Bool = true         
+end
+
+function (learner::MACLearner)(env::MultiThreadEnv)
+    learner.approximator.actor(send_to_device(
+        device(learner.approximator),
+        get_state(env),
+    )) |> send_to_host
+end
+
+function (learner::MACLearner)(env)
+    s = get_state(env)
+    s = Flux.unsqueeze(s, ndims(s) + 1)
+    s = send_to_device(device(learner.approximator), s)
+    learner.approximator.actor(s) |> vec |> send_to_host
+end
+
+function RLBase.update!(learner::MACLearner, t::AbstractTrajectory)
+    isfull(t) || return
+    
+    states = t[:state]
+    actions = t[:action]
+    rewards = t[:reward]
+    terminals = t[:terminal]
+    
+    AC = learner.approximator
+    γ = learner.γ
+    D = device(AC)
+    
+    states = send_to_device(D, states)
+    states_flattened = flatten_batch(states) # (state_size..., n_thread * update_step)
+    
+    
+    actions = flatten_batch(actions)
+    actions = CartesianIndex.(actions, 1:length(actions))
+    
+    if learner.bootstrap
+        next_state = select_last_frame(t[:next_state])
+        next_state = send_to_device(D, next_state) 
+        next_state_values = AC.critic(next_state)
+    
+        gains = discount_rewards(
+            rewards,
+            γ;
+            dims = 2,
+            init = send_to_host(next_state_values),
+            terminal = terminals,
+        )
+        gains = send_to_device(D, gains)
+    else
+        next_state_flattened = flatten_batch(t[:next_state])
+        next_state_flattened = send_to_device(D, next_state_flattened)
+        rewards_flattened = flatten_batch(rewards)
+        rewards_flattened = send_to_device(D, rewards_flattened)
+    end
+
+    action_values = AC.critic(states_flattened)
+
+    ps1 = Flux.params(AC.actor)
+    gs1 = gradient(ps1) do
+        logits = AC.actor(states_flattened)
+        probs = softmax(logits)
+        actor_loss = -mean(sum((probs .* Zygote.dropgrad(action_values)),dims=1))
+        loss = actor_loss
+        ignore() do
+            learner.actor_loss = actor_loss
+        end
+        loss
+    end
+    if !isnothing(learner.max_grad_norm)
+        learner.norm = clip_by_global_norm!(gs1, ps1, learner.max_grad_norm)
+    end
+    update!(AC.actor, gs1)
+
+    ps2 = Flux.params(AC.critic)
+    gs2 = gradient(ps2) do        
+        if learner.bootstrap
+            critic_loss = mean((vec(gains) .- vec(action_values[actions])).^ 2)
+        else
+            next_state_values = AC.critic(next_state_flattened)
+            target_action_values = vec(rewards_flattened) .+ γ*vec(Zygote.dropgrad(sum(next_state_values.*softmax(AC.actor(next_state_flattened)),dims=1)))
+            critic_loss = mean((vec(target_action_values) .- vec(action_values[actions])) .^ 2)
+        end
+
+        loss = critic_loss
+        ignore() do
+            learner.critic_loss = critic_loss
+        end
+        loss
+    end
+    if !isnothing(learner.max_grad_norm)
+        learner.norm = clip_by_global_norm!(gs2, ps2, learner.max_grad_norm)
+    end
+    update!(AC.critic, gs2)
+end
+
+function (agent::Agent{<:QBasedPolicy{<:MACLearner},<:CircularCompactSARTSATrajectory})(
+    ::Training{PreActStage},
+    env,
+)
+    action = agent.policy(env)
+    state = get_state(env)
+    push!(agent.trajectory; state = state, action = action)
+    update!(agent.policy, agent.trajectory)
+
+    # the main difference is we'd like to flush the buffer after each update!
+    if isfull(agent.trajectory)
+        empty!(agent.trajectory)
+        push!(agent.trajectory; state = state, action = action)
+    end
+
+    action
+end
diff --git a/src/algorithms/policy_gradient/policy_gradient.jl b/src/algorithms/policy_gradient/policy_gradient.jl
@@ -2,6 +2,7 @@ include("vpg.jl")
 include("A2C.jl")
 include("ppo.jl")
 include("A2CGAE.jl")
+include("MAC.jl")
 include("ddpg.jl")
 include("td3.jl")
 include("sac.jl")
diff --git a/src/experiments/rl_envs.jl b/src/experiments/rl_envs.jl
@@ -801,6 +801,107 @@ function RLCore.Experiment(
     )
 end
 
+
+function RLCore.Experiment(
+    ::Val{:JuliaRL},
+    ::Val{:MAC},
+    ::Val{:CartPole},
+    ::Nothing;
+    save_dir = nothing,
+    seed = 123,
+)
+    if isnothing(save_dir)
+        t = Dates.format(now(), "yyyy_mm_dd_HH_MM_SS")
+        save_dir = joinpath(pwd(), "checkpoints", "JuliaRL_MAC_CartPole_$(t)")
+    end
+
+    lg = TBLogger(joinpath(save_dir, "tb_log"), min_level = Logging.Info)
+    rng = MersenneTwister(seed)
+    N_ENV = 16
+    UPDATE_FREQ = 20
+    env = MultiThreadEnv([
+        CartPoleEnv(; T = Float32, rng = MersenneTwister(hash(seed + i))) for i in 1:N_ENV
+    ])
+    ns, na = length(get_state(env[1])), length(get_actions(env[1]))
+    RLBase.reset!(env, is_force = true)
+    
+    agent = Agent(
+        policy = QBasedPolicy(
+            learner = MACLearner(
+                approximator = ActorCritic(
+                    actor = NeuralNetworkApproximator(
+                        model = Chain(
+                            Dense(ns, 30, relu; initW = glorot_uniform(rng)),
+                            Dense(30, 30, relu; initW = glorot_uniform(rng)),
+                            Dense(30, na; initW = glorot_uniform(rng)),
+
+                        ),
+                        optimizer = ADAM(1e-2),
+                    ),
+                    critic = NeuralNetworkApproximator(
+                        model = Chain(
+                            Dense(ns, 30, relu; initW = glorot_uniform(rng)),
+                            Dense(30, 30, relu; initW = glorot_uniform(rng)),
+                            Dense(30, na; initW = glorot_uniform(rng)),
+                        ),
+                        optimizer = ADAM(3e-3),
+                    ),
+                    ) |> cpu,
+                γ = 0.99f0,
+
+                bootstrap = true,
+            ),
+            explorer = BatchExplorer(GumbelSoftmaxExplorer()),#= seed = nothing =#
+        ),
+        trajectory = CircularCompactSARTSATrajectory(;
+            capacity = UPDATE_FREQ,
+            state_type = Float32,
+            state_size = (ns, N_ENV),
+            action_type = Int,
+            action_size = (N_ENV,),
+            reward_type = Float32,
+            reward_size = (N_ENV,),
+            terminal_type = Bool,
+            terminal_size = (N_ENV,),
+        ),
+    )
+   
+    stop_condition = StopAfterStep(haskey(ENV, "CI") ? 10_000 : 100_000)
+    total_reward_per_episode = TotalBatchRewardPerEpisode(N_ENV)
+    time_per_step = TimePerStep()
+    hook = ComposedHook(
+        total_reward_per_episode,
+        time_per_step,
+        DoEveryNStep() do t, agent, env
+            with_logger(lg) do
+                @info(
+                    "training",
+                    actor_loss = agent.policy.learner.actor_loss,
+                    critic_loss = agent.policy.learner.critic_loss,
+                )
+                for i in 1:length(env)
+                    if get_terminal(env[i])
+                        @info "training" reward = total_reward_per_episode.rewards[i][end] log_step_increment =
+                            0
+                        break
+                    end
+                end
+            end
+        end,
+        DoEveryNStep(10000) do t, agent, env
+            RLCore.save(save_dir, agent)
+            BSON.@save joinpath(save_dir, "stats.bson") total_reward_per_episode time_per_step
+        end,
+    )
+    Experiment(
+        agent,
+        env,
+        stop_condition,
+        hook,
+        Description("# MAC with CartPole", save_dir),
+    ) 
+end
+
 function RLCore.Experiment(
     ::Val{:JuliaRL},
     ::Val{:TD3},
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -55,7 +55,7 @@ end
                     1 / mean(res.hook[2].times)
             end
 
-            for method in (:A2C, :A2CGAE, :PPO)
+            for method in (:A2C, :A2CGAE, :PPO, :MAC)
                 res = run(Experiment(
                     Val(:JuliaRL),
                     Val(method),