Twin Delayed DDPG (TD3) (#89)

rbange · web-flow · commit 44e3358b23de · 2020-09-21T18:27:00.000+08:00
* add TD3

* adapt README
diff --git a/README.md b/README.md
@@ -23,6 +23,7 @@ This project aims to provide some implementations of the most typical reinforcem
 - A2C
 - PPO
 - DDPG
+- TD3
 - SAC
 - CFR
 - Minimax
@@ -44,6 +45,7 @@ Some built-in experiments are exported to help new users to easily run benchmark
 - ``E`JuliaRL_A2CGAE_CartPole` `` (Thanks to [@sriram13m](https://github.com/sriram13m))
 - ``E`JuliaRL_PPO_CartPole` ``
 - ``E`JuliaRL_DDPG_Pendulum` ``
+- ``E`JuliaRL_TD3_Pendulum` `` (Thanks to [@rbange](https://github.com/rbange))
 - ``E`JuliaRL_SAC_Pendulum` `` (Thanks to [@rbange](https://github.com/rbange))
 - ``E`JuliaRL_BasicDQN_MountainCar` `` (Thanks to [@felixchalumeau](https://github.com/felixchalumeau))
 - ``E`JuliaRL_DQN_MountainCar` `` (Thanks to [@felixchalumeau](https://github.com/felixchalumeau))
diff --git a/src/algorithms/policy_gradient/policy_gradient.jl b/src/algorithms/policy_gradient/policy_gradient.jl
@@ -2,4 +2,5 @@ include("A2C.jl")
 include("ppo.jl")
 include("A2CGAE.jl")
 include("ddpg.jl")
+include("td3.jl")
 include("sac.jl")
diff --git a/src/algorithms/policy_gradient/td3.jl b/src/algorithms/policy_gradient/td3.jl
@@ -0,0 +1,188 @@
+export TD3Policy, TD3Critic
+
+using Random
+using Flux
+
+struct TD3Critic
+    critic_1::Flux.Chain
+    critic_2::Flux.Chain
+end
+Flux.@functor TD3Critic
+(c::TD3Critic)(s, a) = (inp = vcat(s, a); (c.critic_1(inp), c.critic_2(inp)))
+
+mutable struct TD3Policy{
+    BA<:NeuralNetworkApproximator,
+    BC<:NeuralNetworkApproximator,
+    TA<:NeuralNetworkApproximator,
+    TC<:NeuralNetworkApproximator,
+    P,
+    R<:AbstractRNG,
+} <: AbstractPolicy
+
+    behavior_actor::BA
+    behavior_critic::BC
+    target_actor::TA
+    target_critic::TC
+    γ::Float32
+    ρ::Float32
+    batch_size::Int
+    start_steps::Int
+    start_policy::P
+    update_after::Int
+    update_every::Int
+    policy_freq::Int
+    target_act_limit::Float64
+    target_act_noise::Float64
+    act_limit::Float64
+    act_noise::Float64
+    step::Int
+    rng::R
+    replay_counter::Int
+    # for logging
+    actor_loss::Float32
+    critic_loss::Float32
+end
+
+"""
+    TD3Policy(;kwargs...)
+
+# Keyword arguments
+
+- `behavior_actor`,
+- `behavior_critic`,
+- `target_actor`,
+- `target_critic`,
+- `start_policy`,
+- `γ = 0.99f0`,
+- `ρ = 0.995f0`,
+- `batch_size = 32`,
+- `start_steps = 10000`,
+- `update_after = 1000`,
+- `update_every = 50`,
+- `policy_freq = 2` # frequency in which the actor performs a gradient step and critic target is updated
+- `target_act_limit = 1.0`, # noise added to actor target
+- `target_act_noise = 0.1`, # noise added to actor target
+- `act_limit = 1.0`, # noise added when outputing action
+- `act_noise = 0.1`, # noise added when outputing action
+- `step = 0`,
+- `rng = Random.GLOBAL_RNG`,
+"""
+function TD3Policy(;
+    behavior_actor,
+    behavior_critic,
+    target_actor,
+    target_critic,
+    start_policy,
+    γ = 0.99f0,
+    ρ = 0.995f0,
+    batch_size = 64,
+    start_steps = 10000,
+    update_after = 1000,
+    update_every = 50,
+    policy_freq = 2,
+    target_act_limit = 1.0,
+    target_act_noise = 0.1,
+    act_limit = 1.0,
+    act_noise = 0.1,
+    step = 0,
+    rng = Random.GLOBAL_RNG,
+)
+    copyto!(behavior_actor, target_actor)  # force sync
+    copyto!(behavior_critic, target_critic)  # force sync
+    TD3Policy(
+        behavior_actor,
+        behavior_critic,
+        target_actor,
+        target_critic,
+        γ,
+        ρ,
+        batch_size,
+        start_steps,
+        start_policy,
+        update_after,
+        update_every,
+        policy_freq,
+        target_act_limit,
+        target_act_noise,
+        act_limit,
+        act_noise,
+        step,
+        rng,
+        1, # keep track of numbers of replay
+        0.f0,
+        0.f0,
+    )
+end
+
+# TODO: handle Training/Testing mode
+function (p::TD3Policy)(env)
+    p.step += 1
+
+    if p.step <= p.start_steps
+        p.start_policy(env)
+    else
+        D = device(p.behavior_actor)
+        s = get_state(env)
+        s = Flux.unsqueeze(s, ndims(s) + 1)
+        action = p.behavior_actor(send_to_device(D, s)) |> vec |> send_to_host
+        clamp(action[] + randn(p.rng) * p.act_noise, -p.act_limit, p.act_limit)
+    end
+end
+
+function RLBase.update!(p::TD3Policy, traj::CircularCompactSARTSATrajectory)
+    length(traj[:terminal]) > p.update_after || return
+    p.step % p.update_every == 0 || return
+
+    inds = rand(p.rng, 1:(length(traj[:terminal])-1), p.batch_size)
+    s = select_last_dim(traj[:state], inds)
+    a = select_last_dim(traj[:action], inds)
+    r = select_last_dim(traj[:reward], inds)
+    t = select_last_dim(traj[:terminal], inds)
+    s′ = select_last_dim(traj[:next_state], inds)
+
+    actor = p.behavior_actor
+    critic = p.behavior_critic
+
+    # !!! we have several assumptions here, need revisit when we have more complex environments
+    # state is vector
+    # action is scalar
+    target_noise = clamp.(
+        randn(p.rng, Float32, 1, p.batch_size) .* p.target_act_noise,
+        -p.target_act_limit,
+        p.target_act_limit,
+    )
+    # add noise and clip to tanh bounds
+    a′ = clamp.(p.target_actor(s′) + target_noise, -1f0, 1f0)
+
+    q_1′, q_2′ = p.target_critic(s′, a′)
+    y = r .+ p.γ .* (1 .- t) .* (min.(q_1′, q_2′) |> vec)
+    a = Flux.unsqueeze(a, 1)
+
+    gs1 = gradient(Flux.params(critic)) do
+        q1, q2 = critic(s, a)
+        loss = mse(q1 |> vec, y) + mse(q2 |> vec, y)
+        ignore() do
+            p.critic_loss = loss
+        end
+        loss
+    end
+    update!(critic, gs1)
+
+    if p.replay_counter % p.policy_freq == 0
+        gs2 = gradient(Flux.params(actor)) do
+            actions = actor(s)
+            loss = -mean(critic.model.critic_1(vcat(s, actions)))
+            ignore() do
+                p.actor_loss = loss
+            end
+            loss
+        end
+        update!(actor, gs2)
+        # polyak averaging
+        for (dest, src) in zip(Flux.params([p.target_actor, p.target_critic]), Flux.params([actor, critic]))
+            dest .= p.ρ .* dest .+ (1 - p.ρ) .* src
+        end
+        p.replay_counter = 1
+    end
+    p.replay_counter += 1
+end
diff --git a/src/experiments/rl_envs.jl b/src/experiments/rl_envs.jl
@@ -800,6 +800,120 @@ function RLCore.Experiment(
     )
 end
 
+function RLCore.Experiment(
+    ::Val{:JuliaRL},
+    ::Val{:TD3},
+    ::Val{:Pendulum},
+    ::Nothing;
+    save_dir = nothing,
+    seed = 123,
+)
+    if isnothing(save_dir)
+        t = Dates.format(now(), "yyyy_mm_dd_HH_MM_SS")
+        save_dir = joinpath(pwd(), "checkpoints", "JuliaRL_TD3_Pendulum_$(t)")
+    end
+
+    lg = TBLogger(joinpath(save_dir, "tb_log"), min_level = Logging.Info)
+    rng = MersenneTwister(seed)
+    inner_env = PendulumEnv(T = Float32, rng = rng)
+    action_space = get_actions(inner_env)
+    low = action_space.low
+    high = action_space.high
+    ns = length(get_state(inner_env))
+
+    env = inner_env |> ActionTransformedEnv(x -> low + (x + 1) * 0.5 * (high - low))
+    init = glorot_uniform(rng)
+
+    create_actor() = Chain(
+        Dense(ns, 30, relu; initW = init),
+        Dense(30, 30, relu; initW = init),
+        Dense(30, 1, tanh; initW = init),
+    )
+
+    create_critic_model() = Chain(
+        Dense(ns + 1, 30, relu; initW = init),
+        Dense(30, 30, relu; initW = init),
+        Dense(30, 1; initW = init),
+    )
+
+    create_critic() = TD3Critic(create_critic_model(), create_critic_model())
+
+    agent = Agent(
+        policy = TD3Policy(
+            behavior_actor = NeuralNetworkApproximator(
+                model = create_actor(),
+                optimizer = ADAM(),
+            ),
+            behavior_critic = NeuralNetworkApproximator(
+                model = create_critic(),
+                optimizer = ADAM(),
+            ),
+            target_actor = NeuralNetworkApproximator(
+                model = create_actor(),
+                optimizer = ADAM(),
+            ),
+            target_critic = NeuralNetworkApproximator(
+                model = create_critic(),
+                optimizer = ADAM(),
+            ),
+            γ = 0.99f0,
+            ρ = 0.99f0,
+            batch_size = 64,
+            start_steps = 1000,
+            start_policy = RandomPolicy(ContinuousSpace(-1.0, 1.0); rng = rng),
+            update_after = 1000,
+            update_every = 1,
+            policy_freq = 2,
+            target_act_limit = 1.0,
+            target_act_noise = 0.1,
+            act_limit = 1.0,
+            act_noise = 0.1,
+            rng = rng,
+        ),
+        trajectory = CircularCompactSARTSATrajectory(
+            capacity = 10000,
+            state_type = Float32,
+            state_size = (ns,),
+            action_type = Float32,
+        ),
+    )
+
+    stop_condition = StopAfterStep(10_000)
+    total_reward_per_episode = TotalRewardPerEpisode()
+    time_per_step = TimePerStep()
+    hook = ComposedHook(
+        total_reward_per_episode,
+        time_per_step,
+        DoEveryNStep() do t, agent, env
+            with_logger(lg) do
+                @info(
+                    "training",
+                    actor_loss = agent.policy.actor_loss,
+                    critic_loss = agent.policy.critic_loss
+                )
+            end
+        end,
+        DoEveryNEpisode() do t, agent, env
+            with_logger(lg) do
+                @info "training" reward = total_reward_per_episode.rewards[end] log_step_increment =
+                    0
+            end
+        end,
+        DoEveryNStep(10000) do t, agent, env
+            RLCore.save(save_dir, agent)
+            BSON.@save joinpath(save_dir, "stats.bson") total_reward_per_episode time_per_step
+        end,
+    )
+
+    Experiment(
+        agent,
+        env,
+        stop_condition,
+        hook,
+        Description("# Play Pendulum with TD3", save_dir),
+    )
+end
+
 function RLCore.Experiment(
     ::Val{:JuliaRL},
     ::Val{:PPO},
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -48,7 +48,7 @@ using OpenSpiel
                     mean(Iterators.flatten(res.hook[1].rewards))
             end
 
-            for method in (:DDPG, :SAC)
+            for method in (:DDPG, :SAC, :TD3)
                 res = run(Experiment(
                     Val(:JuliaRL),
                     Val(method),