JuliaReinforcementLearning
diff --git a/‎src/algorithms/cfr/best_response_policy.jl‎
Lines changed: 1 addition & 5 deletions b/‎src/algorithms/cfr/best_response_policy.jl‎
Lines changed: 1 addition & 5 deletions
diff --git a/‎src/algorithms/cfr/external_sampling_mccfr.jl‎
Lines changed: 1 addition & 2 deletions b/‎src/algorithms/cfr/external_sampling_mccfr.jl‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎src/algorithms/cfr/outcome_sampling_mccfr.jl‎
Lines changed: 1 addition & 2 deletions b/‎src/algorithms/cfr/outcome_sampling_mccfr.jl‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎src/algorithms/dqns/basic_dqn.jl‎
Lines changed: 1 addition & 3 deletions b/‎src/algorithms/dqns/basic_dqn.jl‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎src/algorithms/dqns/common.jl‎
Lines changed: 8 additions & 3 deletions b/‎src/algorithms/dqns/common.jl‎
Lines changed: 8 additions & 3 deletions
diff --git a/‎src/algorithms/dqns/dqn.jl‎
Lines changed: 14 additions & 7 deletions b/‎src/algorithms/dqns/dqn.jl‎
Lines changed: 14 additions & 7 deletions
diff --git a/‎src/algorithms/dqns/iqn.jl‎
Lines changed: 11 additions & 3 deletions b/‎src/algorithms/dqns/iqn.jl‎
Lines changed: 11 additions & 3 deletions
diff --git a/‎src/algorithms/dqns/prioritized_dqn.jl‎
Lines changed: 15 additions & 8 deletions b/‎src/algorithms/dqns/prioritized_dqn.jl‎
Lines changed: 15 additions & 8 deletions
diff --git a/‎src/algorithms/dqns/rainbow.jl‎
Lines changed: 7 additions & 2 deletions b/‎src/algorithms/dqns/rainbow.jl‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎src/algorithms/policy_gradient/A2C.jl‎
Lines changed: 13 additions & 19 deletions b/‎src/algorithms/policy_gradient/A2C.jl‎
Lines changed: 13 additions & 19 deletions
@@ -66,11 +66,7 @@ function init_cfr_reach_prob!(p, env, reach_prob = 1.0)
             end
         else  # opponents
             for a in legal_action_space(env)
-                init_cfr_reach_prob!(
-                    p,
-                    child(env, a),
-                    reach_prob * prob(p.policy, env, a),
-                )
+                init_cfr_reach_prob!(p, child(env, a), reach_prob * prob(p.policy, env, a))
             end
         end
     end
 
@@ -18,8 +18,7 @@ end
 
 (p::ExternalSamplingMCCFRPolicy)(env::AbstractEnv) = p.behavior_policy(env)
 
-RLBase.prob(p::ExternalSamplingMCCFRPolicy, env::AbstractEnv) =
-    prob(p.behavior_policy, env)
+RLBase.prob(p::ExternalSamplingMCCFRPolicy, env::AbstractEnv) = prob(p.behavior_policy, env)
 
 function ExternalSamplingMCCFRPolicy(; state_type = String, rng = Random.GLOBAL_RNG)
     ExternalSamplingMCCFRPolicy(
 
@@ -19,8 +19,7 @@ end
 
 (p::OutcomeSamplingMCCFRPolicy)(env::AbstractEnv) = p.behavior_policy(env)
 
-RLBase.prob(p::OutcomeSamplingMCCFRPolicy, env::AbstractEnv) =
-    prob(p.behavior_policy, env)
+RLBase.prob(p::OutcomeSamplingMCCFRPolicy, env::AbstractEnv) = prob(p.behavior_policy, env)
 
 function OutcomeSamplingMCCFRPolicy(; state_type = String, rng = Random.GLOBAL_RNG, ϵ = 0.6)
     OutcomeSamplingMCCFRPolicy(
 
@@ -38,9 +38,7 @@ end
 (learner::BasicDQNLearner)(env) =
     env |>
     state |>
-    x -> send_to_device(device(learner), x) |>
-    learner.approximator |>
-    send_to_host
+    x -> send_to_device(device(learner), x) |> learner.approximator |> send_to_host
 
 function BasicDQNLearner(;
     approximator::Q,
 
@@ -4,7 +4,7 @@
 
 const PERLearners = Union{PrioritizedDQNLearner,RainbowLearner,IQNLearner}
 
-function RLBase.update!(learner::Union{DQNLearner, PERLearners}, t::AbstractTrajectory)
+function RLBase.update!(learner::Union{DQNLearner,PERLearners}, t::AbstractTrajectory)
     length(t[:terminal]) < learner.min_replay_history && return
 
     learner.update_step += 1
@@ -21,11 +21,16 @@ function RLBase.update!(learner::Union{DQNLearner, PERLearners}, t::AbstractTraj
         priorities = update!(learner, batch)
         t[:priority][inds] .= priorities
     else
-        update!(learner,batch)
+        update!(learner, batch)
     end
 end
 
-function RLBase.update!(trajectory::PrioritizedTrajectory, p::QBasedPolicy{<:PERLearners}, env::AbstractEnv, ::PostActStage)
+function RLBase.update!(
+    trajectory::PrioritizedTrajectory,
+    p::QBasedPolicy{<:PERLearners},
+    env::AbstractEnv,
+    ::PostActStage,
+)
     push!(trajectory[:reward], reward(env))
     push!(trajectory[:terminal], is_terminated(env))
     push!(trajectory[:priority], p.learner.default_priority)
 
@@ -55,7 +55,12 @@ function DQNLearner(;
     rng = Random.GLOBAL_RNG,
 ) where {Tq,Tt,Tf}
     copyto!(approximator, target_approximator)
-    sampler = NStepBatchSampler{traces}(;γ=γ, n=update_horizon,stack_size=stack_size,batch_size=batch_size)
+    sampler = NStepBatchSampler{traces}(;
+        γ = γ,
+        n = update_horizon,
+        stack_size = stack_size,
+        batch_size = batch_size,
+    )
     DQNLearner(
         approximator,
         target_approximator,
@@ -81,11 +86,13 @@ end
 function (learner::DQNLearner)(env)
     env |>
     state |>
-    x -> Flux.unsqueeze(x, ndims(x) + 1) |>
-    x -> send_to_device(device(learner), x) |>
-    learner.approximator |>
-    vec |>
-    send_to_host
+    x ->
+        Flux.unsqueeze(x, ndims(x) + 1) |>
+        x ->
+            send_to_device(device(learner), x) |>
+            learner.approximator |>
+            vec |>
+            send_to_host
 end
 
 function RLBase.update!(learner::DQNLearner, batch::NamedTuple)
@@ -103,7 +110,7 @@ function RLBase.update!(learner::DQNLearner, batch::NamedTuple)
     target_q = Qₜ(s′)
     if haskey(batch, :next_legal_actions_mask)
         l′ = send_to_device(D, batch[:next_legal_actions_mask])
-        target_q .+= ifelse.(l′, 0.f0, typemin(Float32))
+        target_q .+= ifelse.(l′, 0.0f0, typemin(Float32))
     end
 
     q′ = dropdims(maximum(target_q; dims = 1), dims = 1)
 
@@ -114,7 +114,12 @@ function IQNLearner(;
     if device(approximator) !== device(device_rng)
         throw(ArgumentError("device of `approximator` doesn't match the device of `device_rng`: $(device(approximator)) !== $(device_rng)"))
     end
-    sampler = NStepBatchSampler{traces}(;γ=γ, n=update_horizon,stack_size=stack_size,batch_size=batch_size)
+    sampler = NStepBatchSampler{traces}(;
+        γ = γ,
+        n = update_horizon,
+        stack_size = stack_size,
+        batch_size = batch_size,
+    )
     IQNLearner(
         approximator,
         target_approximator,
@@ -158,7 +163,8 @@ function RLBase.update!(learner::IQNLearner, batch::NamedTuple)
     batch_size = learner.sampler.batch_size
 
     D = device(Z)
-    s, r, t, s′ = (send_to_device(D, batch[x]) for x in (:state, :reward, :terminal, :next_state))
+    s, r, t, s′ =
+        (send_to_device(D, batch[x]) for x in (:state, :reward, :terminal, :next_state))
 
     τ′ = rand(learner.device_rng, Float32, N′, batch_size)  # TODO: support β distribution
     τₑₘ′ = embed(τ′, Nₑₘ)
@@ -174,7 +180,9 @@ function RLBase.update!(learner::IQNLearner, batch::NamedTuple)
     aₜ = argmax(avg_zₜ, dims = 1)
     aₜ = aₜ .+ typeof(aₜ)(CartesianIndices((0, 0:N′-1, 0)))
     qₜ = reshape(zₜ[aₜ], :, batch_size)
-    target = reshape(r, 1, batch_size) .+ learner.sampler.γ * reshape(1 .- t, 1, batch_size) .* qₜ  # reshape to allow broadcast
+    target =
+        reshape(r, 1, batch_size) .+
+        learner.sampler.γ * reshape(1 .- t, 1, batch_size) .* qₜ  # reshape to allow broadcast
 
     τ = rand(learner.device_rng, Float32, N, batch_size)
     τₑₘ = embed(τ, Nₑₘ)
 
@@ -60,7 +60,12 @@ function PrioritizedDQNLearner(;
     rng = Random.GLOBAL_RNG,
 ) where {Tq,Tt,Tf}
     copyto!(approximator, target_approximator)
-    sampler = NStepBatchSampler{traces}(;γ=γ, n=update_horizon,stack_size=stack_size,batch_size=batch_size)
+    sampler = NStepBatchSampler{traces}(;
+        γ = γ,
+        n = update_horizon,
+        stack_size = stack_size,
+        batch_size = batch_size,
+    )
     PrioritizedDQNLearner(
         approximator,
         target_approximator,
@@ -94,11 +99,13 @@ end
 function (learner::PrioritizedDQNLearner)(env)
     env |>
     state |>
-    x -> Flux.unsqueeze(x, ndims(x) + 1) |>
-    x -> send_to_device(device(learner), x) |>
-    learner.approximator |>
-    vec |>
-    send_to_host
+    x ->
+        Flux.unsqueeze(x, ndims(x) + 1) |>
+        x ->
+            send_to_device(device(learner), x) |>
+            learner.approximator |>
+            vec |>
+            send_to_host
 end
 
 function RLBase.update!(learner::PrioritizedDQNLearner, batch::NamedTuple)
@@ -111,7 +118,7 @@ function RLBase.update!(learner::PrioritizedDQNLearner, batch::NamedTuple)
     batch_size = learner.sampler.batch_size
 
     D = device(Q)
-    s, a, r, t, s′ = (send_to_device(D,batch[x]) for x in SARTS)
+    s, a, r, t, s′ = (send_to_device(D, batch[x]) for x in SARTS)
     a = CartesianIndex.(a, 1:batch_size)
 
     updated_priorities = Vector{Float32}(undef, batch_size)
@@ -122,7 +129,7 @@ function RLBase.update!(learner::PrioritizedDQNLearner, batch::NamedTuple)
     target_q = Qₜ(s′)
     if haskey(batch, :next_legal_actions_mask)
         l′ = send_to_device(D, batch[:next_legal_actions_mask])
-        target_q .+= ifelse.(l′, 0.f0, typemin(Float32))
+        target_q .+= ifelse.(l′, 0.0f0, typemin(Float32))
     end
 
     q′ = dropdims(maximum(target_q; dims = 1), dims = 1)
 
@@ -87,7 +87,12 @@ function RainbowLearner(;
     default_priority >= 1.0f0 || error("default value must be >= 1.0f0")
     copyto!(approximator, target_approximator)  # force sync
     support = send_to_device(device(approximator), support)
-    sampler = NStepBatchSampler{traces}(;γ=γ, n=update_horizon,stack_size=stack_size,batch_size=batch_size)
+    sampler = NStepBatchSampler{traces}(;
+        γ = γ,
+        n = update_horizon,
+        stack_size = stack_size,
+        batch_size = batch_size,
+    )
     RainbowLearner(
         approximator,
         target_approximator,
@@ -147,7 +152,7 @@ function RLBase.update!(learner::RainbowLearner, batch::NamedTuple)
     next_q = reshape(sum(support .* next_probs, dims = 1), n_actions, :)
     if haskey(batch, :next_legal_actions_mask)
         l′ = send_to_device(D, batch[:next_legal_actions_mask])
-        next_q .+= ifelse.(l′, 0.f0, typemin(Float32))
+        next_q .+= ifelse.(l′, 0.0f0, typemin(Float32))
     end
     next_prob_select = select_best_probs(next_probs, next_q)
 
 
@@ -29,13 +29,10 @@ Base.@kwdef mutable struct A2CLearner{A<:ActorCritic} <: AbstractLearner
     norm::Float32 = 0.0f0
 end
 
-Flux.functor(x::A2CLearner) = (app = x.approximator, ), y -> @set x.approximator = y.app
+Flux.functor(x::A2CLearner) = (app = x.approximator,), y -> @set x.approximator = y.app
 
 function (learner::A2CLearner)(env::MultiThreadEnv)
-    learner.approximator.actor(send_to_device(
-        device(learner),
-        state(env),
-    )) |> send_to_host
+    learner.approximator.actor(send_to_device(device(learner), state(env))) |> send_to_host
 end
 
 function (learner::A2CLearner)(env)
@@ -70,20 +67,17 @@ function _update!(learner::A2CLearner, t::CircularArraySARTTrajectory)
     actions = flatten_batch(actions)
     actions = CartesianIndex.(actions, 1:length(actions))
 
-    next_state_values = t[:state] |>
-        select_last_frame |>
-        Array |>
-        to_device |>
-        AC.critic |>
-        send_to_host
+    next_state_values =
+        t[:state] |> select_last_frame |> Array |> to_device |> AC.critic |> send_to_host
 
-    gains = discount_rewards(
-        t[:reward],
-        γ;
-        dims = 2,
-        init = send_to_host(next_state_values),
-        terminal = t[:terminal],
-    ) |> to_device
+    gains =
+        discount_rewards(
+            t[:reward],
+            γ;
+            dims = 2,
+            init = send_to_host(next_state_values),
+            terminal = t[:terminal],
+        ) |> to_device
 
     ps = Flux.params(AC)
     gs = gradient(ps) do
@@ -111,4 +105,4 @@ function _update!(learner::A2CLearner, t::CircularArraySARTTrajectory)
     update!(AC, gs)
 end
 
-RLCore.check(::QBasedPolicy{<:A2CLearner}, ::MultiThreadEnv) = nothing
+RLCore.check(::QBasedPolicy{<:A2CLearner}, ::MultiThreadEnv) = nothing