JuliaReinforcementLearning
diff --git a/‎src/ReinforcementLearningZoo.jl‎
Lines changed: 6 additions & 2 deletions b/‎src/ReinforcementLearningZoo.jl‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎src/algorithms/cfr/best_response_policy.jl‎
Lines changed: 1 addition & 5 deletions b/‎src/algorithms/cfr/best_response_policy.jl‎
Lines changed: 1 addition & 5 deletions
diff --git a/‎src/algorithms/cfr/external_sampling_mccfr.jl‎
Lines changed: 1 addition & 4 deletions b/‎src/algorithms/cfr/external_sampling_mccfr.jl‎
Lines changed: 1 addition & 4 deletions
diff --git a/‎src/algorithms/cfr/nash_conv.jl‎
Lines changed: 2 additions & 2 deletions b/‎src/algorithms/cfr/nash_conv.jl‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/algorithms/cfr/outcome_sampling_mccfr.jl‎
Lines changed: 1 addition & 4 deletions b/‎src/algorithms/cfr/outcome_sampling_mccfr.jl‎
Lines changed: 1 addition & 4 deletions
diff --git a/‎src/algorithms/cfr/tabular_cfr.jl‎
Lines changed: 2 additions & 10 deletions b/‎src/algorithms/cfr/tabular_cfr.jl‎
Lines changed: 2 additions & 10 deletions
diff --git a/‎src/algorithms/dqns/iqn.jl‎
Lines changed: 5 additions & 1 deletion b/‎src/algorithms/dqns/iqn.jl‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎src/algorithms/offline_rl/behavior_cloning.jl‎
Lines changed: 1 addition & 1 deletion b/‎src/algorithms/offline_rl/behavior_cloning.jl‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/algorithms/offline_rl/offline_rl.jl‎
Lines changed: 1 addition & 1 deletion b/‎src/algorithms/offline_rl/offline_rl.jl‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/algorithms/policy_gradient/MAC.jl‎
Lines changed: 14 additions & 5 deletions b/‎src/algorithms/policy_gradient/MAC.jl‎
Lines changed: 14 additions & 5 deletions
@@ -36,8 +36,12 @@ using Requires
 function __init__()
     @require ReinforcementLearningEnvironments = "25e41dd2-4622-11e9-1641-f1adca772921" begin
         include("experiments/rl_envs/rl_envs.jl")
-        @require ArcadeLearningEnvironment = "b7f77d8d-088d-5e02-8ac0-89aab2acc977" include("experiments/atari/atari.jl")
-        @require OpenSpiel = "ceb70bd2-fe3f-44f0-b81f-41608acaf2f2" include("experiments/open_spiel/open_spiel.jl")
+        @require ArcadeLearningEnvironment = "b7f77d8d-088d-5e02-8ac0-89aab2acc977" include(
+            "experiments/atari/atari.jl",
+        )
+        @require OpenSpiel = "ceb70bd2-fe3f-44f0-b81f-41608acaf2f2" include(
+            "experiments/open_spiel/open_spiel.jl",
+        )
     end
 end
 
 
@@ -15,11 +15,7 @@ end
 - `env`, the environment to handle.
 - `best_responder`, the player to choose best response action.
 """
-function BestResponsePolicy(
-    policy,
-    env,
-    best_responder;
-)
+function BestResponsePolicy(policy, env, best_responder;)
     S = eltype(state_space(env))
     A = eltype(action_space(env))
     E = typeof(env)
 
@@ -26,10 +26,7 @@ RLBase.prob(p::ExternalSamplingMCCFRPolicy, env::AbstractEnv, action) =
 function ExternalSamplingMCCFRPolicy(; state_type = String, rng = Random.GLOBAL_RNG)
     ExternalSamplingMCCFRPolicy(
         Dict{state_type,InfoStateNode}(),
-        TabularRandomPolicy(;
-            rng = rng,
-            table = Dict{state_type,Vector{Float64}}(),
-        ),
+        TabularRandomPolicy(; rng = rng, table = Dict{state_type,Vector{Float64}}()),
         rng,
     )
 end
 
@@ -31,8 +31,8 @@ function nash_conv(π, env; is_reduce = true, kw...)
     RLBase.reset!(e)
 
     σ′ = [
-        best_response_value(BestResponsePolicy(π, e, i; kw...), e)
-        for i in players(e) if i != chance_player(e)
+        best_response_value(BestResponsePolicy(π, e, i; kw...), e) for
+        i in players(e) if i != chance_player(e)
     ]
 
     σ = expected_policy_values(π, e)
 
@@ -27,10 +27,7 @@ RLBase.prob(p::OutcomeSamplingMCCFRPolicy, env::AbstractEnv, action) =
 function OutcomeSamplingMCCFRPolicy(; state_type = String, rng = Random.GLOBAL_RNG, ϵ = 0.6)
     OutcomeSamplingMCCFRPolicy(
         Dict{state_type,InfoStateNode}(),
-        TabularRandomPolicy(;
-            rng = rng,
-            table = Dict{state_type,Vector{Float64}}(),
-        ),
+        TabularRandomPolicy(; rng = rng, table = Dict{state_type,Vector{Float64}}()),
         ϵ,
         rng,
     )
 
@@ -9,12 +9,7 @@ end
 
 function InfoStateNode(mask)
     n = sum(mask)
-    InfoStateNode(
-        fill(1 / n, n),
-        zeros(n),
-        zeros(n),
-        mask
-    )
+    InfoStateNode(fill(1 / n, n), zeros(n), zeros(n), mask)
 end
 
 #####
@@ -67,10 +62,7 @@ function TabularCFRPolicy(;
 )
     TabularCFRPolicy(
         Dict{state_type,InfoStateNode}(),
-        TabularRandomPolicy(;
-            rng = rng,
-            table = Dict{state_type,Vector{Float64}}(),
-        ),
+        TabularRandomPolicy(; rng = rng, table = Dict{state_type,Vector{Float64}}()),
         is_reset_neg_regrets,
         is_linear_averaging,
         weighted_averaging_delay,
 
@@ -112,7 +112,11 @@ function IQNLearner(;
 )
     copyto!(approximator, target_approximator)  # force sync
     if device(approximator) !== device(device_rng)
-        throw(ArgumentError("device of `approximator` doesn't match the device of `device_rng`: $(device(approximator)) !== $(device_rng)"))
+        throw(
+            ArgumentError(
+                "device of `approximator` doesn't match the device of `device_rng`: $(device(approximator)) !== $(device_rng)",
+            ),
+        )
     end
     sampler = NStepBatchSampler{traces}(;
         γ = γ,
 
@@ -30,4 +30,4 @@ function RLBase.update!(p::BehaviorCloningPolicy, batch::NamedTuple{(:state, :ac
         logitcrossentropy(ŷ, y)
     end
     update!(m, gs)
-end
+end
@@ -1 +1 @@
-include("behavior_cloning.jl")
+include("behavior_cloning.jl")
@@ -39,7 +39,12 @@ function (learner::MACLearner)(env)
     learner.approximator.actor(s) |> vec |> send_to_host
 end
 
-function RLBase.update!(learner::MACLearner, t::CircularArraySARTTrajectory, ::AbstractEnv, ::PreActStage)
+function RLBase.update!(
+    learner::MACLearner,
+    t::CircularArraySARTTrajectory,
+    ::AbstractEnv,
+    ::PreActStage,
+)
     length(t) == 0 && return  # in the first update, only state & action is inserted into trajectory
     learner.update_step += 1
     if learner.update_step % learner.update_freq == 0
@@ -112,10 +117,14 @@ function _update!(learner::MACLearner, t::CircularArraySARTTrajectory)
             next_state_values = AC.critic(next_state_flattened)
             target_action_values =
                 vec(rewards_flattened) .+
-                γ * vec(Zygote.dropgrad(sum(
-                    next_state_values .* softmax(AC.actor(next_state_flattened)),
-                    dims = 1,
-                )))
+                γ * vec(
+                    Zygote.dropgrad(
+                        sum(
+                            next_state_values .* softmax(AC.actor(next_state_flattened)),
+                            dims = 1,
+                        ),
+                    ),
+                )
             critic_loss =
                 mean((vec(target_action_values) .- vec(action_values[actions])) .^ 2)
         end
Original file line number	Diff line number	Diff line change
`@@ -31,8 +31,8 @@ function nash_conv(π, env; is_reduce = true, kw...)`
`31`	`31`	`RLBase.reset!(e)`
`32`	`32`
`33`	`33`	`σ′ = [`
`34`		`- best_response_value(BestResponsePolicy(π, e, i; kw...), e)`
`35`		`- for i in players(e) if i != chance_player(e)`
	`34`	`+ best_response_value(BestResponsePolicy(π, e, i; kw...), e) for`
	`35`	`+ i in players(e) if i != chance_player(e)`
`36`	`36`	`]`
`37`	`37`
`38`	`38`	`σ = expected_policy_values(π, e)`
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-include("behavior_cloning.jl")`
	`1`	`+include("behavior_cloning.jl")`