JuliaReinforcementLearning
diff --git a/‎Project.toml‎
Lines changed: 6 additions & 2 deletions b/‎Project.toml‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎src/ReinforcementLearningZoo.jl‎
Lines changed: 2 additions & 0 deletions b/‎src/ReinforcementLearningZoo.jl‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/algorithms/cfr/best_response_policy.jl‎
Lines changed: 26 additions & 26 deletions b/‎src/algorithms/cfr/best_response_policy.jl‎
Lines changed: 26 additions & 26 deletions
diff --git a/‎src/algorithms/cfr/deep_cfr.jl‎
Lines changed: 17 additions & 17 deletions b/‎src/algorithms/cfr/deep_cfr.jl‎
Lines changed: 17 additions & 17 deletions
diff --git a/‎src/algorithms/cfr/external_sampling_mccfr.jl‎
Lines changed: 10 additions & 10 deletions b/‎src/algorithms/cfr/external_sampling_mccfr.jl‎
Lines changed: 10 additions & 10 deletions
diff --git a/‎src/algorithms/cfr/nash_conv.jl‎
Lines changed: 9 additions & 9 deletions b/‎src/algorithms/cfr/nash_conv.jl‎
Lines changed: 9 additions & 9 deletions
diff --git a/‎src/algorithms/cfr/outcome_sampling_mccfr.jl‎
Lines changed: 11 additions & 11 deletions b/‎src/algorithms/cfr/outcome_sampling_mccfr.jl‎
Lines changed: 11 additions & 11 deletions
@@ -11,13 +11,15 @@ CircularArrayBuffers = "9de3a189-e0c0-4e15-ba3b-b14b9fb0aec1"
 Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
 Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
+IntervalSets = "8197267c-284f-5f27-9208-e0e47529a953"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
 MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
 Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 ReinforcementLearningBase = "e575027e-6cd6-5018-9292-cdc6200d2b44"
 ReinforcementLearningCore = "de1b191a-4ae0-4afa-a27b-92d07f46b2d6"
+ReinforcementLearningEnvironments = "25e41dd2-4622-11e9-1641-f1adca772921"
 Requires = "ae029012-a4dd-5104-9daa-d747884805df"
 Setfield = "efcf1570-3423-57d1-acb7-fd33fddbac46"
 StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
@@ -34,9 +36,11 @@ CUDA = "1, 2.1"
 CircularArrayBuffers = "0.1"
 Distributions = "0.24"
 Flux = "0.11"
+IntervalSets = "0.5"
 MacroTools = "0.5"
-ReinforcementLearningBase = "0.8.4"
-ReinforcementLearningCore = "0.6"
+ReinforcementLearningBase = "0.9"
+ReinforcementLearningCore = "0.6.1"
+ReinforcementLearningEnvironments = "0.4"
 Requires = "1"
 Setfield = "0.6, 0.7"
 StableRNGs = "1.0"
 
@@ -6,11 +6,13 @@ export RLZoo
 using CircularArrayBuffers
 using ReinforcementLearningBase
 using ReinforcementLearningCore
+using ReinforcementLearningEnvironments
 using Setfield: @set
 using StableRNGs
 using Logging
 using Flux.Losses
 using Dates
+using IntervalSets
 using Random
 using Random: shuffle
 using CUDA
 
@@ -22,8 +22,8 @@ function BestResponsePolicy(
     state_type = String,
     action_type = Int,
 )
-    # S = typeof(get_state(env))  # TODO: currently it will break the OpenSpielEnv. Can not get information set for chance player
-    # A = eltype(get_actions(env))  # TODO: for chance players it will return ActionProbPair
+    # S = typeof(state(env))  # TODO: currently it will break the OpenSpielEnv. Can not get information set for chance player
+    # A = eltype(action_space(env))  # TODO: for chance players it will return ActionProbPair
     S = state_type
     A = action_type
     E = typeof(env)
@@ -45,31 +45,31 @@ function BestResponsePolicy(
 end
 
 function (p::BestResponsePolicy)(env::AbstractEnv)
-    if get_current_player(env) == p.best_responder
+    if current_player(env) == p.best_responder
         best_response_action(p, env)
     else
         p.policy(env)
     end
 end
 
 function init_cfr_reach_prob!(p, env, reach_prob = 1.0)
-    if !get_terminal(env)
-        if get_current_player(env) == p.best_responder
-            push!(get!(p.cfr_reach_prob, get_state(env), []), env => reach_prob)
+    if !is_terminated(env)
+        if current_player(env) == p.best_responder
+            push!(get!(p.cfr_reach_prob, state(env), []), env => reach_prob)
 
-            for a in get_legal_actions(env)
+            for a in legal_action_space(env)
                 init_cfr_reach_prob!(p, child(env, a), reach_prob)
             end
-        elseif get_current_player(env) == get_chance_player(env)
-            for a::ActionProbPair in get_actions(env)
+        elseif current_player(env) == chance_player(env)
+            for a::ActionProbPair in action_space(env)
                 init_cfr_reach_prob!(p, child(env, a), reach_prob * a.prob)
             end
         else  # opponents
-            for a in get_legal_actions(env)
+            for a in legal_action_space(env)
                 init_cfr_reach_prob!(
                     p,
                     child(env, a),
-                    reach_prob * get_prob(p.policy, env, a),
+                    reach_prob * prob(p.policy, env, a),
                 )
             end
         end
@@ -78,34 +78,34 @@ end
 
 function best_response_value(p, env)
     get!(p.best_response_value_cache, env) do
-        if get_terminal(env)
-            get_reward(env, p.best_responder)
-        elseif get_current_player(env) == p.best_responder
+        if is_terminated(env)
+            reward(env, p.best_responder)
+        elseif current_player(env) == p.best_responder
             a = best_response_action(p, env)
             best_response_value(p, child(env, a))
-        elseif get_current_player(env) == get_chance_player(env)
+        elseif current_player(env) == chance_player(env)
             v = 0.0
-            for a::ActionProbPair in get_actions(env)
+            for a::ActionProbPair in action_space(env)
                 v += a.prob * best_response_value(p, child(env, a))
             end
             v
         else
             v = 0.0
-            for a in get_legal_actions(env)
-                v += get_prob(p.policy, env, a) * best_response_value(p, child(env, a))
+            for a in legal_action_space(env)
+                v += prob(p.policy, env, a) * best_response_value(p, child(env, a))
             end
             v
         end
     end
 end
 
 function best_response_action(p, env)
-    get!(p.best_response_action_cache, get_state(env)) do
+    get!(p.best_response_action_cache, state(env)) do
         best_action, best_action_value = nothing, typemin(Float64)
-        for a in get_legal_actions(env)
-            # for each information set (`get_state(env)` here), we may have several paths to reach it
+        for a in legal_action_space(env)
+            # for each information set (`state(env)` here), we may have several paths to reach it
             # here we sum the cfr reach prob weighted value to find out the best action
-            v = sum(p.cfr_reach_prob[get_state(env)]) do (e, reach_prob)
+            v = sum(p.cfr_reach_prob[state(env)]) do (e, reach_prob)
                 reach_prob * best_response_value(p, child(e, a))
             end
             if v > best_action_value
@@ -118,10 +118,10 @@ end
 
 RLBase.update!(p::BestResponsePolicy, args...) = nothing
 
-function RLBase.get_prob(p::BestResponsePolicy, env::AbstractEnv)
-    if get_current_player(env) == p.best_responder
-        onehot(p(env), get_actions(env))
+function RLBase.prob(p::BestResponsePolicy, env::AbstractEnv)
+    if current_player(env) == p.best_responder
+        onehot(p(env), action_space(env))
     else
-        get_prob(p.policy, env)
+        prob(p.policy, env)
     end
 end
@@ -40,21 +40,21 @@ Base.@kwdef mutable struct DeepCFR{TP,TV,TMP,TMV,I,R,P} <: AbstractCFRPolicy
         Dict(k => zeros(Float32, n_training_steps_V) for (k, _) in MV)
 end
 
-function RLBase.get_prob(π::DeepCFR, env::AbstractEnv)
-    I = send_to_device(device(π.Π), get_state(env))
-    m = send_to_device(device(π.Π), ifelse.(get_legal_actions_mask(env), 0.0f0, -Inf32))
+function RLBase.prob(π::DeepCFR, env::AbstractEnv)
+    I = send_to_device(device(π.Π), state(env))
+    m = send_to_device(device(π.Π), ifelse.(legal_action_space_mask(env), 0.0f0, -Inf32))
     logits = π.Π(Flux.unsqueeze(I, ndims(I) + 1)) |> vec
     σ = softmax(logits .+ m)
     send_to_host(σ)
 end
 
 (π::DeepCFR)(env::AbstractEnv) =
-    sample(π.rng, get_actions(env), Weights(get_prob(π, env), 1.0))
+    sample(π.rng, action_space(env), Weights(prob(π, env), 1.0))
 
 "Run one interation"
 function RLBase.update!(π::DeepCFR, env::AbstractEnv)
     for p in get_players(env)
-        if p != get_chance_player(env)
+        if p != chance_player(env)
             for k in 1:π.K
                 external_sampling!(π, copy(env), p)
             end
@@ -135,17 +135,17 @@ end
 
 "CFR Traversal with External Sampling"
 function external_sampling!(π::DeepCFR, env::AbstractEnv, p)
-    if get_terminal(env)
-        get_reward(env, p)
-    elseif get_current_player(env) == get_chance_player(env)
-        env(rand(π.rng, get_actions(env)))
+    if is_terminated(env)
+        reward(env, p)
+    elseif current_player(env) == chance_player(env)
+        env(rand(π.rng, action_space(env)))
         external_sampling!(π, env, p)
-    elseif get_current_player(env) == p
+    elseif current_player(env) == p
         V = π.V[p]
-        s = get_state(env)
+        s = state(env)
         I = send_to_device(device(V), Flux.unsqueeze(s, ndims(s) + 1))
-        A = get_actions(env)
-        m = get_legal_actions_mask(env)
+        A = action_space(env)
+        m = legal_action_space_mask(env)
         σ = masked_regret_matching(V(I) |> send_to_host |> vec, m)
         v = zeros(length(σ))
         v̄ = 0.0
@@ -158,11 +158,11 @@ function external_sampling!(π::DeepCFR, env::AbstractEnv, p)
         push!(π.MV[p], I = s, t = π.t, r̃ = (v .- v̄) .* m, m = m)
         v̄
     else
-        V = π.V[get_current_player(env)]
-        s = get_state(env)
+        V = π.V[current_player(env)]
+        s = state(env)
         I = send_to_device(device(V), Flux.unsqueeze(s, ndims(s) + 1))
-        A = get_actions(env)
-        m = get_legal_actions_mask(env)
+        A = action_space(env)
+        m = legal_action_space_mask(env)
         σ = masked_regret_matching(V(I) |> send_to_host |> vec, m)
         push!(π.MΠ, I = s, t = π.t, σ = σ, m = m)
         a = sample(π.rng, A, Weights(σ, 1.0))
 
@@ -18,8 +18,8 @@ end
 
 (p::ExternalSamplingMCCFRPolicy)(env::AbstractEnv) = p.behavior_policy(env)
 
-RLBase.get_prob(p::ExternalSamplingMCCFRPolicy, env::AbstractEnv) =
-    get_prob(p.behavior_policy, env)
+RLBase.prob(p::ExternalSamplingMCCFRPolicy, env::AbstractEnv) =
+    prob(p.behavior_policy, env)
 
 function ExternalSamplingMCCFRPolicy(; state_type = String, rng = Random.GLOBAL_RNG)
     ExternalSamplingMCCFRPolicy(
@@ -48,23 +48,23 @@ end
 "Run one interation"
 function RLBase.update!(p::ExternalSamplingMCCFRPolicy, env::AbstractEnv)
     for x in get_players(env)
-        if x != get_chance_player(env)
+        if x != chance_player(env)
             external_sampling(copy(env), x, p.nodes, p.rng)
         end
     end
 end
 
 function external_sampling(env, i, nodes, rng)
-    current_player = get_current_player(env)
+    current_player = current_player(env)
 
-    if get_terminal(env)
-        get_reward(env, i)
-    elseif current_player == get_chance_player(env)
-        env(rand(rng, get_actions(env)))
+    if is_terminated(env)
+        reward(env, i)
+    elseif current_player == chance_player(env)
+        env(rand(rng, action_space(env)))
         external_sampling(env, i, nodes, rng)
     else
-        I = get_state(env)
-        legal_actions = get_legal_actions(env)
+        I = state(env)
+        legal_actions = legal_action_space(env)
         n = length(legal_actions)
         node = get!(nodes, I, InfoStateNode(n))
         regret_matching!(node; is_reset_neg_regrets = false)
 
@@ -1,18 +1,18 @@
 export expected_policy_values, nash_conv
 
 function expected_policy_values(π::AbstractPolicy, env::AbstractEnv)
-    if get_terminal(env)
-        [get_reward(env, p) for p in get_players(env) if p != get_chance_player(env)]
-    elseif get_current_player(env) == get_chance_player(env)
-        vals = [0.0 for p in get_players(env) if p != get_chance_player(env)]
-        for a::ActionProbPair in get_legal_actions(env)
+    if is_terminated(env)
+        [reward(env, p) for p in get_players(env) if p != chance_player(env)]
+    elseif current_player(env) == chance_player(env)
+        vals = [0.0 for p in get_players(env) if p != chance_player(env)]
+        for a::ActionProbPair in legal_action_space(env)
             vals .+= a.prob .* expected_policy_values(π, child(env, a))
         end
         vals
     else
-        vals = [0.0 for p in get_players(env) if p != get_chance_player(env)]
-        actions = get_actions(env)
-        probs = get_prob(π, env)
+        vals = [0.0 for p in get_players(env) if p != chance_player(env)]
+        actions = action_space(env)
+        probs = prob(π, env)
         @assert length(actions) == length(probs)
 
         for (a, p) in zip(actions, probs)
@@ -30,7 +30,7 @@ function nash_conv(π, env; is_reduce = true, kw...)
 
     σ′ = [
         best_response_value(BestResponsePolicy(π, e, i; kw...), e)
-        for i in get_players(e) if i != get_chance_player(e)
+        for i in get_players(e) if i != chance_player(e)
     ]
 
     σ = expected_policy_values(π, e)
 
@@ -19,8 +19,8 @@ end
 
 (p::OutcomeSamplingMCCFRPolicy)(env::AbstractEnv) = p.behavior_policy(env)
 
-RLBase.get_prob(p::OutcomeSamplingMCCFRPolicy, env::AbstractEnv) =
-    get_prob(p.behavior_policy, env)
+RLBase.prob(p::OutcomeSamplingMCCFRPolicy, env::AbstractEnv) =
+    prob(p.behavior_policy, env)
 
 function OutcomeSamplingMCCFRPolicy(; state_type = String, rng = Random.GLOBAL_RNG, ϵ = 0.6)
     OutcomeSamplingMCCFRPolicy(
@@ -38,7 +38,7 @@ end
 "Run one interation"
 function RLBase.update!(p::OutcomeSamplingMCCFRPolicy, env::AbstractEnv)
     for x in get_players(env)
-        if x != get_chance_player(env)
+        if x != chance_player(env)
             outcome_sampling(copy(env), x, p.nodes, p.ϵ, 1.0, 1.0, 1.0, p.rng)
         end
     end
@@ -57,16 +57,16 @@ function RLBase.update!(p::OutcomeSamplingMCCFRPolicy)
 end
 
 function outcome_sampling(env, i, nodes, ϵ, πᵢ, π₋ᵢ, s, rng)
-    current_player = get_current_player(env)
+    current_player = current_player(env)
 
-    if get_terminal(env)
-        get_reward(env, i) / s, 1.0
-    elseif current_player == get_chance_player(env)
-        env(rand(rng, get_actions(env)))
+    if is_terminated(env)
+        reward(env, i) / s, 1.0
+    elseif current_player == chance_player(env)
+        env(rand(rng, action_space(env)))
         outcome_sampling(env, i, nodes, ϵ, πᵢ, π₋ᵢ, s, rng)
     else
-        I = get_state(env)
-        legal_actions = get_legal_actions(env)
+        I = state(env)
+        legal_actions = legal_action_space(env)
         n = length(legal_actions)
         node = get!(nodes, I, InfoStateNode(n))
         regret_matching!(node; is_reset_neg_regrets = false)
@@ -82,7 +82,7 @@ function outcome_sampling(env, i, nodes, ϵ, πᵢ, π₋ᵢ, s, rng)
             πᵢ′, π₋ᵢ′, s′ = πᵢ, π₋ᵢ * pᵢ, s * pᵢ
         end
 
-        env(get_legal_actions(env)[aᵢ])
+        env(legal_action_space(env)[aᵢ])
         u, πₜₐᵢₗ = outcome_sampling(env, i, nodes, ϵ, πᵢ′, π₋ᵢ′, s′, rng)
 
         if i == current_player