|
| 1 | +Base.@kwdef struct RecordStateAction <: AbstractHook |
| 2 | + records::Any = VectorSATrajectory(;state=Vector{Float32}) |
| 3 | +end |
| 4 | + |
| 5 | +function (h::RecordStateAction)(::PreActStage, policy, env, action) |
| 6 | + push!(h.records;state=copy(state(env)), action=action) |
| 7 | +end |
| 8 | + |
| 9 | +function RLCore.Experiment( |
| 10 | + ::Val{:JuliaRL}, |
| 11 | + ::Val{:BC}, |
| 12 | + ::Val{:CartPole}, |
| 13 | + ::Nothing; |
| 14 | + seed = 123, |
| 15 | + save_dir = nothing, |
| 16 | +) |
| 17 | + rng = StableRNG(seed) |
| 18 | + |
| 19 | + env = CartPoleEnv(; T = Float32, rng = rng) |
| 20 | + ns, na = length(state(env)), length(action_space(env)) |
| 21 | + agent = Agent( |
| 22 | + policy = QBasedPolicy( |
| 23 | + learner = BasicDQNLearner( |
| 24 | + approximator = NeuralNetworkApproximator( |
| 25 | + model = Chain( |
| 26 | + Dense(ns, 128, relu; initW = glorot_uniform(rng)), |
| 27 | + Dense(128, 128, relu; initW = glorot_uniform(rng)), |
| 28 | + Dense(128, na; initW = glorot_uniform(rng)), |
| 29 | + ) |> cpu, |
| 30 | + optimizer = ADAM(), |
| 31 | + ), |
| 32 | + batch_size = 32, |
| 33 | + min_replay_history = 100, |
| 34 | + loss_func = huber_loss, |
| 35 | + rng = rng, |
| 36 | + ), |
| 37 | + explorer = EpsilonGreedyExplorer( |
| 38 | + kind = :exp, |
| 39 | + ϵ_stable = 0.01, |
| 40 | + decay_steps = 500, |
| 41 | + rng = rng, |
| 42 | + ), |
| 43 | + ), |
| 44 | + trajectory = CircularArraySARTTrajectory( |
| 45 | + capacity = 1000, |
| 46 | + state = Vector{Float32} => (ns,), |
| 47 | + ), |
| 48 | + ) |
| 49 | + |
| 50 | + stop_condition = StopAfterStep(10_000) |
| 51 | + hook = RecordStateAction() |
| 52 | + run(agent, env, stop_condition, hook) |
| 53 | + |
| 54 | + bc = BehaviorCloningPolicy( |
| 55 | + approximator = NeuralNetworkApproximator( |
| 56 | + model = Chain( |
| 57 | + Dense(ns, 128, relu; initW = glorot_uniform(rng)), |
| 58 | + Dense(128, 128, relu; initW = glorot_uniform(rng)), |
| 59 | + Dense(128, na; initW = glorot_uniform(rng)), |
| 60 | + ) |> cpu, |
| 61 | + optimizer = ADAM(), |
| 62 | + ) |
| 63 | + ) |
| 64 | + |
| 65 | + s = BatchSampler{(:state, :action)}(32;) |
| 66 | + |
| 67 | + for i in 1:300 |
| 68 | + _, batch = s(hook.records) |
| 69 | + RLBase.update!(bc, batch) |
| 70 | + end |
| 71 | + |
| 72 | + description = """ |
| 73 | + # Behavior Cloning with CartPole |
| 74 | +
|
| 75 | + This experiment uses transitions during the experiment |
| 76 | + `JuliaRL_BasicDQN_CartPole` to train a behavior policy. |
| 77 | + """ |
| 78 | + |
| 79 | + hook = ComposedHook( |
| 80 | + TotalRewardPerEpisode(), |
| 81 | + TimePerStep(), |
| 82 | + ) |
| 83 | + |
| 84 | + Experiment(bc, env, StopAfterEpisode(100), hook, description) |
| 85 | +end |
0 commit comments