Implement ε-decay (a decay of exploration ratio)

itu-square · Jan 6, 2024 · 247598e · 247598e
1 parent 12bc2c7
commit 247598e
Show file tree

Hide file tree

Showing 36 changed files with 113 additions and 58 deletions.
diff --git a/src/main/scala/symsim/Bdl.scala b/src/main/scala/symsim/Bdl.scala
@@ -46,7 +46,7 @@ trait BdlLearn[State, ObservableState, Action, Reward, Scheduler[_]]
   def bdl: Update 
 
   override def toString: String =
-    s"BDL(..., 𝜀=$epsilon, $episodes episodes)"
+    s"BDL(..., 𝜀=$epsilon0, $episodes episodes)"
 
   /** A single step of the learning algorithm
    *
@@ -78,7 +78,7 @@ trait BdlLearn[State, ObservableState, Action, Reward, Scheduler[_]]
       (os_t, os_tk)            = (agent.observe (s_t), agent.observe (s_tk))
       expectation              = allActions
                                   .map { a => 
-                                    vf.probability (ε) (q_t) (os_tk, a)
+                                    vf.probability (ε0) (q_t) (os_tk, a)
                                       * q_t (os_tk, a) }
                                   .arithSum
       g_tkk                    = g_tk + γ_tk * expectation
@@ -118,7 +118,7 @@ trait BdlLearn[State, ObservableState, Action, Reward, Scheduler[_]]
       for 
         sr           <- agent.step (s_t) (a_t) // intermediate name needed for stryker which fails with -source:future
         (s_tt, r_tt) =  sr
-        a_tt         <- vf.chooseAction (ε) (q_t) (agent.observe (s_tt))
+        a_tt         <- vf.chooseAction (ε0) (q_t) (agent.observe (s_tt))
         g_tt         =  g_t + γ_t * r_tt
         γ_tt         =  γ_t * γ
       yield (s_tt, a_tt, g_tt, γ_tt)
@@ -128,12 +128,12 @@ trait BdlLearn[State, ObservableState, Action, Reward, Scheduler[_]]
         sr           <- agent.step (s_t) (a_t) // intermediate name needed for stryker which fails with -source:future
         (s_tt, r_tt) =  sr
         os_tt        =  agent.observe (s_tt)
-        a_tt         <- vf.chooseAction (ε) (q_t) (os_tt)
+        a_tt         <- vf.chooseAction (ε0) (q_t) (os_tt)
         expectation  =  allActions
                          .filter { _ != a_tt }
                          .map { a => 
-                           vf.probability (ε) (q_t) (os_tt, a) * q_t (os_tt, a) }
+                           vf.probability (ε0) (q_t) (os_tt, a) * q_t (os_tt, a) }
                          .arithSum
         g_tt         =  g_t + γ_t * (r_tt + expectation) 
-        γ_tt         =  γ_t * γ * vf.probability (ε) (q_t) (os_tt, a_tt)
+        γ_tt         =  γ_t * γ * vf.probability (ε0) (q_t) (os_tt, a_tt)
       yield (s_tt, a_tt, g_tt, γ_tt)
diff --git a/src/main/scala/symsim/ExactRL.scala b/src/main/scala/symsim/ExactRL.scala
@@ -32,9 +32,14 @@ trait ExactRL[State, ObservableState, Action, Reward, Scheduler[_]]
   import vf.{VF, chooseAction}
 
   def alpha: Double
-  def epsilon: Probability
+  def epsilon0: Probability
   def α: Double = this.alpha
-  def ε: Probability = this.epsilon
+  def ε0: Probability = this.epsilon0
+
+  /** A decay function for ε (exploration probability).  Override to define. 
+    * If you do not want decay use identity (mix in NoDecay)
+    */
+  def decay (ε: Probability): Probability
 
 
   /* Policy Learning */
@@ -57,15 +62,19 @@ trait ExactRL[State, ObservableState, Action, Reward, Scheduler[_]]
   /** Execute a full learning episode (until the final state of agent is
     * reached).
     */
-  def learningEpisode(fR: (VF, List[VF]), s_t: State): Scheduler[(VF, List[VF])] =
+  def learningEpisode(fR: (VF, List[VF], Probability), s_t: State)
+    : Scheduler[(VF, List[VF], Probability)] =
+
     def done (f: VF, s: State, a: Action): Boolean = agent.isFinal(s)
-    val f = fR._1
-    val qL_t = fR._2
+
+    val (f, qL_t, ε) = fR
+
     for
       a    <- chooseAction (ε) (f) (agent.observe (s_t))
       fin  <- Monad[Scheduler].iterateUntilM (f, s_t, a) (learningEpoch) (done)
       qL_tt = fin._1 :: qL_t
-    yield (fin._1, qL_tt)
+    yield (fin._1, qL_tt, decay (ε))
+
 
   /** Executes as many full learning episodes (until the final state of agent is
     * reached) as the given state scheduler generates.  For this method to work
@@ -75,9 +84,11 @@ trait ExactRL[State, ObservableState, Action, Reward, Scheduler[_]]
     * Scheduler is lazy then the evaluation is not really doing more than just
     * formulating the thunk of that scheduler.
     */
-  final def learn (f: VF, q_l: List[VF], ss: => Scheduler[State]):
-    Scheduler[(VF, List[VF])] =
-      ss.foldM[Scheduler, (VF, List[VF])] (f, q_l) (learningEpisode)
+  final def learn (f: VF, q_l: List[VF], ss: => Scheduler[State])
+    : Scheduler[(VF, List[VF])] =
+    val result = 
+      ss.foldM[Scheduler, (VF, List[VF], Probability)] (f, q_l, ε0) (learningEpisode)
+    result.map { (vf, history, ε) => (vf, history) }
 
 
 

diff --git a/src/main/scala/symsim/ExpectedSarsa.scala b/src/main/scala/symsim/ExpectedSarsa.scala
@@ -28,11 +28,11 @@ trait ExpectedSarsa[State, ObservableState, Action, Reward, Scheduler[_]]
       (s_tt, r_tt)  = sa_tt
                     // Expected Sarsa (p.133 in Sutton & Barto)
       (os_t, os_tt) = (agent.observe (s_t), agent.observe (s_tt))
-      a_tt         <- vf.chooseAction (ε) (q_t) (os_tt)
+      a_tt         <- vf.chooseAction (ε0) (q_t) (os_tt)
       q_t_value     = q_t (os_t, a_t)
       expectation   = agent.instances.allActions
                         .map { a => 
-                          vf.probability (ε) (q_t) (os_tt, a)
+                          vf.probability (ε0) (q_t) (os_tt, a)
                             * q_t (os_tt, a) }
                         .arithSum
       g_tt          = r_tt + γ * expectation

diff --git a/src/main/scala/symsim/QLearning.scala b/src/main/scala/symsim/QLearning.scala
@@ -36,5 +36,5 @@ trait QLearning[State, ObservableState, Action, Reward, Scheduler[_]]
       qval         = old_entry + alpha * correction
 
       q1           = q.updated (ds_t, a_t, qval)
-      a_tt1        <- chooseAction (ε) (q1) (ds_tt)
+      a_tt1        <- chooseAction (ε0) (q1) (ds_tt)
     yield (q1, s_tt, a_tt1)
diff --git a/src/main/scala/symsim/Sarsa.scala b/src/main/scala/symsim/Sarsa.scala
@@ -29,7 +29,7 @@ trait Sarsa[State, ObservableState, Action, Reward, Scheduler[_]]
       (s_tt, r_tt)  = sa_tt
                       // SARSA: on-policy (p.844 in Russel & Norvig)
       (os_t, os_tt) = (agent.observe (s_t), agent.observe (s_tt))
-      a_tt         <- vf.chooseAction (ε) (q_t) (os_tt)
+      a_tt         <- vf.chooseAction (ε0) (q_t) (os_tt)
       q_t_value     = q_t (os_t, a_t)
       g_tt          = r_tt + γ * q_t (os_tt, a_tt)
       q_tt_value    = q_t_value + α * (g_tt - q_t_value)

diff --git a/src/main/scala/symsim/concrete/BdlConcreteExpectedSarsa.scala b/src/main/scala/symsim/concrete/BdlConcreteExpectedSarsa.scala
@@ -11,10 +11,11 @@ case class BdlConcreteExpectedSarsa [
   val agent: Agent[State, ObservableState, Action, Double, Randomized],
   val alpha: Double,
   val gamma: Double,
-  val epsilon: Probability,
+  val epsilon0: Probability,
   val episodes: Int,
 ) extends BdlLearn[State, ObservableState, Action, Double, Randomized],
-  ConcreteExactRL[State, ObservableState, Action]:
+  ConcreteExactRL[State, ObservableState, Action],
+  NoDecay:
 
   import Est.*, Upd.*  
   val bdl = Update (List(Sample (gamma)), alpha, ExpectationU)
diff --git a/src/main/scala/symsim/concrete/BdlConcreteSarsa.scala b/src/main/scala/symsim/concrete/BdlConcreteSarsa.scala
@@ -11,10 +11,11 @@ case class BdlConcreteSarsa [
   val agent: Agent[State, ObservableState, Action, Double, Randomized],
   val alpha: Double,
   val gamma: Double,
-  val epsilon: Probability,
+  val epsilon0: Probability,
   val episodes: Int,
 ) extends BdlLearn[State, ObservableState, Action, Double, Randomized],
-  ConcreteExactRL[State, ObservableState, Action]:
+  ConcreteExactRL[State, ObservableState, Action],
+  NoDecay:
 
   import Est.*, Upd.*  
   val bdl = Update (List(Sample (gamma)), alpha, SampleU)
diff --git a/src/main/scala/symsim/concrete/BoundedEpsilonDecay.scala b/src/main/scala/symsim/concrete/BoundedEpsilonDecay.scala
@@ -0,0 +1,10 @@
+package symsim
+package concrete
+
+trait BoundedEpsilonDecay: 
+
+  def decayFactor: Double = 0.99
+  def minExploration: Double = 0.00001
+
+  def decay (ε: Probability): Probability = 
+    if ε <= minExploration then ε else ε * decayFactor 
diff --git a/src/main/scala/symsim/concrete/ConcreteExpectedSarsa.scala b/src/main/scala/symsim/concrete/ConcreteExpectedSarsa.scala
@@ -5,10 +5,11 @@ case class ConcreteExpectedSarsa[State, ObservableState, Action] (
  val agent: Agent[State, ObservableState, Action, Double, Randomized],
  val alpha: Double,
  val gamma: Double,
- val epsilon: Probability,
+ val epsilon0: Probability,
  val episodes: Int,
 ) extends ExpectedSarsa[State, ObservableState, Action, Double, Randomized],
-  ConcreteExactRL[State, ObservableState, Action]:
+  ConcreteExactRL[State, ObservableState, Action],
+  NoDecay:
 
   override def toString: String =
-    s"ExpectedSarsa-Learn(α=$alpha, 𝛾=$gamma, 𝜀=$epsilon, $episodes episodes)"
+    s"ExpectedSarsa-Learn(α=$alpha, 𝛾=$gamma, 𝜀=$epsilon0, $episodes episodes)"
diff --git a/src/main/scala/symsim/concrete/ConcreteQLearning.scala b/src/main/scala/symsim/concrete/ConcreteQLearning.scala
@@ -11,10 +11,11 @@ case class ConcreteQLearning [
   val agent: Agent[State, ObservableState, Action, Double, Randomized],
   val alpha: Double,
   val gamma: Double,
-  val epsilon: Probability,
+  val epsilon0: Probability,
   val episodes: Int,
 ) extends QLearning[State, ObservableState, Action, Double, Randomized],
-  ConcreteExactRL[State, ObservableState, Action]:
+  ConcreteExactRL[State, ObservableState, Action],
+  NoDecay:
 
   override def toString: String =
-    s"Q-Learn(α=$alpha, 𝛾=$gamma, 𝜀=$epsilon, $episodes episodes)"
+    s"Q-Learn(α=$alpha, 𝛾=$gamma, 𝜀=$epsilon0, $episodes episodes)"
diff --git a/src/main/scala/symsim/concrete/ConcreteQLearningWithDecay.scala b/src/main/scala/symsim/concrete/ConcreteQLearningWithDecay.scala
@@ -0,0 +1,23 @@
+package symsim
+package concrete
+
+import cats.kernel.BoundedEnumerable
+
+case class ConcreteQLearningWithDecay [
+  State,
+  ObservableState: BoundedEnumerable,
+  Action: BoundedEnumerable
+] (
+
+  val agent: Agent[State, ObservableState, Action, Double, Randomized],
+  val alpha: Double,
+  val gamma: Double,
+  val epsilon0: Probability,
+  val episodes: Int,
+
+) extends QLearning[State, ObservableState, Action, Double, Randomized],
+  ConcreteExactRL[State, ObservableState, Action],
+  BoundedEpsilonDecay:
+
+  override def toString: String =
+    s"Q-Learn(α=$alpha, 𝛾=$gamma, 𝜀=$epsilon0, $episodes episodes)"
diff --git a/src/main/scala/symsim/concrete/ConcreteSarsa.scala b/src/main/scala/symsim/concrete/ConcreteSarsa.scala
@@ -11,10 +11,11 @@ case class ConcreteSarsa [
   val agent: Agent[State, ObservableState, Action, Double, Randomized],
   val alpha: Double,
   val gamma: Double,
-  val epsilon: Probability,
+  val epsilon0: Probability,
   val episodes: Int,
 ) extends Sarsa[State, ObservableState, Action, Double, Randomized],
-  ConcreteExactRL[State, ObservableState, Action]:
+  ConcreteExactRL[State, ObservableState, Action],
+  NoDecay:
 
   override def toString: String =
-    s"SARSA(α=$alpha, 𝛾=$gamma, 𝜀=$epsilon, $episodes episodes)"
+    s"SARSA(α=$alpha, 𝛾=$gamma, 𝜀=$epsilon0, $episodes episodes)"
diff --git a/src/main/scala/symsim/concrete/ConcreteVTable.scala b/src/main/scala/symsim/concrete/ConcreteVTable.scala
@@ -15,7 +15,7 @@ trait ConcreteVTable[State, ObservableState, Action]
   def bestAction (v: V) (s: State): Action = ???
 
   def chooseAction (v: V) (s: State): Scheduler[Action] = for
-    explore <- Randomized.coin (this.epsilon)
+    explore <- Randomized.coin (this.epsilon0)
     action <- if explore
     then Randomized.oneOf (allActions*)
     else Randomized.const (bestAction (v) (s))

diff --git a/src/main/scala/symsim/concrete/NoDecay.scala b/src/main/scala/symsim/concrete/NoDecay.scala
@@ -0,0 +1,6 @@
+package symsim
+package concrete
+
+trait NoDecay: 
+
+  def decay (ε: Probability): Probability = ε
diff --git a/src/main/scala/symsim/laws/ConcreteExpectedSarsaLaws.scala b/src/main/scala/symsim/laws/ConcreteExpectedSarsaLaws.scala
@@ -50,7 +50,7 @@ case class ConcreteExpectedSarsaLaws[State, ObservableState, Action]
    */
   val bdl =  
     symsim.concrete.BdlConcreteExpectedSarsa[State, ObservableState, Action] 
-      (agent, sarsa.α, this.γ, sarsa.ε, -1)
+      (agent, sarsa.α, this.γ, sarsa.ε0, -1)
 
   given Arbitrary[Q] = 
     Arbitrary (vf.genVF (using agent.instances.arbitraryReward))

diff --git a/src/main/scala/symsim/laws/ConcreteSarsaLaws.scala b/src/main/scala/symsim/laws/ConcreteSarsaLaws.scala
@@ -45,7 +45,7 @@ case class ConcreteSarsaLaws[State, ObservableState, Action]
   // A shortcut for instantiating the interpreter with the right term for SARSA
   val bdl =  
     symsim.concrete.BdlConcreteSarsa[State, ObservableState, Action] 
-      (agent, sarsa.α, this.γ, sarsa.ε, -1)
+      (agent, sarsa.α, this.γ, sarsa.ε0, -1)
 
   given Arbitrary[Q] = 
     Arbitrary (vf.genVF (using agent.instances.arbitraryReward))

diff --git a/src/main/scala/symsim/laws/SarsaLaws.scala b/src/main/scala/symsim/laws/SarsaLaws.scala
@@ -68,7 +68,7 @@ case class SarsaLaws[State, ObservableState, Action, Reward, Scheduler[_]]
       forAll (sarsa.vf.genVF) { (q: sarsa.vf.Q) =>
         forAll { (s: State) =>
           val sa: Scheduler[Action] =
-            sarsa.vf.chooseAction (sarsa.ε) (q) (sarsa.agent.observe (s))
+            sarsa.vf.chooseAction (sarsa.ε0) (q) (sarsa.agent.observe (s))
           forAll (sa.toGen) { a => allActions.contains (a)
       } } },
     )
diff --git a/src/test/scala/symsim/concrete/BdlConcreteExpectedSarsaIsExpectedSarsa.scala b/src/test/scala/symsim/concrete/BdlConcreteExpectedSarsaIsExpectedSarsa.scala
@@ -15,7 +15,7 @@ class BdlConcreteExpectedSarsaIsExpectedSarsaSpec
     agent = MountainCar,
     alpha = 0.1,
     gamma = 0.2,
-    epsilon = 0.0, // The update distribution test requires low ε for stability
+    epsilon0 = 0.0, // The update distribution test requires low ε for stability
     episodes = -1, // Not used in this test
   )
 

diff --git a/src/test/scala/symsim/concrete/BdlConcreteSarsaIsSarsa.scala b/src/test/scala/symsim/concrete/BdlConcreteSarsaIsSarsa.scala
@@ -15,7 +15,7 @@ class BdlConcreteSarsaIsSarsaSpec
     agent = MountainCar,
     alpha = 0.1,
     gamma = 0.2,
-    epsilon = 0.0, // the update distribution test requires low ε for stability
+    epsilon0 = 0.0, // the update distribution test requires low ε for stability
     episodes = 1000,
   )
 

diff --git a/src/test/scala/symsim/concrete/ConcreteExpectedSarsaIsExpectedSarsa.scala b/src/test/scala/symsim/concrete/ConcreteExpectedSarsaIsExpectedSarsa.scala
@@ -12,7 +12,7 @@ class ConcreteExpectedSarsaIsSarsaSpec
     agent = MountainCar,
     alpha = 0.1,
     gamma = 0.2,
-    epsilon = 0.003, // The update distribution test requires low ε for stability
+    epsilon0 = 0.003, // The update distribution test requires low ε for stability
     episodes = -1, // Not used in this test
   )
 

diff --git a/src/test/scala/symsim/concrete/ConcreteQLearningIsSarsaSpec.scala b/src/test/scala/symsim/concrete/ConcreteQLearningIsSarsaSpec.scala
@@ -17,7 +17,7 @@ class ConcreteQLearningIsSarsaSpec
     agent = MountainCar,
     alpha = 0.1,
     gamma = 0.2,
-    epsilon = 0.0, // The update distribution test requires low ε for stability
+    epsilon0 = 0.0, // The update distribution test requires low ε for stability
     episodes = -1, // Not used in this test
   )
 

diff --git a/src/test/scala/symsim/concrete/ConcreteSarsaIsSarsaSpec.scala b/src/test/scala/symsim/concrete/ConcreteSarsaIsSarsaSpec.scala
@@ -12,7 +12,7 @@ class ConcreteSarsaIsSarsaSpec
     agent = MountainCar,
     alpha = 0.1,
     gamma = 0.2,
-    epsilon = 0.003, // The update distribution test requires low ε for stability
+    epsilon0 = 0.003, // The update distribution test requires low ε for stability
     episodes = -1, // Not used in this test
   )
 

diff --git a/src/test/scala/symsim/concrete/ConcreteSarsaSpec.scala b/src/test/scala/symsim/concrete/ConcreteSarsaSpec.scala
@@ -14,7 +14,7 @@ class ConcreteSarsaSpec
     agent = UnitAgent,
     alpha = 0.1,
     gamma = 0.1,
-    epsilon = 0.2, // explore vs exploit ratio
+    epsilon0 = 0.2, // explore vs exploit ratio
     episodes = 2*C
   )
 
@@ -43,7 +43,7 @@ class ConcreteSarsaSpec
   // but at least checks for crash
   // also with the immediate final state 'learn' is not really tested here
   "learn is tail recursive, no stack overflow (regression)"  in {
-    val result = sarsa.learningEpisode ((sarsa.vf.initialize, List[sarsa.vf.Q]()), ())
+    val result = sarsa.learningEpisode ((sarsa.vf.initialize, List[sarsa.vf.Q](), sarsa.ε0), ())
     result.head
   }
 

diff --git a/src/test/scala/symsim/concrete/UnitAgentExperimentsSpec.scala b/src/test/scala/symsim/concrete/UnitAgentExperimentsSpec.scala
@@ -11,7 +11,7 @@ class UnitAgentExperiments
      agent = UnitAgent,
      alpha = 0.1,
      gamma = 0.1,
-     epsilon = 0.05, // explore vs exploit ratio
+     epsilon0 = 0.05, // explore vs exploit ratio
      episodes = 100,
   )
 
@@ -24,7 +24,7 @@ class UnitAgentExperiments
      agent = UnitAgent,
      alpha = 0.1,
      gamma = 0.1,
-     epsilon = 0.05, // explore vs exploit ratio
+     epsilon0 = 0.05, // explore vs exploit ratio
      episodes = 100,
   )
 

diff --git a/src/test/scala/symsim/examples/concrete/braking/Experiments.scala b/src/test/scala/symsim/examples/concrete/braking/Experiments.scala
@@ -14,7 +14,7 @@ class Experiments
     agent = Car,
     alpha = 0.1,
     gamma = 0.1,
-    epsilon = 0.05,
+    epsilon0 = 0.05,
     episodes = 100000,
   )
 

diff --git a/src/test/scala/symsim/examples/concrete/cartpole/Experiments.scala b/src/test/scala/symsim/examples/concrete/cartpole/Experiments.scala
@@ -11,7 +11,7 @@ class Experiments extends
     agent = CartPole,
     alpha = 0.1,
     gamma = 0.1,
-    epsilon = 0.05,
+    epsilon0 = 0.05,
     episodes = 20000,
   )
 

diff --git a/src/test/scala/symsim/examples/concrete/cliffwalking/Experiments.scala b/src/test/scala/symsim/examples/concrete/cliffwalking/Experiments.scala
@@ -13,7 +13,7 @@ class Experiments
     agent = CliffWalking,
     alpha = 0.1,
     gamma = 0.1,
-    epsilon = 0.1,
+    epsilon0 = 0.1,
     episodes = 100
   )
 

diff --git a/src/test/scala/symsim/examples/concrete/golf/Experiments.scala b/src/test/scala/symsim/examples/concrete/golf/Experiments.scala
@@ -11,7 +11,7 @@ class Experiments
     agent = Golf,
     alpha = 0.1,
     gamma = 0.1,
-    epsilon = 0.1,
+    epsilon0 = 0.1,
     episodes = 20000,
   )