Skip to content

Commit

Permalink
Implement ε-decay (a decay of exploration ratio)
Browse files Browse the repository at this point in the history
  • Loading branch information
wasowski authored and mohsen-ghaffari1992 committed Jan 6, 2024
1 parent 12bc2c7 commit 247598e
Show file tree
Hide file tree
Showing 36 changed files with 113 additions and 58 deletions.
12 changes: 6 additions & 6 deletions src/main/scala/symsim/Bdl.scala
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ trait BdlLearn[State, ObservableState, Action, Reward, Scheduler[_]]
def bdl: Update

override def toString: String =
s"BDL(..., 𝜀=$epsilon, $episodes episodes)"
s"BDL(..., 𝜀=$epsilon0, $episodes episodes)"

/** A single step of the learning algorithm
*
Expand Down Expand Up @@ -78,7 +78,7 @@ trait BdlLearn[State, ObservableState, Action, Reward, Scheduler[_]]
(os_t, os_tk) = (agent.observe (s_t), agent.observe (s_tk))
expectation = allActions
.map { a =>
vf.probability (ε) (q_t) (os_tk, a)
vf.probability (ε0) (q_t) (os_tk, a)
* q_t (os_tk, a) }
.arithSum
g_tkk = g_tk + γ_tk * expectation
Expand Down Expand Up @@ -118,7 +118,7 @@ trait BdlLearn[State, ObservableState, Action, Reward, Scheduler[_]]
for
sr <- agent.step (s_t) (a_t) // intermediate name needed for stryker which fails with -source:future
(s_tt, r_tt) = sr
a_tt <- vf.chooseAction (ε) (q_t) (agent.observe (s_tt))
a_tt <- vf.chooseAction (ε0) (q_t) (agent.observe (s_tt))
g_tt = g_t + γ_t * r_tt
γ_tt = γ_t * γ
yield (s_tt, a_tt, g_tt, γ_tt)
Expand All @@ -128,12 +128,12 @@ trait BdlLearn[State, ObservableState, Action, Reward, Scheduler[_]]
sr <- agent.step (s_t) (a_t) // intermediate name needed for stryker which fails with -source:future
(s_tt, r_tt) = sr
os_tt = agent.observe (s_tt)
a_tt <- vf.chooseAction (ε) (q_t) (os_tt)
a_tt <- vf.chooseAction (ε0) (q_t) (os_tt)
expectation = allActions
.filter { _ != a_tt }
.map { a =>
vf.probability (ε) (q_t) (os_tt, a) * q_t (os_tt, a) }
vf.probability (ε0) (q_t) (os_tt, a) * q_t (os_tt, a) }
.arithSum
g_tt = g_t + γ_t * (r_tt + expectation)
γ_tt = γ_t * γ * vf.probability (ε) (q_t) (os_tt, a_tt)
γ_tt = γ_t * γ * vf.probability (ε0) (q_t) (os_tt, a_tt)
yield (s_tt, a_tt, g_tt, γ_tt)
29 changes: 20 additions & 9 deletions src/main/scala/symsim/ExactRL.scala
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,14 @@ trait ExactRL[State, ObservableState, Action, Reward, Scheduler[_]]
import vf.{VF, chooseAction}

def alpha: Double
def epsilon: Probability
def epsilon0: Probability
def α: Double = this.alpha
def ε: Probability = this.epsilon
def ε0: Probability = this.epsilon0

/** A decay function for ε (exploration probability). Override to define.
* If you do not want decay use identity (mix in NoDecay)
*/
def decay (ε: Probability): Probability


/* Policy Learning */
Expand All @@ -57,15 +62,19 @@ trait ExactRL[State, ObservableState, Action, Reward, Scheduler[_]]
/** Execute a full learning episode (until the final state of agent is
* reached).
*/
def learningEpisode(fR: (VF, List[VF]), s_t: State): Scheduler[(VF, List[VF])] =
def learningEpisode(fR: (VF, List[VF], Probability), s_t: State)
: Scheduler[(VF, List[VF], Probability)] =

def done (f: VF, s: State, a: Action): Boolean = agent.isFinal(s)
val f = fR._1
val qL_t = fR._2

val (f, qL_t, ε) = fR

for
a <- chooseAction (ε) (f) (agent.observe (s_t))
fin <- Monad[Scheduler].iterateUntilM (f, s_t, a) (learningEpoch) (done)
qL_tt = fin._1 :: qL_t
yield (fin._1, qL_tt)
yield (fin._1, qL_tt, decay (ε))


/** Executes as many full learning episodes (until the final state of agent is
* reached) as the given state scheduler generates. For this method to work
Expand All @@ -75,9 +84,11 @@ trait ExactRL[State, ObservableState, Action, Reward, Scheduler[_]]
* Scheduler is lazy then the evaluation is not really doing more than just
* formulating the thunk of that scheduler.
*/
final def learn (f: VF, q_l: List[VF], ss: => Scheduler[State]):
Scheduler[(VF, List[VF])] =
ss.foldM[Scheduler, (VF, List[VF])] (f, q_l) (learningEpisode)
final def learn (f: VF, q_l: List[VF], ss: => Scheduler[State])
: Scheduler[(VF, List[VF])] =
val result =
ss.foldM[Scheduler, (VF, List[VF], Probability)] (f, q_l, ε0) (learningEpisode)
result.map { (vf, history, ε) => (vf, history) }



Expand Down
4 changes: 2 additions & 2 deletions src/main/scala/symsim/ExpectedSarsa.scala
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,11 @@ trait ExpectedSarsa[State, ObservableState, Action, Reward, Scheduler[_]]
(s_tt, r_tt) = sa_tt
// Expected Sarsa (p.133 in Sutton & Barto)
(os_t, os_tt) = (agent.observe (s_t), agent.observe (s_tt))
a_tt <- vf.chooseAction (ε) (q_t) (os_tt)
a_tt <- vf.chooseAction (ε0) (q_t) (os_tt)
q_t_value = q_t (os_t, a_t)
expectation = agent.instances.allActions
.map { a =>
vf.probability (ε) (q_t) (os_tt, a)
vf.probability (ε0) (q_t) (os_tt, a)
* q_t (os_tt, a) }
.arithSum
g_tt = r_tt + γ * expectation
Expand Down
2 changes: 1 addition & 1 deletion src/main/scala/symsim/QLearning.scala
Original file line number Diff line number Diff line change
Expand Up @@ -36,5 +36,5 @@ trait QLearning[State, ObservableState, Action, Reward, Scheduler[_]]
qval = old_entry + alpha * correction

q1 = q.updated (ds_t, a_t, qval)
a_tt1 <- chooseAction (ε) (q1) (ds_tt)
a_tt1 <- chooseAction (ε0) (q1) (ds_tt)
yield (q1, s_tt, a_tt1)
2 changes: 1 addition & 1 deletion src/main/scala/symsim/Sarsa.scala
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ trait Sarsa[State, ObservableState, Action, Reward, Scheduler[_]]
(s_tt, r_tt) = sa_tt
// SARSA: on-policy (p.844 in Russel & Norvig)
(os_t, os_tt) = (agent.observe (s_t), agent.observe (s_tt))
a_tt <- vf.chooseAction (ε) (q_t) (os_tt)
a_tt <- vf.chooseAction (ε0) (q_t) (os_tt)
q_t_value = q_t (os_t, a_t)
g_tt = r_tt + γ * q_t (os_tt, a_tt)
q_tt_value = q_t_value + α * (g_tt - q_t_value)
Expand Down
5 changes: 3 additions & 2 deletions src/main/scala/symsim/concrete/BdlConcreteExpectedSarsa.scala
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,11 @@ case class BdlConcreteExpectedSarsa [
val agent: Agent[State, ObservableState, Action, Double, Randomized],
val alpha: Double,
val gamma: Double,
val epsilon: Probability,
val epsilon0: Probability,
val episodes: Int,
) extends BdlLearn[State, ObservableState, Action, Double, Randomized],
ConcreteExactRL[State, ObservableState, Action]:
ConcreteExactRL[State, ObservableState, Action],
NoDecay:

import Est.*, Upd.*
val bdl = Update (List(Sample (gamma)), alpha, ExpectationU)
5 changes: 3 additions & 2 deletions src/main/scala/symsim/concrete/BdlConcreteSarsa.scala
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,11 @@ case class BdlConcreteSarsa [
val agent: Agent[State, ObservableState, Action, Double, Randomized],
val alpha: Double,
val gamma: Double,
val epsilon: Probability,
val epsilon0: Probability,
val episodes: Int,
) extends BdlLearn[State, ObservableState, Action, Double, Randomized],
ConcreteExactRL[State, ObservableState, Action]:
ConcreteExactRL[State, ObservableState, Action],
NoDecay:

import Est.*, Upd.*
val bdl = Update (List(Sample (gamma)), alpha, SampleU)
10 changes: 10 additions & 0 deletions src/main/scala/symsim/concrete/BoundedEpsilonDecay.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
package symsim
package concrete

trait BoundedEpsilonDecay:

def decayFactor: Double = 0.99
def minExploration: Double = 0.00001

def decay (ε: Probability): Probability =
if ε <= minExploration then ε else ε * decayFactor
7 changes: 4 additions & 3 deletions src/main/scala/symsim/concrete/ConcreteExpectedSarsa.scala
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,11 @@ case class ConcreteExpectedSarsa[State, ObservableState, Action] (
val agent: Agent[State, ObservableState, Action, Double, Randomized],
val alpha: Double,
val gamma: Double,
val epsilon: Probability,
val epsilon0: Probability,
val episodes: Int,
) extends ExpectedSarsa[State, ObservableState, Action, Double, Randomized],
ConcreteExactRL[State, ObservableState, Action]:
ConcreteExactRL[State, ObservableState, Action],
NoDecay:

override def toString: String =
s"ExpectedSarsa-Learn(α=$alpha, 𝛾=$gamma, 𝜀=$epsilon, $episodes episodes)"
s"ExpectedSarsa-Learn(α=$alpha, 𝛾=$gamma, 𝜀=$epsilon0, $episodes episodes)"
7 changes: 4 additions & 3 deletions src/main/scala/symsim/concrete/ConcreteQLearning.scala
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,11 @@ case class ConcreteQLearning [
val agent: Agent[State, ObservableState, Action, Double, Randomized],
val alpha: Double,
val gamma: Double,
val epsilon: Probability,
val epsilon0: Probability,
val episodes: Int,
) extends QLearning[State, ObservableState, Action, Double, Randomized],
ConcreteExactRL[State, ObservableState, Action]:
ConcreteExactRL[State, ObservableState, Action],
NoDecay:

override def toString: String =
s"Q-Learn(α=$alpha, 𝛾=$gamma, 𝜀=$epsilon, $episodes episodes)"
s"Q-Learn(α=$alpha, 𝛾=$gamma, 𝜀=$epsilon0, $episodes episodes)"
23 changes: 23 additions & 0 deletions src/main/scala/symsim/concrete/ConcreteQLearningWithDecay.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
package symsim
package concrete

import cats.kernel.BoundedEnumerable

case class ConcreteQLearningWithDecay [
State,
ObservableState: BoundedEnumerable,
Action: BoundedEnumerable
] (

val agent: Agent[State, ObservableState, Action, Double, Randomized],
val alpha: Double,
val gamma: Double,
val epsilon0: Probability,
val episodes: Int,

) extends QLearning[State, ObservableState, Action, Double, Randomized],
ConcreteExactRL[State, ObservableState, Action],
BoundedEpsilonDecay:

override def toString: String =
s"Q-Learn(α=$alpha, 𝛾=$gamma, 𝜀=$epsilon0, $episodes episodes)"
7 changes: 4 additions & 3 deletions src/main/scala/symsim/concrete/ConcreteSarsa.scala
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,11 @@ case class ConcreteSarsa [
val agent: Agent[State, ObservableState, Action, Double, Randomized],
val alpha: Double,
val gamma: Double,
val epsilon: Probability,
val epsilon0: Probability,
val episodes: Int,
) extends Sarsa[State, ObservableState, Action, Double, Randomized],
ConcreteExactRL[State, ObservableState, Action]:
ConcreteExactRL[State, ObservableState, Action],
NoDecay:

override def toString: String =
s"SARSA(α=$alpha, 𝛾=$gamma, 𝜀=$epsilon, $episodes episodes)"
s"SARSA(α=$alpha, 𝛾=$gamma, 𝜀=$epsilon0, $episodes episodes)"
2 changes: 1 addition & 1 deletion src/main/scala/symsim/concrete/ConcreteVTable.scala
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ trait ConcreteVTable[State, ObservableState, Action]
def bestAction (v: V) (s: State): Action = ???

def chooseAction (v: V) (s: State): Scheduler[Action] = for
explore <- Randomized.coin (this.epsilon)
explore <- Randomized.coin (this.epsilon0)
action <- if explore
then Randomized.oneOf (allActions*)
else Randomized.const (bestAction (v) (s))
Expand Down
6 changes: 6 additions & 0 deletions src/main/scala/symsim/concrete/NoDecay.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
package symsim
package concrete

trait NoDecay:

def decay (ε: Probability): Probability = ε
2 changes: 1 addition & 1 deletion src/main/scala/symsim/laws/ConcreteExpectedSarsaLaws.scala
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ case class ConcreteExpectedSarsaLaws[State, ObservableState, Action]
*/
val bdl =
symsim.concrete.BdlConcreteExpectedSarsa[State, ObservableState, Action]
(agent, sarsa.α, this.γ, sarsa.ε, -1)
(agent, sarsa.α, this.γ, sarsa.ε0, -1)

given Arbitrary[Q] =
Arbitrary (vf.genVF (using agent.instances.arbitraryReward))
Expand Down
2 changes: 1 addition & 1 deletion src/main/scala/symsim/laws/ConcreteSarsaLaws.scala
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ case class ConcreteSarsaLaws[State, ObservableState, Action]
// A shortcut for instantiating the interpreter with the right term for SARSA
val bdl =
symsim.concrete.BdlConcreteSarsa[State, ObservableState, Action]
(agent, sarsa.α, this.γ, sarsa.ε, -1)
(agent, sarsa.α, this.γ, sarsa.ε0, -1)

given Arbitrary[Q] =
Arbitrary (vf.genVF (using agent.instances.arbitraryReward))
Expand Down
2 changes: 1 addition & 1 deletion src/main/scala/symsim/laws/SarsaLaws.scala
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ case class SarsaLaws[State, ObservableState, Action, Reward, Scheduler[_]]
forAll (sarsa.vf.genVF) { (q: sarsa.vf.Q) =>
forAll { (s: State) =>
val sa: Scheduler[Action] =
sarsa.vf.chooseAction (sarsa.ε) (q) (sarsa.agent.observe (s))
sarsa.vf.chooseAction (sarsa.ε0) (q) (sarsa.agent.observe (s))
forAll (sa.toGen) { a => allActions.contains (a)
} } },
)
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ class BdlConcreteExpectedSarsaIsExpectedSarsaSpec
agent = MountainCar,
alpha = 0.1,
gamma = 0.2,
epsilon = 0.0, // The update distribution test requires low ε for stability
epsilon0 = 0.0, // The update distribution test requires low ε for stability
episodes = -1, // Not used in this test
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ class BdlConcreteSarsaIsSarsaSpec
agent = MountainCar,
alpha = 0.1,
gamma = 0.2,
epsilon = 0.0, // the update distribution test requires low ε for stability
epsilon0 = 0.0, // the update distribution test requires low ε for stability
episodes = 1000,
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ class ConcreteExpectedSarsaIsSarsaSpec
agent = MountainCar,
alpha = 0.1,
gamma = 0.2,
epsilon = 0.003, // The update distribution test requires low ε for stability
epsilon0 = 0.003, // The update distribution test requires low ε for stability
episodes = -1, // Not used in this test
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ class ConcreteQLearningIsSarsaSpec
agent = MountainCar,
alpha = 0.1,
gamma = 0.2,
epsilon = 0.0, // The update distribution test requires low ε for stability
epsilon0 = 0.0, // The update distribution test requires low ε for stability
episodes = -1, // Not used in this test
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ class ConcreteSarsaIsSarsaSpec
agent = MountainCar,
alpha = 0.1,
gamma = 0.2,
epsilon = 0.003, // The update distribution test requires low ε for stability
epsilon0 = 0.003, // The update distribution test requires low ε for stability
episodes = -1, // Not used in this test
)

Expand Down
4 changes: 2 additions & 2 deletions src/test/scala/symsim/concrete/ConcreteSarsaSpec.scala
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ class ConcreteSarsaSpec
agent = UnitAgent,
alpha = 0.1,
gamma = 0.1,
epsilon = 0.2, // explore vs exploit ratio
epsilon0 = 0.2, // explore vs exploit ratio
episodes = 2*C
)

Expand Down Expand Up @@ -43,7 +43,7 @@ class ConcreteSarsaSpec
// but at least checks for crash
// also with the immediate final state 'learn' is not really tested here
"learn is tail recursive, no stack overflow (regression)" in {
val result = sarsa.learningEpisode ((sarsa.vf.initialize, List[sarsa.vf.Q]()), ())
val result = sarsa.learningEpisode ((sarsa.vf.initialize, List[sarsa.vf.Q](), sarsa.ε0), ())
result.head
}

Expand Down
4 changes: 2 additions & 2 deletions src/test/scala/symsim/concrete/UnitAgentExperimentsSpec.scala
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ class UnitAgentExperiments
agent = UnitAgent,
alpha = 0.1,
gamma = 0.1,
epsilon = 0.05, // explore vs exploit ratio
epsilon0 = 0.05, // explore vs exploit ratio
episodes = 100,
)

Expand All @@ -24,7 +24,7 @@ class UnitAgentExperiments
agent = UnitAgent,
alpha = 0.1,
gamma = 0.1,
epsilon = 0.05, // explore vs exploit ratio
epsilon0 = 0.05, // explore vs exploit ratio
episodes = 100,
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ class Experiments
agent = Car,
alpha = 0.1,
gamma = 0.1,
epsilon = 0.05,
epsilon0 = 0.05,
episodes = 100000,
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ class Experiments extends
agent = CartPole,
alpha = 0.1,
gamma = 0.1,
epsilon = 0.05,
epsilon0 = 0.05,
episodes = 20000,
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ class Experiments
agent = CliffWalking,
alpha = 0.1,
gamma = 0.1,
epsilon = 0.1,
epsilon0 = 0.1,
episodes = 100
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ class Experiments
agent = Golf,
alpha = 0.1,
gamma = 0.1,
epsilon = 0.1,
epsilon0 = 0.1,
episodes = 20000,
)

Expand Down
Loading

0 comments on commit 247598e

Please sign in to comment.