-
Notifications
You must be signed in to change notification settings - Fork 0
/
valueIteration.py
70 lines (51 loc) · 2.69 KB
/
valueIteration.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import numpy as np
from frozenlake import FrozenLake
def value_iteration(env, gamma, theta, max_iterations, value=None):
if value is None:
value = np.zeros(env.n_states)
else:
value = np.array(value, dtype=np.float)
#Initialize policy with zeros
policy = np.zeros(env.n_states, dtype=int)
n_value_iterations = 0
delta = abs(theta) + 1
while max_iterations > n_value_iterations and delta > theta:
delta = 0
for state in range(env.n_states):
previous_value = value[state]
new_value = []
for action in range(env.n_actions):
total_exprected_return = 0
for next_state in range(env.n_states):
#Probability of transitioning to this state to the next state
next_state_probability = env.p(next_state, state, action)
#Get the discounted reward
discounted_reward = env.r(next_state, state, action) + (gamma*value[next_state])
# print("-------", discounted_reward)
#Expected return for each state
total_exprected_return += next_state_probability * discounted_reward
new_value.append(total_exprected_return) #Append the expected return for each next state
#Select the maximum value
value[state] = max(new_value)
#Calculate how much the estimate of the value for this state has changed and store the maximum from change we've already observed
delta = max(delta, np.abs(previous_value - value[state]))
n_value_iterations += 1
# Recover the best action of the optimal policy
for state in range(env.n_states):
new_actions = []
new_action_values = []
for action in range(env.n_actions):
for next_state in range(env.n_states):
#Probability of transitioning to this state to the next state
next_state_probability = env.p(next_state, state, action=action)
#Get the discounted reward
discounted_reward = env.r(next_state, state, action=action) + (gamma*value[next_state])
#Appned the new actions
new_actions.append(action)
#Append the value to the new action
new_action_values.append(next_state_probability*discounted_reward)
#Choose the maximum value as it will give the best optimal action
best_action = new_actions[new_action_values.index(max(new_action_values))]
policy[state] = best_action
print("Number of value iterations :-> ",n_value_iterations)
return policy, value