I'm Harima, a first year master's student in the Graduate School of Science. I will summarize my learning contents as a memo. I'm sorry it's hard to see. I would like to know what you do not understand.
Implementation code (GitHub) https://github.com/YutaroOgawa/Deep-Reinforcement-Learning-Book
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
fig=plt.figure(figsize=(5,5))
ax=plt.gca()
plt.plot([1,1],[0,1],color='red',linewidth=2)
plt.plot([1,2],[2,2],color='red',linewidth=2)
plt.plot([2,2],[2,1],color='red',linewidth=2)
plt.plot([2,3],[1,1],color='red',linewidth=2)
plt.text(0.5,2.5,'S0',size=14,ha='center')
plt.text(1.5,2.5,'S1',size=14,ha='center')
plt.text(2.5,2.5,'S2',size=14,ha='center')
plt.text(0.5,1.5,'S3',size=14,ha='center')
plt.text(1.5,1.5,'S4',size=14,ha='center')
plt.text(2.5,1.5,'S5',size=14,ha='center')
plt.text(0.5,0.5,'S6',size=14,ha='center')
plt.text(1.5,0.5,'S7',size=14,ha='center')
plt.text(2.5,0.5,'S8',size=14,ha='center')
plt.text(0.5,2.3,'START',ha='center')
plt.text(2.5,0.3,'GOAL',ha='center')
ax.set_xlim(0,3)
ax.set_ylim(0,3)
plt.tick_params(axis='both',which='both',bottom='off',top='off',
labelbottom='off',right='off',left='off',labelleft='off')
line, =ax.plot([0.5],[2.5],marker="o",color='g',markersize=60)
It's an overall view of the maze.
・ The rules that define how agents behave are called ** policies **.
・ Expressed as $ \ pi_ \ theta (s, a) $
・ The probability of adopting the action $ a $ in the state $ s $ follows the policy $ \ pi $ determined by the parameter $ \ theta $.
theta_0 = np.array([[np.nan, 1, 1, np.nan],
[np.nan, 1, np.nan, 1],
[np.nan, np.nan, 1, 1],
[1, 1, 1, np.nan],
[np.nan, np.nan, 1, 1],
[1, np.nan, np.nan, np.nan],
[1, np.nan, np.nan, np.nan],
[1, 1, np.nan, np.nan]
])
-Convert the parameter $ \ theta_0 $ to find the policy $ \ pi_ \ theta (s, a) $
def simple_convert_into_pi_fron_theta(theta):
[m,n] = theta.shape
pi = np.zeros((m,n))
for i in range(0,m):
pi[i, :] = theta[i, :] / np.nansum(theta[i, :])
pi = np.nan_to_num(pi)
return pi
pi_0 = simple_convert_into_pi_fron_theta(theta_0)
・ The probability of going toward the wall is 0
・ Move in other directions with equal probability
pi_0
-Since the initial policy is completed, move the agent according to the policy $ \ pi_ {\ theta_ {0}} (s, a) $
・ Keep moving agents until you reach the goal
def get_next_s(pi, s):
direction = ["up", "right", "down", "left"]
next_direction = np.random.choice(direction, p=pi[s, :])
if next_direction == "up":
s_next = s - 3
elif next_direction == "right":
s_next = s + 1
elif next_direction == "down":
s_next = s + 3
elif next_direction == "left":
s_next = s - 1
return s_next
def goal_maze(pi):
s = 0
state_history = [0]
while (1):
next_s = get_next_s(pi, s)
state_history.append(next_s)
if next_s == 8:
break
else:
s = next_s
return state_history
・ Check what kind of trajectory and how many steps you have taken in total until you reach the goal
state_history = goal_maze(pi_0)
print(state_history)
print("The number of steps it took to solve the maze" + str(len(state_history) - 1) + "is")
・ Visualize the trajectory of state transition
from matplotlib import animation
from IPython.display import HTML
def init():
line.set_data([], [])
return (line,)
def animate(i):
state = state_history[i]
x = (state % 3) + 0.5
y = 2.5 - int(state / 3)
line.set_data(x, y)
return (line,)
anim = animation.FuncAnimation(fig, animate, init_func=init, frames=len(
state_history), interval=200, repeat=False)
HTML(anim.to_jshtml())
・ Think about how to learn strategies so that agents can go straight to the goal
Strategy that emphasizes the behavior of successful cases
Strategy to add value (priority) to positions (states) other than the goal
def softmax_convert_into_pi_from_theta(theta):
beta = 1.0
[m, n] = theta.shape
pi = np.zeros((m, n))
exp_theta = np.exp(beta * theta)
for i in range(0, m):
pi[i, :] = exp_theta[i, :] / np.nansum(exp_theta[i, :])
pi = np.nan_to_num(pi)
return pi
・ Measure $ \ pi_ {{\ theta_0}} $
pi_0 = softmax_convert_into_pi_from_theta(theta_0)
print(pi_0)
-Modified the "get_next_s" function handled in 2.2
・ Acquire not only the state but also the adopted action
def get_action_and_next_s(pi, s):
direction = ["up", "right", "down", "left"]
next_direction = np.random.choice(direction, p=pi[s, :])
if next_direction == "up":
action = 0
s_next = s - 3
elif next_direction == "right":
action = 1
s_next = s + 1
elif next_direction == "down":
action = 2
s_next = s + 3
elif next_direction == "left":
action = 3
s_next = s - 1
return [action, s_next]
-Fixed the "goal_maze" function that moves the agent until it reaches the goal
def goal_maze_ret_s_a(pi):
s = 0
s_a_history = [[0, np.nan]]
while (1):
[action, next_s] = get_action_and_next_s(pi, s)
s_a_history[-1][1] = action
s_a_history.append([next_s, np.nan])
if next_s == 8:
break
else:
s = next_s
return s_a_history
s_a_history = goal_maze_ret_s_a(pi_0)
print(s_a_history)
print("The number of steps it took to solve the maze" + str(len(s_a_history) - 1) + "is")
Omitted because it is long ...
・ The policy gradient method updates the parameter $ \ theta $ according to the following formula.
\theta_{s_i,a_j}=\theta_{s_i,a_j}+\eta*\Delta\theta_{s,a_j} \\
\Delta\theta{s,a_j}=\{ N(s_i,a_j)-P(s_i,a_j)N(s_i,a) \}/T
def update_theta(theta, pi, s_a_history):
eta = 0.1
T = len(s_a_history) - 1
[m, n] = theta.shape
delta_theta = theta.copy()
for i in range(0, m):
for j in range(0, n):
if not(np.isnan(theta[i, j])):
SA_i = [SA for SA in s_a_history if SA[0] == i]
SA_ij = [SA for SA in s_a_history if SA == [i, j]]
N_i = len(SA_i)
N_ij = len(SA_ij)
delta_theta[i, j] = (N_ij - pi[i, j] * N_i) / T
new_theta = theta + eta * delta_theta
return new_theta
--I really don't understand here! !! !! !!
new_theta = theta + eta * delta_theta
--Why add! ?? --Should the one with a large number of trials (which is unlikely to be the shortest path) be subtracted? --Please tell me ...
-Update the parameter $ \ theta $ and observe the change in policy $ \ pi_ {\ theta} $
new_theta = update_theta(theta_0, pi_0, s_a_history)
pi = softmax_convert_into_pi_from_theta(new_theta)
print(pi)
・ Repeat the search in the maze and the update of the parameter $ \ theta $ until the maze can be cleared in a straight line.
・ Measure End when the sum of the absolute values of the changes in $ \ pi $ becomes smaller than $ 10 ^ {-4} $
stop_epsilon = 10**-4
theta = theta_0
pi = pi_0
is_continue = True
count = 1
while is_continue:
s_a_history = goal_maze_ret_s_a(pi)
new_theta = update_theta(theta, pi, s_a_history)
new_pi = softmax_convert_into_pi_from_theta(new_theta)
if np.sum(np.abs(new_pi - pi)) < stop_epsilon:
is_continue = False
else:
theta = new_theta
pi = new_pi
Actually, there was "print" in the function, but I cut it because it's annoying ...
np.set_printoptions(precision=3, suppress=True)
print(pi)
・ Try to visualize
from matplotlib import animation
from IPython.display import HTML
def init():
line.set_data([], [])
return (line,)
def animate(i):
state = s_a_history[i][0]
x = (state % 3) + 0.5
y = 2.5 - int(state / 3)
line.set_data(x, y)
return (line,)
anim = animation.FuncAnimation(fig, animate, init_func=init, frames=len(
s_a_history), interval=200, repeat=False)
HTML(anim.to_jshtml())
-The softmax function can derive a strategy even if the parameter $ \ theta $ becomes a negative value.
・ By using the policy gradient theorem, it is possible to solve the update method of the parameter $ \ theta $ by the policy gradient method.
-There is an algorithm REINFORCE that approximately implements the policy gradient theorem.
――This time, there was something I didn't understand. ――I would appreciate it if anyone could tell me.
Recommended Posts