代码拉取完成,页面将自动刷新
# +-----------------------------------------------+
# | RL-ROBOT. Reinforcement Learning for Robotics |
# | Angel Martinez-Tenor |
# | MAPIR. University of Malaga. 2016 |
# +-----------------------------------------------+
""" Learning process """
import time
import numpy as np
import agent
import exp
import show
import task
learning_module = "algorithm_" + exp.ALGORITHM.lower()
learning_algorithm = __import__(learning_module)
np.set_printoptions(precision=2)
step_time = 0
step = -1 # Current learning step
a = -1 # Current action
s = -1 # current state
sp = -1 # state reached (s')
s0 = -1 # initial state
ap = -1 # next action selected (a')
r = 0 # current reward obtained (R)
delta = 0 # Temporal Difference (TD) Error
q = None # Q-matrix
v = None # Value function
policy = None # Current Policy
q_count = None
alpha = 0
elapsed_time = 0 # seconds
actual_step_time = 0 # seconds: simulation time / actual_time
final_average_reward = 0 # Resulting average R at last step
ave_v_step = None # Average V per step (for plotting)
ave_r_step = None # Average R obtained per step
sasr_step = None # History of (s,a,s',R) per step
q_limit = 0
if exp.USE_T_SAS_AND_R_SAS:
t_sas = np.zeros((task.n_states, task.n_actions, task.n_states))
# Number of steps in which from s, executing a, resulted sp
r_sas = np.zeros((task.n_states, task.n_actions, task.n_states))
# Sum of Rewards for s,a,sp
initiated = False
initial_step_time = step_time # auxiliary
def setup():
"""Create module variables"""
global step_time, step, s, sp, a, ap, r, alpha, delta, q, v, policy, q_count
global t_sas, r_sas, elapsed_time, actual_step_time
global final_average_reward, ave_v_step, ave_r_step, sasr_step, q_limit, s0
global initiated, initial_step_time
agent.setup()
step_time = task.STEP_TIME / exp.SPEED_RATE
initial_step_time = step_time
# only used in case we want to modify step_time from a task module
# to speed up the experiment when the robot reaches the goal
step = 0
# Get initial state:
if exp.LEARN_FROM_MODEL:
import model
s = int(model.s0)
else:
s = agent.observe_state()
s0 = s
sp = -1
a = task.INITIAL_POLICY
ap = -1
r = 0
alpha = exp.ALPHA
delta = 0
q = np.zeros((task.n_states, task.n_actions), dtype=np.float32)
v = np.zeros(task.n_states, dtype=np.float64)
policy = np.full(task.n_states, task.INITIAL_POLICY, dtype=np.uint32)
q_count = np.zeros((task.n_states, task.n_actions), dtype=np.uint64)
if exp.USE_T_SAS_AND_R_SAS:
t_sas = np.zeros((task.n_states, task.n_actions, task.n_states))
r_sas = np.zeros((task.n_states, task.n_actions, task.n_states))
elapsed_time = 0
actual_step_time = 0
final_average_reward = 0
ave_v_step = np.zeros(exp.N_STEPS)
ave_r_step = np.zeros(exp.N_STEPS)
sasr_step = np.zeros((exp.N_STEPS, 4))
learning_algorithm.setup()
q_limit = round(max(task.REWARDS) / (1 - exp.GAMMA))
# q_limit = max(TASK.REWARDS)/(1-EXP.GAMMA)
if q_limit != 100:
print(
"q_limit = ",
str(q_limit),
". Softmax regression will be normalized as q_limit = 100",
)
time.sleep(2)
initiated = True
return
def run():
"""Execute the learning Process"""
global step, s, sp, a, ap, r, alpha
global q, v, policy, q_count
global t_sas, r_sas
global ave_r_step, sasr_step
global elapsed_time, actual_step_time
global final_average_reward
assert initiated, (
"learning process not initiated! setup() " "must be previously called"
)
mark = time.time()
# Start learning process: -----------------------------
for step in range(0, exp.N_STEPS):
if step > 0 and step % exp.DISPLAY_STEP == 0:
print("STEP: ", step)
if agent.goal_reached: # rest of the process filled with highest reward
sasr_step[step, 3] = r
ave_r_step[step] = np.average(sasr_step[0:step, 3])
continue
# Execute the selected learning algorithm. Also change lp variables
learning_algorithm.execute()
# update analysis and model arrays
q_count[s, a] += 1
sasr_step[step, 0] = s
sasr_step[step, 1] = a
sasr_step[step, 2] = sp
sasr_step[step, 3] = r
ave_v_step[step] = np.mean(v)
if step == 0:
ave_r_step[step] = sasr_step[step, 3]
else:
ave_r_step[step] = np.average(sasr_step[0:step, 3])
# Display information
if step > 0 and step % exp.DISPLAY_STEP == 0:
print("s:", s, " a:", a, " sp:", sp, " R: %0.2f" % r)
print("Average Reward: %0.2f" % ave_r_step[step], "\n")
if exp.TEACHING_PROCESS:
print("--- Teaching ---")
if exp.USE_T_SAS_AND_R_SAS:
t_sas[s, a, sp] += 1
r_sas[s, a, sp] += r
# Update state
s = sp
a = ap
# End of learning process ----------------------
final_average_reward = ave_r_step[step]
elapsed_time = time.time() - mark
actual_step_time = elapsed_time / (step + 1)
show.process_summary()
return
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。