代码拉取完成,页面将自动刷新
#%%
"""
S H H
F F F
H H G
"""
import numpy as np
#up down left right
actions=np.arange(4)
status=np.arange(9)
gama=0.8
#构造转移矩阵
p=[[0 for i in range(4)] for j in range(9)]
p[0][0]=[[0.7,0,0,False],[0.3,3,0,False]]
p[0][1]=[[0.3,0,0,False],[0.7,3,0,False]]
p[0][2]=[[0.7,0,0,False],[0.3,1,0,True]]
p[0][3]=[[0.3,0,0,False],[0.7,1,0,True]]
p[1][0]=[[1,1,-1,True]]
p[1][1]=[[1,1,-1,True]]
p[1][2]=[[1,1,-1,True]]
p[1][3]=[[1,1,-1,True]]
p[2][0]=[[1,2,-1,True]]
p[2][1]=[[1,2,-1,True]]
p[2][2]=[[1,2,-1,True]]
p[2][3]=[[1,2,-1,True]]
p[3][0]=[[0.5,0,0,False],[0.3,3,0,False],[0.2,6,-1,True]]
p[3][1]=[[0.5,6,-1,True],[0.3,3,0,False],[0.2,0,0,False]]
p[3][2]=[[0.7,3,0,False],[0.3,4,0,False]]
p[3][3]=[[0.3,3,0,False],[0.7,4,0,False]]
p[4][0]=[[0.5,1,-1,True],[0.3,4,0,False],[0.2,7,-1,True]]
p[4][1]=[[0.5,7,-1,True],[0.3,4,0,False],[0.2,1,-1,True]]
p[4][2]=[[0.7,3,0,False],[0.3,4,0,False],[0.2,5,0,False]]
p[4][3]=[[0.7,5,0,False],[0.3,4,0,False],[0.2,3,0,False]]
p[5][0]=[[0.5,2,-1,True],[0.3,5,0,False],[0.2,8,5,True]]
p[5][1]=[[0.5,8,5,True],[0.3,5,0,False],[0.2,2,-1,True]]
p[5][2]=[[0.7,4,0,False],[0.3,5,0,False]]
p[5][3]=[[0.7,5,0,False],[0.3,4,0,False]]
p[6][0]=[[1,6,-1,True]]
p[6][1]=[[1,6,-1,True]]
p[6][2]=[[1,6,-1,True]]
p[6][3]=[[1,6,-1,True]]
p[7][0]=[[1,7,-1,True]]
p[7][1]=[[1,7,-1,True]]
p[7][2]=[[1,7,-1,True]]
p[7][3]=[[1,7,-1,True]]
p[8][0]=[[1,8,5,True]]
p[8][1]=[[1,8,5,True]]
p[8][2]=[[1,8,5,True]]
p[8][3]=[[1,8,5,True]]
#old_policy=[2 for i in range(9)]
#old_policy=np.array(old_policy)
old_policy=np.random.randint(4,size=9)
print("init policy",old_policy)
n_evaluate_policy_iteration=1000
n_policy_iteration=1000
def evaluate_policy(policy):
value_table=np.zeros(9)
for i in range(n_evaluate_policy_iteration):
value_tmp_table=np.copy(value_table)
for s in status:
tmp=0
for prob,next_st,reward,_ in p[s][policy[s]]:
tmp+=prob*(reward+gama*value_tmp_table[next_st])
value_table[s]=tmp
return value_table
def extract_policy(value_table):
every_status_action_tmp_q=np.zeros((9,4))
for s in status:
for a in actions:
for prob,next_st,reward,_ in p[s][a]:
every_status_action_tmp_q[s][a]+=prob*(reward+gama*value_table[next_st])
policy=np.argmax(every_status_action_tmp_q,axis=1)
return policy
for i in range(n_policy_iteration):
value_table=evaluate_policy(old_policy)
new_policy=extract_policy(value_table)
com_policy=(new_policy==old_policy)
if(com_policy.all()):
break
old_policy=new_policy
print(new_policy)
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。