代码拉取完成,页面将自动刷新
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
import numpy as np
"""实现策略价值网络(PyTorch版)"""
# 辅助函数,用于直接设定学习速率
def set_learning_rate(optimizer, lr):
"""Set the learning rate to the given value"""
for param_group in optimizer.param_groups:
param_group['lr'] = lr
class Net(nn.Module):
"""定义策略价值网络结构"""
def __init__(self, board_width, board_height):
super(Net, self).__init__()
self.board_width = board_width
self.board_height = board_height
# common layers
self.conv1 = nn.Conv2d(4, 32, kernel_size=3, padding=1)
self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
self.conv3 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
# action policy layers
self.act_conv1 = nn.Conv2d(128, 4, kernel_size=1)
self.act_fc1 = nn.Linear(4 * board_width * board_height, board_width * board_height)
# state value layers
self.val_conv1 = nn.Conv2d(128, 2, kernel_size=1)
self.val_fc1 = nn.Linear(2 * board_height * board_width, 64)
self.val_fc2 = nn.Linear(64, 1)
# 定义前向传播
def forward(self, state_input):
# common layers
x = F.relu(self.conv1(state_input.float()))
x = F.relu(self.conv2(x))
x = F.relu(self.conv3(x))
# action policy layers
x_act = F.relu(self.act_conv1(x))
x_act = x_act.view(-1, 4 * self.board_width * self.board_height)
# x_act = F.log_softmax(self.act_fc1(x_act))
x_act = F.log_softmax(self.act_fc1(x_act), dim=1)
# state value layers
x_val = F.relu(self.val_conv1(x))
x_val = x_val.view(-1, 2 * self.board_width * self.board_height)
x_val = F.relu(self.val_fc1(x_val))
x_val = F.tanh(self.val_fc2(x_val))
return x_act, x_val
class PolicyValueNet:
def __init__(self, board_width, board_height, model_file=None, use_gpu=True, device='cuda'):
self.device = device
self.use_gpu = use_gpu
self.board_width = board_width
self.board_height = board_height
self.l2_const = 1e-4
self.policy_value_net = Net(board_width, board_height).to(self.device)
self.optimizer = optim.Adam(self.policy_value_net.parameters(), weight_decay=self.l2_const)
if model_file:
net_params = torch.load(model_file)
self.policy_value_net.load_state_dict(net_params)
"""在蒙特卡洛树搜索过程中评估叶子节点对应的局面评分和返回该局面下的所有可行动作及对应概率"""
def policy_value_fn(self, board):
# print(board.current_state())
# 棋盘上的课落子情况
legal_positions = board.available
current_state = np.ascontiguousarray(board.current_state().reshape(-1, 4, self.board_width, self.board_height))
# current_state = Variable(torch.from_numpy(current_state)).float()
current_state = torch.as_tensor(current_state).to(self.device)
log_act_probs, value = self.policy_value_net(current_state)
log_act_probs, value = log_act_probs.cpu(), value.cpu()
act_probs = np.exp(np.exp(log_act_probs.detach().numpy().astype('float16').flatten()))
# act_probs = np.exp(log_act_probs.data.numpy().flatten())
act_probs = zip(legal_positions, act_probs[legal_positions])
value = value.data[0][0]
return act_probs, value
"""收集自我对弈数据"""
def train_step(self, state_batch, mcts_probs, winner_batch, lr):
"""perform a training step"""
# state_batch = Variable(torch.FloatTensor(state_batch))
# mcts_probs = Variable(torch.FloatTensor(mcts_probs))
# winner_batch = Variable(torch.FloatTensor(winner_batch))
# state_batch = torch.tensor(state_batch).to(self.device)
state_batch = torch.tensor(state_batch, dtype=torch.float32).to(self.device)
mcts_probs = torch.tensor(mcts_probs, dtype=torch.float32).to(self.device)
winner_batch = torch.tensor(winner_batch, dtype=torch.float32).to(self.device)
# zero the parameter gradients 参数梯度为零
self.optimizer.zero_grad()
# set learning rate
set_learning_rate(self.optimizer, lr)
# forward
log_act_probs, value = self.policy_value_net(state_batch)
# define the loss
value_loss = F.mse_loss(value.view(-1), winner_batch)
policy_loss = -torch.mean(torch.sum(mcts_probs * log_act_probs, 1))
loss = value_loss + policy_loss
# backward and optimize
loss.backward()
self.optimizer.step()
# policy entropy ,for monitoring only 计算策略的熵,仅用于评估模型
entropy = -torch.mean(torch.sum(torch.exp(log_act_probs) * log_act_probs, 1))
# return loss.item(), entropy.item()
return loss.detach().cpu().numpy(), entropy.detach().cpu().numpy()
# 获取策略价值网络模型的参数
def get_policy_param(self):
net_params = self.policy_value_net.state_dict()
return net_params
# 将模型保存到文件
def save_model(self, model_file):
"""save model params to file"""
net_params = self.get_policy_param()
torch.save(net_params, model_file)
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。