master

分支 (1)

标签 (2)

管理

管理

master

1.3.0

1.2.4

future_agent
/
learn_sac.py


from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.utils import set_random_seed
from stable_baselines3 import SAC
from training_env import TrainingEnv
from save_model import SaveModelCallback
from stable_baselines3.common.vec_env import VecFrameStack,SubprocVecEnv
import torch as th
from stable_baselines3.common.evaluation import evaluate_policy

TB_LOG_PATH = "../tb_log"
MODEL_PATH = "./model/sac"

LEARN_TIMES = 2000000
TRAINING_BEGIN_TIME = ["2022-08-14","2022-08-15"
,"2022-08-18","2022-08-19","2022-08-20","2022-08-21","2022-08-22","2022-08-25","2022-08-26","2022-08-27","2022-08-28"
,"2022-08-29","2022-09-01","2022-09-02","2022-09-03","2022-09-04"]
# The algorithms require a vectorized environment to run


def make_env(rank, seed=0):
    """
    Utility function for multiprocessed env.

    :param env_id: (str) the environment ID
    :param num_env: (int) the number of environments you wish to have in subprocesses
    :param seed: (int) the inital seed for RNG
    :param rank: (int) index of the subprocess
    """
    def _init():
        env = Monitor(TrainingEnv(TRAINING_BEGIN_TIME), MODEL_PATH)
        env.seed(seed + rank)
        return env
    set_random_seed(seed)
    return _init


def optimize_params():

    policy = dict(
        activation_fn=th.nn.ReLU,
        net_arch=[
        508,
        494,
        865,
        754,
        417,
        798,
        799,
        343
        ]
    )
    return {
        'gamma':0.8595542347091839,
        'learning_rate':1.8359176676795943e-05,
        'policy_kwargs':policy
    }


if __name__ == '__main__':

    num_cpu = 128  # Number of processes to use
    # Create the vectorized environment

    #env = DummyVecEnv([lambda: Monitor(TrainingEnv(TRAINING_BEGIN_TIME), MODEL_PATH)])

    env = SubprocVecEnv([make_env(i) for i in range(num_cpu)])

    model_params = optimize_params()

    model = SAC('MlpPolicy', env,verbose=1,tensorboard_log=TB_LOG_PATH,**model_params)

    model.learn(total_timesteps=LEARN_TIMES,callback=SaveModelCallback(check_freq=4096, path=MODEL_PATH,env=env))

    mean_reward, std_reward = evaluate_policy(model, env)

    print(f"{mean_reward} {std_reward}")

    model.save("sac_stock")