-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathppo_continous_main.py
More file actions
248 lines (220 loc) · 12.1 KB
/
ppo_continous_main.py
File metadata and controls
248 lines (220 loc) · 12.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
# -*- encoding: utf-8 -*-
'''
@File : ppo_continous_main.py
@Time : 2024/10/22 21:29:40
@Author : junewluo
'''
import os
import time
import torch
import numpy as np
import datetime
# import gymnasium as gym
import argparse
import wandb
import warnings
warnings.filterwarnings("ignore")
from torch.utils.tensorboard import SummaryWriter
from ppo.trick import (
state_norm, reward_norm, adv_norm,
lr_decay, orthogonal_initialization,
ppo_clip_param_annealing, KeepMeanStd
)
from ppo.relaybuffer import RelayBuffer
from ppo_mp.ppo import PPO
from share_func import clear_folder, build_env, run2gif
from env.flappy_bird import FlappyBirdWrapper
from env.catcher import CatcherWrapper
def build_ppo_param(args, device):
ppo_params = {
# ppo algorithm params
'clip_param' : args.epsilon,
# 训练参数
'lr_a': args.lr_a,
'lr_c': args.lr_c,
'gamma': args.gamma,
'lamda': args.lamda,
'batch_size' : args.batch_size,
'mini_batch_size': args.mini_batch_size,
# trick params
"use_ppo_clip":args.use_ppo_clip , # use ppo clip param annealing
"use_adv_norm" : args.use_adv_norm, # use advantage normalization
"use_state_norm" : args.use_state_norm, # use state normalization
"use_reward_norm" : args.use_reward_norm, # use reward normalization
'use_tanh' : args.use_tanh, # use tanh activate func or ReLU func
'use_adv_norm' : args.use_adv_norm, # use advantage normalization
'use_grad_clip' : args.use_grad_clip, # use grad clip in model params.
'use_gae': args.use_gae,
'grad_clip_params': args.grad_clip_param,
'use_lr_decay': args.use_lr_decay,
'entropy_coef': args.entropy_coef,
'device': device,
}
return ppo_params
def main(args, number, seed):
# Set random seed
np.random.seed(seed)
torch.manual_seed(seed)
eval_env, envs = build_env(env_name = args.env_name, env_num = args.env_num, seed = seed)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = torch.device('cpu')
state_dim = envs.single_observation_space.shape[0]
action_dim = envs.single_action_space.n
layer_nums = args.layers
hidden_dims = args.hidden_dims
# args.max_episode_steps = env._max_episode_steps # Maximum number of steps per episode
print(f'======== run ppo algorithm =========')
print("env = {}".format(args.env_name))
print("device = {}".format(device))
print("state_dim = {}".format(state_dim))
print("action_dim = {}".format(action_dim))
print('max_train_steps = {}'.format(args.max_train_steps))
print('eval_freq = {}'.format(args.evaluate_freq))
print(f'=====================================')
evaluate_num = 0 # Record the number of evaluations
evaluate_rewards = [] # Record the rewards during the evaluating
total_steps = 0 # Record the total steps during the training
# init the agent
ppo_params = build_ppo_param(args = args, device = device)
agent = PPO(state_dim = state_dim,
act_dim = action_dim,
hidden_dims = hidden_dims,
layer_nums = layer_nums,
train_params = ppo_params
)
# monitor tools init
if args.wandb == 1:
now_time = datetime.datetime.now().strftime("%Y-%m-%d")
name = f'{args.env_name}_{now_time}-{os.getpid()}'
wandb.init(project = f"ppo_train", name = name)
if args.tensorboard == 1:
# clear dir or make dir
tensorboard_logdir = 'runs/PPO_mp_{}_seed_{}'.format(args.env_name, number, seed)
clear_folder(folder_path = tensorboard_logdir)
# Build a tensorboard
writer = SummaryWriter(log_dir=tensorboard_logdir)
# pre-define, batch_obs shape is [args.per_batch_steps, args.env_num, envs.single_observation_space.shape]
batch_obs = torch.zeros((args.per_batch_steps, args.env_num) + envs.single_observation_space.shape)
batch_actions = torch.zeros((args.per_batch_steps, args.env_num) + envs.single_action_space.shape)
batch_log_probs = torch.zeros((args.per_batch_steps, args.env_num))
batch_rewards = torch.zeros((args.per_batch_steps, args.env_num))
batch_dones = torch.zeros((args.per_batch_steps, args.env_num))
batch_next_obs = torch.zeros_like(batch_obs)
# batch_values = torch.zeros((args.per_batch_steps, args.env_num))
# generated data
obs, _ = envs.reset()
done = torch.zeros(args.env_num)
# training process
for train_step in range(args.max_train_steps):
## Trick5 : Learning Rate Decay ##
for step in range(args.per_batch_steps):
# notice: action, a_logprob and value is a numpy.ndarray
action, a_logprob = agent.select_action(obs)
# value = agent.get_value(obs)
# batch_values[step] = torch.tensor(value).flatten()
next_obs, reward, done, truncation, _ = envs.step(action)
# write to tensor
batch_actions[step] = torch.tensor(action)
batch_log_probs[step] = torch.tensor(a_logprob)
batch_rewards[step] = torch.tensor(reward).view(-1)
batch_obs[step] = torch.tensor(obs)
batch_next_obs[step] = torch.tensor(next_obs)
batch_dones[step] = torch.tensor(done)
obs = next_obs
# start optimizer process while collect enough datas.
update_epoch = (args.per_batch_steps * args.env_num) // args.mini_batch_size
for _ in range(update_epoch):
# print(f"Start Training! Update Epoch is {update_epoch}")
loss = agent.learn(batch_obs = batch_obs.to(device),
batch_actions = batch_actions.to(device),
batch_log_probs= batch_log_probs.to(device),
batch_rewards = batch_rewards.to(device),
batch_dones = batch_dones.to(device),
batch_next_obs = batch_next_obs.to(device)
)
total_steps += 1
a_loss, c_loss = loss
if args.wandb == 1:
wandb.log({'actor_loss': a_loss, 'critic_loss': c_loss})
if args.tensorboard == 1:
writer.add_scalar(tag = f'train_actor_loss_{args.env_name}', scalar_value = a_loss, global_step = total_steps)
writer.add_scalar(tag = f'train_critic_loss_{args.env_name}', scalar_value = c_loss, global_step = total_steps)
if args.use_lr_decay :
cur_lr_a = agent.ppo_params['lr_a']
cur_lr_c = agent.ppo_params['lr_c']
new_lr_a = lr_decay(agent.actor_optim, cur_step = total_steps, max_step = args.max_train_steps, cur_lr = cur_lr_a)
new_lr_c = lr_decay(agent.critic_optim, cur_step = total_steps, max_step = args.max_train_steps, cur_lr = cur_lr_c)
agent.ppo_params['lr_a'] = new_lr_a
agent.ppo_params['lr_c'] = new_lr_c
if (train_step+1) % args.evaluate_freq == 0:
eval_times = 1
round_count = 0
val_reward = 0
for k in range(eval_times):
done = False
step = 0
episode_reward = 0.0
state, _ = eval_env.reset()
# val_env.render()
while not done:
step += 1
done = False
action, _ = agent.select_action(state, eval_mode = True) # We use the deterministic policy during the evaluating
state_, reward, done,trun, _ = eval_env.step(action.item())
episode_reward += reward
state = state_
if step >= 15000:
break
val_reward += episode_reward
round_count += step
print(f'step is {train_step}, validation reward is {val_reward / eval_times}, every round count is {round_count / eval_times}')
if args.wandb == 1:
wandb.log({'eval_reward':val_reward / eval_times, "eval_steps": (round_count / eval_times)})
if args.tensorboard == 1:
writer.add_scalar(tag = f'validation_reward_{args.env_name}', scalar_value = val_reward / eval_times, global_step = evaluate_num)
writer.add_scalar(tag = f'validation_rounds_{args.env_name}', scalar_value = round_count / eval_times, global_step = evaluate_num)
evaluate_num += 1
# save model, can choice
agent.save_checkpoint(only_net = False)
# 测试模型并生成gif动态图
gif_name = f'{args.env_name}_ppo_{int(time.time())}_{os.getpid()}.gif'
run2gif(eval_env, agent, gif_name)
if __name__ == '__main__':
parser = argparse.ArgumentParser("Hyperparameter Setting for PPO")
# env variable setting
parser.add_argument("--env_name",type=str,default="CartPole-v1",help="The Env Name of Gym")
parser.add_argument("--env_num",type=int,default=50,help="The number of envs that are activated")
parser.add_argument("--layers", type=int, default=3, help="the number of hidden layers")
parser.add_argument("--hidden_dims", type=int, nargs='+', default=[128,128], help="The number of neurons in hidden layers of the neural network")
# training variable setting
parser.add_argument("--max_train_steps", type=int, default=2000, help=" Maximum number of training steps")
parser.add_argument("--per_batch_steps", type=int, default=500, help="max step in a round.")
parser.add_argument("--evaluate_freq", type=int, default=20, help="Evaluate the policy every 'evaluate_freq' steps")
parser.add_argument("--batch_size", type=int, default=4096, help="Batch size")
parser.add_argument("--mini_batch_size", type=int, default=512, help="Minibatch size")
parser.add_argument("--hidden_width", type=int, default=64, help="The number of neurons in hidden layers of the neural network")
parser.add_argument("--lr_a", type=float, default=1e-3, help="Learning rate of actor")
parser.add_argument("--lr_c", type=float, default=1e-4, help="Learning rate of critic")
parser.add_argument("--gamma", type=float, default=0.99, help="Discount factor")
parser.add_argument("--lamda", type=float, default=0.95, help="GAE parameter")
parser.add_argument("--epsilon", type=float, default=0.2, help="PPO clip parameter")
# some setting
parser.add_argument("--use_gae",type=bool,default=True,help="whether use gae function to cal adv")
parser.add_argument("--grad_clip_param",type=float,default=0.5,help="parameters for model grad clip")
# training trcik setting
parser.add_argument("--use_adv_norm", type=bool, default=True, help="Trick 1:advantage normalization")
parser.add_argument("--use_state_norm", type=bool, default=True, help="Trick 2:state normalization")
parser.add_argument("--use_reward_norm", type=bool, default=False, help="Trick 3:reward normalization")
parser.add_argument("--use_reward_scaling", type=bool, default=True, help="Trick 4:reward scaling")
parser.add_argument("--entropy_coef", type=float, default=0.01, help="Trick 5: policy entropy")
parser.add_argument("--use_lr_decay", type=bool, default=True, help="Trick 6:learning rate Decay")
parser.add_argument("--use_grad_clip", type=bool, default=True, help="Trick 7: Gradient clip")
parser.add_argument("--use_orthogonal_init", type=bool, default=True, help="Trick 8: orthogonal initialization")
parser.add_argument("--set_adam_eps", type=float, default=1e-5, help="Trick 9: set Adam epsilon=1e-5")
parser.add_argument("--use_tanh", type=bool, default=True, help="Trick 10: tanh activation function")
parser.add_argument("--use_ppo_clip",type=bool,default=True,help="Trick 11: use ppo param to clip")
# monitor setting
parser.add_argument("--wandb", type=int, default=0, help="use wandb to monitor train process")
parser.add_argument("--tensorboard", type=int, default=0, help="use tensorboard to monitor training process")
args = parser.parse_args()
main(args, number=1, seed=0)