Change training dataset storage method

save training dataset by it target type.
while training NN use single target training set to backward NN.
this improve at least 20 times faster than last update!
This commit is contained in:
Koha9 2022-12-03 07:54:38 +09:00
parent 895cd5c118
commit cbc385ca10

View File

@ -10,6 +10,7 @@ import torch.optim as optim
from AimbotEnv import Aimbot from AimbotEnv import Aimbot
from tqdm import tqdm from tqdm import tqdm
from enum import Enum
from torch.distributions.normal import Normal from torch.distributions.normal import Normal
from torch.distributions.categorical import Categorical from torch.distributions.categorical import Categorical
from distutils.util import strtobool from distutils.util import strtobool
@ -34,11 +35,11 @@ BASE_PORT = 1001
# max round steps per agent is 2500/Decision_period, 25 seconds # max round steps per agent is 2500/Decision_period, 25 seconds
# !!!check every parameters before run!!! # !!!check every parameters before run!!!
TOTAL_STEPS = 6000000 TOTAL_STEPS = 6750000
BATCH_SIZE = 512 BATCH_SIZE = 512
MAX_TRAINNING_DATASETS = 8000 MAX_TRAINNING_DATASETS = 3000
DECISION_PERIOD = 1 DECISION_PERIOD = 1
LEARNING_RATE = 8e-4 LEARNING_RATE = 1e-3
GAMMA = 0.99 GAMMA = 0.99
GAE_LAMBDA = 0.95 GAE_LAMBDA = 0.95
EPOCHS = 4 EPOCHS = 4
@ -54,17 +55,27 @@ NORM_ADV = True
TRAIN = True TRAIN = True
WANDB_TACK = True WANDB_TACK = True
#LOAD_DIR = None LOAD_DIR = None
LOAD_DIR = "../PPO-Model/Aimbot-target-last.pt" #LOAD_DIR = "../PPO-Model/Aimbot-target-last.pt"
# public data # public data
class Targets(Enum):
Free = 0
Go = 1
Attack = 2
Defence = 3
Num = 4
BASE_WINREWARD = 999 BASE_WINREWARD = 999
BASE_LOSEREWARD = -999 BASE_LOSEREWARD = -999
TARGETNUM= 4 TARGETNUM= 4
ENV_TIMELIMIT = 30 ENV_TIMELIMIT = 30
RESULT_BROADCAST_RATIO = 2/ENV_TIMELIMIT RESULT_BROADCAST_RATIO = 2/ENV_TIMELIMIT
TotalRounds = {"Go":0,"Attack":0,"Free":0} TotalRounds = {"Free":0,"Go":0,"Attack":0}
WinRounds = {"Go":0,"Attack":0,"Free":0} WinRounds = {"Free":0,"Go":0,"Attack":0}
# !!!SPECIAL PARAMETERS!!!
# change it while program is finished
using_targets_num = 3
def parse_args(): def parse_args():
@ -164,7 +175,7 @@ class PPOAgent(nn.Module):
def get_actions_value(self, state: torch.Tensor, actions=None): def get_actions_value(self, state: torch.Tensor, actions=None):
hidden = self.network(state) hidden = self.network(state)
targets = state[:,0] targets = state[:,0].to(torch.int32)
# discrete # discrete
# 递归targets的数量,既agent数来实现根据target不同来选用对应的输出网络计算输出 # 递归targets的数量,既agent数来实现根据target不同来选用对应的输出网络计算输出
@ -321,8 +332,8 @@ if __name__ == "__main__":
optimizer = optim.Adam(agent.parameters(), lr=args.lr, eps=1e-5) optimizer = optim.Adam(agent.parameters(), lr=args.lr, eps=1e-5)
# Tensorboard and WandB Recorder # Tensorboard and WandB Recorder
game_name = "Aimbot_Target" game_name = "Aimbot_Target_Hybrid_Multi_Output"
game_type = "OffPolicy_HMNN_EndBC" game_type = "OffPolicy_EndBC"
run_name = f"{game_name}_{game_type}_{args.seed}_{int(time.time())}" run_name = f"{game_name}_{game_type}_{args.seed}_{int(time.time())}"
if args.wandb_track: if args.wandb_track:
wandb.init( wandb.init(
@ -351,14 +362,24 @@ if __name__ == "__main__":
dones_bf = [[] for i in range(env.unity_agent_num)] dones_bf = [[] for i in range(env.unity_agent_num)]
values_bf = [[] for i in range(env.unity_agent_num)] values_bf = [[] for i in range(env.unity_agent_num)]
# TRY NOT TO MODIFY: start the game # start the game
total_update_step = args.total_timesteps // args.datasetSize total_update_step = using_targets_num * args.total_timesteps // args.datasetSize
global_step = 0 target_steps = [0 for i in range(TARGETNUM)]
start_time = time.time() start_time = time.time()
state, _, done = env.reset() state, _, done = env.reset()
# state = torch.Tensor(next_obs).to(device) # state = torch.Tensor(next_obs).to(device)
# next_done = torch.zeros(env.unity_agent_num).to(device) # next_done = torch.zeros(env.unity_agent_num).to(device)
# initialize empty training datasets
obs = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,env.unity_observation_size)
actions = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,env.unity_action_size)
dis_logprobs = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1)
con_logprobs = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1)
rewards = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1)
values = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1)
advantages = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1)
returns = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1)
for total_steps in range(total_update_step): for total_steps in range(total_update_step):
# discunt learning rate, while step == total_update_step lr will be 0 # discunt learning rate, while step == total_update_step lr will be 0
print("new episode") print("new episode")
@ -368,24 +389,15 @@ if __name__ == "__main__":
lrnow = frac * args.lr lrnow = frac * args.lr
optimizer.param_groups[0]["lr"] = lrnow optimizer.param_groups[0]["lr"] = lrnow
# initialize empty training datasets
obs = torch.tensor([]).to(device) # (n,env.unity_observation_size)
actions = torch.tensor([]).to(device) # (n,env.unity_action_size)
dis_logprobs = torch.tensor([]).to(device) # (n,1)
con_logprobs = torch.tensor([]).to(device) # (n,1)
rewards = torch.tensor([]).to(device) # (n,1)
values = torch.tensor([]).to(device) # (n,1)
advantages = torch.tensor([]).to(device) # (n,1)
returns = torch.tensor([]).to(device) # (n,1)
# MAIN LOOP: run agent in environment # MAIN LOOP: run agent in environment
i = 0 i = 0
training = False training = False
trainQueue = []
while True: while True:
if i % args.decision_period == 0: if i % args.decision_period == 0:
step = round(i / args.decision_period) step = round(i / args.decision_period)
# Choose action by agent # Choose action by agent
global_step += 1 * env.unity_agent_num
with torch.no_grad(): with torch.no_grad():
# predict actions # predict actions
@ -416,7 +428,8 @@ if __name__ == "__main__":
# finished a round, send finished memories to training datasets # finished a round, send finished memories to training datasets
# compute advantage and discounted reward # compute advantage and discounted reward
#print(i,"over") #print(i,"over")
thisRewardsTensor = broadCastEndReward(rewards_bf[i],state[i,6]) roundTargetType = int(state[i,0])
thisRewardsTensor = broadCastEndReward(rewards_bf[i],roundTargetType)
adv, rt = GAE( adv, rt = GAE(
agent, agent,
args, args,
@ -427,18 +440,18 @@ if __name__ == "__main__":
torch.Tensor([next_done[i]]).to(device), torch.Tensor([next_done[i]]).to(device),
) )
# send memories to training datasets # send memories to training datasets
obs = torch.cat((obs, torch.tensor(ob_bf[i]).to(device)), 0) obs[roundTargetType] = torch.cat((obs[roundTargetType], torch.tensor(ob_bf[i]).to(device)), 0)
actions = torch.cat((actions, torch.tensor(act_bf[i]).to(device)), 0) actions[roundTargetType] = torch.cat((actions[roundTargetType], torch.tensor(act_bf[i]).to(device)), 0)
dis_logprobs = torch.cat( dis_logprobs[roundTargetType] = torch.cat(
(dis_logprobs, torch.tensor(dis_logprobs_bf[i]).to(device)), 0 (dis_logprobs[roundTargetType], torch.tensor(dis_logprobs_bf[i]).to(device)), 0
) )
con_logprobs = torch.cat( con_logprobs[roundTargetType] = torch.cat(
(con_logprobs, torch.tensor(con_logprobs_bf[i]).to(device)), 0 (con_logprobs[roundTargetType], torch.tensor(con_logprobs_bf[i]).to(device)), 0
) )
rewards = torch.cat((rewards, thisRewardsTensor), 0) rewards[roundTargetType] = torch.cat((rewards[roundTargetType], thisRewardsTensor), 0)
values = torch.cat((values, torch.tensor(values_bf[i]).to(device)), 0) values[roundTargetType] = torch.cat((values[roundTargetType], torch.tensor(values_bf[i]).to(device)), 0)
advantages = torch.cat((advantages, adv), 0) advantages[roundTargetType] = torch.cat((advantages[roundTargetType], adv), 0)
returns = torch.cat((returns, rt), 0) returns[roundTargetType] = torch.cat((returns[roundTargetType], rt), 0)
# clear buffers # clear buffers
ob_bf[i] = [] ob_bf[i] = []
@ -448,10 +461,13 @@ if __name__ == "__main__":
rewards_bf[i] = [] rewards_bf[i] = []
dones_bf[i] = [] dones_bf[i] = []
values_bf[i] = [] values_bf[i] = []
print(f"train dataset added:{obs.size()[0]}/{args.datasetSize}") print(f"train dataset {Targets(roundTargetType).name} added:{obs[roundTargetType].size()[0]}/{args.datasetSize}")
if obs.size()[0] >= args.datasetSize: for i in range(TARGETNUM):
if obs[i].size()[0] >= args.datasetSize:
# start train NN # start train NN
trainQueue.append(i)
if(len(trainQueue)>0):
break break
state, done = next_state, next_done state, done = next_state, next_done
else: else:
@ -507,15 +523,19 @@ if __name__ == "__main__":
i += 1 i += 1
if args.train: if args.train:
meanRewardList = [] # for WANDB
# loop all tarining queue
for thisT in trainQueue:
target_steps[thisT]+=1
# flatten the batch # flatten the batch
b_obs = obs.reshape((-1,) + env.unity_observation_shape) b_obs = obs[thisT].reshape((-1,) + env.unity_observation_shape)
b_dis_logprobs = dis_logprobs.reshape(-1) b_dis_logprobs = dis_logprobs[thisT].reshape(-1)
b_con_logprobs = con_logprobs.reshape(-1) b_con_logprobs = con_logprobs[thisT].reshape(-1)
b_actions = actions.reshape((-1,) + (env.unity_action_size,)) b_actions = actions[thisT].reshape((-1,) + (env.unity_action_size,))
b_advantages = advantages.reshape(-1) b_advantages = advantages[thisT].reshape(-1)
b_returns = returns.reshape(-1) b_returns = returns[thisT].reshape(-1)
b_values = values.reshape(-1) b_values = values[thisT].reshape(-1)
b_size = b_obs.size()[0] b_size = b_obs[thisT].size()[0]
# Optimizing the policy and value network # Optimizing the policy and value network
b_inds = np.arange(b_size) b_inds = np.arange(b_size)
# clipfracs = [] # clipfracs = []
@ -605,30 +625,41 @@ if __name__ == "__main__":
if approx_kl > args.target_kl: if approx_kl > args.target_kl:
break break
""" """
# record mean reward before clear history
targetRewardMean = np.mean(rewards[thisT].to("cpu").detach().numpy().copy())
meanRewardList.append(targetRewardMean)
targetName = Targets(thisT).name
# clear this target trainning set buffer
obs[thisT] = torch.tensor([]).to(device)
actions[thisT] = torch.tensor([]).to(device)
dis_logprobs[thisT] = torch.tensor([]).to(device)
con_logprobs[thisT] = torch.tensor([]).to(device)
rewards[thisT] = torch.tensor([]).to(device)
values[thisT] = torch.tensor([]).to(device)
advantages[thisT] = torch.tensor([]).to(device)
returns[thisT] = torch.tensor([]).to(device)
# record rewards for plotting purposes # record rewards for plotting purposes
rewardsMean = np.mean(rewards.to("cpu").detach().numpy().copy()) writer.add_scalar(f"Target{targetName}/value_loss", v_loss.item(), target_steps[thisT])
writer.add_scalar("charts/learning_rate", optimizer.param_groups[0]["lr"], global_step) writer.add_scalar(f"Target{targetName}/value_loss", v_loss.item(), target_steps[thisT])
writer.add_scalar("losses/value_loss", v_loss.item(), global_step) writer.add_scalar(f"Target{targetName}/dis_policy_loss", dis_pg_loss.item(), target_steps[thisT])
writer.add_scalar("losses/dis_policy_loss", dis_pg_loss.item(), global_step) writer.add_scalar(f"Target{targetName}/con_policy_loss", con_pg_loss.item(), target_steps[thisT])
writer.add_scalar("losses/con_policy_loss", con_pg_loss.item(), global_step) writer.add_scalar(f"Target{targetName}/total_loss", loss.item(), target_steps[thisT])
writer.add_scalar("losses/total_loss", loss.item(), global_step) writer.add_scalar(f"Target{targetName}/entropy_loss", entropy_loss.item(), target_steps[thisT])
writer.add_scalar("losses/entropy_loss", entropy_loss.item(), global_step) writer.add_scalar(f"Target{targetName}/Reward", targetRewardMean, target_steps[thisT])
# writer.add_scalar("losses/old_approx_kl", old_approx_kl.item(), global_step) writer.add_scalar(f"Target{targetName}/WinRatio", WinRounds[targetName]/TotalRounds[targetName], target_steps[thisT])
# writer.add_scalar("losses/approx_kl", approx_kl.item(), global_step) print(f"episode over Target{targetName} mean reward:", targetRewardMean)
# writer.add_scalar("losses/clipfrac", np.mean(clipfracs), global_step) TotalRewardMean = np.mean(meanRewardList)
# print("SPS:", int(global_step / (time.time() - start_time))) writer.add_scalar("GlobalCharts/TotalRewardMean", TotalRewardMean, total_steps)
print("episode over mean reward:", rewardsMean) writer.add_scalar("GlobalCharts/learning_rate", optimizer.param_groups[0]["lr"], total_steps)
writer.add_scalar( # New Record!
"charts/SPS", int(global_step / (time.time() - start_time)), global_step if TotalRewardMean > bestReward:
) bestReward = targetRewardMean
writer.add_scalar("charts/Reward", rewardsMean, global_step) saveDir = "../PPO-Model/Hybrid-MNN-500-300" + str(TotalRewardMean) + ".pt"
writer.add_scalar("charts/GoWinRatio", WinRounds["Go"]/TotalRounds["Go"], global_step)
writer.add_scalar("charts/AttackWinRatio", WinRounds["Attack"]/TotalRounds["Attack"], global_step)
writer.add_scalar("charts/FreeWinRatio", WinRounds["Free"]/TotalRounds["Free"], global_step)
if rewardsMean > bestReward:
bestReward = rewardsMean
saveDir = "../PPO-Model/Target-700-500-256-hybrid-" + str(rewardsMean) + ".pt"
torch.save(agent, saveDir) torch.save(agent, saveDir)
saveDir = "../PPO-Model/Hybrid-MNN-500-300-Last" + ".pt"
torch.save(agent, saveDir)
env.close() env.close()
writer.close() writer.close()