diff --git a/.gitignore b/.gitignore index 1292001..f29e4da 100644 --- a/.gitignore +++ b/.gitignore @@ -81,8 +81,6 @@ crashlytics-build.properties /Aimbot-PPO-Python/Pytorch/runs/ /Aimbot-PPO-Python/Pytorch/wandb/ /Aimbot-PPO-Python/Backup/ -/Aimbot-PPO-Python/Build-MultiScene-WithLoad/ -/Aimbot-PPO-Python/Build-CloseEnemyCut/ -/Aimbot-PPO-Python/Build-ParallelEnv/ +/Aimbot-PPO-Python/Build/ /Aimbot-PPO-Python/PPO-Model/ /Aimbot-PPO-Python/GAIL-Expert-Data/ \ No newline at end of file diff --git a/Aimbot-PPO-Python/Pytorch/ppo.py b/Aimbot-PPO-Python/Pytorch/ppo.py index 411b294..1ef7d41 100644 --- a/Aimbot-PPO-Python/Pytorch/ppo.py +++ b/Aimbot-PPO-Python/Pytorch/ppo.py @@ -13,30 +13,36 @@ from torch.distributions.categorical import Categorical from distutils.util import strtobool from torch.utils.tensorboard import SummaryWriter +bestReward = 0 + DEFAULT_SEED = 9331 -ENV_PATH = "../Build-ParallelEnv/Aimbot-ParallelEnv" +ENV_PATH = "../Build/Build-ParallelEnv-BigArea-6Enemy/Aimbot-ParallelEnv" WAND_ENTITY = "koha9" WORKER_ID = 1 -BASE_PORT = 2002 +BASE_PORT = 1000 +TOTAL_STEPS = 2000000 +STEP_NUM = 314 +DECISION_PERIOD = 2 LEARNING_RATE = 7e-4 GAMMA = 0.99 GAE_LAMBDA = 0.95 -TOTAL_STEPS = 2000000 -STEP_NUM = 256 -MINIBATCH_NUM = 1 +MINIBATCH_NUM = 4 EPOCHS = 4 CLIP_COEF = 0.1 +POLICY_COEF = 1.0 ENTROPY_COEF = 0.01 CRITIC_COEF = 0.5 ANNEAL_LEARNING_RATE = True CLIP_VLOSS = True NORM_ADV = True +TRAIN = True -WANDB_TACK = True -LOAD_DIR = "../PPO-Model/SmallArea-256-128-hybrid.pt" +WANDB_TACK = False +LOAD_DIR = None +# LOAD_DIR = "../PPO-Model/SmallArea-256-128-hybrid-2nd-trainning.pt" def parse_args(): @@ -59,6 +65,8 @@ def parse_args(): help="total timesteps of the experiments") # model parameters + parser.add_argument("--train",type=lambda x: bool(strtobool(x)), default=TRAIN, nargs="?", const=True, + help="Train Model or not") parser.add_argument("--stepNum", type=int, default=STEP_NUM, help="the number of steps to run in each environment per policy rollout") parser.add_argument("--minibatchesNum", type=int, default=MINIBATCH_NUM, @@ -73,8 +81,10 @@ def parse_args(): help="the entity (team) of wandb's project") parser.add_argument("--load-dir", type=str, default=LOAD_DIR, help="load model directory") + parser.add_argument("--decision-period", type=int, default=DECISION_PERIOD, + help="the number of steps to run in each environment per policy rollout") - # GAE + # GAE loss parser.add_argument("--gae", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True, help="Use GAE for advantage computation") parser.add_argument("--norm-adv", type=lambda x: bool(strtobool(x)), default=NORM_ADV, nargs="?", const=True, @@ -85,6 +95,8 @@ def parse_args(): help="the lambda for the general advantage estimation") parser.add_argument("--clip-coef", type=float, default=CLIP_COEF, help="the surrogate clipping coefficient") + parser.add_argument("--policy-coef", type=float, default=POLICY_COEF, + help="coefficient of the policy") parser.add_argument("--ent-coef", type=float, default=ENTROPY_COEF, help="coefficient of the entropy") parser.add_argument("--critic-coef", type=float, default=CRITIC_COEF, @@ -114,15 +126,15 @@ class PPOAgent(nn.Module): self.continuous_size = env.unity_continuous_size self.network = nn.Sequential( - layer_init(nn.Linear(np.array(env.unity_observation_shape).prod(), 256)), + layer_init(nn.Linear(np.array(env.unity_observation_shape).prod(), 384)), nn.ReLU(), - layer_init(nn.Linear(256, 128)), + layer_init(nn.Linear(384, 256)), nn.ReLU(), ) - self.actor_dis = layer_init(nn.Linear(128, self.discrete_size), std=0.01) - self.actor_mean = layer_init(nn.Linear(128, self.continuous_size), std=0.01) + self.actor_dis = layer_init(nn.Linear(256, self.discrete_size), std=0.01) + self.actor_mean = layer_init(nn.Linear(256, self.continuous_size), std=0.01) self.actor_logstd = nn.Parameter(torch.zeros(1, self.continuous_size)) - self.critic = layer_init(nn.Linear(128, 1), std=1) + self.critic = layer_init(nn.Linear(256, 1), std=1) def get_value(self, state: torch.Tensor): return self.critic(self.network(state)) @@ -140,9 +152,16 @@ class PPOAgent(nn.Module): con_probs = Normal(actions_mean, action_std) if actions is None: - disAct = torch.stack([ctgr.sample() for ctgr in multi_categoricals]) - conAct = con_probs.sample() - actions = torch.cat([disAct.T, conAct], dim=1) + if args.train: + # select actions base on probability distribution model + disAct = torch.stack([ctgr.sample() for ctgr in multi_categoricals]) + conAct = con_probs.sample() + actions = torch.cat([disAct.T, conAct], dim=1) + else: + # select actions base on best probability distribution + disAct = torch.stack([torch.argmax(logit, dim=1) for logit in split_logits]) + conAct = actions_mean + actions = torch.cat([disAct.T, conAct], dim=1) else: disAct = actions[:, 0 : env.unity_discrete_type].T conAct = actions[:, env.unity_discrete_type :] @@ -181,7 +200,7 @@ if __name__ == "__main__": # Tensorboard and WandB Recorder game_name = "Aimbot" - run_name = f"{game_name}__{args.seed}__{int(time.time())}" + run_name = f"{game_name}_{args.seed}_{int(time.time())}" if args.wandb_track: wandb.init( project=run_name, @@ -227,24 +246,37 @@ if __name__ == "__main__": optimizer.param_groups[0]["lr"] = lrnow # MAIN LOOP: run agent in environment - for step in range(args.stepNum): - global_step += 1 * env.unity_agent_num - obs[step] = next_obs - dones[step] = next_done + for i in range(args.stepNum * args.decision_period): + if i % args.decision_period == 0: + step = round(i / args.decision_period) + # Choose action by agent + global_step += 1 * env.unity_agent_num + obs[step] = next_obs + dones[step] = next_done - with torch.no_grad(): - # predict actions - action, dis_logprob, _, con_logprob, _, value = agent.get_actions_value(next_obs) - value = value.flatten() - next_obs, reward, done = env.step(action.cpu().numpy()) + with torch.no_grad(): + # predict actions + action, dis_logprob, _, con_logprob, _, value = agent.get_actions_value( + next_obs + ) + value = value.flatten() + next_obs, reward, done = env.step(action.cpu().numpy()) - # save memories - actions[step] = action - dis_logprobs[step] = dis_logprob - con_logprobs[step] = con_logprob - values[step] = value - rewards[step] = torch.tensor(reward).to(device).view(-1) - next_obs, next_done = torch.Tensor(next_obs).to(device), torch.Tensor(done).to(device) + # save memories + actions[step] = action + dis_logprobs[step] = dis_logprob + con_logprobs[step] = con_logprob + values[step] = value + rewards[step] = torch.tensor(reward).to(device).view(-1) + next_obs, next_done = torch.Tensor(next_obs).to(device), torch.Tensor(done).to( + device + ) + else: + # skip this step use last predict action + next_obs, reward, done = env.step(action.cpu().numpy()) + next_obs, next_done = torch.Tensor(next_obs).to(device), torch.Tensor(done).to( + device + ) # GAE with torch.no_grad(): @@ -276,119 +308,126 @@ if __name__ == "__main__": returns[t] = rewards[t] + args.gamma * nextnonterminal * next_return advantages = returns - values - # flatten the batch - b_obs = obs.reshape((-1,) + env.unity_observation_shape) - b_dis_logprobs = dis_logprobs.reshape(-1) - b_con_logprobs = con_logprobs.reshape(-1) - b_actions = actions.reshape((-1,) + (env.unity_action_size,)) - b_advantages = advantages.reshape(-1) - b_returns = returns.reshape(-1) - b_values = values.reshape(-1) + if args.train: + # flatten the batch + b_obs = obs.reshape((-1,) + env.unity_observation_shape) + b_dis_logprobs = dis_logprobs.reshape(-1) + b_con_logprobs = con_logprobs.reshape(-1) + b_actions = actions.reshape((-1,) + (env.unity_action_size,)) + b_advantages = advantages.reshape(-1) + b_returns = returns.reshape(-1) + b_values = values.reshape(-1) - # Optimizing the policy and value network - b_inds = np.arange(args.batch_size) - #clipfracs = [] - for epoch in range(args.epochs): - # shuffle all datasets - np.random.shuffle(b_inds) - for start in range(0, args.batch_size, args.minibatch_size): - end = start + args.minibatch_size - mb_inds = b_inds[start:end] - mb_advantages = b_advantages[mb_inds] + # Optimizing the policy and value network + b_inds = np.arange(args.batch_size) + # clipfracs = [] + for epoch in range(args.epochs): + # shuffle all datasets + np.random.shuffle(b_inds) + for start in range(0, args.batch_size, args.minibatch_size): + end = start + args.minibatch_size + mb_inds = b_inds[start:end] + mb_advantages = b_advantages[mb_inds] - # normalize advantages - if args.norm_adv: - mb_advantages = (mb_advantages - mb_advantages.mean()) / ( - mb_advantages.std() + 1e-8 + # normalize advantages + if args.norm_adv: + mb_advantages = (mb_advantages - mb_advantages.mean()) / ( + mb_advantages.std() + 1e-8 + ) + + ( + _, + new_dis_logprob, + dis_entropy, + new_con_logprob, + con_entropy, + newvalue, + ) = agent.get_actions_value(b_obs[mb_inds], b_actions[mb_inds]) + # discrete ratio + dis_logratio = new_dis_logprob - b_dis_logprobs[mb_inds] + dis_ratio = dis_logratio.exp() + # continuous ratio + con_logratio = new_con_logprob - b_con_logprobs[mb_inds] + con_ratio = con_logratio.exp() + + """ + # early stop + with torch.no_grad(): + # calculate approx_kl http://joschu.net/blog/kl-approx.html + old_approx_kl = (-logratio).mean() + approx_kl = ((ratio - 1) - logratio).mean() + clipfracs += [((ratio - 1.0).abs() > args.clip_coef).float().mean().item()] + """ + + # discrete Policy loss + dis_pg_loss_orig = -mb_advantages * dis_ratio + dis_pg_loss_clip = -mb_advantages * torch.clamp( + dis_ratio, 1 - args.clip_coef, 1 + args.clip_coef + ) + dis_pg_loss = torch.max(dis_pg_loss_orig, dis_pg_loss_clip).mean() + # continuous Policy loss + con_pg_loss_orig = -mb_advantages * con_ratio + con_pg_loss_clip = -mb_advantages * torch.clamp( + con_ratio, 1 - args.clip_coef, 1 + args.clip_coef + ) + con_pg_loss = torch.max(con_pg_loss_orig, con_pg_loss_clip).mean() + + # Value loss + newvalue = newvalue.view(-1) + if args.clip_vloss: + v_loss_unclipped = (newvalue - b_returns[mb_inds]) ** 2 + v_clipped = b_values[mb_inds] + torch.clamp( + newvalue - b_values[mb_inds], + -args.clip_coef, + args.clip_coef, + ) + v_loss_clipped = (v_clipped - b_returns[mb_inds]) ** 2 + v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped) + v_loss = 0.5 * v_loss_max.mean() + else: + v_loss = 0.5 * ((newvalue - b_returns[mb_inds]) ** 2).mean() + + # total loss + entropy_loss = dis_entropy.mean() + con_entropy.mean() + loss = ( + dis_pg_loss * args.policy_coef + + con_pg_loss * args.policy_coef + - entropy_loss * args.ent_coef + + v_loss * args.critic_coef ) - ( - _, - new_dis_logprob, - dis_entropy, - new_con_logprob, - con_entropy, - newvalue, - ) = agent.get_actions_value(b_obs[mb_inds], b_actions[mb_inds]) - # discrete ratio - dis_logratio = new_dis_logprob - b_dis_logprobs[mb_inds] - dis_ratio = dis_logratio.exp() - # continuous ratio - con_logratio = new_con_logprob - b_con_logprobs[mb_inds] - con_ratio = con_logratio.exp() + optimizer.zero_grad() + loss.backward() + # Clips gradient norm of an iterable of parameters. + nn.utils.clip_grad_norm_(agent.parameters(), args.max_grad_norm) + optimizer.step() """ - # early stop - with torch.no_grad(): - # calculate approx_kl http://joschu.net/blog/kl-approx.html - old_approx_kl = (-logratio).mean() - approx_kl = ((ratio - 1) - logratio).mean() - clipfracs += [((ratio - 1.0).abs() > args.clip_coef).float().mean().item()] + if args.target_kl is not None: + if approx_kl > args.target_kl: + break """ - - # discrete Policy loss - dis_pg_loss_orig = -mb_advantages * dis_ratio - dis_pg_loss_clip = -mb_advantages * torch.clamp( - dis_ratio, 1 - args.clip_coef, 1 + args.clip_coef - ) - dis_pg_loss = torch.max(dis_pg_loss_orig, dis_pg_loss_clip).mean() - # continuous Policy loss - con_pg_loss_orig = -mb_advantages * con_ratio - con_pg_loss_clip = -mb_advantages * torch.clamp( - con_ratio, 1 - args.clip_coef, 1 + args.clip_coef - ) - con_pg_loss = torch.max(con_pg_loss_orig, con_pg_loss_clip).mean() - - # Value loss - newvalue = newvalue.view(-1) - if args.clip_vloss: - v_loss_unclipped = (newvalue - b_returns[mb_inds]) ** 2 - v_clipped = b_values[mb_inds] + torch.clamp( - newvalue - b_values[mb_inds], - -args.clip_coef, - args.clip_coef, - ) - v_loss_clipped = (v_clipped - b_returns[mb_inds]) ** 2 - v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped) - v_loss = 0.5 * v_loss_max.mean() - else: - v_loss = 0.5 * ((newvalue - b_returns[mb_inds]) ** 2).mean() - - # total loss - entropy_loss = dis_entropy.mean() + con_entropy.mean() - loss = ( - dis_pg_loss - + con_pg_loss - - entropy_loss * args.ent_coef - + v_loss * args.critic_coef - ) - - optimizer.zero_grad() - loss.backward() - # Clips gradient norm of an iterable of parameters. - nn.utils.clip_grad_norm_(agent.parameters(), args.max_grad_norm) - optimizer.step() - - """ - if args.target_kl is not None: - if approx_kl > args.target_kl: - break - """ - # record rewards for plotting purposes - writer.add_scalar("charts/learning_rate", optimizer.param_groups[0]["lr"], global_step) - writer.add_scalar("losses/value_loss", v_loss.item(), global_step) - writer.add_scalar("losses/dis_policy_loss", dis_pg_loss.item(), global_step) - writer.add_scalar("losses/con_policy_loss", con_pg_loss.item(), global_step) - writer.add_scalar("losses/total_loss", loss.item(), global_step) - writer.add_scalar("losses/entropy_loss", entropy_loss.item(), global_step) - # writer.add_scalar("losses/old_approx_kl", old_approx_kl.item(), global_step) - # writer.add_scalar("losses/approx_kl", approx_kl.item(), global_step) - #writer.add_scalar("losses/clipfrac", np.mean(clipfracs), global_step) - print("SPS:", int(global_step / (time.time() - start_time))) - writer.add_scalar("charts/SPS", int(global_step / (time.time() - start_time)), global_step) - writer.add_scalar( - "charts/Reward", np.mean(rewards.to("cpu").detach().numpy().copy()), global_step - ) + # record rewards for plotting purposes + rewardsMean = np.mean(rewards.to("cpu").detach().numpy().copy()) + writer.add_scalar("charts/learning_rate", optimizer.param_groups[0]["lr"], global_step) + writer.add_scalar("losses/value_loss", v_loss.item(), global_step) + writer.add_scalar("losses/dis_policy_loss", dis_pg_loss.item(), global_step) + writer.add_scalar("losses/con_policy_loss", con_pg_loss.item(), global_step) + writer.add_scalar("losses/total_loss", loss.item(), global_step) + writer.add_scalar("losses/entropy_loss", entropy_loss.item(), global_step) + # writer.add_scalar("losses/old_approx_kl", old_approx_kl.item(), global_step) + # writer.add_scalar("losses/approx_kl", approx_kl.item(), global_step) + # writer.add_scalar("losses/clipfrac", np.mean(clipfracs), global_step) + # print("SPS:", int(global_step / (time.time() - start_time))) + print("episode over mean reward:", rewardsMean) + writer.add_scalar( + "charts/SPS", int(global_step / (time.time() - start_time)), global_step + ) + writer.add_scalar("charts/Reward", rewardsMean, global_step) + if rewardsMean > bestReward: + bestReward = rewardsMean + saveDir = "../PPO-Model/bigArea-384-128-hybrid-" + str(rewardsMean) + ".pt" + torch.save(agent, saveDir) env.close() writer.close() diff --git a/Aimbot-PPO-Python/Pytorch/testarea.ipynb b/Aimbot-PPO-Python/Pytorch/testarea.ipynb index 8801206..dd9fd35 100644 --- a/Aimbot-PPO-Python/Pytorch/testarea.ipynb +++ b/Aimbot-PPO-Python/Pytorch/testarea.ipynb @@ -431,6 +431,45 @@ "mymodel = torch.load(\"../PPO-Model/SmallArea-256-128-hybrid.pt\")\n", "mymodel.eval()" ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "x : torch.Size([2, 3, 4])\n", + "x : torch.Size([6, 2, 3, 4])\n", + "x : torch.Size([6, 2, 3, 4])\n" + ] + } + ], + "source": [ + "import torch\n", + "#1\n", + "x = torch.randn(2, 1, 1)#为1可以扩展为3和4\n", + "x = x.expand(2, 3, 4)\n", + "print('x :', x.size())\n", + "\n", + "#2\n", + "#扩展一个新的维度必须在最前面,否则会报错\n", + "#x = x.expand(2, 3, 4, 6)\n", + "\n", + "x = x.expand(6, 2, 3, 4)\n", + "print('x :', x.size())\n", + "\n", + "#3\n", + "#某一个维度为-1表示不改变该维度的大小\n", + "x = x.expand(6, -1, -1, -1)\n", + "print('x :', x.size())\n", + "\n", + "x : torch.Size([2, 3, 4])\n", + "x : torch.Size([6, 2, 3, 4])\n", + "x : torch.Size([6, 2, 3, 4])" + ] } ], "metadata": {