Add load & save function.

Add load & save function.
Add train flag to test model.
Add new action select function while in test mode.
Add decision period to skip step.
This commit is contained in:
Koha9 2022-11-08 23:14:34 +09:00
parent 474032d1e8
commit a0895c7449
3 changed files with 218 additions and 142 deletions

4
.gitignore vendored
View File

@ -81,8 +81,6 @@ crashlytics-build.properties
/Aimbot-PPO-Python/Pytorch/runs/ /Aimbot-PPO-Python/Pytorch/runs/
/Aimbot-PPO-Python/Pytorch/wandb/ /Aimbot-PPO-Python/Pytorch/wandb/
/Aimbot-PPO-Python/Backup/ /Aimbot-PPO-Python/Backup/
/Aimbot-PPO-Python/Build-MultiScene-WithLoad/ /Aimbot-PPO-Python/Build/
/Aimbot-PPO-Python/Build-CloseEnemyCut/
/Aimbot-PPO-Python/Build-ParallelEnv/
/Aimbot-PPO-Python/PPO-Model/ /Aimbot-PPO-Python/PPO-Model/
/Aimbot-PPO-Python/GAIL-Expert-Data/ /Aimbot-PPO-Python/GAIL-Expert-Data/

View File

@ -13,30 +13,36 @@ from torch.distributions.categorical import Categorical
from distutils.util import strtobool from distutils.util import strtobool
from torch.utils.tensorboard import SummaryWriter from torch.utils.tensorboard import SummaryWriter
bestReward = 0
DEFAULT_SEED = 9331 DEFAULT_SEED = 9331
ENV_PATH = "../Build-ParallelEnv/Aimbot-ParallelEnv" ENV_PATH = "../Build/Build-ParallelEnv-BigArea-6Enemy/Aimbot-ParallelEnv"
WAND_ENTITY = "koha9" WAND_ENTITY = "koha9"
WORKER_ID = 1 WORKER_ID = 1
BASE_PORT = 2002 BASE_PORT = 1000
TOTAL_STEPS = 2000000
STEP_NUM = 314
DECISION_PERIOD = 2
LEARNING_RATE = 7e-4 LEARNING_RATE = 7e-4
GAMMA = 0.99 GAMMA = 0.99
GAE_LAMBDA = 0.95 GAE_LAMBDA = 0.95
TOTAL_STEPS = 2000000 MINIBATCH_NUM = 4
STEP_NUM = 256
MINIBATCH_NUM = 1
EPOCHS = 4 EPOCHS = 4
CLIP_COEF = 0.1 CLIP_COEF = 0.1
POLICY_COEF = 1.0
ENTROPY_COEF = 0.01 ENTROPY_COEF = 0.01
CRITIC_COEF = 0.5 CRITIC_COEF = 0.5
ANNEAL_LEARNING_RATE = True ANNEAL_LEARNING_RATE = True
CLIP_VLOSS = True CLIP_VLOSS = True
NORM_ADV = True NORM_ADV = True
TRAIN = True
WANDB_TACK = True WANDB_TACK = False
LOAD_DIR = "../PPO-Model/SmallArea-256-128-hybrid.pt" LOAD_DIR = None
# LOAD_DIR = "../PPO-Model/SmallArea-256-128-hybrid-2nd-trainning.pt"
def parse_args(): def parse_args():
@ -59,6 +65,8 @@ def parse_args():
help="total timesteps of the experiments") help="total timesteps of the experiments")
# model parameters # model parameters
parser.add_argument("--train",type=lambda x: bool(strtobool(x)), default=TRAIN, nargs="?", const=True,
help="Train Model or not")
parser.add_argument("--stepNum", type=int, default=STEP_NUM, parser.add_argument("--stepNum", type=int, default=STEP_NUM,
help="the number of steps to run in each environment per policy rollout") help="the number of steps to run in each environment per policy rollout")
parser.add_argument("--minibatchesNum", type=int, default=MINIBATCH_NUM, parser.add_argument("--minibatchesNum", type=int, default=MINIBATCH_NUM,
@ -73,8 +81,10 @@ def parse_args():
help="the entity (team) of wandb's project") help="the entity (team) of wandb's project")
parser.add_argument("--load-dir", type=str, default=LOAD_DIR, parser.add_argument("--load-dir", type=str, default=LOAD_DIR,
help="load model directory") help="load model directory")
parser.add_argument("--decision-period", type=int, default=DECISION_PERIOD,
help="the number of steps to run in each environment per policy rollout")
# GAE # GAE loss
parser.add_argument("--gae", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True, parser.add_argument("--gae", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
help="Use GAE for advantage computation") help="Use GAE for advantage computation")
parser.add_argument("--norm-adv", type=lambda x: bool(strtobool(x)), default=NORM_ADV, nargs="?", const=True, parser.add_argument("--norm-adv", type=lambda x: bool(strtobool(x)), default=NORM_ADV, nargs="?", const=True,
@ -85,6 +95,8 @@ def parse_args():
help="the lambda for the general advantage estimation") help="the lambda for the general advantage estimation")
parser.add_argument("--clip-coef", type=float, default=CLIP_COEF, parser.add_argument("--clip-coef", type=float, default=CLIP_COEF,
help="the surrogate clipping coefficient") help="the surrogate clipping coefficient")
parser.add_argument("--policy-coef", type=float, default=POLICY_COEF,
help="coefficient of the policy")
parser.add_argument("--ent-coef", type=float, default=ENTROPY_COEF, parser.add_argument("--ent-coef", type=float, default=ENTROPY_COEF,
help="coefficient of the entropy") help="coefficient of the entropy")
parser.add_argument("--critic-coef", type=float, default=CRITIC_COEF, parser.add_argument("--critic-coef", type=float, default=CRITIC_COEF,
@ -114,15 +126,15 @@ class PPOAgent(nn.Module):
self.continuous_size = env.unity_continuous_size self.continuous_size = env.unity_continuous_size
self.network = nn.Sequential( self.network = nn.Sequential(
layer_init(nn.Linear(np.array(env.unity_observation_shape).prod(), 256)), layer_init(nn.Linear(np.array(env.unity_observation_shape).prod(), 384)),
nn.ReLU(), nn.ReLU(),
layer_init(nn.Linear(256, 128)), layer_init(nn.Linear(384, 256)),
nn.ReLU(), nn.ReLU(),
) )
self.actor_dis = layer_init(nn.Linear(128, self.discrete_size), std=0.01) self.actor_dis = layer_init(nn.Linear(256, self.discrete_size), std=0.01)
self.actor_mean = layer_init(nn.Linear(128, self.continuous_size), std=0.01) self.actor_mean = layer_init(nn.Linear(256, self.continuous_size), std=0.01)
self.actor_logstd = nn.Parameter(torch.zeros(1, self.continuous_size)) self.actor_logstd = nn.Parameter(torch.zeros(1, self.continuous_size))
self.critic = layer_init(nn.Linear(128, 1), std=1) self.critic = layer_init(nn.Linear(256, 1), std=1)
def get_value(self, state: torch.Tensor): def get_value(self, state: torch.Tensor):
return self.critic(self.network(state)) return self.critic(self.network(state))
@ -140,9 +152,16 @@ class PPOAgent(nn.Module):
con_probs = Normal(actions_mean, action_std) con_probs = Normal(actions_mean, action_std)
if actions is None: if actions is None:
if args.train:
# select actions base on probability distribution model
disAct = torch.stack([ctgr.sample() for ctgr in multi_categoricals]) disAct = torch.stack([ctgr.sample() for ctgr in multi_categoricals])
conAct = con_probs.sample() conAct = con_probs.sample()
actions = torch.cat([disAct.T, conAct], dim=1) actions = torch.cat([disAct.T, conAct], dim=1)
else:
# select actions base on best probability distribution
disAct = torch.stack([torch.argmax(logit, dim=1) for logit in split_logits])
conAct = actions_mean
actions = torch.cat([disAct.T, conAct], dim=1)
else: else:
disAct = actions[:, 0 : env.unity_discrete_type].T disAct = actions[:, 0 : env.unity_discrete_type].T
conAct = actions[:, env.unity_discrete_type :] conAct = actions[:, env.unity_discrete_type :]
@ -181,7 +200,7 @@ if __name__ == "__main__":
# Tensorboard and WandB Recorder # Tensorboard and WandB Recorder
game_name = "Aimbot" game_name = "Aimbot"
run_name = f"{game_name}__{args.seed}__{int(time.time())}" run_name = f"{game_name}_{args.seed}_{int(time.time())}"
if args.wandb_track: if args.wandb_track:
wandb.init( wandb.init(
project=run_name, project=run_name,
@ -227,14 +246,19 @@ if __name__ == "__main__":
optimizer.param_groups[0]["lr"] = lrnow optimizer.param_groups[0]["lr"] = lrnow
# MAIN LOOP: run agent in environment # MAIN LOOP: run agent in environment
for step in range(args.stepNum): for i in range(args.stepNum * args.decision_period):
if i % args.decision_period == 0:
step = round(i / args.decision_period)
# Choose action by agent
global_step += 1 * env.unity_agent_num global_step += 1 * env.unity_agent_num
obs[step] = next_obs obs[step] = next_obs
dones[step] = next_done dones[step] = next_done
with torch.no_grad(): with torch.no_grad():
# predict actions # predict actions
action, dis_logprob, _, con_logprob, _, value = agent.get_actions_value(next_obs) action, dis_logprob, _, con_logprob, _, value = agent.get_actions_value(
next_obs
)
value = value.flatten() value = value.flatten()
next_obs, reward, done = env.step(action.cpu().numpy()) next_obs, reward, done = env.step(action.cpu().numpy())
@ -244,7 +268,15 @@ if __name__ == "__main__":
con_logprobs[step] = con_logprob con_logprobs[step] = con_logprob
values[step] = value values[step] = value
rewards[step] = torch.tensor(reward).to(device).view(-1) rewards[step] = torch.tensor(reward).to(device).view(-1)
next_obs, next_done = torch.Tensor(next_obs).to(device), torch.Tensor(done).to(device) next_obs, next_done = torch.Tensor(next_obs).to(device), torch.Tensor(done).to(
device
)
else:
# skip this step use last predict action
next_obs, reward, done = env.step(action.cpu().numpy())
next_obs, next_done = torch.Tensor(next_obs).to(device), torch.Tensor(done).to(
device
)
# GAE # GAE
with torch.no_grad(): with torch.no_grad():
@ -276,6 +308,7 @@ if __name__ == "__main__":
returns[t] = rewards[t] + args.gamma * nextnonterminal * next_return returns[t] = rewards[t] + args.gamma * nextnonterminal * next_return
advantages = returns - values advantages = returns - values
if args.train:
# flatten the batch # flatten the batch
b_obs = obs.reshape((-1,) + env.unity_observation_shape) b_obs = obs.reshape((-1,) + env.unity_observation_shape)
b_dis_logprobs = dis_logprobs.reshape(-1) b_dis_logprobs = dis_logprobs.reshape(-1)
@ -287,7 +320,7 @@ if __name__ == "__main__":
# Optimizing the policy and value network # Optimizing the policy and value network
b_inds = np.arange(args.batch_size) b_inds = np.arange(args.batch_size)
#clipfracs = [] # clipfracs = []
for epoch in range(args.epochs): for epoch in range(args.epochs):
# shuffle all datasets # shuffle all datasets
np.random.shuffle(b_inds) np.random.shuffle(b_inds)
@ -357,8 +390,8 @@ if __name__ == "__main__":
# total loss # total loss
entropy_loss = dis_entropy.mean() + con_entropy.mean() entropy_loss = dis_entropy.mean() + con_entropy.mean()
loss = ( loss = (
dis_pg_loss dis_pg_loss * args.policy_coef
+ con_pg_loss + con_pg_loss * args.policy_coef
- entropy_loss * args.ent_coef - entropy_loss * args.ent_coef
+ v_loss * args.critic_coef + v_loss * args.critic_coef
) )
@ -375,6 +408,7 @@ if __name__ == "__main__":
break break
""" """
# record rewards for plotting purposes # record rewards for plotting purposes
rewardsMean = np.mean(rewards.to("cpu").detach().numpy().copy())
writer.add_scalar("charts/learning_rate", optimizer.param_groups[0]["lr"], global_step) writer.add_scalar("charts/learning_rate", optimizer.param_groups[0]["lr"], global_step)
writer.add_scalar("losses/value_loss", v_loss.item(), global_step) writer.add_scalar("losses/value_loss", v_loss.item(), global_step)
writer.add_scalar("losses/dis_policy_loss", dis_pg_loss.item(), global_step) writer.add_scalar("losses/dis_policy_loss", dis_pg_loss.item(), global_step)
@ -383,12 +417,17 @@ if __name__ == "__main__":
writer.add_scalar("losses/entropy_loss", entropy_loss.item(), global_step) writer.add_scalar("losses/entropy_loss", entropy_loss.item(), global_step)
# writer.add_scalar("losses/old_approx_kl", old_approx_kl.item(), global_step) # writer.add_scalar("losses/old_approx_kl", old_approx_kl.item(), global_step)
# writer.add_scalar("losses/approx_kl", approx_kl.item(), global_step) # writer.add_scalar("losses/approx_kl", approx_kl.item(), global_step)
#writer.add_scalar("losses/clipfrac", np.mean(clipfracs), global_step) # writer.add_scalar("losses/clipfrac", np.mean(clipfracs), global_step)
print("SPS:", int(global_step / (time.time() - start_time))) # print("SPS:", int(global_step / (time.time() - start_time)))
writer.add_scalar("charts/SPS", int(global_step / (time.time() - start_time)), global_step) print("episode over mean reward:", rewardsMean)
writer.add_scalar( writer.add_scalar(
"charts/Reward", np.mean(rewards.to("cpu").detach().numpy().copy()), global_step "charts/SPS", int(global_step / (time.time() - start_time)), global_step
) )
writer.add_scalar("charts/Reward", rewardsMean, global_step)
if rewardsMean > bestReward:
bestReward = rewardsMean
saveDir = "../PPO-Model/bigArea-384-128-hybrid-" + str(rewardsMean) + ".pt"
torch.save(agent, saveDir)
env.close() env.close()
writer.close() writer.close()

View File

@ -431,6 +431,45 @@
"mymodel = torch.load(\"../PPO-Model/SmallArea-256-128-hybrid.pt\")\n", "mymodel = torch.load(\"../PPO-Model/SmallArea-256-128-hybrid.pt\")\n",
"mymodel.eval()" "mymodel.eval()"
] ]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"x : torch.Size([2, 3, 4])\n",
"x : torch.Size([6, 2, 3, 4])\n",
"x : torch.Size([6, 2, 3, 4])\n"
]
}
],
"source": [
"import torch\n",
"#1\n",
"x = torch.randn(2, 1, 1)#为1可以扩展为3和4\n",
"x = x.expand(2, 3, 4)\n",
"print('x :', x.size())\n",
"\n",
"#2\n",
"#扩展一个新的维度必须在最前面,否则会报错\n",
"#x = x.expand(2, 3, 4, 6)\n",
"\n",
"x = x.expand(6, 2, 3, 4)\n",
"print('x :', x.size())\n",
"\n",
"#3\n",
"#某一个维度为-1表示不改变该维度的大小\n",
"x = x.expand(6, -1, -1, -1)\n",
"print('x :', x.size())\n",
"\n",
"x : torch.Size([2, 3, 4])\n",
"x : torch.Size([6, 2, 3, 4])\n",
"x : torch.Size([6, 2, 3, 4])"
]
} }
], ],
"metadata": { "metadata": {