修正预测函数小错误,规范化命名

修正get_actions_value中在非学习模式时仍然使用sample模式而不是取最佳值
规范化命名
This commit is contained in:
Koha9 2023-08-04 03:49:49 +09:00
parent 15c1edb6c9
commit 52ccce88bc
3 changed files with 69 additions and 77 deletions

View File

@ -2,6 +2,9 @@
<dictionary name="UCUNI"> <dictionary name="UCUNI">
<words> <words>
<w>aimbot</w> <w>aimbot</w>
<w>logprobs</w>
<w>logstd</w>
<w>unclipped</w>
</words> </words>
</dictionary> </dictionary>
</component> </component>

View File

@ -20,9 +20,6 @@ SIDE_CHANNEL_UUID = uuid.UUID("8bbfb62a-99b4-457c-879d-b78b69066b5e")
GAME_NAME = "Aimbot_Hybrid_V3" GAME_NAME = "Aimbot_Hybrid_V3"
GAME_TYPE = "Mix_Verification" GAME_TYPE = "Mix_Verification"
# !!!SPECIAL PARAMETERS!!!
using_targets_num = 3
if __name__ == "__main__": if __name__ == "__main__":
args = parse_args() args = parse_args()
random.seed(args.seed) random.seed(args.seed)
@ -61,7 +58,6 @@ if __name__ == "__main__":
run_name = f"{GAME_TYPE}_{args.seed}_{int(time.time())}" run_name = f"{GAME_TYPE}_{args.seed}_{int(time.time())}"
wdb_recorder = WandbRecorder(GAME_NAME, GAME_TYPE, run_name, args) wdb_recorder = WandbRecorder(GAME_NAME, GAME_TYPE, run_name, args)
@atexit.register @atexit.register
def save_model(): def save_model():
# close env # close env
@ -72,9 +68,8 @@ if __name__ == "__main__":
torch.save(agent, save_dir) torch.save(agent, save_dir)
print("save model to " + save_dir) print("save model to " + save_dir)
# start the game # start the game
total_update_step = using_targets_num * args.total_timesteps // args.datasetSize total_update_step = args.target_num * args.total_timesteps // args.datasetSize
target_steps = [0 for i in range(args.target_num)] target_steps = [0 for i in range(args.target_num)]
start_time = time.time() start_time = time.time()
state, _, done = env.reset() state, _, done = env.reset()

View File

@ -17,10 +17,10 @@ def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
class PPOAgent(nn.Module): class PPOAgent(nn.Module):
def __init__( def __init__(
self, self,
env: Aimbot, env: Aimbot,
this_args:argparse.Namespace, this_args: argparse.Namespace,
device: torch.device, device: torch.device,
): ):
super(PPOAgent, self).__init__() super(PPOAgent, self).__init__()
self.device = device self.device = device
@ -38,7 +38,7 @@ class PPOAgent(nn.Module):
self.ray_state_size = env.unity_observation_shape[0] - self.args.total_target_size self.ray_state_size = env.unity_observation_shape[0] - self.args.total_target_size
self.state_size_without_ray = self.args.total_target_size self.state_size_without_ray = self.args.total_target_size
self.head_input_size = ( self.head_input_size = (
env.unity_observation_shape[0] - self.target_size - self.time_state_size - self.gun_state_size env.unity_observation_shape[0] - self.target_size - self.time_state_size - self.gun_state_size
) # except target state input ) # except target state input
self.unity_discrete_type = env.unity_discrete_type self.unity_discrete_type = env.unity_discrete_type
@ -65,9 +65,6 @@ class PPOAgent(nn.Module):
self.actor_mean = nn.ModuleList( self.actor_mean = nn.ModuleList(
[layer_init(nn.Linear(200, self.continuous_size), std=0.5) for i in range(self.target_num)] [layer_init(nn.Linear(200, self.continuous_size), std=0.5) for i in range(self.target_num)]
) )
# self.actor_logstd =
# nn.ModuleList([layer_init(nn.Linear(200, self.continuous_size), std=1) for i in range(targetNum)])
# self.actor_logstd = nn.Parameter(torch.zeros(1, self.continuous_size))
self.actor_logstd = nn.ParameterList( self.actor_logstd = nn.ParameterList(
[nn.Parameter(torch.zeros(1, self.continuous_size)) for i in range(self.target_num)] [nn.Parameter(torch.zeros(1, self.continuous_size)) for i in range(self.target_num)]
) # nn.Parameter(torch.zeros(1, self.continuous_size)) ) # nn.Parameter(torch.zeros(1, self.continuous_size))
@ -78,7 +75,7 @@ class PPOAgent(nn.Module):
def get_value(self, state: torch.Tensor): def get_value(self, state: torch.Tensor):
target = state[:, 0].to(torch.int32) # int target = state[:, 0].to(torch.int32) # int
this_state_num = target.size()[0] this_state_num = target.size()[0]
view_input = state[:, -self.ray_state_size :] # all ray input view_input = state[:, -self.ray_state_size:] # all ray input
target_input = state[:, : self.state_size_without_ray] target_input = state[:, : self.state_size_without_ray]
view_layer = self.view_network(view_input) view_layer = self.view_network(view_input)
target_layer = torch.stack( target_layer = torch.stack(
@ -96,7 +93,7 @@ class PPOAgent(nn.Module):
def get_actions_value(self, state: torch.Tensor, actions=None): def get_actions_value(self, state: torch.Tensor, actions=None):
target = state[:, 0].to(torch.int32) # int target = state[:, 0].to(torch.int32) # int
this_state_num = target.size()[0] this_state_num = target.size()[0]
view_input = state[:, -self.ray_state_size :] # all ray input view_input = state[:, -self.ray_state_size:] # all ray input
target_input = state[:, : self.state_size_without_ray] target_input = state[:, : self.state_size_without_ray]
view_layer = self.view_network(view_input) view_layer = self.view_network(view_input)
target_layer = torch.stack( target_layer = torch.stack(
@ -118,8 +115,6 @@ class PPOAgent(nn.Module):
actions_mean = torch.stack( actions_mean = torch.stack(
[self.actor_mean[target[i]](middle_layer[i]) for i in range(this_state_num)] [self.actor_mean[target[i]](middle_layer[i]) for i in range(this_state_num)]
) # self.actor_mean(hidden) ) # self.actor_mean(hidden)
# action_logstd = torch.stack([self.actor_logstd[target[i]](middleLayer[i]) for i in range(thisStateNum)]) # self.actor_logstd(hidden)
# action_logstd = self.actor_logstd.expand_as(actions_mean) # self.actor_logstd.expand_as(actions_mean)
action_logstd = torch.stack( action_logstd = torch.stack(
[torch.squeeze(self.actor_logstd[target[i]], 0) for i in range(this_state_num)] [torch.squeeze(self.actor_logstd[target[i]], 0) for i in range(this_state_num)]
) )
@ -134,32 +129,31 @@ class PPOAgent(nn.Module):
if actions is None: if actions is None:
if self.train_agent: if self.train_agent:
# select actions base on probability distribution model # select actions base on probability distribution model
disAct = torch.stack([ctgr.sample() for ctgr in multi_categoricals]) dis_act = torch.stack([ctgr.sample() for ctgr in multi_categoricals])
conAct = con_probs.sample() con_act = con_probs.sample()
actions = torch.cat([disAct.T, conAct], dim=1) actions = torch.cat([dis_act.T, con_act], dim=1)
else: else:
# select actions base on best probability distribution # select actions base on best probability distribution
# disAct = torch.stack([torch.argmax(logit, dim=1) for logit in split_logits]) dis_act = torch.stack([torch.argmax(logit, dim=1) for logit in split_logits])
conAct = actions_mean con_act = actions_mean
disAct = torch.stack([ctgr.sample() for ctgr in multi_categoricals]) actions = torch.cat([dis_act.T, con_act], dim=1)
conAct = con_probs.sample()
actions = torch.cat([disAct.T, conAct], dim=1)
else: else:
disAct = actions[:, 0 : self.unity_discrete_type].T dis_act = actions[:, 0: self.unity_discrete_type].T
conAct = actions[:, self.unity_discrete_type :] con_act = actions[:, self.unity_discrete_type:]
dis_log_prob = torch.stack( dis_log_prob = torch.stack(
[ctgr.log_prob(act) for act, ctgr in zip(disAct, multi_categoricals)] [ctgr.log_prob(act) for act, ctgr in zip(dis_act, multi_categoricals)]
) )
dis_entropy = torch.stack([ctgr.entropy() for ctgr in multi_categoricals]) dis_entropy = torch.stack([ctgr.entropy() for ctgr in multi_categoricals])
return ( return (
actions, actions,
dis_log_prob.sum(0), dis_log_prob.sum(0),
dis_entropy.sum(0), dis_entropy.sum(0),
con_probs.log_prob(conAct).sum(1), con_probs.log_prob(con_act).sum(1),
con_probs.entropy().sum(1), con_probs.entropy().sum(1),
criticV, criticV,
) )
def train_net(self, this_train_ind:int,ppo_memories,optimizer) -> tuple:
def train_net(self, this_train_ind: int, ppo_memories, optimizer) -> tuple:
start_time = time.time() start_time = time.time()
# flatten the batch # flatten the batch
b_obs = ppo_memories.obs[this_train_ind].reshape((-1,) + self.unity_observation_shape) b_obs = ppo_memories.obs[this_train_ind].reshape((-1,) + self.unity_observation_shape)
@ -171,24 +165,24 @@ class PPOAgent(nn.Module):
b_values = ppo_memories.values[this_train_ind].reshape(-1) b_values = ppo_memories.values[this_train_ind].reshape(-1)
b_size = b_obs.size()[0] b_size = b_obs.size()[0]
# optimizing the policy and value network # optimizing the policy and value network
b_inds = np.arange(b_size) b_index = np.arange(b_size)
for epoch in range(self.args.epochs): for epoch in range(self.args.epochs):
print("epoch:",epoch,end="") print("epoch:", epoch, end="")
# shuffle all datasets # shuffle all datasets
np.random.shuffle(b_inds) np.random.shuffle(b_index)
for start in range(0, b_size, self.args.minibatchSize): for start in range(0, b_size, self.args.minibatchSize):
print(".",end="") print(".", end="")
end = start + self.args.minibatchSize end = start + self.args.minibatchSize
mb_inds = b_inds[start:end] mb_index = b_index[start:end]
if(np.size(mb_inds)<=1): if np.size(mb_index) <= 1:
break break
mb_advantages = b_advantages[mb_inds] mb_advantages = b_advantages[mb_index]
# normalize advantages # normalize advantages
if self.args.norm_adv: if self.args.norm_adv:
mb_advantages = (mb_advantages - mb_advantages.mean()) / ( mb_advantages = (mb_advantages - mb_advantages.mean()) / (
mb_advantages.std() + 1e-8 mb_advantages.std() + 1e-8
) )
( (
@ -197,14 +191,14 @@ class PPOAgent(nn.Module):
dis_entropy, dis_entropy,
new_con_logprob, new_con_logprob,
con_entropy, con_entropy,
newvalue, new_value,
) = self.get_actions_value(b_obs[mb_inds], b_actions[mb_inds]) ) = self.get_actions_value(b_obs[mb_index], b_actions[mb_index])
# discrete ratio # discrete ratio
dis_logratio = new_dis_logprob - b_dis_logprobs[mb_inds] dis_log_ratio = new_dis_logprob - b_dis_logprobs[mb_index]
dis_ratio = dis_logratio.exp() dis_ratio = dis_log_ratio.exp()
# continuous ratio # continuous ratio
con_logratio = new_con_logprob - b_con_logprobs[mb_inds] con_log_ratio = new_con_logprob - b_con_logprobs[mb_index]
con_ratio = con_logratio.exp() con_ratio = con_log_ratio.exp()
""" """
# early stop # early stop
@ -229,38 +223,38 @@ class PPOAgent(nn.Module):
con_pg_loss = torch.max(con_pg_loss_orig, con_pg_loss_clip).mean() con_pg_loss = torch.max(con_pg_loss_orig, con_pg_loss_clip).mean()
# Value loss # Value loss
newvalue = newvalue.view(-1) new_value = new_value.view(-1)
if self.args.clip_vloss: if self.args.clip_vloss:
v_loss_unclipped = (newvalue - b_returns[mb_inds]) ** 2 v_loss_unclipped = (new_value - b_returns[mb_index]) ** 2
v_clipped = b_values[mb_inds] + torch.clamp( v_clipped = b_values[mb_index] + torch.clamp(
newvalue - b_values[mb_inds], new_value - b_values[mb_index],
-self.args.clip_coef, -self.args.clip_coef,
self.args.clip_coef, self.args.clip_coef,
) )
v_loss_clipped = (v_clipped - b_returns[mb_inds]) ** 2 v_loss_clipped = (v_clipped - b_returns[mb_index]) ** 2
v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped) v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped)
v_loss = 0.5 * v_loss_max.mean() v_loss = 0.5 * v_loss_max.mean()
else: else:
v_loss = 0.5 * ((newvalue - b_returns[mb_inds]) ** 2).mean() v_loss = 0.5 * ((new_value - b_returns[mb_index]) ** 2).mean()
# total loss # total loss
entropy_loss = dis_entropy.mean() + con_entropy.mean() entropy_loss = dis_entropy.mean() + con_entropy.mean()
loss = ( loss = (
dis_pg_loss * self.args.policy_coef[this_train_ind] dis_pg_loss * self.args.policy_coef[this_train_ind]
+ con_pg_loss * self.args.policy_coef[this_train_ind] + con_pg_loss * self.args.policy_coef[this_train_ind]
+ entropy_loss * self.args.entropy_coef[this_train_ind] + entropy_loss * self.args.entropy_coef[this_train_ind]
+ v_loss * self.args.critic_coef[this_train_ind] + v_loss * self.args.critic_coef[this_train_ind]
)*self.args.loss_coef[this_train_ind] ) * self.args.loss_coef[this_train_ind]
if(torch.isnan(loss).any()): if torch.isnan(loss).any():
print("LOSS Include NAN!!!") print("LOSS Include NAN!!!")
if(torch.isnan(dis_pg_loss.any())): if torch.isnan(dis_pg_loss.any()):
print("dis_pg_loss include nan") print("dis_pg_loss include nan")
if(torch.isnan(con_pg_loss.any())): if torch.isnan(con_pg_loss.any()):
print("con_pg_loss include nan") print("con_pg_loss include nan")
if(torch.isnan(entropy_loss.any())): if torch.isnan(entropy_loss.any()):
print("entropy_loss include nan") print("entropy_loss include nan")
if(torch.isnan(v_loss.any())): if torch.isnan(v_loss.any()):
print("v_loss include nan") print("v_loss include nan")
raise raise
@ -275,15 +269,15 @@ class PPOAgent(nn.Module):
if approx_kl > args.target_kl: if approx_kl > args.target_kl:
break break
""" """
return (v_loss,dis_pg_loss,con_pg_loss,loss,entropy_loss) return v_loss, dis_pg_loss, con_pg_loss, loss, entropy_loss
def gae( def gae(
self, self,
rewards: torch.Tensor, rewards: torch.Tensor,
dones: torch.Tensor, dones: torch.Tensor,
values: torch.tensor, values: torch.tensor,
next_obs: torch.tensor, next_obs: torch.tensor,
next_done: torch.Tensor, next_done: torch.Tensor,
) -> tuple: ) -> tuple:
# GAE # GAE
with torch.no_grad(): with torch.no_grad():
@ -294,25 +288,25 @@ class PPOAgent(nn.Module):
last_gae_lam = 0 last_gae_lam = 0
for t in reversed(range(data_size)): for t in reversed(range(data_size)):
if t == data_size - 1: if t == data_size - 1:
nextnonterminal = 1.0 - next_done next_non_terminal = 1.0 - next_done
next_values = next_value next_values = next_value
else: else:
nextnonterminal = 1.0 - dones[t + 1] next_non_terminal = 1.0 - dones[t + 1]
next_values = values[t + 1] next_values = values[t + 1]
delta = rewards[t] + self.args.gamma * next_values * nextnonterminal - values[t] delta = rewards[t] + self.args.gamma * next_values * next_non_terminal - values[t]
advantages[t] = last_gae_lam = ( advantages[t] = last_gae_lam = (
delta + self.args.gamma * self.args.gaeLambda * nextnonterminal * last_gae_lam delta + self.args.gamma * self.args.gaeLambda * next_non_terminal * last_gae_lam
) )
returns = advantages + values returns = advantages + values
else: else:
returns = torch.zeros_like(rewards).to(self.device) returns = torch.zeros_like(rewards).to(self.device)
for t in reversed(range(data_size)): for t in reversed(range(data_size)):
if t == data_size - 1: if t == data_size - 1:
nextnonterminal = 1.0 - next_done next_non_terminal = 1.0 - next_done
next_return = next_value next_return = next_value
else: else:
nextnonterminal = 1.0 - dones[t + 1] next_non_terminal = 1.0 - dones[t + 1]
next_return = returns[t + 1] next_return = returns[t + 1]
returns[t] = rewards[t] + self.args.gamma * nextnonterminal * next_return returns[t] = rewards[t] + self.args.gamma * next_non_terminal * next_return
advantages = returns - values advantages = returns - values
return advantages, returns return advantages, returns