修正预测函数小错误,规范化命名
修正get_actions_value中在非学习模式时仍然使用sample模式而不是取最佳值 规范化命名
This commit is contained in:
parent
15c1edb6c9
commit
52ccce88bc
@ -2,6 +2,9 @@
|
|||||||
<dictionary name="UCUNI">
|
<dictionary name="UCUNI">
|
||||||
<words>
|
<words>
|
||||||
<w>aimbot</w>
|
<w>aimbot</w>
|
||||||
|
<w>logprobs</w>
|
||||||
|
<w>logstd</w>
|
||||||
|
<w>unclipped</w>
|
||||||
</words>
|
</words>
|
||||||
</dictionary>
|
</dictionary>
|
||||||
</component>
|
</component>
|
@ -20,9 +20,6 @@ SIDE_CHANNEL_UUID = uuid.UUID("8bbfb62a-99b4-457c-879d-b78b69066b5e")
|
|||||||
GAME_NAME = "Aimbot_Hybrid_V3"
|
GAME_NAME = "Aimbot_Hybrid_V3"
|
||||||
GAME_TYPE = "Mix_Verification"
|
GAME_TYPE = "Mix_Verification"
|
||||||
|
|
||||||
# !!!SPECIAL PARAMETERS!!!
|
|
||||||
using_targets_num = 3
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
args = parse_args()
|
args = parse_args()
|
||||||
random.seed(args.seed)
|
random.seed(args.seed)
|
||||||
@ -61,7 +58,6 @@ if __name__ == "__main__":
|
|||||||
run_name = f"{GAME_TYPE}_{args.seed}_{int(time.time())}"
|
run_name = f"{GAME_TYPE}_{args.seed}_{int(time.time())}"
|
||||||
wdb_recorder = WandbRecorder(GAME_NAME, GAME_TYPE, run_name, args)
|
wdb_recorder = WandbRecorder(GAME_NAME, GAME_TYPE, run_name, args)
|
||||||
|
|
||||||
|
|
||||||
@atexit.register
|
@atexit.register
|
||||||
def save_model():
|
def save_model():
|
||||||
# close env
|
# close env
|
||||||
@ -72,9 +68,8 @@ if __name__ == "__main__":
|
|||||||
torch.save(agent, save_dir)
|
torch.save(agent, save_dir)
|
||||||
print("save model to " + save_dir)
|
print("save model to " + save_dir)
|
||||||
|
|
||||||
|
|
||||||
# start the game
|
# start the game
|
||||||
total_update_step = using_targets_num * args.total_timesteps // args.datasetSize
|
total_update_step = args.target_num * args.total_timesteps // args.datasetSize
|
||||||
target_steps = [0 for i in range(args.target_num)]
|
target_steps = [0 for i in range(args.target_num)]
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
state, _, done = env.reset()
|
state, _, done = env.reset()
|
||||||
|
@ -17,10 +17,10 @@ def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
|
|||||||
|
|
||||||
class PPOAgent(nn.Module):
|
class PPOAgent(nn.Module):
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
env: Aimbot,
|
env: Aimbot,
|
||||||
this_args:argparse.Namespace,
|
this_args: argparse.Namespace,
|
||||||
device: torch.device,
|
device: torch.device,
|
||||||
):
|
):
|
||||||
super(PPOAgent, self).__init__()
|
super(PPOAgent, self).__init__()
|
||||||
self.device = device
|
self.device = device
|
||||||
@ -38,7 +38,7 @@ class PPOAgent(nn.Module):
|
|||||||
self.ray_state_size = env.unity_observation_shape[0] - self.args.total_target_size
|
self.ray_state_size = env.unity_observation_shape[0] - self.args.total_target_size
|
||||||
self.state_size_without_ray = self.args.total_target_size
|
self.state_size_without_ray = self.args.total_target_size
|
||||||
self.head_input_size = (
|
self.head_input_size = (
|
||||||
env.unity_observation_shape[0] - self.target_size - self.time_state_size - self.gun_state_size
|
env.unity_observation_shape[0] - self.target_size - self.time_state_size - self.gun_state_size
|
||||||
) # except target state input
|
) # except target state input
|
||||||
|
|
||||||
self.unity_discrete_type = env.unity_discrete_type
|
self.unity_discrete_type = env.unity_discrete_type
|
||||||
@ -65,9 +65,6 @@ class PPOAgent(nn.Module):
|
|||||||
self.actor_mean = nn.ModuleList(
|
self.actor_mean = nn.ModuleList(
|
||||||
[layer_init(nn.Linear(200, self.continuous_size), std=0.5) for i in range(self.target_num)]
|
[layer_init(nn.Linear(200, self.continuous_size), std=0.5) for i in range(self.target_num)]
|
||||||
)
|
)
|
||||||
# self.actor_logstd =
|
|
||||||
# nn.ModuleList([layer_init(nn.Linear(200, self.continuous_size), std=1) for i in range(targetNum)])
|
|
||||||
# self.actor_logstd = nn.Parameter(torch.zeros(1, self.continuous_size))
|
|
||||||
self.actor_logstd = nn.ParameterList(
|
self.actor_logstd = nn.ParameterList(
|
||||||
[nn.Parameter(torch.zeros(1, self.continuous_size)) for i in range(self.target_num)]
|
[nn.Parameter(torch.zeros(1, self.continuous_size)) for i in range(self.target_num)]
|
||||||
) # nn.Parameter(torch.zeros(1, self.continuous_size))
|
) # nn.Parameter(torch.zeros(1, self.continuous_size))
|
||||||
@ -78,7 +75,7 @@ class PPOAgent(nn.Module):
|
|||||||
def get_value(self, state: torch.Tensor):
|
def get_value(self, state: torch.Tensor):
|
||||||
target = state[:, 0].to(torch.int32) # int
|
target = state[:, 0].to(torch.int32) # int
|
||||||
this_state_num = target.size()[0]
|
this_state_num = target.size()[0]
|
||||||
view_input = state[:, -self.ray_state_size :] # all ray input
|
view_input = state[:, -self.ray_state_size:] # all ray input
|
||||||
target_input = state[:, : self.state_size_without_ray]
|
target_input = state[:, : self.state_size_without_ray]
|
||||||
view_layer = self.view_network(view_input)
|
view_layer = self.view_network(view_input)
|
||||||
target_layer = torch.stack(
|
target_layer = torch.stack(
|
||||||
@ -96,7 +93,7 @@ class PPOAgent(nn.Module):
|
|||||||
def get_actions_value(self, state: torch.Tensor, actions=None):
|
def get_actions_value(self, state: torch.Tensor, actions=None):
|
||||||
target = state[:, 0].to(torch.int32) # int
|
target = state[:, 0].to(torch.int32) # int
|
||||||
this_state_num = target.size()[0]
|
this_state_num = target.size()[0]
|
||||||
view_input = state[:, -self.ray_state_size :] # all ray input
|
view_input = state[:, -self.ray_state_size:] # all ray input
|
||||||
target_input = state[:, : self.state_size_without_ray]
|
target_input = state[:, : self.state_size_without_ray]
|
||||||
view_layer = self.view_network(view_input)
|
view_layer = self.view_network(view_input)
|
||||||
target_layer = torch.stack(
|
target_layer = torch.stack(
|
||||||
@ -118,8 +115,6 @@ class PPOAgent(nn.Module):
|
|||||||
actions_mean = torch.stack(
|
actions_mean = torch.stack(
|
||||||
[self.actor_mean[target[i]](middle_layer[i]) for i in range(this_state_num)]
|
[self.actor_mean[target[i]](middle_layer[i]) for i in range(this_state_num)]
|
||||||
) # self.actor_mean(hidden)
|
) # self.actor_mean(hidden)
|
||||||
# action_logstd = torch.stack([self.actor_logstd[target[i]](middleLayer[i]) for i in range(thisStateNum)]) # self.actor_logstd(hidden)
|
|
||||||
# action_logstd = self.actor_logstd.expand_as(actions_mean) # self.actor_logstd.expand_as(actions_mean)
|
|
||||||
action_logstd = torch.stack(
|
action_logstd = torch.stack(
|
||||||
[torch.squeeze(self.actor_logstd[target[i]], 0) for i in range(this_state_num)]
|
[torch.squeeze(self.actor_logstd[target[i]], 0) for i in range(this_state_num)]
|
||||||
)
|
)
|
||||||
@ -134,32 +129,31 @@ class PPOAgent(nn.Module):
|
|||||||
if actions is None:
|
if actions is None:
|
||||||
if self.train_agent:
|
if self.train_agent:
|
||||||
# select actions base on probability distribution model
|
# select actions base on probability distribution model
|
||||||
disAct = torch.stack([ctgr.sample() for ctgr in multi_categoricals])
|
dis_act = torch.stack([ctgr.sample() for ctgr in multi_categoricals])
|
||||||
conAct = con_probs.sample()
|
con_act = con_probs.sample()
|
||||||
actions = torch.cat([disAct.T, conAct], dim=1)
|
actions = torch.cat([dis_act.T, con_act], dim=1)
|
||||||
else:
|
else:
|
||||||
# select actions base on best probability distribution
|
# select actions base on best probability distribution
|
||||||
# disAct = torch.stack([torch.argmax(logit, dim=1) for logit in split_logits])
|
dis_act = torch.stack([torch.argmax(logit, dim=1) for logit in split_logits])
|
||||||
conAct = actions_mean
|
con_act = actions_mean
|
||||||
disAct = torch.stack([ctgr.sample() for ctgr in multi_categoricals])
|
actions = torch.cat([dis_act.T, con_act], dim=1)
|
||||||
conAct = con_probs.sample()
|
|
||||||
actions = torch.cat([disAct.T, conAct], dim=1)
|
|
||||||
else:
|
else:
|
||||||
disAct = actions[:, 0 : self.unity_discrete_type].T
|
dis_act = actions[:, 0: self.unity_discrete_type].T
|
||||||
conAct = actions[:, self.unity_discrete_type :]
|
con_act = actions[:, self.unity_discrete_type:]
|
||||||
dis_log_prob = torch.stack(
|
dis_log_prob = torch.stack(
|
||||||
[ctgr.log_prob(act) for act, ctgr in zip(disAct, multi_categoricals)]
|
[ctgr.log_prob(act) for act, ctgr in zip(dis_act, multi_categoricals)]
|
||||||
)
|
)
|
||||||
dis_entropy = torch.stack([ctgr.entropy() for ctgr in multi_categoricals])
|
dis_entropy = torch.stack([ctgr.entropy() for ctgr in multi_categoricals])
|
||||||
return (
|
return (
|
||||||
actions,
|
actions,
|
||||||
dis_log_prob.sum(0),
|
dis_log_prob.sum(0),
|
||||||
dis_entropy.sum(0),
|
dis_entropy.sum(0),
|
||||||
con_probs.log_prob(conAct).sum(1),
|
con_probs.log_prob(con_act).sum(1),
|
||||||
con_probs.entropy().sum(1),
|
con_probs.entropy().sum(1),
|
||||||
criticV,
|
criticV,
|
||||||
)
|
)
|
||||||
def train_net(self, this_train_ind:int,ppo_memories,optimizer) -> tuple:
|
|
||||||
|
def train_net(self, this_train_ind: int, ppo_memories, optimizer) -> tuple:
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
# flatten the batch
|
# flatten the batch
|
||||||
b_obs = ppo_memories.obs[this_train_ind].reshape((-1,) + self.unity_observation_shape)
|
b_obs = ppo_memories.obs[this_train_ind].reshape((-1,) + self.unity_observation_shape)
|
||||||
@ -171,24 +165,24 @@ class PPOAgent(nn.Module):
|
|||||||
b_values = ppo_memories.values[this_train_ind].reshape(-1)
|
b_values = ppo_memories.values[this_train_ind].reshape(-1)
|
||||||
b_size = b_obs.size()[0]
|
b_size = b_obs.size()[0]
|
||||||
# optimizing the policy and value network
|
# optimizing the policy and value network
|
||||||
b_inds = np.arange(b_size)
|
b_index = np.arange(b_size)
|
||||||
|
|
||||||
for epoch in range(self.args.epochs):
|
for epoch in range(self.args.epochs):
|
||||||
print("epoch:",epoch,end="")
|
print("epoch:", epoch, end="")
|
||||||
# shuffle all datasets
|
# shuffle all datasets
|
||||||
np.random.shuffle(b_inds)
|
np.random.shuffle(b_index)
|
||||||
for start in range(0, b_size, self.args.minibatchSize):
|
for start in range(0, b_size, self.args.minibatchSize):
|
||||||
print(".",end="")
|
print(".", end="")
|
||||||
end = start + self.args.minibatchSize
|
end = start + self.args.minibatchSize
|
||||||
mb_inds = b_inds[start:end]
|
mb_index = b_index[start:end]
|
||||||
if(np.size(mb_inds)<=1):
|
if np.size(mb_index) <= 1:
|
||||||
break
|
break
|
||||||
mb_advantages = b_advantages[mb_inds]
|
mb_advantages = b_advantages[mb_index]
|
||||||
|
|
||||||
# normalize advantages
|
# normalize advantages
|
||||||
if self.args.norm_adv:
|
if self.args.norm_adv:
|
||||||
mb_advantages = (mb_advantages - mb_advantages.mean()) / (
|
mb_advantages = (mb_advantages - mb_advantages.mean()) / (
|
||||||
mb_advantages.std() + 1e-8
|
mb_advantages.std() + 1e-8
|
||||||
)
|
)
|
||||||
|
|
||||||
(
|
(
|
||||||
@ -197,14 +191,14 @@ class PPOAgent(nn.Module):
|
|||||||
dis_entropy,
|
dis_entropy,
|
||||||
new_con_logprob,
|
new_con_logprob,
|
||||||
con_entropy,
|
con_entropy,
|
||||||
newvalue,
|
new_value,
|
||||||
) = self.get_actions_value(b_obs[mb_inds], b_actions[mb_inds])
|
) = self.get_actions_value(b_obs[mb_index], b_actions[mb_index])
|
||||||
# discrete ratio
|
# discrete ratio
|
||||||
dis_logratio = new_dis_logprob - b_dis_logprobs[mb_inds]
|
dis_log_ratio = new_dis_logprob - b_dis_logprobs[mb_index]
|
||||||
dis_ratio = dis_logratio.exp()
|
dis_ratio = dis_log_ratio.exp()
|
||||||
# continuous ratio
|
# continuous ratio
|
||||||
con_logratio = new_con_logprob - b_con_logprobs[mb_inds]
|
con_log_ratio = new_con_logprob - b_con_logprobs[mb_index]
|
||||||
con_ratio = con_logratio.exp()
|
con_ratio = con_log_ratio.exp()
|
||||||
|
|
||||||
"""
|
"""
|
||||||
# early stop
|
# early stop
|
||||||
@ -229,38 +223,38 @@ class PPOAgent(nn.Module):
|
|||||||
con_pg_loss = torch.max(con_pg_loss_orig, con_pg_loss_clip).mean()
|
con_pg_loss = torch.max(con_pg_loss_orig, con_pg_loss_clip).mean()
|
||||||
|
|
||||||
# Value loss
|
# Value loss
|
||||||
newvalue = newvalue.view(-1)
|
new_value = new_value.view(-1)
|
||||||
if self.args.clip_vloss:
|
if self.args.clip_vloss:
|
||||||
v_loss_unclipped = (newvalue - b_returns[mb_inds]) ** 2
|
v_loss_unclipped = (new_value - b_returns[mb_index]) ** 2
|
||||||
v_clipped = b_values[mb_inds] + torch.clamp(
|
v_clipped = b_values[mb_index] + torch.clamp(
|
||||||
newvalue - b_values[mb_inds],
|
new_value - b_values[mb_index],
|
||||||
-self.args.clip_coef,
|
-self.args.clip_coef,
|
||||||
self.args.clip_coef,
|
self.args.clip_coef,
|
||||||
)
|
)
|
||||||
v_loss_clipped = (v_clipped - b_returns[mb_inds]) ** 2
|
v_loss_clipped = (v_clipped - b_returns[mb_index]) ** 2
|
||||||
v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped)
|
v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped)
|
||||||
v_loss = 0.5 * v_loss_max.mean()
|
v_loss = 0.5 * v_loss_max.mean()
|
||||||
else:
|
else:
|
||||||
v_loss = 0.5 * ((newvalue - b_returns[mb_inds]) ** 2).mean()
|
v_loss = 0.5 * ((new_value - b_returns[mb_index]) ** 2).mean()
|
||||||
|
|
||||||
# total loss
|
# total loss
|
||||||
entropy_loss = dis_entropy.mean() + con_entropy.mean()
|
entropy_loss = dis_entropy.mean() + con_entropy.mean()
|
||||||
loss = (
|
loss = (
|
||||||
dis_pg_loss * self.args.policy_coef[this_train_ind]
|
dis_pg_loss * self.args.policy_coef[this_train_ind]
|
||||||
+ con_pg_loss * self.args.policy_coef[this_train_ind]
|
+ con_pg_loss * self.args.policy_coef[this_train_ind]
|
||||||
+ entropy_loss * self.args.entropy_coef[this_train_ind]
|
+ entropy_loss * self.args.entropy_coef[this_train_ind]
|
||||||
+ v_loss * self.args.critic_coef[this_train_ind]
|
+ v_loss * self.args.critic_coef[this_train_ind]
|
||||||
)*self.args.loss_coef[this_train_ind]
|
) * self.args.loss_coef[this_train_ind]
|
||||||
|
|
||||||
if(torch.isnan(loss).any()):
|
if torch.isnan(loss).any():
|
||||||
print("LOSS Include NAN!!!")
|
print("LOSS Include NAN!!!")
|
||||||
if(torch.isnan(dis_pg_loss.any())):
|
if torch.isnan(dis_pg_loss.any()):
|
||||||
print("dis_pg_loss include nan")
|
print("dis_pg_loss include nan")
|
||||||
if(torch.isnan(con_pg_loss.any())):
|
if torch.isnan(con_pg_loss.any()):
|
||||||
print("con_pg_loss include nan")
|
print("con_pg_loss include nan")
|
||||||
if(torch.isnan(entropy_loss.any())):
|
if torch.isnan(entropy_loss.any()):
|
||||||
print("entropy_loss include nan")
|
print("entropy_loss include nan")
|
||||||
if(torch.isnan(v_loss.any())):
|
if torch.isnan(v_loss.any()):
|
||||||
print("v_loss include nan")
|
print("v_loss include nan")
|
||||||
raise
|
raise
|
||||||
|
|
||||||
@ -275,15 +269,15 @@ class PPOAgent(nn.Module):
|
|||||||
if approx_kl > args.target_kl:
|
if approx_kl > args.target_kl:
|
||||||
break
|
break
|
||||||
"""
|
"""
|
||||||
return (v_loss,dis_pg_loss,con_pg_loss,loss,entropy_loss)
|
return v_loss, dis_pg_loss, con_pg_loss, loss, entropy_loss
|
||||||
|
|
||||||
def gae(
|
def gae(
|
||||||
self,
|
self,
|
||||||
rewards: torch.Tensor,
|
rewards: torch.Tensor,
|
||||||
dones: torch.Tensor,
|
dones: torch.Tensor,
|
||||||
values: torch.tensor,
|
values: torch.tensor,
|
||||||
next_obs: torch.tensor,
|
next_obs: torch.tensor,
|
||||||
next_done: torch.Tensor,
|
next_done: torch.Tensor,
|
||||||
) -> tuple:
|
) -> tuple:
|
||||||
# GAE
|
# GAE
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
@ -294,25 +288,25 @@ class PPOAgent(nn.Module):
|
|||||||
last_gae_lam = 0
|
last_gae_lam = 0
|
||||||
for t in reversed(range(data_size)):
|
for t in reversed(range(data_size)):
|
||||||
if t == data_size - 1:
|
if t == data_size - 1:
|
||||||
nextnonterminal = 1.0 - next_done
|
next_non_terminal = 1.0 - next_done
|
||||||
next_values = next_value
|
next_values = next_value
|
||||||
else:
|
else:
|
||||||
nextnonterminal = 1.0 - dones[t + 1]
|
next_non_terminal = 1.0 - dones[t + 1]
|
||||||
next_values = values[t + 1]
|
next_values = values[t + 1]
|
||||||
delta = rewards[t] + self.args.gamma * next_values * nextnonterminal - values[t]
|
delta = rewards[t] + self.args.gamma * next_values * next_non_terminal - values[t]
|
||||||
advantages[t] = last_gae_lam = (
|
advantages[t] = last_gae_lam = (
|
||||||
delta + self.args.gamma * self.args.gaeLambda * nextnonterminal * last_gae_lam
|
delta + self.args.gamma * self.args.gaeLambda * next_non_terminal * last_gae_lam
|
||||||
)
|
)
|
||||||
returns = advantages + values
|
returns = advantages + values
|
||||||
else:
|
else:
|
||||||
returns = torch.zeros_like(rewards).to(self.device)
|
returns = torch.zeros_like(rewards).to(self.device)
|
||||||
for t in reversed(range(data_size)):
|
for t in reversed(range(data_size)):
|
||||||
if t == data_size - 1:
|
if t == data_size - 1:
|
||||||
nextnonterminal = 1.0 - next_done
|
next_non_terminal = 1.0 - next_done
|
||||||
next_return = next_value
|
next_return = next_value
|
||||||
else:
|
else:
|
||||||
nextnonterminal = 1.0 - dones[t + 1]
|
next_non_terminal = 1.0 - dones[t + 1]
|
||||||
next_return = returns[t + 1]
|
next_return = returns[t + 1]
|
||||||
returns[t] = rewards[t] + self.args.gamma * nextnonterminal * next_return
|
returns[t] = rewards[t] + self.args.gamma * next_non_terminal * next_return
|
||||||
advantages = returns - values
|
advantages = returns - values
|
||||||
return advantages, returns
|
return advantages, returns
|
Loading…
Reference in New Issue
Block a user