1 Commits

Author SHA1 Message Date
Koha9 1e974ada2a Side Channel Added
add side Channel to save target win ratio.
2022-11-30 07:01:05 +09:00
26 changed files with 635 additions and 3628 deletions
-2
View File
@@ -76,8 +76,6 @@ crashlytics-build.properties
/Aimbot-PPO-Python/.vscode/
/Aimbot-PPO-Python/.mypy_cache/
/Aimbot-PPO-Python/__pycache__/
/Aimbot-PPO-Python/wandb/
/Aimbot-PPO-Python/runs/
/Aimbot-PPO-Python/Tensorflow/__pycache__/
/Aimbot-PPO-Python/Pytorch/__pycache__/
/Aimbot-PPO-Python/Pytorch/runs/
-5
View File
@@ -1,5 +0,0 @@
{
"python.linting.enabled": false,
"python.analysis.typeCheckingMode": "off",
"commentTranslate.source": "intellsmi.deepl-translate-deepl"
}
-3
View File
@@ -1,3 +0,0 @@
# Default ignored files
/shelf/
/workspace.xml
-8
View File
@@ -1,8 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="jdk" jdkName="mlagents39" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>
-10
View File
@@ -1,10 +0,0 @@
<component name="ProjectDictionaryState">
<dictionary name="UCUNI">
<words>
<w>aimbot</w>
<w>logprobs</w>
<w>logstd</w>
<w>unclipped</w>
</words>
</dictionary>
</component>
@@ -1,6 +0,0 @@
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>
-4
View File
@@ -1,4 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="mlagents39" project-jdk-type="Python SDK" />
</project>
-8
View File
@@ -1,8 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/Pytorch.iml" filepath="$PROJECT_DIR$/.idea/Pytorch.iml" />
</modules>
</component>
</project>
-6
View File
@@ -1,6 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$/../.." vcs="Git" />
</component>
</project>
+40 -118
View File
@@ -1,33 +1,26 @@
import gym
import numpy as np
import uuid
import airecorder
from numpy import ndarray
from mlagents_envs.base_env import ActionTuple
from mlagents_envs.environment import UnityEnvironment
from typing import Tuple, List
from mlagents_envs.side_channel.side_channel import (
SideChannel,
IncomingMessage,
OutgoingMessage,
)
class Aimbot(gym.Env):
def __init__(
self,
env_path: str,
worker_id: int = 1,
base_port: int = 100,
envPath: str,
workerID: int = 1,
basePort: int = 100,
side_channels: list = []
):
super(Aimbot, self).__init__()
self.env = UnityEnvironment(
file_name=env_path,
file_name=envPath,
seed=1,
side_channels=side_channels,
worker_id=worker_id,
base_port=base_port,
worker_id=workerID,
base_port=basePort,
)
self.env.reset()
# all behavior_specs
@@ -41,7 +34,7 @@ class Aimbot(gym.Env):
# environment action specs
self.unity_action_spec = self.unity_specs.action_spec
# environment sample observation
decision_steps, _ = self.env.get_steps(self.unity_beha_name)
decisionSteps, _ = self.env.get_steps(self.unity_beha_name)
# OBSERVATION SPECS
# environment state shape. like tuple:(93,)
@@ -64,34 +57,31 @@ class Aimbot(gym.Env):
# AGENT SPECS
# all agents ID
self.unity_agent_IDS = decision_steps.agent_id
self.unity_agent_IDS = decisionSteps.agent_id
# agents number
self.unity_agent_num = len(self.unity_agent_IDS)
# all zero action
self.all_zero_action = np.zeros((self.unity_agent_num, self.unity_action_size))
def reset(self) -> Tuple[np.ndarray, List, List]:
"""reset environment and get observations
def reset(self):
"""reset enviroment and get observations
Returns:
ndarray: next_state, reward, done, loadDir, saveNow
ndarray: nextState, reward, done, loadDir, saveNow
"""
# reset env
self.env.reset()
next_state, reward, done = self.get_steps()
return next_state, reward, done
nextState, reward, done = self.getSteps()
return nextState, reward, done
# TODO:
# delete all stack state DONE
# get-step State disassembly function DONE
# getstep State disassembly function DONE
# delete agent selection function DONE
# self.step action wrapper function DONE
def step(
self,
actions: ndarray,
) -> Tuple[np.ndarray, List, List]:
"""change actions list to ActionTuple then send it to environment
):
"""change ations list to ActionTuple then send it to enviroment
Args:
actions (ndarray): PPO chooseAction output action list.(agentNum,actionNum)
@@ -99,36 +89,36 @@ class Aimbot(gym.Env):
Returns:
ndarray: nextState, reward, done
"""
# take action to environment
# take action to enviroment
# return mextState,reward,done
# discrete action
if self.unity_dis_act_exist:
# create discrete action from actions list
discrete_actions = actions[:, 0: self.unity_discrete_type]
discreteActions = actions[:, 0 : self.unity_discrete_type]
else:
# create empty discrete action
discrete_actions = np.asarray([[0]])
discreteActions = np.asarray([[0]])
# continuous action
if self.unity_con_act_exist:
# create continuous actions from actions list
continuous_actions = actions[:, self.unity_discrete_type:]
continuousActions = actions[:, self.unity_discrete_type :]
else:
# create empty continuous action
continuous_actions = np.asanyarray([[0.0]])
continuousActions = np.asanyarray([[0.0]])
# Dummy continuous action
# continuousActions = np.asanyarray([[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0]])
# create actionTuple
this_action_tuple = ActionTuple(continuous=continuous_actions, discrete=discrete_actions)
thisActionTuple = ActionTuple(continuous=continuousActions, discrete=discreteActions)
# take action to env
self.env.set_actions(behavior_name=self.unity_beha_name, action=this_action_tuple)
self.env.set_actions(behavior_name=self.unity_beha_name, action=thisActionTuple)
self.env.step()
# get nextState & reward & done after this action
next_states, rewards, dones = self.get_steps()
return next_states, rewards, dones
nextStates, rewards, dones = self.getSteps()
return nextStates, rewards, dones
def get_steps(self) -> Tuple[np.ndarray, List, List]:
"""get environment now observations.
def getSteps(self):
"""get enviroment now observations.
Include State, Reward, Done
Args:
@@ -137,96 +127,28 @@ class Aimbot(gym.Env):
ndarray: nextState, reward, done
"""
# get nextState & reward & done
decision_steps, terminal_steps = self.env.get_steps(self.unity_beha_name)
next_states = []
decisionSteps, terminalSteps = self.env.get_steps(self.unity_beha_name)
nextStates = []
dones = []
rewards = []
for this_agent_ID in self.unity_agent_IDS:
for thisAgentID in self.unity_agent_IDS:
# while Episode over agentID will both in decisionSteps and terminalSteps.
# avoid redundant state and reward,
# use agentExist toggle to check if agent is already exist.
agent_exist = False
agentExist = False
# game done
if this_agent_ID in terminal_steps:
next_states.append(terminal_steps[this_agent_ID].obs[0])
if thisAgentID in terminalSteps:
nextStates.append(terminalSteps[thisAgentID].obs[0])
dones.append(True)
rewards.append(terminal_steps[this_agent_ID].reward)
agent_exist = True
rewards.append(terminalSteps[thisAgentID].reward)
agentExist = True
# game not over yet and agent not in terminalSteps
if (this_agent_ID in decision_steps) and (not agent_exist):
next_states.append(decision_steps[this_agent_ID].obs[0])
if (thisAgentID in decisionSteps) and (not agentExist):
nextStates.append(decisionSteps[thisAgentID].obs[0])
dones.append(False)
rewards.append(decision_steps[this_agent_ID].reward)
rewards.append(decisionSteps[thisAgentID].reward)
return np.asarray(next_states), rewards, dones
return np.asarray(nextStates), rewards, dones
def close(self):
self.env.close()
class AimbotSideChannel(SideChannel):
def __init__(self, channel_id: uuid.UUID) -> None:
super().__init__(channel_id)
def on_message_received(self, msg: IncomingMessage) -> None:
"""
Note: We must implement this method of the SideChannel interface to
receive messages from Unity
Message will be sent like this:
"Warning|Message1|Message2|Message3" or
"Error|Message1|Message2|Message3"
"""
this_message = msg.read_string()
this_result = this_message.split("|")
print(this_result)
if this_result[0] == "Warning":
if this_result[1] == "Result":
airecorder.total_rounds[this_result[2]] += 1
if this_result[3] == "Win":
airecorder.win_rounds[this_result[2]] += 1
# print(TotalRounds)
# print(WinRounds)
elif this_result[0] == "Error":
print(this_message)
# # while Message type is Warning
# if(thisResult[0] == "Warning"):
# # while Message1 is result means one game is over
# if (thisResult[1] == "Result"):
# TotalRounds[thisResult[2]]+=1
# # while Message3 is Win means this agent win this game
# if(thisResult[3] == "Win"):
# WinRounds[thisResult[2]]+=1
# # while Message1 is GameState means this game is just start
# # and tell python which game mode is
# elif (thisResult[1] == "GameState"):
# SCrecieved = 1
# # while Message type is Error
# elif(thisResult[0] == "Error"):
# print(thisMessage)
# 发送函数
def send_string(self, data: str) -> None:
# send a string toC#
msg = OutgoingMessage()
msg.write_string(data)
super().queue_message_to_send(msg)
def send_bool(self, data: bool) -> None:
msg = OutgoingMessage()
msg.write_bool(data)
super().queue_message_to_send(msg)
def send_int(self, data: int) -> None:
msg = OutgoingMessage()
msg.write_int32(data)
super().queue_message_to_send(msg)
def send_float(self, data: float) -> None:
msg = OutgoingMessage()
msg.write_float32(data)
super().queue_message_to_send(msg)
def send_float_list(self, data: List[float]) -> None:
msg = OutgoingMessage()
msg.write_float32_list(data)
super().queue_message_to_send(msg)
-769
View File
@@ -1,769 +0,0 @@
import argparse
import wandb
import time
import numpy as np
import random
import uuid
import torch
import torch.nn as nn
import torch.optim as optim
import atexit
from torchviz import make_dot, make_dot_from_trace
from AimbotEnv import Aimbot
from tqdm import tqdm
from enum import Enum
from torch.distributions.normal import Normal
from torch.distributions.categorical import Categorical
from distutils.util import strtobool
from torch.utils.tensorboard import SummaryWriter
from mlagents_envs.environment import UnityEnvironment
from mlagents_envs.side_channel.side_channel import (
SideChannel,
IncomingMessage,
OutgoingMessage,
)
from typing import List
bestReward = -1
DEFAULT_SEED = 9331
ENV_PATH = "../Build/Build-ParallelEnv-Target-OffPolicy-SingleStack-SideChannel-EndReward-Easy-V2.7-FreeOnly-NormalMapSize/Aimbot-ParallelEnv"
SIDE_CHANNEL_UUID = uuid.UUID("8bbfb62a-99b4-457c-879d-b78b69066b5e")
WAND_ENTITY = "koha9"
WORKER_ID = 2
BASE_PORT = 1111
# max round steps per agent is 2500/Decision_period, 25 seconds
# !!!check every parameters before run!!!
TOTAL_STEPS = 3150000
BATCH_SIZE = 1024
MAX_TRAINNING_DATASETS = 6000
DECISION_PERIOD = 1
LEARNING_RATE = 5e-4
GAMMA = 0.99
GAE_LAMBDA = 0.95
EPOCHS = 3
CLIP_COEF = 0.11
LOSS_COEF = [1.0, 1.0, 1.0, 1.0] # free go attack defence
POLICY_COEF = [1.0, 1.0, 1.0, 1.0]
ENTROPY_COEF = [0.05, 0.05, 0.05, 0.05]
CRITIC_COEF = [0.5, 0.5, 0.5, 0.5]
TARGET_LEARNING_RATE = 1e-6
FREEZE_VIEW_NETWORK = False
ANNEAL_LEARNING_RATE = True
CLIP_VLOSS = True
NORM_ADV = True
TRAIN = True
SAVE_MODEL = False
WANDB_TACK = False
LOAD_DIR = None
#LOAD_DIR = "../PPO-Model/Aimbot_Target_Hybrid_PMNN_V2_OffPolicy_EndBC_9331_1670986948-freeonly-20/Aimbot_Target_Hybrid_PMNN_V2_OffPolicy_EndBC_9331_1670986948_0.7949778.pt"
# public data
class Targets(Enum):
Free = 0
Go = 1
Attack = 2
Defence = 3
Num = 4
TARGET_STATE_SIZE = 6
INAREA_STATE_SIZE = 1
TIME_STATE_SIZE = 1
GUN_STATE_SIZE = 1
MY_STATE_SIZE = 4
TOTAL_T_SIZE = TARGET_STATE_SIZE+INAREA_STATE_SIZE+TIME_STATE_SIZE+GUN_STATE_SIZE+MY_STATE_SIZE
BASE_WINREWARD = 999
BASE_LOSEREWARD = -999
TARGETNUM= 4
ENV_TIMELIMIT = 30
RESULT_BROADCAST_RATIO = 1/ENV_TIMELIMIT
TotalRounds = {"Free":0,"Go":0,"Attack":0}
WinRounds = {"Free":0,"Go":0,"Attack":0}
# !!!SPECIAL PARAMETERS!!!
# change it while program is finished
using_targets_num = 3
def parse_args():
# fmt: off
# pytorch and environment parameters
parser = argparse.ArgumentParser()
parser.add_argument("--seed", type=int, default=DEFAULT_SEED,
help="seed of the experiment")
parser.add_argument("--path", type=str, default=ENV_PATH,
help="enviroment path")
parser.add_argument("--workerID", type=int, default=WORKER_ID,
help="unity worker ID")
parser.add_argument("--baseport", type=int, default=BASE_PORT,
help="port to connect to Unity environment")
parser.add_argument("--lr", type=float, default=LEARNING_RATE,
help="the learning rate of optimizer")
parser.add_argument("--cuda", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
help="if toggled, cuda will be enabled by default")
parser.add_argument("--total-timesteps", type=int, default=TOTAL_STEPS,
help="total timesteps of the experiments")
# model parameters
parser.add_argument("--train",type=lambda x: bool(strtobool(x)), default=TRAIN, nargs="?", const=True,
help="Train Model or not")
parser.add_argument("--freeze-viewnet", type=lambda x: bool(strtobool(x)), default=FREEZE_VIEW_NETWORK, nargs="?", const=True,
help="freeze view network or not")
parser.add_argument("--datasetSize", type=int, default=MAX_TRAINNING_DATASETS,
help="training dataset size,start training while dataset collect enough data")
parser.add_argument("--minibatchSize", type=int, default=BATCH_SIZE,
help="nimi batch size")
parser.add_argument("--epochs", type=int, default=EPOCHS,
help="the K epochs to update the policy")
parser.add_argument("--annealLR", type=lambda x: bool(strtobool(x)), default=ANNEAL_LEARNING_RATE, nargs="?", const=True,
help="Toggle learning rate annealing for policy and value networks")
parser.add_argument("--wandb-track", type=lambda x: bool(strtobool(x)), default=WANDB_TACK, nargs="?", const=True,
help="track on the wandb")
parser.add_argument("--save-model", type=lambda x: bool(strtobool(x)), default=SAVE_MODEL, nargs="?", const=True,
help="save model or not")
parser.add_argument("--wandb-entity", type=str, default=WAND_ENTITY,
help="the entity (team) of wandb's project")
parser.add_argument("--load-dir", type=str, default=LOAD_DIR,
help="load model directory")
parser.add_argument("--decision-period", type=int, default=DECISION_PERIOD,
help="the number of steps to run in each environment per policy rollout")
parser.add_argument("--result-broadcast-ratio", type=float, default=RESULT_BROADCAST_RATIO,
help="broadcast result when win round is reached,r=result-broadcast-ratio*remainTime")
# GAE loss
parser.add_argument("--gae", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
help="Use GAE for advantage computation")
parser.add_argument("--norm-adv", type=lambda x: bool(strtobool(x)), default=NORM_ADV, nargs="?", const=True,
help="Toggles advantages normalization")
parser.add_argument("--gamma", type=float, default=GAMMA,
help="the discount factor gamma")
parser.add_argument("--gaeLambda", type=float, default=GAE_LAMBDA,
help="the lambda for the general advantage estimation")
parser.add_argument("--clip-coef", type=float, default=CLIP_COEF,
help="the surrogate clipping coefficient")
parser.add_argument("--policy-coef", type=float, default=POLICY_COEF,
help="coefficient of the policy")
parser.add_argument("--ent-coef", type=float, default=ENTROPY_COEF,
help="coefficient of the entropy")
parser.add_argument("--critic-coef", type=float, default=CRITIC_COEF,
help="coefficient of the value function")
parser.add_argument("--clip-vloss", type=lambda x: bool(strtobool(x)), default=CLIP_VLOSS, nargs="?", const=True,
help="Toggles whether or not to use a clipped loss for the value function, as per the paper.")
parser.add_argument("--max-grad-norm", type=float, default=0.5,
help="the maximum norm for the gradient clipping")
parser.add_argument("--target-kl", type=float, default=None,
help="the target KL divergence threshold")
# fmt: on
args = parser.parse_args()
return args
def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
torch.nn.init.orthogonal_(layer.weight, std)
torch.nn.init.constant_(layer.bias, bias_const)
return layer
class PPOAgent(nn.Module):
def __init__(self, env: Aimbot,targetNum:int):
super(PPOAgent, self).__init__()
self.targetNum = targetNum
self.stateSize = env.unity_observation_shape[0]
self.agentNum = env.unity_agent_num
self.targetSize = TARGET_STATE_SIZE
self.timeSize = TIME_STATE_SIZE
self.gunSize = GUN_STATE_SIZE
self.myStateSize = MY_STATE_SIZE
self.raySize = env.unity_observation_shape[0] - TOTAL_T_SIZE
self.nonRaySize = TOTAL_T_SIZE
self.head_input_size = env.unity_observation_shape[0] - self.targetSize-self.timeSize-self.gunSize# except target state input
self.discrete_size = env.unity_discrete_size
self.discrete_shape = list(env.unity_discrete_branches)
self.continuous_size = env.unity_continuous_size
self.viewNetwork = nn.Sequential(
layer_init(nn.Linear(self.raySize, 200)),
nn.Tanh()
)
self.targetNetworks = nn.ModuleList([nn.Sequential(
layer_init(nn.Linear(self.nonRaySize, 100)),
nn.Tanh()
)for i in range(targetNum)])
self.middleNetworks = nn.ModuleList([nn.Sequential(
layer_init(nn.Linear(300,200)),
nn.Tanh()
)for i in range(targetNum)])
self.actor_dis = nn.ModuleList([layer_init(nn.Linear(200, self.discrete_size), std=0.5) for i in range(targetNum)])
self.actor_mean = nn.ModuleList([layer_init(nn.Linear(200, self.continuous_size), std=0.5) for i in range(targetNum)])
# self.actor_logstd = nn.ModuleList([layer_init(nn.Linear(200, self.continuous_size), std=1) for i in range(targetNum)])
# self.actor_logstd = nn.Parameter(torch.zeros(1, self.continuous_size))
self.actor_logstd = nn.ParameterList([nn.Parameter(torch.zeros(1,self.continuous_size))for i in range(targetNum)]) # nn.Parameter(torch.zeros(1, self.continuous_size))
self.critic = nn.ModuleList([layer_init(nn.Linear(200, 1), std=1)for i in range(targetNum)])
def get_value(self, state: torch.Tensor):
target = state[:,0].to(torch.int32) # int
thisStateNum = target.size()[0]
viewInput = state[:,-self.raySize:] # all ray input
targetInput = state[:,:self.nonRaySize]
viewLayer = self.viewNetwork(viewInput)
targetLayer = torch.stack([self.targetNetworks[target[i]](targetInput[i]) for i in range(thisStateNum)])
middleInput = torch.cat([viewLayer,targetLayer],dim = 1)
middleLayer = torch.stack([self.middleNetworks[target[i]](middleInput[i]) for i in range(thisStateNum)])
criticV = torch.stack([self.critic[target[i]](middleLayer[i]) for i in range(thisStateNum)]) # self.critic
return criticV
def get_actions_value(self, state: torch.Tensor, actions=None):
target = state[:,0].to(torch.int32) # int
thisStateNum = target.size()[0]
viewInput = state[:,-self.raySize:] # all ray input
targetInput = state[:,:self.nonRaySize]
viewLayer = self.viewNetwork(viewInput)
targetLayer = torch.stack([self.targetNetworks[target[i]](targetInput[i]) for i in range(thisStateNum)])
middleInput = torch.cat([viewLayer,targetLayer],dim = 1)
middleLayer = torch.stack([self.middleNetworks[target[i]](middleInput[i]) for i in range(thisStateNum)])
# discrete
# 递归targets的数量,既agent数来实现根据target不同来选用对应的输出网络计算输出
dis_logits = torch.stack([self.actor_dis[target[i]](middleLayer[i]) for i in range(thisStateNum)])
split_logits = torch.split(dis_logits, self.discrete_shape, dim=1)
multi_categoricals = [Categorical(logits=thisLogits) for thisLogits in split_logits]
# continuous
actions_mean = torch.stack([self.actor_mean[target[i]](middleLayer[i]) for i in range(thisStateNum)]) # self.actor_mean(hidden)
# action_logstd = torch.stack([self.actor_logstd[target[i]](middleLayer[i]) for i in range(thisStateNum)]) # self.actor_logstd(hidden)
# action_logstd = self.actor_logstd.expand_as(actions_mean) # self.actor_logstd.expand_as(actions_mean)
action_logstd = torch.stack([torch.squeeze(self.actor_logstd[target[i]],0) for i in range(thisStateNum)])
# print(action_logstd)
action_std = torch.exp(action_logstd) # torch.exp(action_logstd)
con_probs = Normal(actions_mean, action_std)
# critic
criticV = torch.stack([self.critic[target[i]](middleLayer[i]) for i in range(thisStateNum)]) # self.critic
if actions is None:
if args.train:
# select actions base on probability distribution model
disAct = torch.stack([ctgr.sample() for ctgr in multi_categoricals])
conAct = con_probs.sample()
actions = torch.cat([disAct.T, conAct], dim=1)
else:
# select actions base on best probability distribution
disAct = torch.stack([torch.argmax(logit, dim=1) for logit in split_logits])
conAct = actions_mean
actions = torch.cat([disAct.T, conAct], dim=1)
else:
disAct = actions[:, 0 : env.unity_discrete_type].T
conAct = actions[:, env.unity_discrete_type :]
dis_log_prob = torch.stack(
[ctgr.log_prob(act) for act, ctgr in zip(disAct, multi_categoricals)]
)
dis_entropy = torch.stack([ctgr.entropy() for ctgr in multi_categoricals])
return (
actions,
dis_log_prob.sum(0),
dis_entropy.sum(0),
con_probs.log_prob(conAct).sum(1),
con_probs.entropy().sum(1),
criticV,
)
def GAE(agent, args, rewards, dones, values, next_obs, next_done):
# GAE
with torch.no_grad():
next_value = agent.get_value(next_obs).reshape(1, -1)
data_size = rewards.size()[0]
if args.gae:
advantages = torch.zeros_like(rewards).to(device)
lastgaelam = 0
for t in reversed(range(data_size)):
if t == data_size - 1:
nextnonterminal = 1.0 - next_done
nextvalues = next_value
else:
nextnonterminal = 1.0 - dones[t + 1]
nextvalues = values[t + 1]
delta = rewards[t] + args.gamma * nextvalues * nextnonterminal - values[t]
advantages[t] = lastgaelam = (
delta + args.gamma * args.gaeLambda * nextnonterminal * lastgaelam
)
returns = advantages + values
else:
returns = torch.zeros_like(rewards).to(device)
for t in reversed(range(data_size)):
if t == data_size - 1:
nextnonterminal = 1.0 - next_done
next_return = next_value
else:
nextnonterminal = 1.0 - dones[t + 1]
next_return = returns[t + 1]
returns[t] = rewards[t] + args.gamma * nextnonterminal * next_return
advantages = returns - values
return advantages, returns
class AimbotSideChannel(SideChannel):
def __init__(self, channel_id: uuid.UUID) -> None:
super().__init__(channel_id)
def on_message_received(self, msg: IncomingMessage) -> None:
"""
Note: We must implement this method of the SideChannel interface to
receive messages from Unity
"""
thisMessage = msg.read_string()
# print(thisMessage)
thisResult = thisMessage.split("|")
if(thisResult[0] == "result"):
TotalRounds[thisResult[1]]+=1
if(thisResult[2] == "Win"):
WinRounds[thisResult[1]]+=1
#print(TotalRounds)
#print(WinRounds)
elif(thisResult[0] == "Error"):
print(thisMessage)
# 发送函数
def send_string(self, data: str) -> None:
# send a string toC#
msg = OutgoingMessage()
msg.write_string(data)
super().queue_message_to_send(msg)
def send_bool(self, data: bool) -> None:
msg = OutgoingMessage()
msg.write_bool(data)
super().queue_message_to_send(msg)
def send_int(self, data: int) -> None:
msg = OutgoingMessage()
msg.write_int32(data)
super().queue_message_to_send(msg)
def send_float(self, data: float) -> None:
msg = OutgoingMessage()
msg.write_float32(data)
super().queue_message_to_send(msg)
def send_float_list(self, data: List[float]) -> None:
msg = OutgoingMessage()
msg.write_float32_list(data)
super().queue_message_to_send(msg)
def broadCastEndReward(rewardBF:list,remainTime:float):
thisRewardBF = rewardBF
if (rewardBF[-1]<=-500):
# print("Lose DO NOT BROAD CAST",rewardBF[-1])
thisRewardBF[-1] = rewardBF[-1]-BASE_LOSEREWARD
thisRewardBF = thisRewardBF
elif (rewardBF[-1]>=500):
# print("Win! Broadcast reward!",rewardBF[-1])
thisRewardBF[-1] = rewardBF[-1]-BASE_WINREWARD
thisRewardBF = (np.asarray(thisRewardBF)+(remainTime*args.result_broadcast_ratio)).tolist()
else:
print("!!!!!DIDNT GET RESULT REWARD!!!!!!",rewardBF[-1])
return torch.Tensor(thisRewardBF).to(device)
if __name__ == "__main__":
args = parse_args()
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu")
# Initialize environment anget optimizer
aimBotsideChannel = AimbotSideChannel(SIDE_CHANNEL_UUID);
env = Aimbot(envPath=args.path, workerID=args.workerID, basePort=args.baseport,side_channels=[aimBotsideChannel])
if args.load_dir is None:
agent = PPOAgent(env,TARGETNUM).to(device)
else:
agent = torch.load(args.load_dir)
# freeze
if args.freeze_viewnet:
# freeze the view network
for p in agent.viewNetwork.parameters():
p.requires_grad = False
print("VIEW NETWORK FREEZED")
print("Load Agent", args.load_dir)
print(agent.eval())
optimizer = optim.Adam(agent.parameters(), lr=args.lr, eps=1e-5)
# Tensorboard and WandB Recorder
game_name = "Aimbot_Target_Hybrid_PMNN_V2"
game_type = "OffPolicy_EndBC"
run_name = f"{game_name}_{game_type}_{args.seed}_{int(time.time())}"
if args.wandb_track:
wandb.init(
project=game_name,
entity=args.wandb_entity,
sync_tensorboard=True,
config=vars(args),
name=run_name,
monitor_gym=True,
save_code=True,
)
writer = SummaryWriter(f"runs/{run_name}")
writer.add_text(
"hyperparameters",
"|param|value|\n|-|-|\n%s"
% ("\n".join([f"|{key}|{value}|" for key, value in vars(args).items()])),
)
@atexit.register
def save_model():
# save model while exit
saveDir = "../PPO-Model/"+ run_name + "_last.pt"
torch.save(agent, saveDir)
print("save model to " + saveDir)
# Trajectory Buffer
ob_bf = [[] for i in range(env.unity_agent_num)]
act_bf = [[] for i in range(env.unity_agent_num)]
dis_logprobs_bf = [[] for i in range(env.unity_agent_num)]
con_logprobs_bf = [[] for i in range(env.unity_agent_num)]
rewards_bf = [[] for i in range(env.unity_agent_num)]
dones_bf = [[] for i in range(env.unity_agent_num)]
values_bf = [[] for i in range(env.unity_agent_num)]
# start the game
total_update_step = using_targets_num * args.total_timesteps // args.datasetSize
target_steps = [0 for i in range(TARGETNUM)]
start_time = time.time()
state, _, done = env.reset()
# state = torch.Tensor(next_obs).to(device)
# next_done = torch.zeros(env.unity_agent_num).to(device)
# initialize empty training datasets
obs = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,env.unity_observation_size)
actions = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,env.unity_action_size)
dis_logprobs = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1)
con_logprobs = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1)
rewards = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1)
values = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1)
advantages = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1)
returns = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1)
vis_graph = make_dot(agent.get_actions_value(
torch.Tensor(state).to(device)
), params=dict(agent.named_parameters()))
vis_graph.view() # 会在当前目录下保存一个“Digraph.gv.pdf”文件,并在默认浏览器中打开
with torch.onnx.set_training(agent, False):
trace, _ = torch.jit.get_trace_graph(agent, args=(torch.Tensor(state).to(device),))
make_dot_from_trace(trace)
raise
for total_steps in range(total_update_step):
# discunt learning rate, while step == total_update_step lr will be 0
if args.annealLR:
finalRatio = TARGET_LEARNING_RATE/args.lr
frac = 1.0 - ((total_steps + 1.0) / total_update_step)
lrnow = frac * args.lr
optimizer.param_groups[0]["lr"] = lrnow
else:
lrnow = args.lr
print("new episode",total_steps,"learning rate = ",lrnow)
# MAIN LOOP: run agent in environment
step = 0
training = False
trainQueue = []
last_reward = [0.for i in range(env.unity_agent_num)]
while True:
if step % args.decision_period == 0:
step += 1
# Choose action by agent
with torch.no_grad():
# predict actions
action, dis_logprob, _, con_logprob, _, value = agent.get_actions_value(
torch.Tensor(state).to(device)
)
value = value.flatten()
# variable from GPU to CPU
action_cpu = action.cpu().numpy()
dis_logprob_cpu = dis_logprob.cpu().numpy()
con_logprob_cpu = con_logprob.cpu().numpy()
value_cpu = value.cpu().numpy()
# Environment step
next_state, reward, next_done = env.step(action_cpu)
# save memories
for i in range(env.unity_agent_num):
# save memories to buffers
ob_bf[i].append(state[i])
act_bf[i].append(action_cpu[i])
dis_logprobs_bf[i].append(dis_logprob_cpu[i])
con_logprobs_bf[i].append(con_logprob_cpu[i])
rewards_bf[i].append(reward[i]+last_reward[i])
dones_bf[i].append(done[i])
values_bf[i].append(value_cpu[i])
remainTime = state[i,TARGET_STATE_SIZE]
if next_done[i] == True:
# finished a round, send finished memories to training datasets
# compute advantage and discounted reward
#print(i,"over")
roundTargetType = int(state[i,0])
thisRewardsTensor = broadCastEndReward(rewards_bf[i],remainTime)
adv, rt = GAE(
agent,
args,
thisRewardsTensor,
torch.Tensor(dones_bf[i]).to(device),
torch.tensor(values_bf[i]).to(device),
torch.tensor(next_state[i]).to(device).unsqueeze(0),
torch.Tensor([next_done[i]]).to(device),
)
# send memories to training datasets
obs[roundTargetType] = torch.cat((obs[roundTargetType], torch.tensor(ob_bf[i]).to(device)), 0)
actions[roundTargetType] = torch.cat((actions[roundTargetType], torch.tensor(act_bf[i]).to(device)), 0)
dis_logprobs[roundTargetType] = torch.cat(
(dis_logprobs[roundTargetType], torch.tensor(dis_logprobs_bf[i]).to(device)), 0
)
con_logprobs[roundTargetType] = torch.cat(
(con_logprobs[roundTargetType], torch.tensor(con_logprobs_bf[i]).to(device)), 0
)
rewards[roundTargetType] = torch.cat((rewards[roundTargetType], thisRewardsTensor), 0)
values[roundTargetType] = torch.cat((values[roundTargetType], torch.tensor(values_bf[i]).to(device)), 0)
advantages[roundTargetType] = torch.cat((advantages[roundTargetType], adv), 0)
returns[roundTargetType] = torch.cat((returns[roundTargetType], rt), 0)
# clear buffers
ob_bf[i] = []
act_bf[i] = []
dis_logprobs_bf[i] = []
con_logprobs_bf[i] = []
rewards_bf[i] = []
dones_bf[i] = []
values_bf[i] = []
print(f"train dataset {Targets(roundTargetType).name} added:{obs[roundTargetType].size()[0]}/{args.datasetSize}")
for i in range(TARGETNUM):
if obs[i].size()[0] >= args.datasetSize:
# start train NN
trainQueue.append(i)
if(len(trainQueue)>0):
break
state, done = next_state, next_done
else:
step += 1
# skip this step use last predict action
next_state, reward, next_done = env.step(action_cpu)
# save memories
for i in range(env.unity_agent_num):
if next_done[i] == True:
#print(i,"over???")
# save memories to buffers
ob_bf[i].append(state[i])
act_bf[i].append(action_cpu[i])
dis_logprobs_bf[i].append(dis_logprob_cpu[i])
con_logprobs_bf[i].append(con_logprob_cpu[i])
rewards_bf[i].append(reward[i])
dones_bf[i].append(done[i])
values_bf[i].append(value_cpu[i])
remainTime = state[i,TARGET_STATE_SIZE]
# finished a round, send finished memories to training datasets
# compute advantage and discounted reward
roundTargetType = int(state[i,0])
thisRewardsTensor = broadCastEndReward(rewards_bf[i],remainTime)
adv, rt = GAE(
agent,
args,
thisRewardsTensor,
torch.Tensor(dones_bf[i]).to(device),
torch.tensor(values_bf[i]).to(device),
torch.Tensor(next_state[i]).to(device).unsqueeze(dim = 0),
torch.Tensor([next_done[i]]).to(device),
)
# send memories to training datasets
obs[roundTargetType] = torch.cat((obs[roundTargetType], torch.tensor(ob_bf[i]).to(device)), 0)
actions[roundTargetType] = torch.cat((actions[roundTargetType], torch.tensor(act_bf[i]).to(device)), 0)
dis_logprobs[roundTargetType] = torch.cat(
(dis_logprobs[roundTargetType], torch.tensor(dis_logprobs_bf[i]).to(device)), 0
)
con_logprobs[roundTargetType] = torch.cat(
(con_logprobs[roundTargetType], torch.tensor(con_logprobs_bf[i]).to(device)), 0
)
rewards[roundTargetType] = torch.cat((rewards[roundTargetType], thisRewardsTensor), 0)
values[roundTargetType] = torch.cat((values[roundTargetType], torch.tensor(values_bf[i]).to(device)), 0)
advantages[roundTargetType] = torch.cat((advantages[roundTargetType], adv), 0)
returns[roundTargetType] = torch.cat((returns[roundTargetType], rt), 0)
# clear buffers
ob_bf[i] = []
act_bf[i] = []
dis_logprobs_bf[i] = []
con_logprobs_bf[i] = []
rewards_bf[i] = []
dones_bf[i] = []
values_bf[i] = []
print(f"train dataset {Targets(roundTargetType).name} added:{obs[roundTargetType].size()[0]}/{args.datasetSize}")
state = next_state
last_reward = reward
i += 1
if args.train:
meanRewardList = [] # for WANDB
# loop all tarining queue
for thisT in trainQueue:
target_steps[thisT]+=1
# flatten the batch
b_obs = obs[thisT].reshape((-1,) + env.unity_observation_shape)
b_dis_logprobs = dis_logprobs[thisT].reshape(-1)
b_con_logprobs = con_logprobs[thisT].reshape(-1)
b_actions = actions[thisT].reshape((-1,) + (env.unity_action_size,))
b_advantages = advantages[thisT].reshape(-1)
b_returns = returns[thisT].reshape(-1)
b_values = values[thisT].reshape(-1)
b_size = b_obs.size()[0]
# Optimizing the policy and value network
b_inds = np.arange(b_size)
# clipfracs = []
for epoch in range(args.epochs):
print(epoch,end="")
# shuffle all datasets
np.random.shuffle(b_inds)
for start in range(0, b_size, args.minibatchSize):
print(".",end="")
end = start + args.minibatchSize
mb_inds = b_inds[start:end]
if(np.size(mb_inds)<=1):
break
mb_advantages = b_advantages[mb_inds]
# normalize advantages
if args.norm_adv:
mb_advantages = (mb_advantages - mb_advantages.mean()) / (
mb_advantages.std() + 1e-8
)
(
_,
new_dis_logprob,
dis_entropy,
new_con_logprob,
con_entropy,
newvalue,
) = agent.get_actions_value(b_obs[mb_inds], b_actions[mb_inds])
# discrete ratio
dis_logratio = new_dis_logprob - b_dis_logprobs[mb_inds]
dis_ratio = dis_logratio.exp()
# continuous ratio
con_logratio = new_con_logprob - b_con_logprobs[mb_inds]
con_ratio = con_logratio.exp()
"""
# early stop
with torch.no_grad():
# calculate approx_kl http://joschu.net/blog/kl-approx.html
old_approx_kl = (-logratio).mean()
approx_kl = ((ratio - 1) - logratio).mean()
clipfracs += [((ratio - 1.0).abs() > args.clip_coef).float().mean().item()]
"""
# discrete Policy loss
dis_pg_loss_orig = -mb_advantages * dis_ratio
dis_pg_loss_clip = -mb_advantages * torch.clamp(
dis_ratio, 1 - args.clip_coef, 1 + args.clip_coef
)
dis_pg_loss = torch.max(dis_pg_loss_orig, dis_pg_loss_clip).mean()
# continuous Policy loss
con_pg_loss_orig = -mb_advantages * con_ratio
con_pg_loss_clip = -mb_advantages * torch.clamp(
con_ratio, 1 - args.clip_coef, 1 + args.clip_coef
)
con_pg_loss = torch.max(con_pg_loss_orig, con_pg_loss_clip).mean()
# Value loss
newvalue = newvalue.view(-1)
if args.clip_vloss:
v_loss_unclipped = (newvalue - b_returns[mb_inds]) ** 2
v_clipped = b_values[mb_inds] + torch.clamp(
newvalue - b_values[mb_inds],
-args.clip_coef,
args.clip_coef,
)
v_loss_clipped = (v_clipped - b_returns[mb_inds]) ** 2
v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped)
v_loss = 0.5 * v_loss_max.mean()
else:
v_loss = 0.5 * ((newvalue - b_returns[mb_inds]) ** 2).mean()
# total loss
entropy_loss = dis_entropy.mean() + con_entropy.mean()
loss = (
dis_pg_loss * POLICY_COEF[thisT]
+ con_pg_loss * POLICY_COEF[thisT]
+ entropy_loss * ENTROPY_COEF[thisT]
+ v_loss * CRITIC_COEF[thisT]
)*LOSS_COEF[thisT]
if(torch.isnan(loss).any()):
print("LOSS Include NAN!!!")
if(torch.isnan(dis_pg_loss.any())):
print("dis_pg_loss include nan")
if(torch.isnan(con_pg_loss.any())):
print("con_pg_loss include nan")
if(torch.isnan(entropy_loss.any())):
print("entropy_loss include nan")
if(torch.isnan(v_loss.any())):
print("v_loss include nan")
raise
optimizer.zero_grad()
loss.backward()
# Clips gradient norm of an iterable of parameters.
nn.utils.clip_grad_norm_(agent.parameters(), args.max_grad_norm)
optimizer.step()
"""
if args.target_kl is not None:
if approx_kl > args.target_kl:
break
"""
# record mean reward before clear history
print("done")
targetRewardMean = np.mean(rewards[thisT].to("cpu").detach().numpy().copy())
meanRewardList.append(targetRewardMean)
targetName = Targets(thisT).name
# clear this target trainning set buffer
obs[thisT] = torch.tensor([]).to(device)
actions[thisT] = torch.tensor([]).to(device)
dis_logprobs[thisT] = torch.tensor([]).to(device)
con_logprobs[thisT] = torch.tensor([]).to(device)
rewards[thisT] = torch.tensor([]).to(device)
values[thisT] = torch.tensor([]).to(device)
advantages[thisT] = torch.tensor([]).to(device)
returns[thisT] = torch.tensor([]).to(device)
# record rewards for plotting purposes
writer.add_scalar(f"Target{targetName}/value_loss", v_loss.item(), target_steps[thisT])
writer.add_scalar(f"Target{targetName}/dis_policy_loss", dis_pg_loss.item(), target_steps[thisT])
writer.add_scalar(f"Target{targetName}/con_policy_loss", con_pg_loss.item(), target_steps[thisT])
writer.add_scalar(f"Target{targetName}/total_loss", loss.item(), target_steps[thisT])
writer.add_scalar(f"Target{targetName}/entropy_loss", entropy_loss.item(), target_steps[thisT])
writer.add_scalar(f"Target{targetName}/Reward", targetRewardMean, target_steps[thisT])
writer.add_scalar(f"Target{targetName}/WinRatio", WinRounds[targetName]/TotalRounds[targetName], target_steps[thisT])
print(f"episode over Target{targetName} mean reward:", targetRewardMean)
TotalRewardMean = np.mean(meanRewardList)
writer.add_scalar("GlobalCharts/TotalRewardMean", TotalRewardMean, total_steps)
writer.add_scalar("GlobalCharts/learning_rate", optimizer.param_groups[0]["lr"], total_steps)
# New Record!
if TotalRewardMean > bestReward and args.save_model:
bestReward = targetRewardMean
saveDir = "../PPO-Model/" + run_name +"_"+ str(TotalRewardMean) + ".pt"
torch.save(agent, saveDir)
saveDir = "../PPO-Model/"+ run_name + "_last.pt"
torch.save(agent, saveDir)
env.close()
writer.close()
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
-256
View File
@@ -1,256 +0,0 @@
import time
import numpy as np
import random
import uuid
import torch
import atexit
from aimbotEnv import Aimbot
from aimbotEnv import AimbotSideChannel
from ppoagent import PPOAgent
from airecorder import WandbRecorder
from aimemory import PPOMem
from aimemory import Targets
from arguments import parse_args
import torch.optim as optim
# side channel uuid
SIDE_CHANNEL_UUID = uuid.UUID("8bbfb62a-99b4-457c-879d-b78b69066b5e")
# tensorboard names
GAME_NAME = "Aimbot_Hybrid_V3"
GAME_TYPE = "Mix_Verification"
if __name__ == "__main__":
args = parse_args()
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu")
best_reward = -1
# Initialize environment agent optimizer
aimbot_side_channel = AimbotSideChannel(SIDE_CHANNEL_UUID)
env = Aimbot(
env_path=args.path,
worker_id=args.workerID,
base_port=args.baseport,
side_channels=[aimbot_side_channel])
if args.load_dir is None:
agent = PPOAgent(
env=env,
this_args=args,
device=device,
).to(device)
else:
agent = torch.load(args.load_dir)
# freeze
if args.freeze_viewnet:
# freeze the view network
for p in agent.viewNetwork.parameters():
p.requires_grad = False
print("VIEW NETWORK FREEZE")
print("Load Agent", args.load_dir)
print(agent.eval())
# optimizer
optimizer = optim.Adam(agent.parameters(), lr=args.lr, eps=1e-5)
# Tensorboard and WandB Recorder
run_name = f"{GAME_TYPE}_{args.seed}_{int(time.time())}"
wdb_recorder = WandbRecorder(GAME_NAME, GAME_TYPE, run_name, args)
@atexit.register
def save_model():
# close env
env.close()
if args.save_model:
# save model while exit
save_dir = "../PPO-Model/" + run_name + "_last.pt"
torch.save(agent, save_dir)
print("save model to " + save_dir)
# start the game
total_update_step = args.target_num * args.total_timesteps // args.datasetSize
target_steps = [0 for i in range(args.target_num)]
start_time = time.time()
state, _, done = env.reset()
# initialize AI memories
ppo_memories = PPOMem(
args=args,
unity_agent_num=env.unity_agent_num,
device=device,
)
# MAIN LOOP: run agent in environment
for total_steps in range(total_update_step):
# discount learning rate, while step == total_update_step lr will be 0
if args.annealLR:
final_lr_ratio = args.target_lr / args.lr
frac = 1.0 - ((total_steps + 1.0) / total_update_step)
lr_now = frac * args.lr
optimizer.param_groups[0]["lr"] = lr_now
else:
lr_now = args.lr
# episode start show learning rate
print("new episode", total_steps, "learning rate = ", lr_now)
step = 0
training = False
train_queue = []
last_reward = [0. for i in range(env.unity_agent_num)]
# MAIN LOOP: run agent in environment
while True:
# Target Type(state[0][0]) is stay(4),use all zero action
if state[0][0] == 4:
next_state, reward, next_done = env.step(env.all_zero_action)
state, done = next_state, next_done
continue
# On decision point, and Target Type(state[0][0]) is not stay(4) choose action by agent
if step % args.decision_period == 0:
step += 1
# Choose action by agent
with torch.no_grad():
# predict actions
action, dis_logprob, _, con_logprob, _, value = agent.get_actions_value(
torch.tensor(state,dtype=torch.float32).to(device)
)
value = value.flatten()
# variable from GPU to CPU
action_cpu = action.cpu().numpy()
dis_logprob_cpu = dis_logprob.cpu().numpy()
con_logprob_cpu = con_logprob.cpu().numpy()
value_cpu = value.cpu().numpy()
# Environment step
next_state, reward, next_done = env.step(action_cpu)
# save memories
if args.train:
ppo_memories.save_memories(
now_step=step,
agent=agent,
state=state,
action_cpu=action_cpu,
dis_logprob_cpu=dis_logprob_cpu,
con_logprob_cpu=con_logprob_cpu,
reward=reward,
done=done,
value_cpu=value_cpu,
last_reward=last_reward,
next_done=next_done,
next_state=next_state,
)
# check if any training dataset is full and ready to train
for i in range(args.target_num):
if ppo_memories.obs[i].size()[0] >= args.datasetSize:
# start train NN
train_queue.append(i)
if len(train_queue) > 0:
# break while loop and start train
break
# update state
state, done = next_state, next_done
else:
step += 1
# skip this step use last predict action
next_state, reward, next_done = env.step(action_cpu)
# save memories
if args.train:
ppo_memories.save_memories(
now_step=step,
agent=agent,
state=state,
action_cpu=action_cpu,
dis_logprob_cpu=dis_logprob_cpu,
con_logprob_cpu=con_logprob_cpu,
reward=reward,
done=done,
value_cpu=value_cpu,
last_reward=last_reward,
next_done=next_done,
next_state=next_state,
)
# update state
state = next_state
last_reward = reward
if args.train:
# train mode on
mean_reward_list = [] # for WANDB
# loop all training queue
for this_train_ind in train_queue:
# start time
start_time = time.time()
target_steps[this_train_ind] += 1
# train agent
(
v_loss,
dis_pg_loss,
con_pg_loss,
loss,
entropy_loss
) = agent.train_net(
this_train_ind=this_train_ind,
ppo_memories=ppo_memories,
optimizer=optimizer
)
# record mean reward before clear history
print("done")
target_reward_mean = np.mean(ppo_memories.rewards[this_train_ind].to("cpu").detach().numpy().copy())
mean_reward_list.append(target_reward_mean)
targetName = Targets(this_train_ind).name
# clear this target training set buffer
ppo_memories.clear_training_datasets(this_train_ind)
# record rewards for plotting purposes
wdb_recorder.add_target_scalar(
targetName,
this_train_ind,
v_loss,
dis_pg_loss,
con_pg_loss,
loss,
entropy_loss,
target_reward_mean,
target_steps,
)
print(f"episode over Target{targetName} mean reward:", target_reward_mean)
TotalRewardMean = np.mean(mean_reward_list)
wdb_recorder.add_global_scalar(
TotalRewardMean,
optimizer.param_groups[0]["lr"],
total_steps,
)
# print cost time as seconds
print("cost time:", time.time() - start_time)
# New Record!
if TotalRewardMean > best_reward and args.save_model:
best_reward = target_reward_mean
saveDir = "../PPO-Model/" + run_name + "_" + str(TotalRewardMean) + ".pt"
torch.save(agent, saveDir)
else:
# train mode off
mean_reward_list = [] # for WANDB
# while not in training mode, clear the buffer
for this_train_ind in train_queue:
target_steps[this_train_ind] += 1
targetName = Targets(this_train_ind).name
target_reward_mean = np.mean(ppo_memories.rewards[this_train_ind].to("cpu").detach().numpy().copy())
mean_reward_list.append(target_reward_mean)
print(target_steps[this_train_ind])
# clear this target training set buffer
ppo_memories.clear_training_datasets(this_train_ind)
# record rewards for plotting purposes
wdb_recorder.writer.add_scalar(f"Target{targetName}/Reward", target_reward_mean,
target_steps[this_train_ind])
wdb_recorder.add_win_ratio(targetName, target_steps[this_train_ind])
print(f"episode over Target{targetName} mean reward:", target_reward_mean)
TotalRewardMean = np.mean(mean_reward_list)
wdb_recorder.writer.add_scalar("GlobalCharts/TotalRewardMean", TotalRewardMean, total_steps)
saveDir = "../PPO-Model/" + run_name + "_last.pt"
torch.save(agent, saveDir)
env.close()
wdb_recorder.writer.close()
-142
View File
@@ -1,142 +0,0 @@
import torch
import numpy as np
import argparse
from ppoagent import PPOAgent
from enum import Enum
# public data
class Targets(Enum):
Free = 0
Go = 1
Attack = 2
Defence = 3
Num = 4
class PPOMem:
def __init__(
self,
args: argparse.Namespace,
unity_agent_num: int,
device: torch.device,
) -> None:
self.target_num = args.target_num
self.data_set_size = args.datasetSize
self.result_broadcast_ratio = args.result_broadcast_ratio
self.decision_period = args.decision_period
self.unity_agent_num = unity_agent_num
self.base_lose_reward = args.base_lose_reward
self.base_win_reward = args.base_win_reward
self.target_state_size = args.target_state_size
self.device = device
# Trajectory Buffer
self.ob_bf = [[] for i in range(self.unity_agent_num)]
self.act_bf = [[] for i in range(self.unity_agent_num)]
self.dis_logprobs_bf = [[] for i in range(self.unity_agent_num)]
self.con_logprobs_bf = [[] for i in range(self.unity_agent_num)]
self.rewards_bf = [[] for i in range(self.unity_agent_num)]
self.dones_bf = [[] for i in range(self.unity_agent_num)]
self.values_bf = [[] for i in range(self.unity_agent_num)]
# initialize empty training datasets
self.obs = [torch.tensor([]).to(device) for i in range(self.target_num)] # (TARGETNUM,n,env.unity_observation_size)
self.actions = [torch.tensor([]).to(device) for i in range(self.target_num)] # (TARGETNUM,n,env.unity_action_size)
self.dis_logprobs = [torch.tensor([]).to(device) for i in range(self.target_num)] # (TARGETNUM,n,1)
self.con_logprobs = [torch.tensor([]).to(device) for i in range(self.target_num)] # (TARGETNUM,n,1)
self.rewards = [torch.tensor([]).to(device) for i in range(self.target_num)] # (TARGETNUM,n,1)
self.values = [torch.tensor([]).to(device) for i in range(self.target_num)] # (TARGETNUM,n,1)
self.advantages = [torch.tensor([]).to(device) for i in range(self.target_num)] # (TARGETNUM,n,1)
self.returns = [torch.tensor([]).to(device) for i in range(self.target_num)] # (TARGETNUM,n,1)
def broad_cast_end_reward(self, rewardBF: list, remainTime: float) -> torch.Tensor:
thisRewardBF = rewardBF.copy()
if rewardBF[-1] <= -500:
# print("Lose DO NOT BROAD CAST",rewardBF[-1])
thisRewardBF[-1] = rewardBF[-1] - self.base_lose_reward
elif rewardBF[-1] >= 500:
# print("Win! Broadcast reward!",rewardBF[-1])
print(sum(thisRewardBF) / len(thisRewardBF))
thisRewardBF[-1] = rewardBF[-1] - self.base_win_reward
thisRewardBF = (np.asarray(thisRewardBF) + (remainTime * self.result_broadcast_ratio)).tolist()
else:
print("!!!!!DIDNT GET RESULT REWARD!!!!!!", rewardBF[-1])
return torch.tensor(thisRewardBF,dtype=torch.float32).to(self.device)
def save_memories(
self,
now_step: int,
agent: PPOAgent,
state: np.ndarray,
action_cpu: np.ndarray,
dis_logprob_cpu: np.ndarray,
con_logprob_cpu: np.ndarray,
reward: list,
done: list,
value_cpu: np.ndarray,
last_reward: list,
next_done: list,
next_state: np.ndarray,
):
for i in range(self.unity_agent_num):
if now_step % self.decision_period == 0 or next_done[i] == True:
# only on decision period or finished a round, save memories to buffer
self.ob_bf[i].append(state[i])
self.act_bf[i].append(action_cpu[i])
self.dis_logprobs_bf[i].append(dis_logprob_cpu[i])
self.con_logprobs_bf[i].append(con_logprob_cpu[i])
self.dones_bf[i].append(done[i])
self.values_bf[i].append(value_cpu[i])
if now_step % self.decision_period == 0:
# on decision period, add last skiped round's reward
self.rewards_bf[i].append(reward[i] + last_reward[i])
else:
# not on decision period, only add this round's reward
self.rewards_bf[i].append(reward[i])
if next_done[i] == True:
# finished a round, send finished memories to training datasets
# compute advantage and discounted reward
remainTime = state[i, self.target_state_size]
roundTargetType = int(state[i, 0])
thisRewardsTensor = self.broad_cast_end_reward(self.rewards_bf[i], remainTime)
adv, rt = agent.gae(
rewards=thisRewardsTensor,
dones=torch.tensor(self.dones_bf[i],dtype=torch.float32).to(self.device),
values=torch.tensor(self.values_bf[i]).to(self.device),
next_obs=torch.tensor(next_state[i]).to(self.device).unsqueeze(0),
next_done=torch.tensor([next_done[i]],dtype=torch.float32).to(self.device),
)
# send memories to training datasets
self.obs[roundTargetType] = torch.cat((self.obs[roundTargetType], torch.tensor(np.array(self.ob_bf[i])).to(self.device)), 0)
self.actions[roundTargetType] = torch.cat((self.actions[roundTargetType], torch.tensor(np.array(self.act_bf[i])).to(self.device)), 0)
self.dis_logprobs[roundTargetType] = torch.cat((self.dis_logprobs[roundTargetType], torch.tensor(np.array(self.dis_logprobs_bf[i])).to(self.device)), 0)
self.con_logprobs[roundTargetType] = torch.cat((self.con_logprobs[roundTargetType], torch.tensor(np.array(self.con_logprobs_bf[i])).to(self.device)), 0)
self.rewards[roundTargetType] = torch.cat((self.rewards[roundTargetType], thisRewardsTensor), 0)
self.values[roundTargetType] = torch.cat((self.values[roundTargetType], torch.tensor(np.array(self.values_bf[i])).to(self.device)), 0)
self.advantages[roundTargetType] = torch.cat((self.advantages[roundTargetType], adv), 0)
self.returns[roundTargetType] = torch.cat((self.returns[roundTargetType], rt), 0)
# clear buffers
self.clear_buffers(i)
print(f"train dataset {Targets(roundTargetType).name} added:{self.obs[roundTargetType].size()[0]}/{self.data_set_size}")
def clear_buffers(self,ind:int):
# clear buffers
self.ob_bf[ind] = []
self.act_bf[ind] = []
self.dis_logprobs_bf[ind] = []
self.con_logprobs_bf[ind] = []
self.rewards_bf[ind] = []
self.dones_bf[ind] = []
self.values_bf[ind] = []
def clear_training_datasets(self,ind:int):
# clear training datasets
self.obs[ind] = torch.tensor([]).to(self.device)
self.actions[ind] = torch.tensor([]).to(self.device)
self.dis_logprobs[ind] = torch.tensor([]).to(self.device)
self.con_logprobs[ind] = torch.tensor([]).to(self.device)
self.rewards[ind] = torch.tensor([]).to(self.device)
self.values[ind] = torch.tensor([]).to(self.device)
self.advantages[ind] = torch.tensor([]).to(self.device)
self.returns[ind] = torch.tensor([]).to(self.device)
-81
View File
@@ -1,81 +0,0 @@
from torch.utils.tensorboard import SummaryWriter
import wandb
total_rounds = {"Free": 0, "Go": 0, "Attack": 0}
win_rounds = {"Free": 0, "Go": 0, "Attack": 0}
# class for wandb recording
class WandbRecorder:
def __init__(self, game_name: str, game_type: str, run_name: str, _args) -> None:
# init wandb
self.game_name = game_name
self.game_type = game_type
self._args = _args
self.run_name = run_name
if self._args.wandb_track:
wandb.init(
project=self.game_name,
entity=self._args.wandb_entity,
sync_tensorboard=True,
config=vars(self._args),
name=self.run_name,
monitor_gym=True,
save_code=True,
)
self.writer = SummaryWriter(f"runs/{self.run_name}")
self.writer.add_text(
"hyperparameters",
"|param|value|\n|-|-|\n%s"
% ("\n".join([f"|{key}|{value}|" for key, value in vars(self._args).items()])),
)
def add_target_scalar(
self,
target_name,
this_t,
v_loss,
dis_pg_loss,
con_pg_loss,
loss,
entropy_loss,
target_reward_mean,
target_steps,
):
# fmt:off
self.writer.add_scalar(
f"Target{target_name}/value_loss", v_loss.item(), target_steps[this_t]
)
self.writer.add_scalar(
f"Target{target_name}/dis_policy_loss", dis_pg_loss.item(), target_steps[this_t]
)
self.writer.add_scalar(
f"Target{target_name}/con_policy_loss", con_pg_loss.item(), target_steps[this_t]
)
self.writer.add_scalar(
f"Target{target_name}/total_loss", loss.item(), target_steps[this_t]
)
self.writer.add_scalar(
f"Target{target_name}/entropy_loss", entropy_loss.item(), target_steps[this_t]
)
self.writer.add_scalar(
f"Target{target_name}/Reward", target_reward_mean, target_steps[this_t]
)
self.writer.add_scalar(
f"Target{target_name}/WinRatio", win_rounds[target_name] / total_rounds[target_name], target_steps[this_t],
)
# fmt:on
def add_global_scalar(
self,
total_reward_mean,
learning_rate,
total_steps,
):
self.writer.add_scalar("GlobalCharts/TotalRewardMean", total_reward_mean, total_steps)
self.writer.add_scalar("GlobalCharts/learning_rate", learning_rate, total_steps)
def add_win_ratio(self, target_name, target_steps):
self.writer.add_scalar(
f"Target{target_name}/WinRatio", win_rounds[target_name] / total_rounds[target_name], target_steps,
)
-154
View File
@@ -1,154 +0,0 @@
import argparse
import uuid
from distutils.util import strtobool
DEFAULT_SEED = 9331
ENV_PATH = "../Build/3.1.6/Aimbot-ParallelEnv"
WAND_ENTITY = "koha9"
WORKER_ID = 1
BASE_PORT = 1000
# tensorboard names
GAME_NAME = "Aimbot_Target_Hybrid_PMNN_V3"
GAME_TYPE = "Mix_Verification"
# max round steps per agent is 2500/Decision_period, 25 seconds
TOTAL_STEPS = 3150000
BATCH_SIZE = 512
MAX_TRAINNING_DATASETS = 6000
DECISION_PERIOD = 1
LEARNING_RATE = 6.5e-4
GAMMA = 0.99
GAE_LAMBDA = 0.95
EPOCHS = 3
CLIP_COEF = 0.11
LOSS_COEF = [1.0, 1.0, 1.0, 1.0] # free go attack defence
POLICY_COEF = [1.0, 1.0, 1.0, 1.0]
ENTROPY_COEF = [0.05, 0.05, 0.05, 0.05]
CRITIC_COEF = [0.5, 0.5, 0.5, 0.5]
TARGET_LEARNING_RATE = 1e-6
FREEZE_VIEW_NETWORK = False
BROADCASTREWARD = False
ANNEAL_LEARNING_RATE = True
CLIP_VLOSS = True
NORM_ADV = False
TRAIN = True
SAVE_MODEL = False
WANDB_TACK = False
LOAD_DIR = None
#LOAD_DIR = "../PPO-Model/PList_Go_LeakyReLU_9331_1677965178_bestGoto/PList_Go_LeakyReLU_9331_1677965178_10.709002.pt"
# Unity Environment Parameters
TARGET_STATE_SIZE = 6
INAREA_STATE_SIZE = 1
TIME_STATE_SIZE = 1
GUN_STATE_SIZE = 1
MY_STATE_SIZE = 4
TOTAL_T_SIZE = TARGET_STATE_SIZE+INAREA_STATE_SIZE+TIME_STATE_SIZE+GUN_STATE_SIZE+MY_STATE_SIZE
BASE_WINREWARD = 999
BASE_LOSEREWARD = -999
TARGETNUM= 4
ENV_TIMELIMIT = 30
RESULT_BROADCAST_RATIO = 1/ENV_TIMELIMIT
def parse_args():
# fmt: off
# pytorch and environment parameters
parser = argparse.ArgumentParser()
parser.add_argument("--seed", type=int, default=DEFAULT_SEED,
help="seed of the experiment")
parser.add_argument("--path", type=str, default=ENV_PATH,
help="enviroment path")
parser.add_argument("--workerID", type=int, default=WORKER_ID,
help="unity worker ID")
parser.add_argument("--baseport", type=int, default=BASE_PORT,
help="port to connect to Unity environment")
parser.add_argument("--lr", type=float, default=LEARNING_RATE,
help="the default learning rate of optimizer")
parser.add_argument("--cuda", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
help="if toggled, cuda will be enabled by default")
parser.add_argument("--total-timesteps", type=int, default=TOTAL_STEPS,
help="total timesteps of the experiments")
# model parameters
parser.add_argument("--train",type=lambda x: bool(strtobool(x)), default=TRAIN, nargs="?", const=True,
help="Train Model or not")
parser.add_argument("--freeze-viewnet", type=lambda x: bool(strtobool(x)), default=FREEZE_VIEW_NETWORK, nargs="?", const=True,
help="freeze view network or not")
parser.add_argument("--datasetSize", type=int, default=MAX_TRAINNING_DATASETS,
help="training dataset size,start training while dataset collect enough data")
parser.add_argument("--minibatchSize", type=int, default=BATCH_SIZE,
help="nimi batch size")
parser.add_argument("--epochs", type=int, default=EPOCHS,
help="the K epochs to update the policy")
parser.add_argument("--annealLR", type=lambda x: bool(strtobool(x)), default=ANNEAL_LEARNING_RATE, nargs="?", const=True,
help="Toggle learning rate annealing for policy and value networks")
parser.add_argument("--wandb-track", type=lambda x: bool(strtobool(x)), default=WANDB_TACK, nargs="?", const=True,
help="track on the wandb")
parser.add_argument("--save-model", type=lambda x: bool(strtobool(x)), default=SAVE_MODEL, nargs="?", const=True,
help="save model or not")
parser.add_argument("--wandb-entity", type=str, default=WAND_ENTITY,
help="the entity (team) of wandb's project")
parser.add_argument("--load-dir", type=str, default=LOAD_DIR,
help="load model directory")
parser.add_argument("--decision-period", type=int, default=DECISION_PERIOD,
help="the number of steps to run in each environment per policy rollout")
parser.add_argument("--result-broadcast-ratio", type=float, default=RESULT_BROADCAST_RATIO,
help="broadcast result when win round is reached,r=result-broadcast-ratio*remainTime")
parser.add_argument("--broadCastEndReward", type=lambda x: bool(strtobool(x)), default=BROADCASTREWARD, nargs="?", const=True,
help="save model or not")
# target_learning_rate
parser.add_argument("--target-lr", type=float, default=TARGET_LEARNING_RATE,
help="target value of downscaling the learning rate")
# POLICY_COEF ENTROPY_COEF CRITIC_COEF LOSS_COEF
parser.add_argument("--policy-coef", type=float, default=POLICY_COEF,
help="coefficient of the policy loss")
parser.add_argument("--entropy-coef", type=float, default=ENTROPY_COEF,
help="coefficient of the entropy loss")
parser.add_argument("--critic-coef", type=float, default=CRITIC_COEF,
help="coefficient of the critic loss")
parser.add_argument("--loss-coef", type=float, default=LOSS_COEF,
help="coefficient of the total loss")
# GAE loss
parser.add_argument("--gae", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
help="Use GAE for advantage computation")
parser.add_argument("--norm-adv", type=lambda x: bool(strtobool(x)), default=NORM_ADV, nargs="?", const=True,
help="Toggles advantages normalization")
parser.add_argument("--gamma", type=float, default=GAMMA,
help="the discount factor gamma")
parser.add_argument("--gaeLambda", type=float, default=GAE_LAMBDA,
help="the lambda for the general advantage estimation")
parser.add_argument("--clip-coef", type=float, default=CLIP_COEF,
help="the surrogate clipping coefficient")
parser.add_argument("--clip-vloss", type=lambda x: bool(strtobool(x)), default=CLIP_VLOSS, nargs="?", const=True,
help="Toggles whether or not to use a clipped loss for the value function, as per the paper.")
parser.add_argument("--max-grad-norm", type=float, default=0.5,
help="the maximum norm for the gradient clipping")
parser.add_argument("--target-kl", type=float, default=None,
help="the target KL divergence threshold")
# environment parameters
parser.add_argument("--target-num", type=int, default=TARGETNUM,
help="the number of targets")
parser.add_argument("--env-timelimit", type=int, default=ENV_TIMELIMIT,
help="the time limit of each round")
parser.add_argument("--base-win-reward", type=int, default=BASE_WINREWARD,
help="the base reward of win round")
parser.add_argument("--base-lose-reward", type=int, default=BASE_LOSEREWARD,
help="the base reward of lose round")
parser.add_argument("--target-state-size", type=int, default=TARGET_STATE_SIZE,
help="the size of target state")
parser.add_argument("--time-state-size", type=int, default=TIME_STATE_SIZE,
help="the size of time state")
parser.add_argument("--gun-state-size", type=int, default=GUN_STATE_SIZE,
help="the size of gun state")
parser.add_argument("--my-state-size", type=int, default=MY_STATE_SIZE,
help="the size of my state")
parser.add_argument("--total-target-size", type=int, default=TOTAL_T_SIZE,
help="the size of total target state")
# fmt: on
args = parser.parse_args()
return args
@@ -9,7 +9,6 @@ import torch.nn as nn
import torch.optim as optim
from AimbotEnv import Aimbot
from tqdm import tqdm
from torch.distributions.normal import Normal
from torch.distributions.categorical import Categorical
from distutils.util import strtobool
@@ -25,28 +24,26 @@ from typing import List
bestReward = 0
DEFAULT_SEED = 9331
ENV_PATH = "../Build/Build-ParallelEnv-Target-OffPolicy-SingleStack-SideChannel-ExtremeReward/Aimbot-ParallelEnv"
ENV_PATH = "../Build/Build-ParallelEnv-Target-OffPolicy-SingleStack-SideChannel/Aimbot-ParallelEnv"
SIDE_CHANNEL_UUID = uuid.UUID("8bbfb62a-99b4-457c-879d-b78b69066b5e")
WAND_ENTITY = "koha9"
WORKER_ID = 1
BASE_PORT = 1000
# max round steps per agent is 2500/Decision_period, 25 seconds
# !!!check every parameters before run!!!
TOTAL_STEPS = 6000000
BATCH_SIZE = 512
MAX_TRAINNING_DATASETS = 8000
DECISION_PERIOD = 1
LEARNING_RATE = 1e-3
TOTAL_STEPS = 2000000
STEP_NUM = 314
DECISION_PERIOD = 2
LEARNING_RATE = 7e-4
GAMMA = 0.99
GAE_LAMBDA = 0.95
MINIBATCH_NUM = 4
EPOCHS = 4
CLIP_COEF = 0.1
POLICY_COEF = 1.0
ENTROPY_COEF = 0.01
CRITIC_COEF = 0.5
TARGET_LEARNING_RATE = 5e-5
ANNEAL_LEARNING_RATE = True
CLIP_VLOSS = True
@@ -54,8 +51,8 @@ NORM_ADV = True
TRAIN = True
WANDB_TACK = False
#LOAD_DIR = None
LOAD_DIR = "../PPO-Model/Aimbot-target-last.pt"
LOAD_DIR = None
# LOAD_DIR = "../PPO-Model/SmallArea-256-128-hybrid-2nd-trainning.pt"
# public data
TotalRounds = {"Go":0,"Attack":0,"Free":0}
@@ -84,10 +81,10 @@ def parse_args():
# model parameters
parser.add_argument("--train",type=lambda x: bool(strtobool(x)), default=TRAIN, nargs="?", const=True,
help="Train Model or not")
parser.add_argument("--datasetSize", type=int, default=MAX_TRAINNING_DATASETS,
help="training dataset size,start training while dataset collect enough data")
parser.add_argument("--minibatchSize", type=int, default=BATCH_SIZE,
help="nimi batch size")
parser.add_argument("--stepNum", type=int, default=STEP_NUM,
help="the number of steps to run in each environment per policy rollout")
parser.add_argument("--minibatchesNum", type=int, default=MINIBATCH_NUM,
help="the number of mini-batches")
parser.add_argument("--epochs", type=int, default=EPOCHS,
help="the K epochs to update the policy")
parser.add_argument("--annealLR", type=lambda x: bool(strtobool(x)), default=ANNEAL_LEARNING_RATE, nargs="?", const=True,
@@ -143,11 +140,9 @@ class PPOAgent(nn.Module):
self.continuous_size = env.unity_continuous_size
self.network = nn.Sequential(
layer_init(nn.Linear(np.array(env.unity_observation_shape).prod(), 700)),
layer_init(nn.Linear(np.array(env.unity_observation_shape).prod(), 384)),
nn.ReLU(),
layer_init(nn.Linear(700, 500)),
nn.ReLU(),
layer_init(nn.Linear(500, 256)),
layer_init(nn.Linear(384, 256)),
nn.ReLU(),
)
self.actor_dis = layer_init(nn.Linear(256, self.discrete_size), std=0.01)
@@ -197,40 +192,6 @@ class PPOAgent(nn.Module):
self.critic(hidden),
)
def GAE(agent, args, rewards, dones, values, next_obs, next_done):
# GAE
with torch.no_grad():
next_value = agent.get_value(next_obs).reshape(1, -1)
data_size = rewards.size()[0]
if args.gae:
advantages = torch.zeros_like(rewards).to(device)
lastgaelam = 0
for t in reversed(range(data_size)):
if t == data_size - 1:
nextnonterminal = 1.0 - next_done
nextvalues = next_value
else:
nextnonterminal = 1.0 - dones[t + 1]
nextvalues = values[t + 1]
delta = rewards[t] + args.gamma * nextvalues * nextnonterminal - values[t]
advantages[t] = lastgaelam = (
delta + args.gamma * args.gaeLambda * nextnonterminal * lastgaelam
)
returns = advantages + values
else:
returns = torch.zeros_like(rewards).to(device)
for t in reversed(range(data_size)):
if t == data_size - 1:
nextnonterminal = 1.0 - next_done
next_return = next_value
else:
nextnonterminal = 1.0 - dones[t + 1]
next_return = returns[t + 1]
returns[t] = rewards[t] + args.gamma * nextnonterminal * next_return
advantages = returns - values
return advantages, returns
class AimbotSideChannel(SideChannel):
def __init__(self, channel_id: uuid.UUID) -> None:
super().__init__(channel_id)
@@ -240,14 +201,14 @@ class AimbotSideChannel(SideChannel):
receive messages from Unity
"""
thisMessage = msg.read_string()
#print(thisMessage)
print(thisMessage)
thisResult = thisMessage.split("|")
if(thisResult[0] == "result"):
TotalRounds[thisResult[1]]+=1
if(thisResult[2] == "Win"):
WinRounds[thisResult[1]]+=1
#print(TotalRounds)
#print(WinRounds)
print(TotalRounds)
print(WinRounds)
elif(thisResult[0] == "Error"):
print(thisMessage)
# 发送函数
@@ -277,7 +238,6 @@ class AimbotSideChannel(SideChannel):
msg.write_float32_list(data)
super().queue_message_to_send(msg)
if __name__ == "__main__":
args = parse_args()
random.seed(args.seed)
@@ -299,12 +259,11 @@ if __name__ == "__main__":
optimizer = optim.Adam(agent.parameters(), lr=args.lr, eps=1e-5)
# Tensorboard and WandB Recorder
game_name = "Aimbot_Target"
game_type = "OffPolicy"
run_name = f"{game_name}_{game_type}_{args.seed}_{int(time.time())}"
game_name = "Aimbot"
run_name = f"{game_name}_{args.seed}_{int(time.time())}"
if args.wandb_track:
wandb.init(
project=game_name,
project=run_name,
entity=args.wandb_entity,
sync_tensorboard=True,
config=vars(args),
@@ -320,168 +279,94 @@ if __name__ == "__main__":
% ("\n".join([f"|{key}|{value}|" for key, value in vars(args).items()])),
)
# Trajectory Buffer
ob_bf = [[] for i in range(env.unity_agent_num)]
act_bf = [[] for i in range(env.unity_agent_num)]
dis_logprobs_bf = [[] for i in range(env.unity_agent_num)]
con_logprobs_bf = [[] for i in range(env.unity_agent_num)]
rewards_bf = [[] for i in range(env.unity_agent_num)]
dones_bf = [[] for i in range(env.unity_agent_num)]
values_bf = [[] for i in range(env.unity_agent_num)]
# Memory Record
obs = torch.zeros((args.stepNum, env.unity_agent_num) + env.unity_observation_shape).to(device)
actions = torch.zeros((args.stepNum, env.unity_agent_num) + (env.unity_action_size,)).to(device)
dis_logprobs = torch.zeros((args.stepNum, env.unity_agent_num)).to(device)
con_logprobs = torch.zeros((args.stepNum, env.unity_agent_num)).to(device)
rewards = torch.zeros((args.stepNum, env.unity_agent_num)).to(device)
dones = torch.zeros((args.stepNum, env.unity_agent_num)).to(device)
values = torch.zeros((args.stepNum, env.unity_agent_num)).to(device)
# TRY NOT TO MODIFY: start the game
total_update_step = args.total_timesteps // args.datasetSize
args.batch_size = int(env.unity_agent_num * args.stepNum)
args.minibatch_size = int(args.batch_size // args.minibatchesNum)
total_update_step = args.total_timesteps // args.batch_size
global_step = 0
start_time = time.time()
state, _, done = env.reset()
# state = torch.Tensor(next_obs).to(device)
# next_done = torch.zeros(env.unity_agent_num).to(device)
next_obs, _, _ = env.reset()
next_obs = torch.Tensor(next_obs).to(device)
next_done = torch.zeros(env.unity_agent_num).to(device)
for total_steps in range(total_update_step):
# discunt learning rate, while step == total_update_step lr will be 0
print("new episode")
if args.annealLR:
finalRatio = TARGET_LEARNING_RATE/args.lr
frac = 1.0 - finalRatio*((total_steps - 1.0) / total_update_step)
frac = 1.0 - (total_steps - 1.0) / total_update_step
lrnow = frac * args.lr
optimizer.param_groups[0]["lr"] = lrnow
# initialize empty training datasets
obs = torch.tensor([]).to(device) # (n,env.unity_observation_size)
actions = torch.tensor([]).to(device) # (n,env.unity_action_size)
dis_logprobs = torch.tensor([]).to(device) # (n,1)
con_logprobs = torch.tensor([]).to(device) # (n,1)
rewards = torch.tensor([]).to(device) # (n,1)
values = torch.tensor([]).to(device) # (n,1)
advantages = torch.tensor([]).to(device) # (n,1)
returns = torch.tensor([]).to(device) # (n,1)
# MAIN LOOP: run agent in environment
i = 0
training = False
while True:
for i in range(args.stepNum * args.decision_period):
if i % args.decision_period == 0:
step = round(i / args.decision_period)
# Choose action by agent
global_step += 1 * env.unity_agent_num
obs[step] = next_obs
dones[step] = next_done
with torch.no_grad():
# predict actions
action, dis_logprob, _, con_logprob, _, value = agent.get_actions_value(
torch.Tensor(state).to(device)
next_obs
)
value = value.flatten()
# variable from GPU to CPU
action_cpu = action.cpu().numpy()
dis_logprob_cpu = dis_logprob.cpu().numpy()
con_logprob_cpu = con_logprob.cpu().numpy()
value_cpu = value.cpu().numpy()
# Environment step
next_state, reward, next_done = env.step(action_cpu)
next_obs, reward, done = env.step(action.cpu().numpy())
# save memories
for i in range(env.unity_agent_num):
# save memories to buffers
ob_bf[i].append(state[i])
act_bf[i].append(action_cpu[i])
dis_logprobs_bf[i].append(dis_logprob_cpu[i])
con_logprobs_bf[i].append(con_logprob_cpu[i])
rewards_bf[i].append(reward[i])
dones_bf[i].append(done[i])
values_bf[i].append(value_cpu[i])
if next_done[i] == True:
# finished a round, send finished memories to training datasets
# compute advantage and discounted reward
#print(i,"over")
adv, rt = GAE(
agent,
args,
torch.tensor(rewards_bf[i]).to(device),
torch.Tensor(dones_bf[i]).to(device),
torch.tensor(values_bf[i]).to(device),
torch.tensor(next_state[i]).to(device),
torch.Tensor([next_done[i]]).to(device),
actions[step] = action
dis_logprobs[step] = dis_logprob
con_logprobs[step] = con_logprob
values[step] = value
rewards[step] = torch.tensor(reward).to(device).view(-1)
next_obs, next_done = torch.Tensor(next_obs).to(device), torch.Tensor(done).to(
device
)
# send memories to training datasets
obs = torch.cat((obs, torch.tensor(ob_bf[i]).to(device)), 0)
actions = torch.cat((actions, torch.tensor(act_bf[i]).to(device)), 0)
dis_logprobs = torch.cat(
(dis_logprobs, torch.tensor(dis_logprobs_bf[i]).to(device)), 0
)
con_logprobs = torch.cat(
(con_logprobs, torch.tensor(con_logprobs_bf[i]).to(device)), 0
)
rewards = torch.cat((rewards, torch.tensor(rewards_bf[i]).to(device)), 0)
values = torch.cat((values, torch.tensor(values_bf[i]).to(device)), 0)
advantages = torch.cat((advantages, adv), 0)
returns = torch.cat((returns, rt), 0)
# clear buffers
ob_bf[i] = []
act_bf[i] = []
dis_logprobs_bf[i] = []
con_logprobs_bf[i] = []
rewards_bf[i] = []
dones_bf[i] = []
values_bf[i] = []
print(f"train dataset added:{obs.size()[0]}/{args.datasetSize}")
if obs.size()[0] >= args.datasetSize:
# start train NN
break
state, done = next_state, next_done
else:
# skip this step use last predict action
next_obs, reward, next_done = env.step(action_cpu)
# save memories
for i in range(env.unity_agent_num):
if next_done[i] == True:
#print(i,"over???")
# save last memories to buffers
ob_bf[i].append(state[i])
act_bf[i].append(action_cpu[i])
dis_logprobs_bf[i].append(dis_logprob_cpu[i])
con_logprobs_bf[i].append(con_logprob_cpu[i])
rewards_bf[i].append(reward[i])
dones_bf[i].append(done[i])
values_bf[i].append(value_cpu[i])
# finished a round, send finished memories to training datasets
# compute advantage and discounted reward
adv, rt = GAE(
agent,
args,
torch.tensor(rewards_bf[i]).to(device),
torch.Tensor(dones_bf[i]).to(device),
torch.tensor(values_bf[i]).to(device),
torch.tensor(next_state[i]).to(device),
torch.Tensor([next_done[i]]).to(device),
next_obs, reward, done = env.step(action.cpu().numpy())
next_obs, next_done = torch.Tensor(next_obs).to(device), torch.Tensor(done).to(
device
)
# send memories to training datasets
obs = torch.cat((obs, torch.tensor(ob_bf[i]).to(device)), 0)
actions = torch.cat((actions, torch.tensor(act_bf[i]).to(device)), 0)
dis_logprobs = torch.cat(
(dis_logprobs, torch.tensor(dis_logprobs_bf[i]).to(device)), 0
)
con_logprobs = torch.cat(
(con_logprobs, torch.tensor(con_logprobs_bf[i]).to(device)), 0
)
rewards = torch.cat((rewards, torch.tensor(rewards_bf[i]).to(device)), 0)
values = torch.cat((values, torch.tensor(values_bf[i]).to(device)), 0)
advantages = torch.cat((advantages, adv), 0)
returns = torch.cat((returns, rt), 0)
# clear buffers
ob_bf[i] = []
act_bf[i] = []
dis_logprobs_bf[i] = []
con_logprobs_bf[i] = []
rewards_bf[i] = []
dones_bf[i] = []
values_bf[i] = []
print(f"train dataset added:{obs.size()[0]}/{args.datasetSize}")
state, done = next_state, next_done
i += 1
# GAE
with torch.no_grad():
next_value = agent.get_value(next_obs).reshape(1, -1)
if args.gae:
advantages = torch.zeros_like(rewards).to(device)
lastgaelam = 0
for t in reversed(range(args.stepNum)):
if t == args.stepNum - 1:
nextnonterminal = 1.0 - next_done
nextvalues = next_value
else:
nextnonterminal = 1.0 - dones[t + 1]
nextvalues = values[t + 1]
delta = rewards[t] + args.gamma * nextvalues * nextnonterminal - values[t]
advantages[t] = lastgaelam = (
delta + args.gamma * args.gaeLambda * nextnonterminal * lastgaelam
)
returns = advantages + values
else:
returns = torch.zeros_like(rewards).to(device)
for t in reversed(range(args.stepNum)):
if t == args.stepNum - 1:
nextnonterminal = 1.0 - next_done
next_return = next_value
else:
nextnonterminal = 1.0 - dones[t + 1]
next_return = returns[t + 1]
returns[t] = rewards[t] + args.gamma * nextnonterminal * next_return
advantages = returns - values
if args.train:
# flatten the batch
@@ -492,15 +377,15 @@ if __name__ == "__main__":
b_advantages = advantages.reshape(-1)
b_returns = returns.reshape(-1)
b_values = values.reshape(-1)
b_size = b_obs.size()[0]
# Optimizing the policy and value network
b_inds = np.arange(b_size)
b_inds = np.arange(args.batch_size)
# clipfracs = []
for epoch in range(args.epochs):
# shuffle all datasets
np.random.shuffle(b_inds)
for start in range(0, b_size, args.minibatchSize):
end = start + args.minibatchSize
for start in range(0, args.batch_size, args.minibatch_size):
end = start + args.minibatch_size
mb_inds = b_inds[start:end]
mb_advantages = b_advantages[mb_inds]
@@ -599,12 +484,12 @@ if __name__ == "__main__":
"charts/SPS", int(global_step / (time.time() - start_time)), global_step
)
writer.add_scalar("charts/Reward", rewardsMean, global_step)
writer.add_scalar("charts/GoWinRatio", WinRounds["Go"]/TotalRounds["Go"], global_step)
writer.add_scalar("charts/AttackWinRatio", WinRounds["Attack"]/TotalRounds["Attack"], global_step)
writer.add_scalar("charts/FreeWinRatio", WinRounds["Free"]/TotalRounds["Free"], global_step)
writer.add_scalar("charts/GoWinRatio", WinRounds["Go"]/TotalRounds["Go"] if TotalRounds["Go"] != 0 else 0, global_step)
writer.add_scalar("charts/AttackWinRatio", WinRounds["Attack"]/TotalRounds["Attack"] if TotalRounds["Attack"] != 0 else 0, global_step)
writer.add_scalar("charts/FreeWinRatio", WinRounds["Free"]/TotalRounds["Free"] if TotalRounds["Free"] != 0 else 0, global_step)
if rewardsMean > bestReward:
bestReward = rewardsMean
saveDir = "../PPO-Model/Target-700-500-256-hybrid-" + str(rewardsMean) + ".pt"
saveDir = "../PPO-Model/bigArea-384-128-hybrid-" + str(rewardsMean) + ".pt"
torch.save(agent, saveDir)
env.close()
-312
View File
@@ -1,312 +0,0 @@
import numpy as np
import torch
import argparse
import time
from torch import nn
from aimbotEnv import Aimbot
from torch.distributions.normal import Normal
from torch.distributions.categorical import Categorical
def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
nn.init.orthogonal_(layer.weight, std)
nn.init.constant_(layer.bias, bias_const)
return layer
class PPOAgent(nn.Module):
def __init__(
self,
env: Aimbot,
this_args: argparse.Namespace,
device: torch.device,
):
super(PPOAgent, self).__init__()
self.device = device
self.args = this_args
self.train_agent = self.args.train
self.target_num = self.args.target_num
self.unity_observation_shape = env.unity_observation_shape
self.unity_action_size = env.unity_action_size
self.state_size = self.unity_observation_shape[0]
self.agent_num = env.unity_agent_num
self.target_size = self.args.target_state_size
self.time_state_size = self.args.time_state_size
self.gun_state_size = self.args.gun_state_size
self.my_state_size = self.args.my_state_size
self.ray_state_size = env.unity_observation_shape[0] - self.args.total_target_size
self.state_size_without_ray = self.args.total_target_size
self.head_input_size = (
env.unity_observation_shape[0] - self.target_size - self.time_state_size - self.gun_state_size
) # except target state input
self.unity_discrete_type = env.unity_discrete_type
self.discrete_size = env.unity_discrete_size
self.discrete_shape = list(env.unity_discrete_branches)
self.continuous_size = env.unity_continuous_size
self.view_network = nn.Sequential(layer_init(nn.Linear(self.ray_state_size, 200)), nn.LeakyReLU())
self.target_networks = nn.ModuleList(
[
nn.Sequential(layer_init(nn.Linear(self.state_size_without_ray, 100)), nn.LeakyReLU())
for i in range(self.target_num)
]
)
self.middle_networks = nn.ModuleList(
[
nn.Sequential(layer_init(nn.Linear(300, 200)), nn.LeakyReLU())
for i in range(self.target_num)
]
)
self.actor_dis = nn.ModuleList(
[layer_init(nn.Linear(200, self.discrete_size), std=0.5) for i in range(self.target_num)]
)
self.actor_mean = nn.ModuleList(
[layer_init(nn.Linear(200, self.continuous_size), std=0.5) for i in range(self.target_num)]
)
self.actor_logstd = nn.ParameterList(
[nn.Parameter(torch.zeros(1, self.continuous_size)) for i in range(self.target_num)]
) # nn.Parameter(torch.zeros(1, self.continuous_size))
self.critic = nn.ModuleList(
[layer_init(nn.Linear(200, 1), std=1) for i in range(self.target_num)]
)
def get_value(self, state: torch.Tensor):
target = state[:, 0].to(torch.int32) # int
this_state_num = target.size()[0]
view_input = state[:, -self.ray_state_size:] # all ray input
target_input = state[:, : self.state_size_without_ray]
view_layer = self.view_network(view_input)
target_layer = torch.stack(
[self.target_networks[target[i]](target_input[i]) for i in range(this_state_num)]
)
middle_input = torch.cat([view_layer, target_layer], dim=1)
middle_layer = torch.stack(
[self.middle_networks[target[i]](middle_input[i]) for i in range(this_state_num)]
)
criticV = torch.stack(
[self.critic[target[i]](middle_layer[i]) for i in range(this_state_num)]
) # self.critic
return criticV
def get_actions_value(self, state: torch.Tensor, actions=None):
target = state[:, 0].to(torch.int32) # int
this_state_num = target.size()[0]
view_input = state[:, -self.ray_state_size:] # all ray input
target_input = state[:, : self.state_size_without_ray]
view_layer = self.view_network(view_input)
target_layer = torch.stack(
[self.target_networks[target[i]](target_input[i]) for i in range(this_state_num)]
)
middle_input = torch.cat([view_layer, target_layer], dim=1)
middle_layer = torch.stack(
[self.middle_networks[target[i]](middle_input[i]) for i in range(this_state_num)]
)
# discrete
# 递归targets的数量,既agent数来实现根据target不同来选用对应的输出网络计算输出
dis_logits = torch.stack(
[self.actor_dis[target[i]](middle_layer[i]) for i in range(this_state_num)]
)
split_logits = torch.split(dis_logits, self.discrete_shape, dim=1)
multi_categoricals = [Categorical(logits=thisLogits) for thisLogits in split_logits]
# continuous
actions_mean = torch.stack(
[self.actor_mean[target[i]](middle_layer[i]) for i in range(this_state_num)]
) # self.actor_mean(hidden)
action_logstd = torch.stack(
[torch.squeeze(self.actor_logstd[target[i]], 0) for i in range(this_state_num)]
)
# print(action_logstd)
action_std = torch.exp(action_logstd) # torch.exp(action_logstd)
con_probs = Normal(actions_mean, action_std)
# critic
criticV = torch.stack(
[self.critic[target[i]](middle_layer[i]) for i in range(this_state_num)]
) # self.critic
if actions is None:
if self.train_agent:
# select actions base on probability distribution model
dis_act = torch.stack([ctgr.sample() for ctgr in multi_categoricals])
con_act = con_probs.sample()
actions = torch.cat([dis_act.T, con_act], dim=1)
else:
# select actions base on best probability distribution
dis_act = torch.stack([torch.argmax(logit, dim=1) for logit in split_logits])
con_act = actions_mean
actions = torch.cat([dis_act.T, con_act], dim=1)
else:
dis_act = actions[:, 0: self.unity_discrete_type].T
con_act = actions[:, self.unity_discrete_type:]
dis_log_prob = torch.stack(
[ctgr.log_prob(act) for act, ctgr in zip(dis_act, multi_categoricals)]
)
dis_entropy = torch.stack([ctgr.entropy() for ctgr in multi_categoricals])
return (
actions,
dis_log_prob.sum(0),
dis_entropy.sum(0),
con_probs.log_prob(con_act).sum(1),
con_probs.entropy().sum(1),
criticV,
)
def train_net(self, this_train_ind: int, ppo_memories, optimizer) -> tuple:
start_time = time.time()
# flatten the batch
b_obs = ppo_memories.obs[this_train_ind].reshape((-1,) + self.unity_observation_shape)
b_dis_logprobs = ppo_memories.dis_logprobs[this_train_ind].reshape(-1)
b_con_logprobs = ppo_memories.con_logprobs[this_train_ind].reshape(-1)
b_actions = ppo_memories.actions[this_train_ind].reshape((-1,) + (self.unity_action_size,))
b_advantages = ppo_memories.advantages[this_train_ind].reshape(-1)
b_returns = ppo_memories.returns[this_train_ind].reshape(-1)
b_values = ppo_memories.values[this_train_ind].reshape(-1)
b_size = b_obs.size()[0]
# optimizing the policy and value network
b_index = np.arange(b_size)
for epoch in range(self.args.epochs):
print("epoch:", epoch, end="")
# shuffle all datasets
np.random.shuffle(b_index)
for start in range(0, b_size, self.args.minibatchSize):
print(".", end="")
end = start + self.args.minibatchSize
mb_index = b_index[start:end]
if np.size(mb_index) <= 1:
break
mb_advantages = b_advantages[mb_index]
# normalize advantages
if self.args.norm_adv:
mb_advantages = (mb_advantages - mb_advantages.mean()) / (
mb_advantages.std() + 1e-8
)
(
_,
new_dis_logprob,
dis_entropy,
new_con_logprob,
con_entropy,
new_value,
) = self.get_actions_value(b_obs[mb_index], b_actions[mb_index])
# discrete ratio
dis_log_ratio = new_dis_logprob - b_dis_logprobs[mb_index]
dis_ratio = dis_log_ratio.exp()
# continuous ratio
con_log_ratio = new_con_logprob - b_con_logprobs[mb_index]
con_ratio = con_log_ratio.exp()
"""
# early stop
with torch.no_grad():
# calculate approx_kl http://joschu.net/blog/kl-approx.html
old_approx_kl = (-logratio).mean()
approx_kl = ((ratio - 1) - logratio).mean()
clipfracs += [((ratio - 1.0).abs() > args.clip_coef).float().mean().item()]
"""
# discrete Policy loss
dis_pg_loss_orig = -mb_advantages * dis_ratio
dis_pg_loss_clip = -mb_advantages * torch.clamp(
dis_ratio, 1 - self.args.clip_coef, 1 + self.args.clip_coef
)
dis_pg_loss = torch.max(dis_pg_loss_orig, dis_pg_loss_clip).mean()
# continuous Policy loss
con_pg_loss_orig = -mb_advantages * con_ratio
con_pg_loss_clip = -mb_advantages * torch.clamp(
con_ratio, 1 - self.args.clip_coef, 1 + self.args.clip_coef
)
con_pg_loss = torch.max(con_pg_loss_orig, con_pg_loss_clip).mean()
# Value loss
new_value = new_value.view(-1)
if self.args.clip_vloss:
v_loss_unclipped = (new_value - b_returns[mb_index]) ** 2
v_clipped = b_values[mb_index] + torch.clamp(
new_value - b_values[mb_index],
-self.args.clip_coef,
self.args.clip_coef,
)
v_loss_clipped = (v_clipped - b_returns[mb_index]) ** 2
v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped)
v_loss = 0.5 * v_loss_max.mean()
else:
v_loss = 0.5 * ((new_value - b_returns[mb_index]) ** 2).mean()
# total loss
entropy_loss = dis_entropy.mean() + con_entropy.mean()
loss = (
dis_pg_loss * self.args.policy_coef[this_train_ind]
+ con_pg_loss * self.args.policy_coef[this_train_ind]
+ entropy_loss * self.args.entropy_coef[this_train_ind]
+ v_loss * self.args.critic_coef[this_train_ind]
) * self.args.loss_coef[this_train_ind]
if torch.isnan(loss).any():
print("LOSS Include NAN!!!")
if torch.isnan(dis_pg_loss.any()):
print("dis_pg_loss include nan")
if torch.isnan(con_pg_loss.any()):
print("con_pg_loss include nan")
if torch.isnan(entropy_loss.any()):
print("entropy_loss include nan")
if torch.isnan(v_loss.any()):
print("v_loss include nan")
raise
optimizer.zero_grad()
loss.backward()
# Clips gradient norm of an iterable of parameters.
nn.utils.clip_grad_norm_(self.parameters(), self.args.max_grad_norm)
optimizer.step()
"""
if args.target_kl is not None:
if approx_kl > args.target_kl:
break
"""
return v_loss, dis_pg_loss, con_pg_loss, loss, entropy_loss
def gae(
self,
rewards: torch.Tensor,
dones: torch.Tensor,
values: torch.Tensor,
next_obs: torch.Tensor,
next_done: torch.Tensor,
) -> tuple:
# GAE
with torch.no_grad():
next_value = self.get_value(next_obs).reshape(1, -1)
data_size = rewards.size()[0]
if self.args.gae:
advantages = torch.zeros_like(rewards).to(self.device)
last_gae_lam = 0
for t in reversed(range(data_size)):
if t == data_size - 1:
next_non_terminal = 1.0 - next_done
next_values = next_value
else:
next_non_terminal = 1.0 - dones[t + 1]
next_values = values[t + 1]
delta = rewards[t] + self.args.gamma * next_values * next_non_terminal - values[t]
advantages[t] = last_gae_lam = (
delta + self.args.gamma * self.args.gaeLambda * next_non_terminal * last_gae_lam
)
returns = advantages + values
else:
returns = torch.zeros_like(rewards).to(self.device)
for t in reversed(range(data_size)):
if t == data_size - 1:
next_non_terminal = 1.0 - next_done
next_return = next_value
else:
next_non_terminal = 1.0 - dones[t + 1]
next_return = returns[t + 1]
returns[t] = rewards[t] + self.args.gamma * next_non_terminal * next_return
advantages = returns - values
return advantages, returns
Binary file not shown.
+502
View File
@@ -0,0 +1,502 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Action, 1 continuous ctrl 2.1\n",
"Action, 0 continuous ctrl -1.1\n"
]
}
],
"source": [
"import gym\n",
"from gym.spaces import Dict, Discrete, Box, Tuple\n",
"import numpy as np\n",
"\n",
"\n",
"class SampleGym(gym.Env):\n",
" def __init__(self, config={}):\n",
" self.config = config\n",
" self.action_space = Tuple((Discrete(2), Box(-10, 10, (2,))))\n",
" self.observation_space = Box(-10, 10, (2, 2))\n",
" self.p_done = config.get(\"p_done\", 0.1)\n",
"\n",
" def reset(self):\n",
" return self.observation_space.sample()\n",
"\n",
" def step(self, action):\n",
" chosen_action = action[0]\n",
" cnt_control = action[1][chosen_action]\n",
"\n",
" if chosen_action == 0:\n",
" reward = cnt_control\n",
" else:\n",
" reward = -cnt_control - 1\n",
"\n",
" print(f\"Action, {chosen_action} continuous ctrl {cnt_control}\")\n",
" return (\n",
" self.observation_space.sample(),\n",
" reward,\n",
" bool(np.random.choice([True, False], p=[self.p_done, 1.0 - self.p_done])),\n",
" {},\n",
" )\n",
"\n",
"\n",
"if __name__ == \"__main__\":\n",
" env = SampleGym()\n",
" env.reset()\n",
" env.step((1, [-1, 2.1])) # should say use action 1 with 2.1\n",
" env.step((0, [-1.1, 2.1])) # should say use action 0 with -1.1"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"from mlagents_envs.environment import UnityEnvironment\n",
"from gym_unity.envs import UnityToGymWrapper\n",
"import numpy as np\n",
"\n",
"ENV_PATH = \"../Build-ParallelEnv/Aimbot-ParallelEnv\"\n",
"WORKER_ID = 1\n",
"BASE_PORT = 2002\n",
"\n",
"env = UnityEnvironment(\n",
" file_name=ENV_PATH,\n",
" seed=1,\n",
" side_channels=[],\n",
" worker_id=WORKER_ID,\n",
" base_port=BASE_PORT,\n",
")\n",
"\n",
"trackedAgent = 0\n",
"env.reset()\n",
"BEHA_SPECS = env.behavior_specs\n",
"BEHA_NAME = list(BEHA_SPECS)[0]\n",
"SPEC = BEHA_SPECS[BEHA_NAME]\n",
"print(SPEC)\n",
"\n",
"decisionSteps, terminalSteps = env.get_steps(BEHA_NAME)\n",
"\n",
"if trackedAgent in decisionSteps: # ゲーム終了していない場合、環境状態がdecision_stepsに保存される\n",
" nextState = decisionSteps[trackedAgent].obs[0]\n",
" reward = decisionSteps[trackedAgent].reward\n",
" done = False\n",
"if trackedAgent in terminalSteps: # ゲーム終了した場合、環境状態がterminal_stepsに保存される\n",
" nextState = terminalSteps[trackedAgent].obs[0]\n",
" reward = terminalSteps[trackedAgent].reward\n",
" done = True\n",
"print(decisionSteps.agent_id)\n",
"print(terminalSteps)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"decisionSteps.agent_id [1 2 5 7]\n",
"decisionSteps.agent_id_to_index {1: 0, 2: 1, 5: 2, 7: 3}\n",
"decisionSteps.reward [0. 0. 0. 0.]\n",
"decisionSteps.action_mask [array([[False, False, False],\n",
" [False, False, False],\n",
" [False, False, False],\n",
" [False, False, False]]), array([[False, False, False],\n",
" [False, False, False],\n",
" [False, False, False],\n",
" [False, False, False]]), array([[False, False],\n",
" [False, False],\n",
" [False, False],\n",
" [False, False]])]\n",
"decisionSteps.obs [ 0. 0. 0. 0. 0. 0.\n",
" 0. 0. 0. 0. 0. 0.\n",
" 0. 0. 0. 0. 0. 0.\n",
" 0. 0. 0. 0. 0. 0.\n",
" 0. 0. 0. 0. 0. 0.\n",
" 0. 0. 0. 0. 0. 0.\n",
" 0. 0. 0. 0. 0. 0.\n",
" 0. 0. 0. 0. 0. 0.\n",
" 0. 0. 0. 0. 0. 0.\n",
" 0. 0. 0. 0. 0. 0.\n",
" 0. 0. -15.994009 1. -26.322788 1.\n",
" 1. 1. 1. 1. 1. 2.\n",
" 1. 1. 1. 1. 1. 1.\n",
" 1. 1.3519633 1.6946528 2.3051548 3.673389 9.067246\n",
" 17.521473 21.727095 22.753294 24.167128 25.905216 18.35725\n",
" 21.02278 21.053417 0. ]\n"
]
},
{
"data": {
"text/plain": [
"'decisionSteps.obs [array([[-15.994009 , 1. , -26.322788 , 1. , 1. ,\\n 1. , 1. , 1. , 1. , 2. ,\\n 1. , 1. , 1. , 1. , 1. ,\\n 1. , 1. , 1.3519633, 1.6946528, 2.3051548,\\n 3.673389 , 9.067246 , 17.521473 , 21.727095 , 22.753294 ,\\n 24.167128 , 25.905216 , 18.35725 , 21.02278 , 21.053417 ,\\n 0. ],\\n [ -1.8809433, 1. , -25.66834 , 1. , 2. ,\\n 1. , 1. , 1. , 1. , 1. ,\\n 1. , 1. , 1. , 1. , 1. ,\\n 1. , 1. , 16.768637 , 23.414627 , 22.04486 ,\\n 21.050663 , 20.486784 , 20.486784 , 21.050665 , 15.049731 ,\\n 11.578419 , 9.695194 , 20.398016 , 20.368341 , 20.398016 ,\\n...\\n 20.551746 , 20.00118 , 20.001116 , 20.551594 , 21.5222 ,\\n 17.707508 , 14.86889 , 19.914494 , 19.885508 , 19.914463 ,\\n 0. ]], dtype=float32)]'"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"print(\"decisionSteps.agent_id\",decisionSteps.agent_id)\n",
"# decisionSteps.agent_id [1 2 5 7]\n",
"print(\"decisionSteps.agent_id_to_index\",decisionSteps.agent_id_to_index)\n",
"# decisionSteps.agent_id_to_index {1: 0, 2: 1, 5: 2, 7: 3}\n",
"print(\"decisionSteps.reward\",decisionSteps.reward)\n",
"# decisionSteps.reward [0. 0. 0. 0.]\n",
"print(\"decisionSteps.action_mask\",decisionSteps.action_mask)\n",
"'''\n",
"decisionSteps.action_mask [array([[False, False, False],\n",
" [False, False, False],\n",
" [False, False, False],\n",
" [False, False, False]]), array([[False, False, False],\n",
" [False, False, False],\n",
" [False, False, False],\n",
" [False, False, False]]), array([[False, False],\n",
" [False, False],\n",
" [False, False],\n",
" [False, False]])]\n",
"'''\n",
"print(\"decisionSteps.obs\", decisionSteps.obs[0][0])\n",
"'''decisionSteps.obs [array([[-15.994009 , 1. , -26.322788 , 1. , 1. ,\n",
" 1. , 1. , 1. , 1. , 2. ,\n",
" 1. , 1. , 1. , 1. , 1. ,\n",
" 1. , 1. , 1.3519633, 1.6946528, 2.3051548,\n",
" 3.673389 , 9.067246 , 17.521473 , 21.727095 , 22.753294 ,\n",
" 24.167128 , 25.905216 , 18.35725 , 21.02278 , 21.053417 ,\n",
" 0. ],\n",
" [ -1.8809433, 1. , -25.66834 , 1. , 2. ,\n",
" 1. , 1. , 1. , 1. , 1. ,\n",
" 1. , 1. , 1. , 1. , 1. ,\n",
" 1. , 1. , 16.768637 , 23.414627 , 22.04486 ,\n",
" 21.050663 , 20.486784 , 20.486784 , 21.050665 , 15.049731 ,\n",
" 11.578419 , 9.695194 , 20.398016 , 20.368341 , 20.398016 ,\n",
"...\n",
" 20.551746 , 20.00118 , 20.001116 , 20.551594 , 21.5222 ,\n",
" 17.707508 , 14.86889 , 19.914494 , 19.885508 , 19.914463 ,\n",
" 0. ]], dtype=float32)]'''\n"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from AimbotEnv import Aimbot\n",
"\n",
"ENV_PATH = \"../Build-ParallelEnv/Aimbot-ParallelEnv\"\n",
"WORKER_ID = 1\n",
"BASE_PORT = 2002\n",
"\n",
"env = Aimbot(envPath=ENV_PATH,workerID= WORKER_ID,basePort= BASE_PORT)\n"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(array([[ 0. , 0. , 0. , 0. , 0. ,\n",
" 0. , 0. , 0. , 0. , 0. ,\n",
" 0. , 0. , 0. , 0. , 0. ,\n",
" 0. , 0. , 0. , 0. , 0. ,\n",
" 0. , 0. , 0. , 0. , 0. ,\n",
" 0. , 0. , 0. , 0. , 0. ,\n",
" 0. , -15.994009 , 1. , -26.322788 , 1. ,\n",
" 1. , 1. , 1. , 1. , 1. ,\n",
" 2. , 1. , 1. , 1. , 1. ,\n",
" 1. , 1. , 1. , 1.3519633, 1.6946528,\n",
" 2.3051548, 3.673389 , 9.067246 , 17.521473 , 21.727095 ,\n",
" 22.753294 , 24.167128 , 25.905216 , 18.35725 , 21.02278 ,\n",
" 21.053417 , 0. , -15.994003 , 1. , -26.322784 ,\n",
" 1. , 1. , 1. , 1. , 1. ,\n",
" 1. , 1. , 1. , 1. , 1. ,\n",
" 1. , 1. , 1. , 1. , 1.3519667,\n",
" 1.6946585, 2.3051722, 3.6734192, 9.067533 , 21.145092 ,\n",
" 21.727148 , 22.753365 , 24.167217 , 25.905317 , 18.358263 ,\n",
" 21.022812 , 21.053455 , 0. ],\n",
" [ 0. , 0. , 0. , 0. , 0. ,\n",
" 0. , 0. , 0. , 0. , 0. ,\n",
" 0. , 0. , 0. , 0. , 0. ,\n",
" 0. , 0. , 0. , 0. , 0. ,\n",
" 0. , 0. , 0. , 0. , 0. ,\n",
" 0. , 0. , 0. , 0. , 0. ,\n",
" 0. , -1.8809433, 1. , -25.66834 , 1. ,\n",
" 2. , 1. , 1. , 1. , 1. ,\n",
" 1. , 1. , 1. , 1. , 1. ,\n",
" 1. , 1. , 1. , 16.768637 , 23.414627 ,\n",
" 22.04486 , 21.050663 , 20.486784 , 20.486784 , 21.050665 ,\n",
" 15.049731 , 11.578419 , 9.695194 , 20.398016 , 20.368341 ,\n",
" 20.398016 , 0. , -1.8809433, 1. , -25.66834 ,\n",
" 1. , 1. , 2. , 1. , 1. ,\n",
" 1. , 1. , 1. , 1. , 2. ,\n",
" 2. , 1. , 1. , 1. , 25.098585 ,\n",
" 15.749494 , 22.044899 , 21.050697 , 20.486813 , 20.486813 ,\n",
" 21.050694 , 15.049746 , 3.872317 , 3.789325 , 20.398046 ,\n",
" 20.368372 , 20.398046 , 0. ],\n",
" [ 0. , 0. , 0. , 0. , 0. ,\n",
" 0. , 0. , 0. , 0. , 0. ,\n",
" 0. , 0. , 0. , 0. , 0. ,\n",
" 0. , 0. , 0. , 0. , 0. ,\n",
" 0. , 0. , 0. , 0. , 0. ,\n",
" 0. , 0. , 0. , 0. , 0. ,\n",
" 0. , -13.672583 , 1. , -26.479263 , 1. ,\n",
" 1. , 1. , 1. , 1. , 1. ,\n",
" 1. , 1. , 1. , 1. , 1. ,\n",
" 1. , 1. , 1. , 5.3249803, 6.401276 ,\n",
" 8.374101 , 12.8657875, 21.302414 , 21.30242 , 21.888742 ,\n",
" 22.92251 , 24.346794 , 26.09773 , 21.210114 , 21.179258 ,\n",
" 21.210117 , 0. , -13.672583 , 1. , -26.479263 ,\n",
" 1. , 1. , 1. , 1. , 1. ,\n",
" 1. , 1. , 2. , 1. , 1. ,\n",
" 2. , 1. , 1. , 2. , 5.3249855,\n",
" 6.4012837, 8.374114 , 12.865807 , 21.302446 , 21.30245 ,\n",
" 16.168503 , 22.922543 , 24.346823 , 7.1110754, 21.210148 ,\n",
" 21.17929 , 12.495141 , 0. ],\n",
" [ 0. , 0. , 0. , 0. , 0. ,\n",
" 0. , 0. , 0. , 0. , 0. ,\n",
" 0. , 0. , 0. , 0. , 0. ,\n",
" 0. , 0. , 0. , 0. , 0. ,\n",
" 0. , 0. , 0. , 0. , 0. ,\n",
" 0. , 0. , 0. , 0. , 0. ,\n",
" 0. , -4.9038744, 1. , -25.185507 , 1. ,\n",
" 1. , 1. , 1. , 1. , 1. ,\n",
" 1. , 1. , 1. , 1. , 1. ,\n",
" 1. , 1. , 1. , 20.33171 , 22.859762 ,\n",
" 21.522427 , 20.551746 , 20.00118 , 20.001116 , 20.551594 ,\n",
" 21.5222 , 17.707508 , 14.86889 , 19.914494 , 19.885508 ,\n",
" 19.914463 , 0. , -4.9038773, 1. , -25.185507 ,\n",
" 1. , 2. , 1. , 2. , 1. ,\n",
" 1. , 1. , 1. , 2. , 1. ,\n",
" 1. , 1. , 1. , 1. , 15.905993 ,\n",
" 22.85977 , 11.566693 , 20.551773 , 20.00121 , 20.001146 ,\n",
" 20.551619 , 7.135157 , 17.707582 , 14.868943 , 19.914528 ,\n",
" 19.88554 , 19.914494 , 0. ]], dtype=float32),\n",
" [[-0.05], [-0.05], [-0.05], [-0.05]],\n",
" [[False], [False], [False], [False]])"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import numpy as np\n",
"env.unity_observation_shape\n",
"(128, 4) + env.unity_observation_shape\n",
"env.reset()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"tensor([[1, 2, 3],\n",
" [1, 2, 3],\n",
" [1, 2, 3],\n",
" [1, 2, 3]], device='cuda:0')\n",
"tensor([[1],\n",
" [2],\n",
" [3],\n",
" [4]], device='cuda:0')\n"
]
},
{
"data": {
"text/plain": [
"tensor([[1, 2, 3, 1],\n",
" [1, 2, 3, 2],\n",
" [1, 2, 3, 3],\n",
" [1, 2, 3, 4]], device='cuda:0')"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import torch\n",
"aa = torch.tensor([[1,2,3],[1,2,3],[1,2,3],[1,2,3]]).to(\"cuda:0\")\n",
"bb = torch.tensor([[1],[2],[3],[4]]).to(\"cuda:0\")\n",
"print(aa)\n",
"print(bb)\n",
"torch.cat([aa,bb],axis = 1)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"ename": "AttributeError",
"evalue": "Can't get attribute 'PPOAgent' on <module '__main__'>",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m~\\AppData\\Local\\Temp\\ipykernel_31348\\1930153251.py\u001b[0m in \u001b[0;36m<cell line: 2>\u001b[1;34m()\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mtorch\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 2\u001b[1;33m \u001b[0mmymodel\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mtorch\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mload\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"../PPO-Model/SmallArea-256-128-hybrid.pt\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 3\u001b[0m \u001b[0mmymodel\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0meval\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32mc:\\Users\\UCUNI\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\torch\\serialization.py\u001b[0m in \u001b[0;36mload\u001b[1;34m(f, map_location, pickle_module, **pickle_load_args)\u001b[0m\n\u001b[0;32m 710\u001b[0m \u001b[0mopened_file\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mseek\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0morig_position\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 711\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mtorch\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mjit\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mload\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mopened_file\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 712\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0m_load\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mopened_zipfile\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmap_location\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mpickle_module\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mpickle_load_args\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 713\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0m_legacy_load\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mopened_file\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmap_location\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mpickle_module\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mpickle_load_args\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 714\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32mc:\\Users\\UCUNI\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\torch\\serialization.py\u001b[0m in \u001b[0;36m_load\u001b[1;34m(zip_file, map_location, pickle_module, pickle_file, **pickle_load_args)\u001b[0m\n\u001b[0;32m 1047\u001b[0m \u001b[0munpickler\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mUnpicklerWrapper\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdata_file\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mpickle_load_args\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1048\u001b[0m \u001b[0munpickler\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpersistent_load\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mpersistent_load\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1049\u001b[1;33m \u001b[0mresult\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0munpickler\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mload\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1050\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1051\u001b[0m \u001b[0mtorch\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_utils\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_validate_loaded_sparse_tensors\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32mc:\\Users\\UCUNI\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\torch\\serialization.py\u001b[0m in \u001b[0;36mfind_class\u001b[1;34m(self, mod_name, name)\u001b[0m\n\u001b[0;32m 1040\u001b[0m \u001b[1;32mpass\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1041\u001b[0m \u001b[0mmod_name\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mload_module_mapping\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmod_name\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmod_name\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1042\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0msuper\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfind_class\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmod_name\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mname\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1043\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1044\u001b[0m \u001b[1;31m# Load the data (which may in turn use `persistent_load` to load tensors)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;31mAttributeError\u001b[0m: Can't get attribute 'PPOAgent' on <module '__main__'>"
]
}
],
"source": [
"import torch\n",
"\n",
"def layer_init(layer, std=np.sqrt(2), bias_const=0.0):\n",
" torch.nn.init.orthogonal_(layer.weight, std)\n",
" torch.nn.init.constant_(layer.bias, bias_const)\n",
" return layer\n",
"\n",
"class PPOAgent(nn.Module):\n",
" def __init__(self, env: Aimbot):\n",
" super(PPOAgent, self).__init__()\n",
" self.discrete_size = env.unity_discrete_size\n",
" self.discrete_shape = list(env.unity_discrete_branches)\n",
" self.continuous_size = env.unity_continuous_size\n",
"\n",
" self.network = nn.Sequential(\n",
" layer_init(nn.Linear(np.array(env.unity_observation_shape).prod(), 256)),\n",
" nn.ReLU(),\n",
" layer_init(nn.Linear(256, 128)),\n",
" nn.ReLU(),\n",
" )\n",
" self.actor_dis = layer_init(nn.Linear(128, self.discrete_size), std=0.01)\n",
" self.actor_mean = layer_init(nn.Linear(128, self.continuous_size), std=0.01)\n",
" self.actor_logstd = nn.Parameter(torch.zeros(1, self.continuous_size))\n",
" self.critic = layer_init(nn.Linear(128, 1), std=1)\n",
"\n",
" def get_value(self, state: torch.Tensor):\n",
" return self.critic(self.network(state))\n",
"\n",
" def get_actions_value(self, state: torch.Tensor, actions=None):\n",
" hidden = self.network(state)\n",
" # discrete\n",
" dis_logits = self.actor_dis(hidden)\n",
" split_logits = torch.split(dis_logits, self.discrete_shape, dim=1)\n",
" multi_categoricals = [Categorical(logits=thisLogits) for thisLogits in split_logits]\n",
" # continuous\n",
" actions_mean = self.actor_mean(hidden)\n",
" action_logstd = self.actor_logstd.expand_as(actions_mean)\n",
" action_std = torch.exp(action_logstd)\n",
" con_probs = Normal(actions_mean, action_std)\n",
"\n",
" if actions is None:\n",
" disAct = torch.stack([ctgr.sample() for ctgr in multi_categoricals])\n",
" conAct = con_probs.sample()\n",
" actions = torch.cat([disAct.T, conAct], dim=1)\n",
" else:\n",
" disAct = actions[:, 0 : env.unity_discrete_type].T\n",
" conAct = actions[:, env.unity_discrete_type :]\n",
" dis_log_prob = torch.stack(\n",
" [ctgr.log_prob(act) for act, ctgr in zip(disAct, multi_categoricals)]\n",
" )\n",
" dis_entropy = torch.stack([ctgr.entropy() for ctgr in multi_categoricals])\n",
" return (\n",
" actions,\n",
" dis_log_prob.sum(0),\n",
" dis_entropy.sum(0),\n",
" con_probs.log_prob(conAct).sum(1),\n",
" con_probs.entropy().sum(1),\n",
" self.critic(hidden),\n",
" )\n",
"\n",
"\n",
"mymodel = torch.load(\"../PPO-Model/SmallArea-256-128-hybrid.pt\")\n",
"mymodel.eval()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"x : torch.Size([2, 3, 4])\n",
"x : torch.Size([6, 2, 3, 4])\n",
"x : torch.Size([6, 2, 3, 4])\n"
]
}
],
"source": [
"import torch\n",
"#1\n",
"x = torch.randn(2, 1, 1)#为1可以扩展为3和4\n",
"x = x.expand(2, 3, 4)\n",
"print('x :', x.size())\n",
"\n",
"#2\n",
"#扩展一个新的维度必须在最前面,否则会报错\n",
"#x = x.expand(2, 3, 4, 6)\n",
"\n",
"x = x.expand(6, 2, 3, 4)\n",
"print('x :', x.size())\n",
"\n",
"#3\n",
"#某一个维度为-1表示不改变该维度的大小\n",
"x = x.expand(6, -1, -1, -1)\n",
"print('x :', x.size())\n",
"\n",
"x : torch.Size([2, 3, 4])\n",
"x : torch.Size([6, 2, 3, 4])\n",
"x : torch.Size([6, 2, 3, 4])"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.9.7 64-bit",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.7"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "86e2db13b09bd6be22cb599ea60c1572b9ef36ebeaa27a4c8e961d6df315ac32"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}
-5
View File
@@ -1,5 +0,0 @@
import numpy as np
aa = np.array([1,2,3,4,5,6,7,8,9,10])
print(aa)