Side Channel Added

add side Channel to save target win ratio.
2022-11-30 07:01:05 +09:00
26 changed files with 635 additions and 3628 deletions
@@ -76,8 +76,6 @@ crashlytics-build.properties
 /Aimbot-PPO-Python/.vscode/
 /Aimbot-PPO-Python/.mypy_cache/
 /Aimbot-PPO-Python/__pycache__/
-/Aimbot-PPO-Python/wandb/
-/Aimbot-PPO-Python/runs/
 /Aimbot-PPO-Python/Tensorflow/__pycache__/
 /Aimbot-PPO-Python/Pytorch/__pycache__/
 /Aimbot-PPO-Python/Pytorch/runs/
@@ -1,5 +0,0 @@
-{
-    "python.linting.enabled": false,
-    "python.analysis.typeCheckingMode": "off",
-    "commentTranslate.source": "intellsmi.deepl-translate-deepl"
-}
@@ -1,3 +0,0 @@
-# Default ignored files
-/shelf/
-/workspace.xml
@@ -1,8 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<module type="PYTHON_MODULE" version="4">
-  <component name="NewModuleRootManager">
-    <content url="file://$MODULE_DIR$" />
-    <orderEntry type="jdk" jdkName="mlagents39" jdkType="Python SDK" />
-    <orderEntry type="sourceFolder" forTests="false" />
-  </component>
-</module>
@@ -1,10 +0,0 @@
-<component name="ProjectDictionaryState">
-  <dictionary name="UCUNI">
-    <words>
-      <w>aimbot</w>
-      <w>logprobs</w>
-      <w>logstd</w>
-      <w>unclipped</w>
-    </words>
-  </dictionary>
-</component>
@@ -1,6 +0,0 @@
-<component name="InspectionProjectProfileManager">
-  <settings>
-    <option name="USE_PROJECT_PROFILE" value="false" />
-    <version value="1.0" />
-  </settings>
-</component>
@@ -1,4 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<project version="4">
-  <component name="ProjectRootManager" version="2" project-jdk-name="mlagents39" project-jdk-type="Python SDK" />
-</project>
@@ -1,8 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<project version="4">
-  <component name="ProjectModuleManager">
-    <modules>
-      <module fileurl="file://$PROJECT_DIR$/.idea/Pytorch.iml" filepath="$PROJECT_DIR$/.idea/Pytorch.iml" />
-    </modules>
-  </component>
-</project>
@@ -1,6 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<project version="4">
-  <component name="VcsDirectoryMappings">
-    <mapping directory="$PROJECT_DIR$/../.." vcs="Git" />
-  </component>
-</project>
@@ -1,33 +1,26 @@
 import gym
 import numpy as np
-import uuid
-import airecorder
+
 from numpy import ndarray
 from mlagents_envs.base_env import ActionTuple
 from mlagents_envs.environment import UnityEnvironment
-from typing import Tuple, List
-from mlagents_envs.side_channel.side_channel import (
-    SideChannel,
-    IncomingMessage,
-    OutgoingMessage,
-)


 class Aimbot(gym.Env):
    def __init__(
        self,
-            env_path: str,
-            worker_id: int = 1,
-            base_port: int = 100,
+        envPath: str,
+        workerID: int = 1,
+        basePort: int = 100,
        side_channels: list = []
    ):
        super(Aimbot, self).__init__()
        self.env = UnityEnvironment(
-            file_name=env_path,
+            file_name=envPath,
            seed=1,
            side_channels=side_channels,
-            worker_id=worker_id,
-            base_port=base_port,
+            worker_id=workerID,
+            base_port=basePort,
        )
        self.env.reset()
        # all behavior_specs
@@ -41,7 +34,7 @@ class Aimbot(gym.Env):
        #  environment action specs
        self.unity_action_spec = self.unity_specs.action_spec
        #  environment sample observation
-        decision_steps, _ = self.env.get_steps(self.unity_beha_name)
+        decisionSteps, _ = self.env.get_steps(self.unity_beha_name)

        # OBSERVATION SPECS
        #  environment state shape. like tuple:(93,)
@@ -64,34 +57,31 @@ class Aimbot(gym.Env):

        # AGENT SPECS
        # all agents ID
-        self.unity_agent_IDS = decision_steps.agent_id
+        self.unity_agent_IDS = decisionSteps.agent_id
        # agents number
        self.unity_agent_num = len(self.unity_agent_IDS)

-        # all zero action
-        self.all_zero_action = np.zeros((self.unity_agent_num, self.unity_action_size))
-
-    def reset(self) -> Tuple[np.ndarray, List, List]:
-        """reset environment and get observations
+    def reset(self):
+        """reset enviroment and get observations

        Returns:
-            ndarray: next_state, reward, done, loadDir, saveNow
+            ndarray: nextState, reward, done, loadDir, saveNow
        """
        # reset env
        self.env.reset()
-        next_state, reward, done = self.get_steps()
-        return next_state, reward, done
+        nextState, reward, done = self.getSteps()
+        return nextState, reward, done

    # TODO:
    # delete all stack state DONE
-    # get-step State disassembly function DONE
+    # getstep State disassembly function DONE
    # delete agent selection function DONE
    # self.step action wrapper function DONE
    def step(
        self,
        actions: ndarray,
-    ) -> Tuple[np.ndarray, List, List]:
-        """change actions list to ActionTuple then send it to environment
+    ):
+        """change ations list to ActionTuple then send it to enviroment

        Args:
            actions (ndarray): PPO chooseAction output action list.(agentNum,actionNum)
@@ -99,36 +89,36 @@ class Aimbot(gym.Env):
        Returns:
            ndarray: nextState, reward, done
        """
-        # take action to environment
+        # take action to enviroment
        # return mextState,reward,done
        # discrete action
        if self.unity_dis_act_exist:
            # create discrete action from actions list
-            discrete_actions = actions[:, 0: self.unity_discrete_type]
+            discreteActions = actions[:, 0 : self.unity_discrete_type]
        else:
            # create empty discrete action
-            discrete_actions = np.asarray([[0]])
+            discreteActions = np.asarray([[0]])
        # continuous action
        if self.unity_con_act_exist:
            # create continuous actions from actions list
-            continuous_actions = actions[:, self.unity_discrete_type:]
+            continuousActions = actions[:, self.unity_discrete_type :]
        else:
            # create empty continuous action
-            continuous_actions = np.asanyarray([[0.0]])
+            continuousActions = np.asanyarray([[0.0]])

        # Dummy continuous action
        # continuousActions = np.asanyarray([[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0]])
        # create actionTuple
-        this_action_tuple = ActionTuple(continuous=continuous_actions, discrete=discrete_actions)
+        thisActionTuple = ActionTuple(continuous=continuousActions, discrete=discreteActions)
        # take action to env
-        self.env.set_actions(behavior_name=self.unity_beha_name, action=this_action_tuple)
+        self.env.set_actions(behavior_name=self.unity_beha_name, action=thisActionTuple)
        self.env.step()
        # get nextState & reward & done after this action
-        next_states, rewards, dones = self.get_steps()
-        return next_states, rewards, dones
+        nextStates, rewards, dones = self.getSteps()
+        return nextStates, rewards, dones

-    def get_steps(self) -> Tuple[np.ndarray, List, List]:
-        """get environment now observations.
+    def getSteps(self):
+        """get enviroment now observations.
        Include State, Reward, Done

        Args:
@@ -137,96 +127,28 @@ class Aimbot(gym.Env):
            ndarray: nextState, reward, done
        """
        # get nextState & reward & done
-        decision_steps, terminal_steps = self.env.get_steps(self.unity_beha_name)
-        next_states = []
+        decisionSteps, terminalSteps = self.env.get_steps(self.unity_beha_name)
+        nextStates = []
        dones = []
        rewards = []
-        for this_agent_ID in self.unity_agent_IDS:
+        for thisAgentID in self.unity_agent_IDS:
            # while Episode over agentID will both in decisionSteps and terminalSteps.
            # avoid redundant state and reward,
            # use agentExist toggle to check if agent is already exist.
-            agent_exist = False
+            agentExist = False
            # game done
-            if this_agent_ID in terminal_steps:
-                next_states.append(terminal_steps[this_agent_ID].obs[0])
+            if thisAgentID in terminalSteps:
+                nextStates.append(terminalSteps[thisAgentID].obs[0])
                dones.append(True)
-                rewards.append(terminal_steps[this_agent_ID].reward)
-                agent_exist = True
+                rewards.append(terminalSteps[thisAgentID].reward)
+                agentExist = True
            # game not over yet and agent not in terminalSteps
-            if (this_agent_ID in decision_steps) and (not agent_exist):
-                next_states.append(decision_steps[this_agent_ID].obs[0])
+            if (thisAgentID in decisionSteps) and (not agentExist):
+                nextStates.append(decisionSteps[thisAgentID].obs[0])
                dones.append(False)
-                rewards.append(decision_steps[this_agent_ID].reward)
+                rewards.append(decisionSteps[thisAgentID].reward)

-        return np.asarray(next_states), rewards, dones
+        return np.asarray(nextStates), rewards, dones

    def close(self):
        self.env.close()
-
-
-class AimbotSideChannel(SideChannel):
-    def __init__(self, channel_id: uuid.UUID) -> None:
-        super().__init__(channel_id)
-
-    def on_message_received(self, msg: IncomingMessage) -> None:
-        """
-        Note: We must implement this method of the SideChannel interface to
-        receive messages from Unity
-        Message will be sent like this:
-        "Warning|Message1|Message2|Message3" or
-        "Error|Message1|Message2|Message3"
-        """
-        this_message = msg.read_string()
-        this_result = this_message.split("|")
-        print(this_result)
-        if this_result[0] == "Warning":
-            if this_result[1] == "Result":
-                airecorder.total_rounds[this_result[2]] += 1
-                if this_result[3] == "Win":
-                    airecorder.win_rounds[this_result[2]] += 1
-                # print(TotalRounds)
-                # print(WinRounds)
-        elif this_result[0] == "Error":
-            print(this_message)
-        # # while Message type is Warning
-        # if(thisResult[0] == "Warning"):
-        #     # while Message1 is result means one game is over
-        #     if (thisResult[1] == "Result"):
-        #         TotalRounds[thisResult[2]]+=1
-        #         # while Message3 is Win means this agent win this game
-        #         if(thisResult[3] == "Win"):
-        #             WinRounds[thisResult[2]]+=1
-        #     # while Message1 is GameState means this game is just start
-        #     # and tell python which game mode is
-        #     elif (thisResult[1] == "GameState"):
-        #         SCrecieved = 1
-        # # while Message type is Error
-        # elif(thisResult[0] == "Error"):
-        #     print(thisMessage)
-
-    # 发送函数
-    def send_string(self, data: str) -> None:
-        # send a string toC#
-        msg = OutgoingMessage()
-        msg.write_string(data)
-        super().queue_message_to_send(msg)
-
-    def send_bool(self, data: bool) -> None:
-        msg = OutgoingMessage()
-        msg.write_bool(data)
-        super().queue_message_to_send(msg)
-
-    def send_int(self, data: int) -> None:
-        msg = OutgoingMessage()
-        msg.write_int32(data)
-        super().queue_message_to_send(msg)
-
-    def send_float(self, data: float) -> None:
-        msg = OutgoingMessage()
-        msg.write_float32(data)
-        super().queue_message_to_send(msg)
-
-    def send_float_list(self, data: List[float]) -> None:
-        msg = OutgoingMessage()
-        msg.write_float32_list(data)
-        super().queue_message_to_send(msg)
@@ -1,769 +0,0 @@
-import argparse
-import wandb
-import time
-import numpy as np
-import random
-import uuid
-import torch
-import torch.nn as nn
-import torch.optim as optim
-import atexit
-
-from torchviz import make_dot, make_dot_from_trace
-from AimbotEnv import Aimbot
-from tqdm import tqdm
-from enum import Enum
-from torch.distributions.normal import Normal
-from torch.distributions.categorical import Categorical
-from distutils.util import strtobool
-from torch.utils.tensorboard import SummaryWriter
-from mlagents_envs.environment import UnityEnvironment
-from mlagents_envs.side_channel.side_channel import (
-    SideChannel,
-    IncomingMessage,
-    OutgoingMessage,
-)
-from typing import List
-
-bestReward = -1
-
-DEFAULT_SEED = 9331
-ENV_PATH = "../Build/Build-ParallelEnv-Target-OffPolicy-SingleStack-SideChannel-EndReward-Easy-V2.7-FreeOnly-NormalMapSize/Aimbot-ParallelEnv"
-SIDE_CHANNEL_UUID = uuid.UUID("8bbfb62a-99b4-457c-879d-b78b69066b5e")
-WAND_ENTITY = "koha9"
-WORKER_ID = 2
-BASE_PORT = 1111
-
-# max round steps per agent is 2500/Decision_period, 25 seconds
-# !!!check every parameters before run!!!
-
-TOTAL_STEPS = 3150000
-BATCH_SIZE = 1024
-MAX_TRAINNING_DATASETS = 6000
-DECISION_PERIOD = 1
-LEARNING_RATE = 5e-4
-GAMMA = 0.99
-GAE_LAMBDA = 0.95
-EPOCHS = 3
-CLIP_COEF = 0.11
-LOSS_COEF = [1.0, 1.0, 1.0, 1.0] # free go attack defence
-POLICY_COEF = [1.0, 1.0, 1.0, 1.0]
-ENTROPY_COEF = [0.05, 0.05, 0.05, 0.05]
-CRITIC_COEF = [0.5, 0.5, 0.5, 0.5]
-TARGET_LEARNING_RATE = 1e-6
-FREEZE_VIEW_NETWORK = False
-
-ANNEAL_LEARNING_RATE = True
-CLIP_VLOSS = True
-NORM_ADV = True
-TRAIN = True
-
-SAVE_MODEL = False
-WANDB_TACK = False
-LOAD_DIR = None
-#LOAD_DIR = "../PPO-Model/Aimbot_Target_Hybrid_PMNN_V2_OffPolicy_EndBC_9331_1670986948-freeonly-20/Aimbot_Target_Hybrid_PMNN_V2_OffPolicy_EndBC_9331_1670986948_0.7949778.pt"
-
-# public data
-class Targets(Enum):
-    Free = 0
-    Go = 1
-    Attack = 2
-    Defence = 3
-    Num = 4
-TARGET_STATE_SIZE = 6
-INAREA_STATE_SIZE = 1
-TIME_STATE_SIZE = 1
-GUN_STATE_SIZE = 1
-MY_STATE_SIZE = 4
-TOTAL_T_SIZE = TARGET_STATE_SIZE+INAREA_STATE_SIZE+TIME_STATE_SIZE+GUN_STATE_SIZE+MY_STATE_SIZE
-BASE_WINREWARD = 999
-BASE_LOSEREWARD = -999
-TARGETNUM= 4
-ENV_TIMELIMIT = 30
-RESULT_BROADCAST_RATIO = 1/ENV_TIMELIMIT
-TotalRounds = {"Free":0,"Go":0,"Attack":0}
-WinRounds = {"Free":0,"Go":0,"Attack":0}
-
-# !!!SPECIAL PARAMETERS!!!
-# change it while program is finished
-using_targets_num = 3
-
-
-def parse_args():
-    # fmt: off
-    # pytorch and environment parameters
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--seed", type=int, default=DEFAULT_SEED,
-                        help="seed of the experiment")
-    parser.add_argument("--path", type=str, default=ENV_PATH,
-                        help="enviroment path")
-    parser.add_argument("--workerID", type=int, default=WORKER_ID,
-                        help="unity worker ID")
-    parser.add_argument("--baseport", type=int, default=BASE_PORT,
-                        help="port to connect to Unity environment")
-    parser.add_argument("--lr", type=float, default=LEARNING_RATE,
-                        help="the learning rate of optimizer")
-    parser.add_argument("--cuda", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
-                        help="if toggled, cuda will be enabled by default")
-    parser.add_argument("--total-timesteps", type=int, default=TOTAL_STEPS,
-                        help="total timesteps of the experiments")
-
-    # model parameters
-    parser.add_argument("--train",type=lambda x: bool(strtobool(x)), default=TRAIN, nargs="?", const=True,
-                        help="Train Model or not")
-    parser.add_argument("--freeze-viewnet", type=lambda x: bool(strtobool(x)), default=FREEZE_VIEW_NETWORK, nargs="?", const=True,
-                        help="freeze view network or not")
-    parser.add_argument("--datasetSize", type=int, default=MAX_TRAINNING_DATASETS,
-                        help="training dataset size,start training while dataset collect enough data")
-    parser.add_argument("--minibatchSize", type=int, default=BATCH_SIZE,
-                        help="nimi batch size")
-    parser.add_argument("--epochs", type=int, default=EPOCHS,
-                        help="the K epochs to update the policy")
-    parser.add_argument("--annealLR", type=lambda x: bool(strtobool(x)), default=ANNEAL_LEARNING_RATE, nargs="?", const=True,
-                        help="Toggle learning rate annealing for policy and value networks")
-    parser.add_argument("--wandb-track", type=lambda x: bool(strtobool(x)), default=WANDB_TACK, nargs="?", const=True,
-                        help="track on the wandb")
-    parser.add_argument("--save-model", type=lambda x: bool(strtobool(x)), default=SAVE_MODEL, nargs="?", const=True,
-                        help="save model or not")
-    parser.add_argument("--wandb-entity", type=str, default=WAND_ENTITY,
-                        help="the entity (team) of wandb's project")
-    parser.add_argument("--load-dir", type=str, default=LOAD_DIR,
-                        help="load model directory")
-    parser.add_argument("--decision-period", type=int, default=DECISION_PERIOD,
-                        help="the number of steps to run in each environment per policy rollout")
-    parser.add_argument("--result-broadcast-ratio", type=float, default=RESULT_BROADCAST_RATIO,
-                        help="broadcast result when win round is reached,r=result-broadcast-ratio*remainTime")
-
-    # GAE loss
-    parser.add_argument("--gae", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
-                        help="Use GAE for advantage computation")
-    parser.add_argument("--norm-adv", type=lambda x: bool(strtobool(x)), default=NORM_ADV, nargs="?", const=True,
-                        help="Toggles advantages normalization")
-    parser.add_argument("--gamma", type=float, default=GAMMA,
-                        help="the discount factor gamma")
-    parser.add_argument("--gaeLambda", type=float, default=GAE_LAMBDA,
-                        help="the lambda for the general advantage estimation")
-    parser.add_argument("--clip-coef", type=float, default=CLIP_COEF,
-                        help="the surrogate clipping coefficient")
-    parser.add_argument("--policy-coef", type=float, default=POLICY_COEF,
-                        help="coefficient of the policy")
-    parser.add_argument("--ent-coef", type=float, default=ENTROPY_COEF,
-                        help="coefficient of the entropy")
-    parser.add_argument("--critic-coef", type=float, default=CRITIC_COEF,
-                        help="coefficient of the value function")
-    parser.add_argument("--clip-vloss", type=lambda x: bool(strtobool(x)), default=CLIP_VLOSS, nargs="?", const=True,
-                        help="Toggles whether or not to use a clipped loss for the value function, as per the paper.")
-    parser.add_argument("--max-grad-norm", type=float, default=0.5,
-                        help="the maximum norm for the gradient clipping")
-    parser.add_argument("--target-kl", type=float, default=None,
-                        help="the target KL divergence threshold")
-    # fmt: on
-    args = parser.parse_args()
-    return args
-
-
-def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
-    torch.nn.init.orthogonal_(layer.weight, std)
-    torch.nn.init.constant_(layer.bias, bias_const)
-    return layer
-
-
-class PPOAgent(nn.Module):
-    def __init__(self, env: Aimbot,targetNum:int):
-        super(PPOAgent, self).__init__()
-        self.targetNum = targetNum
-        self.stateSize = env.unity_observation_shape[0]
-        self.agentNum = env.unity_agent_num
-        self.targetSize = TARGET_STATE_SIZE
-        self.timeSize = TIME_STATE_SIZE
-        self.gunSize = GUN_STATE_SIZE
-        self.myStateSize = MY_STATE_SIZE
-        self.raySize = env.unity_observation_shape[0] - TOTAL_T_SIZE
-        self.nonRaySize = TOTAL_T_SIZE
-        self.head_input_size = env.unity_observation_shape[0] - self.targetSize-self.timeSize-self.gunSize# except target state input
-
-        self.discrete_size = env.unity_discrete_size
-        self.discrete_shape = list(env.unity_discrete_branches)
-        self.continuous_size = env.unity_continuous_size
-
-        self.viewNetwork = nn.Sequential(
-            layer_init(nn.Linear(self.raySize, 200)),
-            nn.Tanh()
-        )
-        self.targetNetworks = nn.ModuleList([nn.Sequential(
-            layer_init(nn.Linear(self.nonRaySize, 100)),
-            nn.Tanh()
-            )for i in range(targetNum)])
-        self.middleNetworks = nn.ModuleList([nn.Sequential(
-            layer_init(nn.Linear(300,200)),
-            nn.Tanh()
-            )for i in range(targetNum)])
-        self.actor_dis = nn.ModuleList([layer_init(nn.Linear(200, self.discrete_size), std=0.5) for i in range(targetNum)])
-        self.actor_mean = nn.ModuleList([layer_init(nn.Linear(200, self.continuous_size), std=0.5) for i in range(targetNum)])
-        # self.actor_logstd = nn.ModuleList([layer_init(nn.Linear(200, self.continuous_size), std=1) for i in range(targetNum)])
-        # self.actor_logstd = nn.Parameter(torch.zeros(1, self.continuous_size))
-        self.actor_logstd = nn.ParameterList([nn.Parameter(torch.zeros(1,self.continuous_size))for i in range(targetNum)]) # nn.Parameter(torch.zeros(1, self.continuous_size))
-        self.critic = nn.ModuleList([layer_init(nn.Linear(200, 1), std=1)for i in range(targetNum)])
-
-    def get_value(self, state: torch.Tensor):
-        target = state[:,0].to(torch.int32) # int
-        thisStateNum = target.size()[0]
-        viewInput = state[:,-self.raySize:] # all ray input
-        targetInput = state[:,:self.nonRaySize]
-        viewLayer = self.viewNetwork(viewInput)
-        targetLayer = torch.stack([self.targetNetworks[target[i]](targetInput[i]) for i in range(thisStateNum)])
-        middleInput = torch.cat([viewLayer,targetLayer],dim = 1)
-        middleLayer = torch.stack([self.middleNetworks[target[i]](middleInput[i]) for i in range(thisStateNum)])
-        criticV = torch.stack([self.critic[target[i]](middleLayer[i]) for i in range(thisStateNum)]) # self.critic
-        return criticV
-
-    def get_actions_value(self, state: torch.Tensor, actions=None):
-        target = state[:,0].to(torch.int32) # int
-        thisStateNum = target.size()[0]
-        viewInput = state[:,-self.raySize:] # all ray input
-        targetInput = state[:,:self.nonRaySize]
-        viewLayer = self.viewNetwork(viewInput)
-        targetLayer = torch.stack([self.targetNetworks[target[i]](targetInput[i]) for i in range(thisStateNum)])
-        middleInput = torch.cat([viewLayer,targetLayer],dim = 1)
-        middleLayer = torch.stack([self.middleNetworks[target[i]](middleInput[i]) for i in range(thisStateNum)])
-
-        # discrete
-        # 递归targets的数量,既agent数来实现根据target不同来选用对应的输出网络计算输出
-        dis_logits = torch.stack([self.actor_dis[target[i]](middleLayer[i]) for i in range(thisStateNum)])
-        split_logits = torch.split(dis_logits, self.discrete_shape, dim=1)
-        multi_categoricals = [Categorical(logits=thisLogits) for thisLogits in split_logits]
-        # continuous
-        actions_mean = torch.stack([self.actor_mean[target[i]](middleLayer[i]) for i in range(thisStateNum)]) # self.actor_mean(hidden)
-        # action_logstd = torch.stack([self.actor_logstd[target[i]](middleLayer[i]) for i in range(thisStateNum)]) # self.actor_logstd(hidden)
-        # action_logstd = self.actor_logstd.expand_as(actions_mean) # self.actor_logstd.expand_as(actions_mean)
-        action_logstd = torch.stack([torch.squeeze(self.actor_logstd[target[i]],0) for i in range(thisStateNum)])
-        # print(action_logstd)
-        action_std = torch.exp(action_logstd) # torch.exp(action_logstd)
-        con_probs = Normal(actions_mean, action_std)
-        # critic
-        criticV = torch.stack([self.critic[target[i]](middleLayer[i]) for i in range(thisStateNum)]) # self.critic
-
-        if actions is None:
-            if args.train:
-                # select actions base on probability distribution model
-                disAct = torch.stack([ctgr.sample() for ctgr in multi_categoricals])
-                conAct = con_probs.sample()
-                actions = torch.cat([disAct.T, conAct], dim=1)
-            else:
-                # select actions base on best probability distribution
-                disAct = torch.stack([torch.argmax(logit, dim=1) for logit in split_logits])
-                conAct = actions_mean
-                actions = torch.cat([disAct.T, conAct], dim=1)
-        else:
-            disAct = actions[:, 0 : env.unity_discrete_type].T
-            conAct = actions[:, env.unity_discrete_type :]
-        dis_log_prob = torch.stack(
-            [ctgr.log_prob(act) for act, ctgr in zip(disAct, multi_categoricals)]
-        )
-        dis_entropy = torch.stack([ctgr.entropy() for ctgr in multi_categoricals])
-        return (
-            actions,
-            dis_log_prob.sum(0),
-            dis_entropy.sum(0),
-            con_probs.log_prob(conAct).sum(1),
-            con_probs.entropy().sum(1),
-            criticV,
-        )
-
-
-def GAE(agent, args, rewards, dones, values, next_obs, next_done):
-    # GAE
-    with torch.no_grad():
-        next_value = agent.get_value(next_obs).reshape(1, -1)
-        data_size = rewards.size()[0]
-        if args.gae:
-            advantages = torch.zeros_like(rewards).to(device)
-            lastgaelam = 0
-            for t in reversed(range(data_size)):
-                if t == data_size - 1:
-                    nextnonterminal = 1.0 - next_done
-                    nextvalues = next_value
-                else:
-                    nextnonterminal = 1.0 - dones[t + 1]
-                    nextvalues = values[t + 1]
-                delta = rewards[t] + args.gamma * nextvalues * nextnonterminal - values[t]
-                advantages[t] = lastgaelam = (
-                    delta + args.gamma * args.gaeLambda * nextnonterminal * lastgaelam
-                )
-            returns = advantages + values
-        else:
-            returns = torch.zeros_like(rewards).to(device)
-            for t in reversed(range(data_size)):
-                if t == data_size - 1:
-                    nextnonterminal = 1.0 - next_done
-                    next_return = next_value
-                else:
-                    nextnonterminal = 1.0 - dones[t + 1]
-                    next_return = returns[t + 1]
-                returns[t] = rewards[t] + args.gamma * nextnonterminal * next_return
-            advantages = returns - values
-    return advantages, returns
-
-class AimbotSideChannel(SideChannel):
-    def __init__(self, channel_id: uuid.UUID) -> None:
-        super().__init__(channel_id)
-    def on_message_received(self, msg: IncomingMessage) -> None:
-        """
-        Note: We must implement this method of the SideChannel interface to
-        receive messages from Unity
-        """
-        thisMessage = msg.read_string()
-        # print(thisMessage)
-        thisResult = thisMessage.split("|")
-        if(thisResult[0] == "result"):
-            TotalRounds[thisResult[1]]+=1
-            if(thisResult[2] == "Win"):
-                WinRounds[thisResult[1]]+=1
-            #print(TotalRounds)
-            #print(WinRounds)
-        elif(thisResult[0] == "Error"):
-            print(thisMessage)
-	# 发送函数
-    def send_string(self, data: str) -> None:
-        # send a string toC#
-        msg = OutgoingMessage()
-        msg.write_string(data)
-        super().queue_message_to_send(msg)
-
-    def send_bool(self, data: bool) -> None:
-        msg = OutgoingMessage()
-        msg.write_bool(data)
-        super().queue_message_to_send(msg)
-
-    def send_int(self, data: int) -> None:
-        msg = OutgoingMessage()
-        msg.write_int32(data)
-        super().queue_message_to_send(msg)
-
-    def send_float(self, data: float) -> None:
-        msg = OutgoingMessage()
-        msg.write_float32(data)
-        super().queue_message_to_send(msg)
-
-    def send_float_list(self, data: List[float]) -> None:
-        msg = OutgoingMessage()
-        msg.write_float32_list(data)
-        super().queue_message_to_send(msg)
-
-def broadCastEndReward(rewardBF:list,remainTime:float):
-    thisRewardBF = rewardBF
-    if (rewardBF[-1]<=-500):
-        # print("Lose DO NOT BROAD CAST",rewardBF[-1])
-        thisRewardBF[-1] = rewardBF[-1]-BASE_LOSEREWARD
-        thisRewardBF = thisRewardBF
-    elif (rewardBF[-1]>=500):
-        # print("Win! Broadcast reward!",rewardBF[-1])
-        thisRewardBF[-1] = rewardBF[-1]-BASE_WINREWARD
-        thisRewardBF = (np.asarray(thisRewardBF)+(remainTime*args.result_broadcast_ratio)).tolist()
-    else:
-        print("!!!!!DIDNT GET RESULT REWARD!!!!!!",rewardBF[-1])
-    return torch.Tensor(thisRewardBF).to(device)
-
-
-if __name__ == "__main__":
-    args = parse_args()
-    random.seed(args.seed)
-    np.random.seed(args.seed)
-    torch.manual_seed(args.seed)
-
-    device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu")
-
-    # Initialize environment anget optimizer
-    aimBotsideChannel = AimbotSideChannel(SIDE_CHANNEL_UUID);
-    env = Aimbot(envPath=args.path, workerID=args.workerID, basePort=args.baseport,side_channels=[aimBotsideChannel])
-    if args.load_dir is None:
-        agent = PPOAgent(env,TARGETNUM).to(device)
-    else:
-        agent = torch.load(args.load_dir)
-        # freeze 
-        if args.freeze_viewnet:
-            # freeze the view network
-            for p in agent.viewNetwork.parameters():
-                p.requires_grad = False
-            print("VIEW NETWORK FREEZED")
-        print("Load Agent", args.load_dir)
-        print(agent.eval())
-
-    optimizer = optim.Adam(agent.parameters(), lr=args.lr, eps=1e-5)
-
-    # Tensorboard and WandB Recorder
-    game_name = "Aimbot_Target_Hybrid_PMNN_V2"
-    game_type = "OffPolicy_EndBC"
-    run_name = f"{game_name}_{game_type}_{args.seed}_{int(time.time())}"
-    if args.wandb_track:
-        wandb.init(
-            project=game_name,
-            entity=args.wandb_entity,
-            sync_tensorboard=True,
-            config=vars(args),
-            name=run_name,
-            monitor_gym=True,
-            save_code=True,
-        )
-
-    writer = SummaryWriter(f"runs/{run_name}")
-    writer.add_text(
-        "hyperparameters",
-        "|param|value|\n|-|-|\n%s"
-        % ("\n".join([f"|{key}|{value}|" for key, value in vars(args).items()])),
-    )
-
-    @atexit.register
-    def save_model():
-        # save model while exit
-        saveDir = "../PPO-Model/"+ run_name + "_last.pt"
-        torch.save(agent, saveDir)
-        print("save model to " + saveDir)
-
-    # Trajectory Buffer
-    ob_bf = [[] for i in range(env.unity_agent_num)]
-    act_bf = [[] for i in range(env.unity_agent_num)]
-    dis_logprobs_bf = [[] for i in range(env.unity_agent_num)]
-    con_logprobs_bf = [[] for i in range(env.unity_agent_num)]
-    rewards_bf = [[] for i in range(env.unity_agent_num)]
-    dones_bf = [[] for i in range(env.unity_agent_num)]
-    values_bf = [[] for i in range(env.unity_agent_num)]
-
-    # start the game
-    total_update_step = using_targets_num * args.total_timesteps // args.datasetSize
-    target_steps = [0 for i in range(TARGETNUM)]
-    start_time = time.time()
-    state, _, done = env.reset()
-    # state = torch.Tensor(next_obs).to(device)
-    # next_done = torch.zeros(env.unity_agent_num).to(device)
-
-    # initialize empty training datasets
-    obs = [torch.tensor([]).to(device) for i in range(TARGETNUM)]  # (TARGETNUM,n,env.unity_observation_size)
-    actions = [torch.tensor([]).to(device) for i in range(TARGETNUM)]  # (TARGETNUM,n,env.unity_action_size)
-    dis_logprobs = [torch.tensor([]).to(device) for i in range(TARGETNUM)]  # (TARGETNUM,n,1)
-    con_logprobs = [torch.tensor([]).to(device) for i in range(TARGETNUM)]  # (TARGETNUM,n,1)
-    rewards = [torch.tensor([]).to(device) for i in range(TARGETNUM)]  # (TARGETNUM,n,1)
-    values = [torch.tensor([]).to(device) for i in range(TARGETNUM)]  # (TARGETNUM,n,1)
-    advantages = [torch.tensor([]).to(device) for i in range(TARGETNUM)]  # (TARGETNUM,n,1)
-    returns = [torch.tensor([]).to(device) for i in range(TARGETNUM)]  # (TARGETNUM,n,1)
-
-    vis_graph = make_dot(agent.get_actions_value(
-                        torch.Tensor(state).to(device)
-                    ), params=dict(agent.named_parameters()))
-    vis_graph.view()  # 会在当前目录下保存一个“Digraph.gv.pdf”文件，并在默认浏览器中打开
-    
-    with torch.onnx.set_training(agent, False):
-        trace, _ = torch.jit.get_trace_graph(agent, args=(torch.Tensor(state).to(device),))
-    make_dot_from_trace(trace)
-    raise
-    
-    for total_steps in range(total_update_step):
-        # discunt learning rate, while step == total_update_step lr will be 0
-
-        if args.annealLR:
-            finalRatio = TARGET_LEARNING_RATE/args.lr
-            frac = 1.0 - ((total_steps + 1.0) / total_update_step)
-            lrnow = frac * args.lr
-            optimizer.param_groups[0]["lr"] = lrnow
-        else:
-            lrnow = args.lr
-        print("new episode",total_steps,"learning rate = ",lrnow)
-
-
-        # MAIN LOOP: run agent in environment
-        step = 0
-        training = False
-        trainQueue = []
-        last_reward = [0.for i in range(env.unity_agent_num)]
-        while True:
-            if step % args.decision_period == 0:
-                step += 1
-                # Choose action by agent
-
-                with torch.no_grad():
-                    # predict actions
-                    action, dis_logprob, _, con_logprob, _, value = agent.get_actions_value(
-                        torch.Tensor(state).to(device)
-                    )
-                    value = value.flatten()
-
-                # variable from GPU to CPU
-                action_cpu = action.cpu().numpy()
-                dis_logprob_cpu = dis_logprob.cpu().numpy()
-                con_logprob_cpu = con_logprob.cpu().numpy()
-                value_cpu = value.cpu().numpy()
-                # Environment step
-                next_state, reward, next_done = env.step(action_cpu)
-
-                # save memories
-                for i in range(env.unity_agent_num):
-                    # save memories to buffers
-                    ob_bf[i].append(state[i])
-                    act_bf[i].append(action_cpu[i])
-                    dis_logprobs_bf[i].append(dis_logprob_cpu[i])
-                    con_logprobs_bf[i].append(con_logprob_cpu[i])
-                    rewards_bf[i].append(reward[i]+last_reward[i])
-                    dones_bf[i].append(done[i])
-                    values_bf[i].append(value_cpu[i])
-                    remainTime = state[i,TARGET_STATE_SIZE]
-                    if next_done[i] == True:
-                        # finished a round, send finished memories to training datasets
-                        # compute advantage and discounted reward
-                        #print(i,"over")
-                        roundTargetType = int(state[i,0])
-                        thisRewardsTensor = broadCastEndReward(rewards_bf[i],remainTime)
-                        adv, rt = GAE(
-                            agent,
-                            args,
-                            thisRewardsTensor,
-                            torch.Tensor(dones_bf[i]).to(device),
-                            torch.tensor(values_bf[i]).to(device),
-                            torch.tensor(next_state[i]).to(device).unsqueeze(0),
-                            torch.Tensor([next_done[i]]).to(device),
-                        )
-                        # send memories to training datasets
-                        obs[roundTargetType] = torch.cat((obs[roundTargetType], torch.tensor(ob_bf[i]).to(device)), 0)
-                        actions[roundTargetType] = torch.cat((actions[roundTargetType], torch.tensor(act_bf[i]).to(device)), 0)
-                        dis_logprobs[roundTargetType] = torch.cat(
-                            (dis_logprobs[roundTargetType], torch.tensor(dis_logprobs_bf[i]).to(device)), 0
-                        )
-                        con_logprobs[roundTargetType] = torch.cat(
-                            (con_logprobs[roundTargetType], torch.tensor(con_logprobs_bf[i]).to(device)), 0
-                        )
-                        rewards[roundTargetType] = torch.cat((rewards[roundTargetType], thisRewardsTensor), 0)
-                        values[roundTargetType] = torch.cat((values[roundTargetType], torch.tensor(values_bf[i]).to(device)), 0)
-                        advantages[roundTargetType] = torch.cat((advantages[roundTargetType], adv), 0)
-                        returns[roundTargetType] = torch.cat((returns[roundTargetType], rt), 0)
-
-                        # clear buffers
-                        ob_bf[i] = []
-                        act_bf[i] = []
-                        dis_logprobs_bf[i] = []
-                        con_logprobs_bf[i] = []
-                        rewards_bf[i] = []
-                        dones_bf[i] = []
-                        values_bf[i] = []
-                        print(f"train dataset {Targets(roundTargetType).name} added:{obs[roundTargetType].size()[0]}/{args.datasetSize}")
-
-                for i in range(TARGETNUM):
-                    if obs[i].size()[0] >= args.datasetSize:
-                        # start train NN
-                        trainQueue.append(i)
-                if(len(trainQueue)>0):
-                    break
-                state, done = next_state, next_done
-            else:
-                step += 1
-                # skip this step use last predict action
-                next_state, reward, next_done = env.step(action_cpu)
-                # save memories
-                for i in range(env.unity_agent_num):
-                    if next_done[i] == True:
-                        #print(i,"over???")
-                        # save memories to buffers
-                        ob_bf[i].append(state[i])
-                        act_bf[i].append(action_cpu[i])
-                        dis_logprobs_bf[i].append(dis_logprob_cpu[i])
-                        con_logprobs_bf[i].append(con_logprob_cpu[i])
-                        rewards_bf[i].append(reward[i])
-                        dones_bf[i].append(done[i])
-                        values_bf[i].append(value_cpu[i])
-                        remainTime = state[i,TARGET_STATE_SIZE]
-                        # finished a round, send finished memories to training datasets
-                        # compute advantage and discounted reward
-                        roundTargetType = int(state[i,0])
-                        thisRewardsTensor = broadCastEndReward(rewards_bf[i],remainTime)
-                        adv, rt = GAE(
-                            agent,
-                            args,
-                            thisRewardsTensor,
-                            torch.Tensor(dones_bf[i]).to(device),
-                            torch.tensor(values_bf[i]).to(device),
-                            torch.Tensor(next_state[i]).to(device).unsqueeze(dim = 0),
-                            torch.Tensor([next_done[i]]).to(device),
-                        )
-                        # send memories to training datasets
-                        obs[roundTargetType] = torch.cat((obs[roundTargetType], torch.tensor(ob_bf[i]).to(device)), 0)
-                        actions[roundTargetType] = torch.cat((actions[roundTargetType], torch.tensor(act_bf[i]).to(device)), 0)
-                        dis_logprobs[roundTargetType] = torch.cat(
-                            (dis_logprobs[roundTargetType], torch.tensor(dis_logprobs_bf[i]).to(device)), 0
-                        )
-                        con_logprobs[roundTargetType] = torch.cat(
-                            (con_logprobs[roundTargetType], torch.tensor(con_logprobs_bf[i]).to(device)), 0
-                        )
-                        rewards[roundTargetType] = torch.cat((rewards[roundTargetType], thisRewardsTensor), 0)
-                        values[roundTargetType] = torch.cat((values[roundTargetType], torch.tensor(values_bf[i]).to(device)), 0)
-                        advantages[roundTargetType] = torch.cat((advantages[roundTargetType], adv), 0)
-                        returns[roundTargetType] = torch.cat((returns[roundTargetType], rt), 0)
-
-                        # clear buffers
-                        ob_bf[i] = []
-                        act_bf[i] = []
-                        dis_logprobs_bf[i] = []
-                        con_logprobs_bf[i] = []
-                        rewards_bf[i] = []
-                        dones_bf[i] = []
-                        values_bf[i] = []
-                        print(f"train dataset {Targets(roundTargetType).name} added:{obs[roundTargetType].size()[0]}/{args.datasetSize}")
-
-                state = next_state
-                last_reward = reward
-            i += 1
-
-        if args.train:
-            meanRewardList = [] # for WANDB
-            # loop all tarining queue
-            for thisT in trainQueue:
-                target_steps[thisT]+=1
-                # flatten the batch
-                b_obs = obs[thisT].reshape((-1,) + env.unity_observation_shape)
-                b_dis_logprobs = dis_logprobs[thisT].reshape(-1)
-                b_con_logprobs = con_logprobs[thisT].reshape(-1)
-                b_actions = actions[thisT].reshape((-1,) + (env.unity_action_size,))
-                b_advantages = advantages[thisT].reshape(-1)
-                b_returns = returns[thisT].reshape(-1)
-                b_values = values[thisT].reshape(-1)
-                b_size = b_obs.size()[0]
-                # Optimizing the policy and value network
-                b_inds = np.arange(b_size)
-                # clipfracs = []
-                for epoch in range(args.epochs):
-                    print(epoch,end="")
-                    # shuffle all datasets
-                    np.random.shuffle(b_inds)
-                    for start in range(0, b_size, args.minibatchSize):
-                        print(".",end="")
-                        end = start + args.minibatchSize
-                        mb_inds = b_inds[start:end]
-                        if(np.size(mb_inds)<=1):
-                            break
-                        mb_advantages = b_advantages[mb_inds]
-
-                        # normalize advantages
-                        if args.norm_adv:
-                            mb_advantages = (mb_advantages - mb_advantages.mean()) / (
-                                mb_advantages.std() + 1e-8
-                            )
-
-                        (
-                            _,
-                            new_dis_logprob,
-                            dis_entropy,
-                            new_con_logprob,
-                            con_entropy,
-                            newvalue,
-                        ) = agent.get_actions_value(b_obs[mb_inds], b_actions[mb_inds])
-                        # discrete ratio
-                        dis_logratio = new_dis_logprob - b_dis_logprobs[mb_inds]
-                        dis_ratio = dis_logratio.exp()
-                        # continuous ratio
-                        con_logratio = new_con_logprob - b_con_logprobs[mb_inds]
-                        con_ratio = con_logratio.exp()
-
-                        """
-                        # early stop
-                        with torch.no_grad():
-                            # calculate approx_kl http://joschu.net/blog/kl-approx.html
-                            old_approx_kl = (-logratio).mean()
-                            approx_kl = ((ratio - 1) - logratio).mean()
-                            clipfracs += [((ratio - 1.0).abs() > args.clip_coef).float().mean().item()]
-                        """
-
-                        # discrete Policy loss
-                        dis_pg_loss_orig = -mb_advantages * dis_ratio
-                        dis_pg_loss_clip = -mb_advantages * torch.clamp(
-                            dis_ratio, 1 - args.clip_coef, 1 + args.clip_coef
-                        )
-                        dis_pg_loss = torch.max(dis_pg_loss_orig, dis_pg_loss_clip).mean()
-                        # continuous Policy loss
-                        con_pg_loss_orig = -mb_advantages * con_ratio
-                        con_pg_loss_clip = -mb_advantages * torch.clamp(
-                            con_ratio, 1 - args.clip_coef, 1 + args.clip_coef
-                        )
-                        con_pg_loss = torch.max(con_pg_loss_orig, con_pg_loss_clip).mean()
-
-                        # Value loss
-                        newvalue = newvalue.view(-1)
-                        if args.clip_vloss:
-                            v_loss_unclipped = (newvalue - b_returns[mb_inds]) ** 2
-                            v_clipped = b_values[mb_inds] + torch.clamp(
-                                newvalue - b_values[mb_inds],
-                                -args.clip_coef,
-                                args.clip_coef,
-                            )
-                            v_loss_clipped = (v_clipped - b_returns[mb_inds]) ** 2
-                            v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped)
-                            v_loss = 0.5 * v_loss_max.mean()
-                        else:
-                            v_loss = 0.5 * ((newvalue - b_returns[mb_inds]) ** 2).mean()
-
-                        # total loss
-                        entropy_loss = dis_entropy.mean() + con_entropy.mean()
-                        loss = (
-                            dis_pg_loss * POLICY_COEF[thisT]
-                            + con_pg_loss * POLICY_COEF[thisT]
-                            + entropy_loss * ENTROPY_COEF[thisT]
-                            + v_loss * CRITIC_COEF[thisT]
-                        )*LOSS_COEF[thisT]
-
-                        if(torch.isnan(loss).any()):
-                            print("LOSS Include NAN!!!")
-                            if(torch.isnan(dis_pg_loss.any())):
-                                print("dis_pg_loss include nan")
-                            if(torch.isnan(con_pg_loss.any())):
-                                print("con_pg_loss include nan")
-                            if(torch.isnan(entropy_loss.any())):
-                                print("entropy_loss include nan")
-                            if(torch.isnan(v_loss.any())):
-                                print("v_loss include nan")
-                            raise
-
-                        optimizer.zero_grad()
-                        loss.backward()
-                        # Clips gradient norm of an iterable of parameters.
-                        nn.utils.clip_grad_norm_(agent.parameters(), args.max_grad_norm)
-                        optimizer.step()
-
-                    """
-                    if args.target_kl is not None:
-                        if approx_kl > args.target_kl:
-                            break
-                    """
-                # record mean reward before clear history
-                print("done")
-                targetRewardMean = np.mean(rewards[thisT].to("cpu").detach().numpy().copy())
-                meanRewardList.append(targetRewardMean)
-                targetName = Targets(thisT).name
-
-                # clear this target trainning set buffer
-                obs[thisT] = torch.tensor([]).to(device)
-                actions[thisT] = torch.tensor([]).to(device)
-                dis_logprobs[thisT] = torch.tensor([]).to(device)
-                con_logprobs[thisT] = torch.tensor([]).to(device)
-                rewards[thisT] = torch.tensor([]).to(device)
-                values[thisT] = torch.tensor([]).to(device)
-                advantages[thisT] = torch.tensor([]).to(device)
-                returns[thisT] = torch.tensor([]).to(device)
-
-                # record rewards for plotting purposes
-                writer.add_scalar(f"Target{targetName}/value_loss", v_loss.item(), target_steps[thisT])
-                writer.add_scalar(f"Target{targetName}/dis_policy_loss", dis_pg_loss.item(), target_steps[thisT])
-                writer.add_scalar(f"Target{targetName}/con_policy_loss", con_pg_loss.item(), target_steps[thisT])
-                writer.add_scalar(f"Target{targetName}/total_loss", loss.item(), target_steps[thisT])
-                writer.add_scalar(f"Target{targetName}/entropy_loss", entropy_loss.item(), target_steps[thisT])
-                writer.add_scalar(f"Target{targetName}/Reward", targetRewardMean, target_steps[thisT])
-                writer.add_scalar(f"Target{targetName}/WinRatio", WinRounds[targetName]/TotalRounds[targetName], target_steps[thisT])
-                print(f"episode over Target{targetName} mean reward:", targetRewardMean)
-            TotalRewardMean = np.mean(meanRewardList)
-            writer.add_scalar("GlobalCharts/TotalRewardMean", TotalRewardMean, total_steps)
-            writer.add_scalar("GlobalCharts/learning_rate", optimizer.param_groups[0]["lr"], total_steps)
-            # New Record!
-            if TotalRewardMean > bestReward and args.save_model:
-                bestReward = targetRewardMean
-                saveDir = "../PPO-Model/" + run_name +"_"+ str(TotalRewardMean) + ".pt"
-                torch.save(agent, saveDir)
-
-    saveDir = "../PPO-Model/"+ run_name + "_last.pt"
-    torch.save(agent, saveDir)
-    env.close()
-    writer.close()
@@ -1,256 +0,0 @@
-import time
-import numpy as np
-import random
-import uuid
-import torch
-import atexit
-
-from aimbotEnv import Aimbot
-from aimbotEnv import AimbotSideChannel
-from ppoagent import PPOAgent
-from airecorder import WandbRecorder
-from aimemory import PPOMem
-from aimemory import Targets
-from arguments import parse_args
-import torch.optim as optim
-
-# side channel uuid
-SIDE_CHANNEL_UUID = uuid.UUID("8bbfb62a-99b4-457c-879d-b78b69066b5e")
-# tensorboard names
-GAME_NAME = "Aimbot_Hybrid_V3"
-GAME_TYPE = "Mix_Verification"
-
-if __name__ == "__main__":
-    args = parse_args()
-    random.seed(args.seed)
-    np.random.seed(args.seed)
-    torch.manual_seed(args.seed)
-
-    device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu")
-    best_reward = -1
-
-    # Initialize environment agent optimizer
-    aimbot_side_channel = AimbotSideChannel(SIDE_CHANNEL_UUID)
-    env = Aimbot(
-        env_path=args.path,
-        worker_id=args.workerID,
-        base_port=args.baseport,
-        side_channels=[aimbot_side_channel])
-    if args.load_dir is None:
-        agent = PPOAgent(
-            env=env,
-            this_args=args,
-            device=device,
-        ).to(device)
-    else:
-        agent = torch.load(args.load_dir)
-        # freeze
-        if args.freeze_viewnet:
-            # freeze the view network
-            for p in agent.viewNetwork.parameters():
-                p.requires_grad = False
-            print("VIEW NETWORK FREEZE")
-        print("Load Agent", args.load_dir)
-        print(agent.eval())
-    # optimizer
-    optimizer = optim.Adam(agent.parameters(), lr=args.lr, eps=1e-5)
-    # Tensorboard and WandB Recorder
-    run_name = f"{GAME_TYPE}_{args.seed}_{int(time.time())}"
-    wdb_recorder = WandbRecorder(GAME_NAME, GAME_TYPE, run_name, args)
-
-    @atexit.register
-    def save_model():
-        # close env
-        env.close()
-        if args.save_model:
-            # save model while exit
-            save_dir = "../PPO-Model/" + run_name + "_last.pt"
-            torch.save(agent, save_dir)
-            print("save model to " + save_dir)
-
-    # start the game
-    total_update_step = args.target_num * args.total_timesteps // args.datasetSize
-    target_steps = [0 for i in range(args.target_num)]
-    start_time = time.time()
-    state, _, done = env.reset()
-
-    # initialize AI memories
-    ppo_memories = PPOMem(
-        args=args,
-        unity_agent_num=env.unity_agent_num,
-        device=device,
-    )
-
-    # MAIN LOOP: run agent in environment
-    for total_steps in range(total_update_step):
-        # discount learning rate, while step == total_update_step lr will be 0
-        if args.annealLR:
-            final_lr_ratio = args.target_lr / args.lr
-            frac = 1.0 - ((total_steps + 1.0) / total_update_step)
-            lr_now = frac * args.lr
-            optimizer.param_groups[0]["lr"] = lr_now
-        else:
-            lr_now = args.lr
-
-        # episode start show learning rate
-        print("new episode", total_steps, "learning rate = ", lr_now)
-        step = 0
-        training = False
-        train_queue = []
-        last_reward = [0. for i in range(env.unity_agent_num)]
-        # MAIN LOOP: run agent in environment
-        while True:
-            # Target Type(state[0][0]) is stay(4),use all zero action
-            if state[0][0] == 4:
-                next_state, reward, next_done = env.step(env.all_zero_action)
-                state, done = next_state, next_done
-                continue
-            # On decision point, and Target Type(state[0][0]) is not stay(4) choose action by agent
-            if step % args.decision_period == 0:
-                step += 1
-                # Choose action by agent
-                with torch.no_grad():
-                    # predict actions
-                    action, dis_logprob, _, con_logprob, _, value = agent.get_actions_value(
-                        torch.tensor(state,dtype=torch.float32).to(device)
-                    )
-                    value = value.flatten()
-
-                # variable from GPU to CPU
-                action_cpu = action.cpu().numpy()
-                dis_logprob_cpu = dis_logprob.cpu().numpy()
-                con_logprob_cpu = con_logprob.cpu().numpy()
-                value_cpu = value.cpu().numpy()
-                # Environment step
-                next_state, reward, next_done = env.step(action_cpu)
-
-                # save memories
-                if args.train:
-                    ppo_memories.save_memories(
-                        now_step=step,
-                        agent=agent,
-                        state=state,
-                        action_cpu=action_cpu,
-                        dis_logprob_cpu=dis_logprob_cpu,
-                        con_logprob_cpu=con_logprob_cpu,
-                        reward=reward,
-                        done=done,
-                        value_cpu=value_cpu,
-                        last_reward=last_reward,
-                        next_done=next_done,
-                        next_state=next_state,
-                    )
-                    # check if any training dataset is full and ready to train
-                    for i in range(args.target_num):
-                        if ppo_memories.obs[i].size()[0] >= args.datasetSize:
-                            # start train NN
-                            train_queue.append(i)
-                    if len(train_queue) > 0:
-                        # break while loop and start train
-                        break
-                    # update state
-                state, done = next_state, next_done
-            else:
-                step += 1
-                # skip this step use last predict action
-                next_state, reward, next_done = env.step(action_cpu)
-                # save memories
-                if args.train:
-                    ppo_memories.save_memories(
-                        now_step=step,
-                        agent=agent,
-                        state=state,
-                        action_cpu=action_cpu,
-                        dis_logprob_cpu=dis_logprob_cpu,
-                        con_logprob_cpu=con_logprob_cpu,
-                        reward=reward,
-                        done=done,
-                        value_cpu=value_cpu,
-                        last_reward=last_reward,
-                        next_done=next_done,
-                        next_state=next_state,
-                    )
-                    # update state
-                    state = next_state
-                    last_reward = reward
-
-        if args.train:
-            # train mode on
-            mean_reward_list = []  # for WANDB
-            # loop all training queue
-            for this_train_ind in train_queue:
-                # start time
-                start_time = time.time()
-                target_steps[this_train_ind] += 1
-                # train agent
-                (
-                    v_loss,
-                    dis_pg_loss,
-                    con_pg_loss,
-                    loss,
-                    entropy_loss
-                ) = agent.train_net(
-                    this_train_ind=this_train_ind,
-                    ppo_memories=ppo_memories,
-                    optimizer=optimizer
-                )
-                # record mean reward before clear history
-                print("done")
-                target_reward_mean = np.mean(ppo_memories.rewards[this_train_ind].to("cpu").detach().numpy().copy())
-                mean_reward_list.append(target_reward_mean)
-                targetName = Targets(this_train_ind).name
-
-                # clear this target training set buffer
-                ppo_memories.clear_training_datasets(this_train_ind)
-                # record rewards for plotting purposes
-                wdb_recorder.add_target_scalar(
-                    targetName,
-                    this_train_ind,
-                    v_loss,
-                    dis_pg_loss,
-                    con_pg_loss,
-                    loss,
-                    entropy_loss,
-                    target_reward_mean,
-                    target_steps,
-                )
-                print(f"episode over Target{targetName} mean reward:", target_reward_mean)
-            TotalRewardMean = np.mean(mean_reward_list)
-            wdb_recorder.add_global_scalar(
-                TotalRewardMean,
-                optimizer.param_groups[0]["lr"],
-                total_steps,
-            )
-            # print cost time as seconds
-            print("cost time:", time.time() - start_time)
-            # New Record!
-            if TotalRewardMean > best_reward and args.save_model:
-                best_reward = target_reward_mean
-                saveDir = "../PPO-Model/" + run_name + "_" + str(TotalRewardMean) + ".pt"
-                torch.save(agent, saveDir)
-        else:
-            # train mode off
-            mean_reward_list = []  # for WANDB
-            # while not in training mode, clear the buffer
-            for this_train_ind in train_queue:
-                target_steps[this_train_ind] += 1
-                targetName = Targets(this_train_ind).name
-                target_reward_mean = np.mean(ppo_memories.rewards[this_train_ind].to("cpu").detach().numpy().copy())
-                mean_reward_list.append(target_reward_mean)
-                print(target_steps[this_train_ind])
-
-                # clear this target training set buffer
-                ppo_memories.clear_training_datasets(this_train_ind)
-
-                # record rewards for plotting purposes
-                wdb_recorder.writer.add_scalar(f"Target{targetName}/Reward", target_reward_mean,
-                                               target_steps[this_train_ind])
-                wdb_recorder.add_win_ratio(targetName, target_steps[this_train_ind])
-                print(f"episode over Target{targetName} mean reward:", target_reward_mean)
-            TotalRewardMean = np.mean(mean_reward_list)
-            wdb_recorder.writer.add_scalar("GlobalCharts/TotalRewardMean", TotalRewardMean, total_steps)
-
-    saveDir = "../PPO-Model/" + run_name + "_last.pt"
-    torch.save(agent, saveDir)
-    env.close()
-    wdb_recorder.writer.close()
@@ -1,142 +0,0 @@
-import torch
-import numpy as np
-import argparse
-from ppoagent import PPOAgent
-from enum import Enum
-
-# public data
-class Targets(Enum):
-    Free = 0
-    Go = 1
-    Attack = 2
-    Defence = 3
-    Num = 4
-
-class PPOMem:
-    def __init__(
-        self,
-        args: argparse.Namespace,
-        unity_agent_num: int,
-        device: torch.device,
-    ) -> None:
-        self.target_num = args.target_num
-        self.data_set_size = args.datasetSize
-        self.result_broadcast_ratio = args.result_broadcast_ratio
-        self.decision_period = args.decision_period
-        self.unity_agent_num = unity_agent_num
-
-        self.base_lose_reward = args.base_lose_reward
-        self.base_win_reward = args.base_win_reward
-        self.target_state_size = args.target_state_size
-        self.device = device
-
-        # Trajectory Buffer
-        self.ob_bf = [[] for i in range(self.unity_agent_num)]
-        self.act_bf = [[] for i in range(self.unity_agent_num)]
-        self.dis_logprobs_bf = [[] for i in range(self.unity_agent_num)]
-        self.con_logprobs_bf = [[] for i in range(self.unity_agent_num)]
-        self.rewards_bf = [[] for i in range(self.unity_agent_num)]
-        self.dones_bf = [[] for i in range(self.unity_agent_num)]
-        self.values_bf = [[] for i in range(self.unity_agent_num)]
-
-        # initialize empty training datasets
-        self.obs = [torch.tensor([]).to(device) for i in range(self.target_num)]  # (TARGETNUM,n,env.unity_observation_size)
-        self.actions = [torch.tensor([]).to(device) for i in range(self.target_num)]  # (TARGETNUM,n,env.unity_action_size)
-        self.dis_logprobs = [torch.tensor([]).to(device) for i in range(self.target_num)]  # (TARGETNUM,n,1)
-        self.con_logprobs = [torch.tensor([]).to(device) for i in range(self.target_num)]  # (TARGETNUM,n,1)
-        self.rewards = [torch.tensor([]).to(device) for i in range(self.target_num)]  # (TARGETNUM,n,1)
-        self.values = [torch.tensor([]).to(device) for i in range(self.target_num)]  # (TARGETNUM,n,1)
-        self.advantages = [torch.tensor([]).to(device) for i in range(self.target_num)]  # (TARGETNUM,n,1)
-        self.returns = [torch.tensor([]).to(device) for i in range(self.target_num)]  # (TARGETNUM,n,1)
-
-    def broad_cast_end_reward(self, rewardBF: list, remainTime: float) -> torch.Tensor:
-        thisRewardBF = rewardBF.copy()
-        if rewardBF[-1] <= -500:
-            # print("Lose DO NOT BROAD CAST",rewardBF[-1])
-            thisRewardBF[-1] = rewardBF[-1] - self.base_lose_reward
-        elif rewardBF[-1] >= 500:
-            # print("Win! Broadcast reward!",rewardBF[-1])
-            print(sum(thisRewardBF) / len(thisRewardBF))
-            thisRewardBF[-1] = rewardBF[-1] - self.base_win_reward
-            thisRewardBF = (np.asarray(thisRewardBF) + (remainTime * self.result_broadcast_ratio)).tolist()
-        else:
-            print("!!!!!DIDNT GET RESULT REWARD!!!!!!", rewardBF[-1])
-        return torch.tensor(thisRewardBF,dtype=torch.float32).to(self.device)
-
-    def save_memories(
-        self,
-        now_step: int,
-        agent: PPOAgent,
-        state: np.ndarray,
-        action_cpu: np.ndarray,
-        dis_logprob_cpu: np.ndarray,
-        con_logprob_cpu: np.ndarray,
-        reward: list,
-        done: list,
-        value_cpu: np.ndarray,
-        last_reward: list,
-        next_done: list,
-        next_state: np.ndarray,
-    ):
-        for i in range(self.unity_agent_num):
-            if now_step % self.decision_period == 0 or next_done[i] == True:
-                # only on decision period or finished a round, save memories to buffer
-                self.ob_bf[i].append(state[i])
-                self.act_bf[i].append(action_cpu[i])
-                self.dis_logprobs_bf[i].append(dis_logprob_cpu[i])
-                self.con_logprobs_bf[i].append(con_logprob_cpu[i])
-                self.dones_bf[i].append(done[i])
-                self.values_bf[i].append(value_cpu[i])
-                if now_step % self.decision_period == 0:
-                    # on decision period, add last skiped round's reward
-                    self.rewards_bf[i].append(reward[i] + last_reward[i])
-                else:
-                    # not on decision period, only add this round's reward
-                    self.rewards_bf[i].append(reward[i])
-            if next_done[i] == True:
-                # finished a round, send finished memories to training datasets
-                # compute advantage and discounted reward
-                remainTime = state[i, self.target_state_size]
-                roundTargetType = int(state[i, 0])
-                thisRewardsTensor = self.broad_cast_end_reward(self.rewards_bf[i], remainTime)
-                adv, rt = agent.gae(
-                    rewards=thisRewardsTensor,
-                    dones=torch.tensor(self.dones_bf[i],dtype=torch.float32).to(self.device),
-                    values=torch.tensor(self.values_bf[i]).to(self.device),
-                    next_obs=torch.tensor(next_state[i]).to(self.device).unsqueeze(0),
-                    next_done=torch.tensor([next_done[i]],dtype=torch.float32).to(self.device),
-                )
-                # send memories to training datasets
-                self.obs[roundTargetType] = torch.cat((self.obs[roundTargetType], torch.tensor(np.array(self.ob_bf[i])).to(self.device)), 0)
-                self.actions[roundTargetType] = torch.cat((self.actions[roundTargetType], torch.tensor(np.array(self.act_bf[i])).to(self.device)), 0)
-                self.dis_logprobs[roundTargetType] = torch.cat((self.dis_logprobs[roundTargetType], torch.tensor(np.array(self.dis_logprobs_bf[i])).to(self.device)), 0)
-                self.con_logprobs[roundTargetType] = torch.cat((self.con_logprobs[roundTargetType], torch.tensor(np.array(self.con_logprobs_bf[i])).to(self.device)), 0)
-                self.rewards[roundTargetType] = torch.cat((self.rewards[roundTargetType], thisRewardsTensor), 0)
-                self.values[roundTargetType] = torch.cat((self.values[roundTargetType], torch.tensor(np.array(self.values_bf[i])).to(self.device)), 0)
-                self.advantages[roundTargetType] = torch.cat((self.advantages[roundTargetType], adv), 0)
-                self.returns[roundTargetType] = torch.cat((self.returns[roundTargetType], rt), 0)
-
-                # clear buffers
-                self.clear_buffers(i)
-                print(f"train dataset {Targets(roundTargetType).name} added:{self.obs[roundTargetType].size()[0]}/{self.data_set_size}")
-
-    def clear_buffers(self,ind:int):
-        # clear buffers
-        self.ob_bf[ind] = []
-        self.act_bf[ind] = []
-        self.dis_logprobs_bf[ind] = []
-        self.con_logprobs_bf[ind] = []
-        self.rewards_bf[ind] = []
-        self.dones_bf[ind] = []
-        self.values_bf[ind] = []
-
-    def clear_training_datasets(self,ind:int):
-        # clear training datasets
-        self.obs[ind] = torch.tensor([]).to(self.device)
-        self.actions[ind] = torch.tensor([]).to(self.device)
-        self.dis_logprobs[ind] = torch.tensor([]).to(self.device)
-        self.con_logprobs[ind] = torch.tensor([]).to(self.device)
-        self.rewards[ind] = torch.tensor([]).to(self.device)
-        self.values[ind] = torch.tensor([]).to(self.device)
-        self.advantages[ind] = torch.tensor([]).to(self.device)
-        self.returns[ind] = torch.tensor([]).to(self.device)
@@ -1,81 +0,0 @@
-from torch.utils.tensorboard import SummaryWriter
-
-import wandb
-
-total_rounds = {"Free": 0, "Go": 0, "Attack": 0}
-win_rounds = {"Free": 0, "Go": 0, "Attack": 0}
-
-
-# class for wandb recording
-class WandbRecorder:
-    def __init__(self, game_name: str, game_type: str, run_name: str, _args) -> None:
-        # init wandb
-        self.game_name = game_name
-        self.game_type = game_type
-        self._args = _args
-        self.run_name = run_name
-        if self._args.wandb_track:
-            wandb.init(
-                project=self.game_name,
-                entity=self._args.wandb_entity,
-                sync_tensorboard=True,
-                config=vars(self._args),
-                name=self.run_name,
-                monitor_gym=True,
-                save_code=True,
-            )
-        self.writer = SummaryWriter(f"runs/{self.run_name}")
-        self.writer.add_text(
-            "hyperparameters",
-            "|param|value|\n|-|-|\n%s"
-            % ("\n".join([f"|{key}|{value}|" for key, value in vars(self._args).items()])),
-        )
-
-    def add_target_scalar(
-        self,
-        target_name,
-        this_t,
-        v_loss,
-        dis_pg_loss,
-        con_pg_loss,
-        loss,
-        entropy_loss,
-        target_reward_mean,
-        target_steps,
-    ):
-        # fmt:off
-        self.writer.add_scalar(
-            f"Target{target_name}/value_loss", v_loss.item(), target_steps[this_t]
-        )
-        self.writer.add_scalar(
-            f"Target{target_name}/dis_policy_loss", dis_pg_loss.item(), target_steps[this_t]
-        )
-        self.writer.add_scalar(
-            f"Target{target_name}/con_policy_loss", con_pg_loss.item(), target_steps[this_t]
-        )
-        self.writer.add_scalar(
-            f"Target{target_name}/total_loss", loss.item(), target_steps[this_t]
-        )
-        self.writer.add_scalar(
-            f"Target{target_name}/entropy_loss", entropy_loss.item(), target_steps[this_t]
-        )
-        self.writer.add_scalar(
-            f"Target{target_name}/Reward", target_reward_mean, target_steps[this_t]
-        )
-        self.writer.add_scalar(
-            f"Target{target_name}/WinRatio", win_rounds[target_name] / total_rounds[target_name], target_steps[this_t],
-        )
-        # fmt:on
-
-    def add_global_scalar(
-        self,
-        total_reward_mean,
-        learning_rate,
-        total_steps,
-    ):
-        self.writer.add_scalar("GlobalCharts/TotalRewardMean", total_reward_mean, total_steps)
-        self.writer.add_scalar("GlobalCharts/learning_rate", learning_rate, total_steps)
-    def add_win_ratio(self, target_name, target_steps):
-        self.writer.add_scalar(
-            f"Target{target_name}/WinRatio", win_rounds[target_name] / total_rounds[target_name], target_steps,
-        )
@@ -1,154 +0,0 @@
-import argparse
-import uuid
-
-from distutils.util import strtobool
-
-DEFAULT_SEED = 9331
-ENV_PATH = "../Build/3.1.6/Aimbot-ParallelEnv"
-WAND_ENTITY = "koha9"
-WORKER_ID = 1
-BASE_PORT = 1000
-
-# tensorboard names
-GAME_NAME = "Aimbot_Target_Hybrid_PMNN_V3"
-GAME_TYPE = "Mix_Verification"
-
-# max round steps per agent is 2500/Decision_period, 25 seconds
-TOTAL_STEPS = 3150000
-BATCH_SIZE = 512
-MAX_TRAINNING_DATASETS = 6000
-DECISION_PERIOD = 1
-LEARNING_RATE = 6.5e-4
-GAMMA = 0.99
-GAE_LAMBDA = 0.95
-EPOCHS = 3
-CLIP_COEF = 0.11
-LOSS_COEF = [1.0, 1.0, 1.0, 1.0] # free go attack defence
-POLICY_COEF = [1.0, 1.0, 1.0, 1.0]
-ENTROPY_COEF = [0.05, 0.05, 0.05, 0.05]
-CRITIC_COEF = [0.5, 0.5, 0.5, 0.5]
-TARGET_LEARNING_RATE = 1e-6
-
-FREEZE_VIEW_NETWORK = False
-BROADCASTREWARD = False
-ANNEAL_LEARNING_RATE = True
-CLIP_VLOSS = True
-NORM_ADV = False
-TRAIN = True
-SAVE_MODEL = False
-WANDB_TACK = False
-LOAD_DIR = None
-#LOAD_DIR = "../PPO-Model/PList_Go_LeakyReLU_9331_1677965178_bestGoto/PList_Go_LeakyReLU_9331_1677965178_10.709002.pt"
-
-# Unity Environment Parameters
-TARGET_STATE_SIZE = 6
-INAREA_STATE_SIZE = 1
-TIME_STATE_SIZE = 1
-GUN_STATE_SIZE = 1
-MY_STATE_SIZE = 4
-TOTAL_T_SIZE = TARGET_STATE_SIZE+INAREA_STATE_SIZE+TIME_STATE_SIZE+GUN_STATE_SIZE+MY_STATE_SIZE
-BASE_WINREWARD = 999
-BASE_LOSEREWARD = -999
-TARGETNUM= 4
-ENV_TIMELIMIT = 30
-RESULT_BROADCAST_RATIO = 1/ENV_TIMELIMIT
-
-def parse_args():
-    # fmt: off
-    # pytorch and environment parameters
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--seed", type=int, default=DEFAULT_SEED,
-                        help="seed of the experiment")
-    parser.add_argument("--path", type=str, default=ENV_PATH,
-                        help="enviroment path")
-    parser.add_argument("--workerID", type=int, default=WORKER_ID,
-                        help="unity worker ID")
-    parser.add_argument("--baseport", type=int, default=BASE_PORT,
-                        help="port to connect to Unity environment")
-    parser.add_argument("--lr", type=float, default=LEARNING_RATE,
-                        help="the default learning rate of optimizer")
-    parser.add_argument("--cuda", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
-                        help="if toggled, cuda will be enabled by default")
-    parser.add_argument("--total-timesteps", type=int, default=TOTAL_STEPS,
-                        help="total timesteps of the experiments")
-
-    # model parameters
-    parser.add_argument("--train",type=lambda x: bool(strtobool(x)), default=TRAIN, nargs="?", const=True,
-                        help="Train Model or not")
-    parser.add_argument("--freeze-viewnet", type=lambda x: bool(strtobool(x)), default=FREEZE_VIEW_NETWORK, nargs="?", const=True,
-                        help="freeze view network or not")
-    parser.add_argument("--datasetSize", type=int, default=MAX_TRAINNING_DATASETS,
-                        help="training dataset size,start training while dataset collect enough data")
-    parser.add_argument("--minibatchSize", type=int, default=BATCH_SIZE,
-                        help="nimi batch size")
-    parser.add_argument("--epochs", type=int, default=EPOCHS,
-                        help="the K epochs to update the policy")
-    parser.add_argument("--annealLR", type=lambda x: bool(strtobool(x)), default=ANNEAL_LEARNING_RATE, nargs="?", const=True,
-                        help="Toggle learning rate annealing for policy and value networks")
-    parser.add_argument("--wandb-track", type=lambda x: bool(strtobool(x)), default=WANDB_TACK, nargs="?", const=True,
-                        help="track on the wandb")
-    parser.add_argument("--save-model", type=lambda x: bool(strtobool(x)), default=SAVE_MODEL, nargs="?", const=True,
-                        help="save model or not")
-    parser.add_argument("--wandb-entity", type=str, default=WAND_ENTITY,
-                        help="the entity (team) of wandb's project")
-    parser.add_argument("--load-dir", type=str, default=LOAD_DIR,
-                        help="load model directory")
-    parser.add_argument("--decision-period", type=int, default=DECISION_PERIOD,
-                        help="the number of steps to run in each environment per policy rollout")
-    parser.add_argument("--result-broadcast-ratio", type=float, default=RESULT_BROADCAST_RATIO,
-                        help="broadcast result when win round is reached,r=result-broadcast-ratio*remainTime")
-    parser.add_argument("--broadCastEndReward", type=lambda x: bool(strtobool(x)), default=BROADCASTREWARD, nargs="?", const=True,
-                        help="save model or not")
-    # target_learning_rate
-    parser.add_argument("--target-lr", type=float, default=TARGET_LEARNING_RATE,
-                        help="target value of downscaling the learning rate")
-    
-    # POLICY_COEF ENTROPY_COEF CRITIC_COEF LOSS_COEF
-    parser.add_argument("--policy-coef", type=float, default=POLICY_COEF,
-                        help="coefficient of the policy loss")
-    parser.add_argument("--entropy-coef", type=float, default=ENTROPY_COEF,
-                        help="coefficient of the entropy loss")
-    parser.add_argument("--critic-coef", type=float, default=CRITIC_COEF,
-                        help="coefficient of the critic loss")
-    parser.add_argument("--loss-coef", type=float, default=LOSS_COEF,
-                        help="coefficient of the total loss")
-
-    # GAE loss
-    parser.add_argument("--gae", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
-                        help="Use GAE for advantage computation")
-    parser.add_argument("--norm-adv", type=lambda x: bool(strtobool(x)), default=NORM_ADV, nargs="?", const=True,
-                        help="Toggles advantages normalization")
-    parser.add_argument("--gamma", type=float, default=GAMMA,
-                        help="the discount factor gamma")
-    parser.add_argument("--gaeLambda", type=float, default=GAE_LAMBDA,
-                        help="the lambda for the general advantage estimation")
-    parser.add_argument("--clip-coef", type=float, default=CLIP_COEF,
-                        help="the surrogate clipping coefficient")
-    parser.add_argument("--clip-vloss", type=lambda x: bool(strtobool(x)), default=CLIP_VLOSS, nargs="?", const=True,
-                        help="Toggles whether or not to use a clipped loss for the value function, as per the paper.")
-    parser.add_argument("--max-grad-norm", type=float, default=0.5,
-                        help="the maximum norm for the gradient clipping")
-    parser.add_argument("--target-kl", type=float, default=None,
-                        help="the target KL divergence threshold")
-    # environment parameters
-    parser.add_argument("--target-num", type=int, default=TARGETNUM,
-                        help="the number of targets")
-    parser.add_argument("--env-timelimit", type=int, default=ENV_TIMELIMIT,
-                        help="the time limit of each round")
-    parser.add_argument("--base-win-reward", type=int, default=BASE_WINREWARD,
-                        help="the base reward of win round")
-    parser.add_argument("--base-lose-reward", type=int, default=BASE_LOSEREWARD,
-                        help="the base reward of lose round")
-    parser.add_argument("--target-state-size", type=int, default=TARGET_STATE_SIZE,
-                        help="the size of target state")
-    parser.add_argument("--time-state-size", type=int, default=TIME_STATE_SIZE,
-                        help="the size of time state")
-    parser.add_argument("--gun-state-size", type=int, default=GUN_STATE_SIZE,
-                        help="the size of gun state")
-    parser.add_argument("--my-state-size", type=int, default=MY_STATE_SIZE,
-                        help="the size of my state")
-    parser.add_argument("--total-target-size", type=int, default=TOTAL_T_SIZE,
-                        help="the size of total target state")
-    # fmt: on
-    args = parser.parse_args()
-    return args
@@ -9,7 +9,6 @@ import torch.nn as nn
 import torch.optim as optim

 from AimbotEnv import Aimbot
-from tqdm import tqdm
 from torch.distributions.normal import Normal
 from torch.distributions.categorical import Categorical
 from distutils.util import strtobool
@@ -25,28 +24,26 @@ from typing import List
 bestReward = 0

 DEFAULT_SEED = 9331
-ENV_PATH = "../Build/Build-ParallelEnv-Target-OffPolicy-SingleStack-SideChannel-ExtremeReward/Aimbot-ParallelEnv"
+ENV_PATH = "../Build/Build-ParallelEnv-Target-OffPolicy-SingleStack-SideChannel/Aimbot-ParallelEnv"
 SIDE_CHANNEL_UUID = uuid.UUID("8bbfb62a-99b4-457c-879d-b78b69066b5e")
 WAND_ENTITY = "koha9"
 WORKER_ID = 1
 BASE_PORT = 1000

-# max round steps per agent is 2500/Decision_period, 25 seconds
 # !!!check every parameters before run!!!

-TOTAL_STEPS = 6000000
-BATCH_SIZE = 512
-MAX_TRAINNING_DATASETS = 8000
-DECISION_PERIOD = 1
-LEARNING_RATE = 1e-3
+TOTAL_STEPS = 2000000
+STEP_NUM = 314
+DECISION_PERIOD = 2
+LEARNING_RATE = 7e-4
 GAMMA = 0.99
 GAE_LAMBDA = 0.95
+MINIBATCH_NUM = 4
 EPOCHS = 4
 CLIP_COEF = 0.1
 POLICY_COEF = 1.0
 ENTROPY_COEF = 0.01
 CRITIC_COEF = 0.5
-TARGET_LEARNING_RATE = 5e-5

 ANNEAL_LEARNING_RATE = True
 CLIP_VLOSS = True
@@ -54,8 +51,8 @@ NORM_ADV = True
 TRAIN = True

 WANDB_TACK = False
-#LOAD_DIR = None
-LOAD_DIR = "../PPO-Model/Aimbot-target-last.pt"
+LOAD_DIR = None
+# LOAD_DIR = "../PPO-Model/SmallArea-256-128-hybrid-2nd-trainning.pt"

 # public data
 TotalRounds = {"Go":0,"Attack":0,"Free":0}
@@ -84,10 +81,10 @@ def parse_args():
    # model parameters
    parser.add_argument("--train",type=lambda x: bool(strtobool(x)), default=TRAIN, nargs="?", const=True,
                        help="Train Model or not")
-    parser.add_argument("--datasetSize", type=int, default=MAX_TRAINNING_DATASETS,
-                        help="training dataset size,start training while dataset collect enough data")
-    parser.add_argument("--minibatchSize", type=int, default=BATCH_SIZE,
-                        help="nimi batch size")
+    parser.add_argument("--stepNum", type=int, default=STEP_NUM,
+                        help="the number of steps to run in each environment per policy rollout")
+    parser.add_argument("--minibatchesNum", type=int, default=MINIBATCH_NUM,
+                        help="the number of mini-batches")
    parser.add_argument("--epochs", type=int, default=EPOCHS,
                        help="the K epochs to update the policy")
    parser.add_argument("--annealLR", type=lambda x: bool(strtobool(x)), default=ANNEAL_LEARNING_RATE, nargs="?", const=True,
@@ -143,11 +140,9 @@ class PPOAgent(nn.Module):
        self.continuous_size = env.unity_continuous_size

        self.network = nn.Sequential(
-            layer_init(nn.Linear(np.array(env.unity_observation_shape).prod(), 700)),
+            layer_init(nn.Linear(np.array(env.unity_observation_shape).prod(), 384)),
            nn.ReLU(),
-            layer_init(nn.Linear(700, 500)),
-            nn.ReLU(),
-            layer_init(nn.Linear(500, 256)),
+            layer_init(nn.Linear(384, 256)),
            nn.ReLU(),
        )
        self.actor_dis = layer_init(nn.Linear(256, self.discrete_size), std=0.01)
@@ -197,40 +192,6 @@ class PPOAgent(nn.Module):
            self.critic(hidden),
        )

-
-def GAE(agent, args, rewards, dones, values, next_obs, next_done):
-    # GAE
-    with torch.no_grad():
-        next_value = agent.get_value(next_obs).reshape(1, -1)
-        data_size = rewards.size()[0]
-        if args.gae:
-            advantages = torch.zeros_like(rewards).to(device)
-            lastgaelam = 0
-            for t in reversed(range(data_size)):
-                if t == data_size - 1:
-                    nextnonterminal = 1.0 - next_done
-                    nextvalues = next_value
-                else:
-                    nextnonterminal = 1.0 - dones[t + 1]
-                    nextvalues = values[t + 1]
-                delta = rewards[t] + args.gamma * nextvalues * nextnonterminal - values[t]
-                advantages[t] = lastgaelam = (
-                    delta + args.gamma * args.gaeLambda * nextnonterminal * lastgaelam
-                )
-            returns = advantages + values
-        else:
-            returns = torch.zeros_like(rewards).to(device)
-            for t in reversed(range(data_size)):
-                if t == data_size - 1:
-                    nextnonterminal = 1.0 - next_done
-                    next_return = next_value
-                else:
-                    nextnonterminal = 1.0 - dones[t + 1]
-                    next_return = returns[t + 1]
-                returns[t] = rewards[t] + args.gamma * nextnonterminal * next_return
-            advantages = returns - values
-    return advantages, returns
-
 class AimbotSideChannel(SideChannel):
    def __init__(self, channel_id: uuid.UUID) -> None:
        super().__init__(channel_id)
@@ -240,14 +201,14 @@ class AimbotSideChannel(SideChannel):
        receive messages from Unity
        """
        thisMessage = msg.read_string()
-        #print(thisMessage)
+        print(thisMessage)
        thisResult = thisMessage.split("|")
        if(thisResult[0] == "result"):
            TotalRounds[thisResult[1]]+=1
            if(thisResult[2] == "Win"):
                WinRounds[thisResult[1]]+=1
-            #print(TotalRounds)
-            #print(WinRounds)
+            print(TotalRounds)
+            print(WinRounds)
        elif(thisResult[0] == "Error"):
            print(thisMessage)
 	# 发送函数
@@ -277,7 +238,6 @@ class AimbotSideChannel(SideChannel):
        msg.write_float32_list(data)
        super().queue_message_to_send(msg)

-
 if __name__ == "__main__":
    args = parse_args()
    random.seed(args.seed)
@@ -299,12 +259,11 @@ if __name__ == "__main__":
    optimizer = optim.Adam(agent.parameters(), lr=args.lr, eps=1e-5)

    # Tensorboard and WandB Recorder
-    game_name = "Aimbot_Target"
-    game_type = "OffPolicy"
-    run_name = f"{game_name}_{game_type}_{args.seed}_{int(time.time())}"
+    game_name = "Aimbot"
+    run_name = f"{game_name}_{args.seed}_{int(time.time())}"
    if args.wandb_track:
        wandb.init(
-            project=game_name,
+            project=run_name,
            entity=args.wandb_entity,
            sync_tensorboard=True,
            config=vars(args),
@@ -320,168 +279,94 @@ if __name__ == "__main__":
        % ("\n".join([f"|{key}|{value}|" for key, value in vars(args).items()])),
    )

-    # Trajectory Buffer
-    ob_bf = [[] for i in range(env.unity_agent_num)]
-    act_bf = [[] for i in range(env.unity_agent_num)]
-    dis_logprobs_bf = [[] for i in range(env.unity_agent_num)]
-    con_logprobs_bf = [[] for i in range(env.unity_agent_num)]
-    rewards_bf = [[] for i in range(env.unity_agent_num)]
-    dones_bf = [[] for i in range(env.unity_agent_num)]
-    values_bf = [[] for i in range(env.unity_agent_num)]
+    # Memory Record
+    obs = torch.zeros((args.stepNum, env.unity_agent_num) + env.unity_observation_shape).to(device)
+    actions = torch.zeros((args.stepNum, env.unity_agent_num) + (env.unity_action_size,)).to(device)
+    dis_logprobs = torch.zeros((args.stepNum, env.unity_agent_num)).to(device)
+    con_logprobs = torch.zeros((args.stepNum, env.unity_agent_num)).to(device)
+    rewards = torch.zeros((args.stepNum, env.unity_agent_num)).to(device)
+    dones = torch.zeros((args.stepNum, env.unity_agent_num)).to(device)
+    values = torch.zeros((args.stepNum, env.unity_agent_num)).to(device)

    # TRY NOT TO MODIFY: start the game
-    total_update_step = args.total_timesteps // args.datasetSize
+    args.batch_size = int(env.unity_agent_num * args.stepNum)
+    args.minibatch_size = int(args.batch_size // args.minibatchesNum)
+    total_update_step = args.total_timesteps // args.batch_size
    global_step = 0
    start_time = time.time()
-    state, _, done = env.reset()
-    # state = torch.Tensor(next_obs).to(device)
-    # next_done = torch.zeros(env.unity_agent_num).to(device)
+    next_obs, _, _ = env.reset()
+    next_obs = torch.Tensor(next_obs).to(device)
+    next_done = torch.zeros(env.unity_agent_num).to(device)

    for total_steps in range(total_update_step):
        # discunt learning rate, while step == total_update_step lr will be 0
-        print("new episode")
        if args.annealLR:
-            finalRatio = TARGET_LEARNING_RATE/args.lr
-            frac = 1.0 - finalRatio*((total_steps - 1.0) / total_update_step)
+            frac = 1.0 - (total_steps - 1.0) / total_update_step
            lrnow = frac * args.lr
            optimizer.param_groups[0]["lr"] = lrnow

-        # initialize empty training datasets
-        obs = torch.tensor([]).to(device)  # (n,env.unity_observation_size)
-        actions = torch.tensor([]).to(device)  # (n,env.unity_action_size)
-        dis_logprobs = torch.tensor([]).to(device)  # (n,1)
-        con_logprobs = torch.tensor([]).to(device)  # (n,1)
-        rewards = torch.tensor([]).to(device)  # (n,1)
-        values = torch.tensor([]).to(device)  # (n,1)
-        advantages = torch.tensor([]).to(device)  # (n,1)
-        returns = torch.tensor([]).to(device)  # (n,1)
-
        # MAIN LOOP: run agent in environment
-        i = 0
-        training = False
-        while True:
+        for i in range(args.stepNum * args.decision_period):
            if i % args.decision_period == 0:
                step = round(i / args.decision_period)
                # Choose action by agent
                global_step += 1 * env.unity_agent_num
+                obs[step] = next_obs
+                dones[step] = next_done

                with torch.no_grad():
                    # predict actions
                    action, dis_logprob, _, con_logprob, _, value = agent.get_actions_value(
-                        torch.Tensor(state).to(device)
+                        next_obs
                    )
                    value = value.flatten()
-
-                # variable from GPU to CPU
-                action_cpu = action.cpu().numpy()
-                dis_logprob_cpu = dis_logprob.cpu().numpy()
-                con_logprob_cpu = con_logprob.cpu().numpy()
-                value_cpu = value.cpu().numpy()
-                # Environment step
-                next_state, reward, next_done = env.step(action_cpu)
+                next_obs, reward, done = env.step(action.cpu().numpy())

                # save memories
-                for i in range(env.unity_agent_num):
-                    # save memories to buffers
-                    ob_bf[i].append(state[i])
-                    act_bf[i].append(action_cpu[i])
-                    dis_logprobs_bf[i].append(dis_logprob_cpu[i])
-                    con_logprobs_bf[i].append(con_logprob_cpu[i])
-                    rewards_bf[i].append(reward[i])
-                    dones_bf[i].append(done[i])
-                    values_bf[i].append(value_cpu[i])
-                    if next_done[i] == True:
-                        # finished a round, send finished memories to training datasets
-                        # compute advantage and discounted reward
-                        #print(i,"over")
-                        adv, rt = GAE(
-                            agent,
-                            args,
-                            torch.tensor(rewards_bf[i]).to(device),
-                            torch.Tensor(dones_bf[i]).to(device),
-                            torch.tensor(values_bf[i]).to(device),
-                            torch.tensor(next_state[i]).to(device),
-                            torch.Tensor([next_done[i]]).to(device),
+                actions[step] = action
+                dis_logprobs[step] = dis_logprob
+                con_logprobs[step] = con_logprob
+                values[step] = value
+                rewards[step] = torch.tensor(reward).to(device).view(-1)
+                next_obs, next_done = torch.Tensor(next_obs).to(device), torch.Tensor(done).to(
+                    device
                )
-                        # send memories to training datasets
-                        obs = torch.cat((obs, torch.tensor(ob_bf[i]).to(device)), 0)
-                        actions = torch.cat((actions, torch.tensor(act_bf[i]).to(device)), 0)
-                        dis_logprobs = torch.cat(
-                            (dis_logprobs, torch.tensor(dis_logprobs_bf[i]).to(device)), 0
-                        )
-                        con_logprobs = torch.cat(
-                            (con_logprobs, torch.tensor(con_logprobs_bf[i]).to(device)), 0
-                        )
-                        rewards = torch.cat((rewards, torch.tensor(rewards_bf[i]).to(device)), 0)
-                        values = torch.cat((values, torch.tensor(values_bf[i]).to(device)), 0)
-                        advantages = torch.cat((advantages, adv), 0)
-                        returns = torch.cat((returns, rt), 0)
-
-                        # clear buffers
-                        ob_bf[i] = []
-                        act_bf[i] = []
-                        dis_logprobs_bf[i] = []
-                        con_logprobs_bf[i] = []
-                        rewards_bf[i] = []
-                        dones_bf[i] = []
-                        values_bf[i] = []
-                        print(f"train dataset added:{obs.size()[0]}/{args.datasetSize}")
-
-                if obs.size()[0] >= args.datasetSize:
-                    # start train NN
-                    break
-                state, done = next_state, next_done
            else:
                # skip this step use last predict action
-                next_obs, reward, next_done = env.step(action_cpu)
-                # save memories
-                for i in range(env.unity_agent_num):
-                    if next_done[i] == True:
-                        #print(i,"over???")
-                        # save last memories to buffers
-                        ob_bf[i].append(state[i])
-                        act_bf[i].append(action_cpu[i])
-                        dis_logprobs_bf[i].append(dis_logprob_cpu[i])
-                        con_logprobs_bf[i].append(con_logprob_cpu[i])
-                        rewards_bf[i].append(reward[i])
-                        dones_bf[i].append(done[i])
-                        values_bf[i].append(value_cpu[i])
-                        # finished a round, send finished memories to training datasets
-                        # compute advantage and discounted reward
-                        adv, rt = GAE(
-                            agent,
-                            args,
-                            torch.tensor(rewards_bf[i]).to(device),
-                            torch.Tensor(dones_bf[i]).to(device),
-                            torch.tensor(values_bf[i]).to(device),
-                            torch.tensor(next_state[i]).to(device),
-                            torch.Tensor([next_done[i]]).to(device),
+                next_obs, reward, done = env.step(action.cpu().numpy())
+                next_obs, next_done = torch.Tensor(next_obs).to(device), torch.Tensor(done).to(
+                    device
                )
-                        # send memories to training datasets
-                        obs = torch.cat((obs, torch.tensor(ob_bf[i]).to(device)), 0)
-                        actions = torch.cat((actions, torch.tensor(act_bf[i]).to(device)), 0)
-                        dis_logprobs = torch.cat(
-                            (dis_logprobs, torch.tensor(dis_logprobs_bf[i]).to(device)), 0
-                        )
-                        con_logprobs = torch.cat(
-                            (con_logprobs, torch.tensor(con_logprobs_bf[i]).to(device)), 0
-                        )
-                        rewards = torch.cat((rewards, torch.tensor(rewards_bf[i]).to(device)), 0)
-                        values = torch.cat((values, torch.tensor(values_bf[i]).to(device)), 0)
-                        advantages = torch.cat((advantages, adv), 0)
-                        returns = torch.cat((returns, rt), 0)

-                        # clear buffers
-                        ob_bf[i] = []
-                        act_bf[i] = []
-                        dis_logprobs_bf[i] = []
-                        con_logprobs_bf[i] = []
-                        rewards_bf[i] = []
-                        dones_bf[i] = []
-                        values_bf[i] = []
-                        print(f"train dataset added:{obs.size()[0]}/{args.datasetSize}")
-                state, done = next_state, next_done
-            i += 1
+        # GAE
+        with torch.no_grad():
+            next_value = agent.get_value(next_obs).reshape(1, -1)
+            if args.gae:
+                advantages = torch.zeros_like(rewards).to(device)
+                lastgaelam = 0
+                for t in reversed(range(args.stepNum)):
+                    if t == args.stepNum - 1:
+                        nextnonterminal = 1.0 - next_done
+                        nextvalues = next_value
+                    else:
+                        nextnonterminal = 1.0 - dones[t + 1]
+                        nextvalues = values[t + 1]
+                    delta = rewards[t] + args.gamma * nextvalues * nextnonterminal - values[t]
+                    advantages[t] = lastgaelam = (
+                        delta + args.gamma * args.gaeLambda * nextnonterminal * lastgaelam
+                    )
+                returns = advantages + values
+            else:
+                returns = torch.zeros_like(rewards).to(device)
+                for t in reversed(range(args.stepNum)):
+                    if t == args.stepNum - 1:
+                        nextnonterminal = 1.0 - next_done
+                        next_return = next_value
+                    else:
+                        nextnonterminal = 1.0 - dones[t + 1]
+                        next_return = returns[t + 1]
+                    returns[t] = rewards[t] + args.gamma * nextnonterminal * next_return
+                advantages = returns - values

        if args.train:
            # flatten the batch
@@ -492,15 +377,15 @@ if __name__ == "__main__":
            b_advantages = advantages.reshape(-1)
            b_returns = returns.reshape(-1)
            b_values = values.reshape(-1)
-            b_size = b_obs.size()[0]
+
            # Optimizing the policy and value network
-            b_inds = np.arange(b_size)
+            b_inds = np.arange(args.batch_size)
            # clipfracs = []
            for epoch in range(args.epochs):
                # shuffle all datasets
                np.random.shuffle(b_inds)
-                for start in range(0, b_size, args.minibatchSize):
-                    end = start + args.minibatchSize
+                for start in range(0, args.batch_size, args.minibatch_size):
+                    end = start + args.minibatch_size
                    mb_inds = b_inds[start:end]
                    mb_advantages = b_advantages[mb_inds]

@@ -599,12 +484,12 @@ if __name__ == "__main__":
                "charts/SPS", int(global_step / (time.time() - start_time)), global_step
            )
            writer.add_scalar("charts/Reward", rewardsMean, global_step)
-            writer.add_scalar("charts/GoWinRatio", WinRounds["Go"]/TotalRounds["Go"], global_step)
-            writer.add_scalar("charts/AttackWinRatio", WinRounds["Attack"]/TotalRounds["Attack"], global_step)
-            writer.add_scalar("charts/FreeWinRatio", WinRounds["Free"]/TotalRounds["Free"], global_step)
+            writer.add_scalar("charts/GoWinRatio", WinRounds["Go"]/TotalRounds["Go"] if TotalRounds["Go"] != 0 else 0, global_step)
+            writer.add_scalar("charts/AttackWinRatio", WinRounds["Attack"]/TotalRounds["Attack"] if TotalRounds["Attack"] != 0 else 0, global_step)
+            writer.add_scalar("charts/FreeWinRatio", WinRounds["Free"]/TotalRounds["Free"] if TotalRounds["Free"] != 0 else 0, global_step)
            if rewardsMean > bestReward:
                bestReward = rewardsMean
-                saveDir = "../PPO-Model/Target-700-500-256-hybrid-" + str(rewardsMean) + ".pt"
+                saveDir = "../PPO-Model/bigArea-384-128-hybrid-" + str(rewardsMean) + ".pt"
                torch.save(agent, saveDir)

    env.close()
@@ -1,312 +0,0 @@
-import numpy as np
-import torch
-import argparse
-import time
-
-from torch import nn
-from aimbotEnv import Aimbot
-from torch.distributions.normal import Normal
-from torch.distributions.categorical import Categorical
-
-
-def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
-    nn.init.orthogonal_(layer.weight, std)
-    nn.init.constant_(layer.bias, bias_const)
-    return layer
-
-
-class PPOAgent(nn.Module):
-    def __init__(
-            self,
-            env: Aimbot,
-            this_args: argparse.Namespace,
-            device: torch.device,
-    ):
-        super(PPOAgent, self).__init__()
-        self.device = device
-        self.args = this_args
-        self.train_agent = self.args.train
-        self.target_num = self.args.target_num
-        self.unity_observation_shape = env.unity_observation_shape
-        self.unity_action_size = env.unity_action_size
-        self.state_size = self.unity_observation_shape[0]
-        self.agent_num = env.unity_agent_num
-        self.target_size = self.args.target_state_size
-        self.time_state_size = self.args.time_state_size
-        self.gun_state_size = self.args.gun_state_size
-        self.my_state_size = self.args.my_state_size
-        self.ray_state_size = env.unity_observation_shape[0] - self.args.total_target_size
-        self.state_size_without_ray = self.args.total_target_size
-        self.head_input_size = (
-                env.unity_observation_shape[0] - self.target_size - self.time_state_size - self.gun_state_size
-        )  # except target state input
-
-        self.unity_discrete_type = env.unity_discrete_type
-        self.discrete_size = env.unity_discrete_size
-        self.discrete_shape = list(env.unity_discrete_branches)
-        self.continuous_size = env.unity_continuous_size
-
-        self.view_network = nn.Sequential(layer_init(nn.Linear(self.ray_state_size, 200)), nn.LeakyReLU())
-        self.target_networks = nn.ModuleList(
-            [
-                nn.Sequential(layer_init(nn.Linear(self.state_size_without_ray, 100)), nn.LeakyReLU())
-                for i in range(self.target_num)
-            ]
-        )
-        self.middle_networks = nn.ModuleList(
-            [
-                nn.Sequential(layer_init(nn.Linear(300, 200)), nn.LeakyReLU())
-                for i in range(self.target_num)
-            ]
-        )
-        self.actor_dis = nn.ModuleList(
-            [layer_init(nn.Linear(200, self.discrete_size), std=0.5) for i in range(self.target_num)]
-        )
-        self.actor_mean = nn.ModuleList(
-            [layer_init(nn.Linear(200, self.continuous_size), std=0.5) for i in range(self.target_num)]
-        )
-        self.actor_logstd = nn.ParameterList(
-            [nn.Parameter(torch.zeros(1, self.continuous_size)) for i in range(self.target_num)]
-        )  # nn.Parameter(torch.zeros(1, self.continuous_size))
-        self.critic = nn.ModuleList(
-            [layer_init(nn.Linear(200, 1), std=1) for i in range(self.target_num)]
-        )
-
-    def get_value(self, state: torch.Tensor):
-        target = state[:, 0].to(torch.int32)  # int
-        this_state_num = target.size()[0]
-        view_input = state[:, -self.ray_state_size:]  # all ray input
-        target_input = state[:, : self.state_size_without_ray]
-        view_layer = self.view_network(view_input)
-        target_layer = torch.stack(
-            [self.target_networks[target[i]](target_input[i]) for i in range(this_state_num)]
-        )
-        middle_input = torch.cat([view_layer, target_layer], dim=1)
-        middle_layer = torch.stack(
-            [self.middle_networks[target[i]](middle_input[i]) for i in range(this_state_num)]
-        )
-        criticV = torch.stack(
-            [self.critic[target[i]](middle_layer[i]) for i in range(this_state_num)]
-        )  # self.critic
-        return criticV
-
-    def get_actions_value(self, state: torch.Tensor, actions=None):
-        target = state[:, 0].to(torch.int32)  # int
-        this_state_num = target.size()[0]
-        view_input = state[:, -self.ray_state_size:]  # all ray input
-        target_input = state[:, : self.state_size_without_ray]
-        view_layer = self.view_network(view_input)
-        target_layer = torch.stack(
-            [self.target_networks[target[i]](target_input[i]) for i in range(this_state_num)]
-        )
-        middle_input = torch.cat([view_layer, target_layer], dim=1)
-        middle_layer = torch.stack(
-            [self.middle_networks[target[i]](middle_input[i]) for i in range(this_state_num)]
-        )
-
-        # discrete
-        # 递归targets的数量,既agent数来实现根据target不同来选用对应的输出网络计算输出
-        dis_logits = torch.stack(
-            [self.actor_dis[target[i]](middle_layer[i]) for i in range(this_state_num)]
-        )
-        split_logits = torch.split(dis_logits, self.discrete_shape, dim=1)
-        multi_categoricals = [Categorical(logits=thisLogits) for thisLogits in split_logits]
-        # continuous
-        actions_mean = torch.stack(
-            [self.actor_mean[target[i]](middle_layer[i]) for i in range(this_state_num)]
-        )  # self.actor_mean(hidden)
-        action_logstd = torch.stack(
-            [torch.squeeze(self.actor_logstd[target[i]], 0) for i in range(this_state_num)]
-        )
-        # print(action_logstd)
-        action_std = torch.exp(action_logstd)  # torch.exp(action_logstd)
-        con_probs = Normal(actions_mean, action_std)
-        # critic
-        criticV = torch.stack(
-            [self.critic[target[i]](middle_layer[i]) for i in range(this_state_num)]
-        )  # self.critic
-
-        if actions is None:
-            if self.train_agent:
-                # select actions base on probability distribution model
-                dis_act = torch.stack([ctgr.sample() for ctgr in multi_categoricals])
-                con_act = con_probs.sample()
-                actions = torch.cat([dis_act.T, con_act], dim=1)
-            else:
-                # select actions base on best probability distribution
-                dis_act = torch.stack([torch.argmax(logit, dim=1) for logit in split_logits])
-                con_act = actions_mean
-                actions = torch.cat([dis_act.T, con_act], dim=1)
-        else:
-            dis_act = actions[:, 0: self.unity_discrete_type].T
-            con_act = actions[:, self.unity_discrete_type:]
-        dis_log_prob = torch.stack(
-            [ctgr.log_prob(act) for act, ctgr in zip(dis_act, multi_categoricals)]
-        )
-        dis_entropy = torch.stack([ctgr.entropy() for ctgr in multi_categoricals])
-        return (
-            actions,
-            dis_log_prob.sum(0),
-            dis_entropy.sum(0),
-            con_probs.log_prob(con_act).sum(1),
-            con_probs.entropy().sum(1),
-            criticV,
-        )
-
-    def train_net(self, this_train_ind: int, ppo_memories, optimizer) -> tuple:
-        start_time = time.time()
-        # flatten the batch
-        b_obs = ppo_memories.obs[this_train_ind].reshape((-1,) + self.unity_observation_shape)
-        b_dis_logprobs = ppo_memories.dis_logprobs[this_train_ind].reshape(-1)
-        b_con_logprobs = ppo_memories.con_logprobs[this_train_ind].reshape(-1)
-        b_actions = ppo_memories.actions[this_train_ind].reshape((-1,) + (self.unity_action_size,))
-        b_advantages = ppo_memories.advantages[this_train_ind].reshape(-1)
-        b_returns = ppo_memories.returns[this_train_ind].reshape(-1)
-        b_values = ppo_memories.values[this_train_ind].reshape(-1)
-        b_size = b_obs.size()[0]
-        # optimizing the policy and value network
-        b_index = np.arange(b_size)
-
-        for epoch in range(self.args.epochs):
-            print("epoch:", epoch, end="")
-            # shuffle all datasets
-            np.random.shuffle(b_index)
-            for start in range(0, b_size, self.args.minibatchSize):
-                print(".", end="")
-                end = start + self.args.minibatchSize
-                mb_index = b_index[start:end]
-                if np.size(mb_index) <= 1:
-                    break
-                mb_advantages = b_advantages[mb_index]
-
-                # normalize advantages
-                if self.args.norm_adv:
-                    mb_advantages = (mb_advantages - mb_advantages.mean()) / (
-                            mb_advantages.std() + 1e-8
-                    )
-
-                (
-                    _,
-                    new_dis_logprob,
-                    dis_entropy,
-                    new_con_logprob,
-                    con_entropy,
-                    new_value,
-                ) = self.get_actions_value(b_obs[mb_index], b_actions[mb_index])
-                # discrete ratio
-                dis_log_ratio = new_dis_logprob - b_dis_logprobs[mb_index]
-                dis_ratio = dis_log_ratio.exp()
-                # continuous ratio
-                con_log_ratio = new_con_logprob - b_con_logprobs[mb_index]
-                con_ratio = con_log_ratio.exp()
-
-                """
-                # early stop
-                with torch.no_grad():
-                    # calculate approx_kl http://joschu.net/blog/kl-approx.html
-                    old_approx_kl = (-logratio).mean()
-                    approx_kl = ((ratio - 1) - logratio).mean()
-                    clipfracs += [((ratio - 1.0).abs() > args.clip_coef).float().mean().item()]
-                """
-
-                # discrete Policy loss
-                dis_pg_loss_orig = -mb_advantages * dis_ratio
-                dis_pg_loss_clip = -mb_advantages * torch.clamp(
-                    dis_ratio, 1 - self.args.clip_coef, 1 + self.args.clip_coef
-                )
-                dis_pg_loss = torch.max(dis_pg_loss_orig, dis_pg_loss_clip).mean()
-                # continuous Policy loss
-                con_pg_loss_orig = -mb_advantages * con_ratio
-                con_pg_loss_clip = -mb_advantages * torch.clamp(
-                    con_ratio, 1 - self.args.clip_coef, 1 + self.args.clip_coef
-                )
-                con_pg_loss = torch.max(con_pg_loss_orig, con_pg_loss_clip).mean()
-
-                # Value loss
-                new_value = new_value.view(-1)
-                if self.args.clip_vloss:
-                    v_loss_unclipped = (new_value - b_returns[mb_index]) ** 2
-                    v_clipped = b_values[mb_index] + torch.clamp(
-                        new_value - b_values[mb_index],
-                        -self.args.clip_coef,
-                        self.args.clip_coef,
-                    )
-                    v_loss_clipped = (v_clipped - b_returns[mb_index]) ** 2
-                    v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped)
-                    v_loss = 0.5 * v_loss_max.mean()
-                else:
-                    v_loss = 0.5 * ((new_value - b_returns[mb_index]) ** 2).mean()
-
-                # total loss
-                entropy_loss = dis_entropy.mean() + con_entropy.mean()
-                loss = (
-                               dis_pg_loss * self.args.policy_coef[this_train_ind]
-                               + con_pg_loss * self.args.policy_coef[this_train_ind]
-                               + entropy_loss * self.args.entropy_coef[this_train_ind]
-                               + v_loss * self.args.critic_coef[this_train_ind]
-                       ) * self.args.loss_coef[this_train_ind]
-
-                if torch.isnan(loss).any():
-                    print("LOSS Include NAN!!!")
-                    if torch.isnan(dis_pg_loss.any()):
-                        print("dis_pg_loss include nan")
-                    if torch.isnan(con_pg_loss.any()):
-                        print("con_pg_loss include nan")
-                    if torch.isnan(entropy_loss.any()):
-                        print("entropy_loss include nan")
-                    if torch.isnan(v_loss.any()):
-                        print("v_loss include nan")
-                    raise
-
-                optimizer.zero_grad()
-                loss.backward()
-                # Clips gradient norm of an iterable of parameters.
-                nn.utils.clip_grad_norm_(self.parameters(), self.args.max_grad_norm)
-                optimizer.step()
-
-            """
-            if args.target_kl is not None:
-                if approx_kl > args.target_kl:
-                    break
-            """
-        return v_loss, dis_pg_loss, con_pg_loss, loss, entropy_loss
-
-    def gae(
-            self,
-            rewards: torch.Tensor,
-            dones: torch.Tensor,
-            values: torch.Tensor,
-            next_obs: torch.Tensor,
-            next_done: torch.Tensor,
-    ) -> tuple:
-        # GAE
-        with torch.no_grad():
-            next_value = self.get_value(next_obs).reshape(1, -1)
-            data_size = rewards.size()[0]
-            if self.args.gae:
-                advantages = torch.zeros_like(rewards).to(self.device)
-                last_gae_lam = 0
-                for t in reversed(range(data_size)):
-                    if t == data_size - 1:
-                        next_non_terminal = 1.0 - next_done
-                        next_values = next_value
-                    else:
-                        next_non_terminal = 1.0 - dones[t + 1]
-                        next_values = values[t + 1]
-                    delta = rewards[t] + self.args.gamma * next_values * next_non_terminal - values[t]
-                    advantages[t] = last_gae_lam = (
-                            delta + self.args.gamma * self.args.gaeLambda * next_non_terminal * last_gae_lam
-                    )
-                returns = advantages + values
-            else:
-                returns = torch.zeros_like(rewards).to(self.device)
-                for t in reversed(range(data_size)):
-                    if t == data_size - 1:
-                        next_non_terminal = 1.0 - next_done
-                        next_return = next_value
-                    else:
-                        next_non_terminal = 1.0 - dones[t + 1]
-                        next_return = returns[t + 1]
-                    returns[t] = rewards[t] + self.args.gamma * next_non_terminal * next_return
-                advantages = returns - values
-        return advantages, returns
@@ -0,0 +1,502 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Action, 1 continuous ctrl 2.1\n",
+      "Action, 0 continuous ctrl -1.1\n"
+     ]
+    }
+   ],
+   "source": [
+    "import gym\n",
+    "from gym.spaces import Dict, Discrete, Box, Tuple\n",
+    "import numpy as np\n",
+    "\n",
+    "\n",
+    "class SampleGym(gym.Env):\n",
+    "    def __init__(self, config={}):\n",
+    "        self.config = config\n",
+    "        self.action_space = Tuple((Discrete(2), Box(-10, 10, (2,))))\n",
+    "        self.observation_space = Box(-10, 10, (2, 2))\n",
+    "        self.p_done = config.get(\"p_done\", 0.1)\n",
+    "\n",
+    "    def reset(self):\n",
+    "        return self.observation_space.sample()\n",
+    "\n",
+    "    def step(self, action):\n",
+    "        chosen_action = action[0]\n",
+    "        cnt_control = action[1][chosen_action]\n",
+    "\n",
+    "        if chosen_action == 0:\n",
+    "            reward = cnt_control\n",
+    "        else:\n",
+    "            reward = -cnt_control - 1\n",
+    "\n",
+    "        print(f\"Action, {chosen_action} continuous ctrl {cnt_control}\")\n",
+    "        return (\n",
+    "            self.observation_space.sample(),\n",
+    "            reward,\n",
+    "            bool(np.random.choice([True, False], p=[self.p_done, 1.0 - self.p_done])),\n",
+    "            {},\n",
+    "        )\n",
+    "\n",
+    "\n",
+    "if __name__ == \"__main__\":\n",
+    "    env = SampleGym()\n",
+    "    env.reset()\n",
+    "    env.step((1, [-1, 2.1]))  # should say use action 1 with 2.1\n",
+    "    env.step((0, [-1.1, 2.1]))  # should say use action 0 with -1.1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from mlagents_envs.environment import UnityEnvironment\n",
+    "from gym_unity.envs import UnityToGymWrapper\n",
+    "import numpy as np\n",
+    "\n",
+    "ENV_PATH = \"../Build-ParallelEnv/Aimbot-ParallelEnv\"\n",
+    "WORKER_ID = 1\n",
+    "BASE_PORT = 2002\n",
+    "\n",
+    "env = UnityEnvironment(\n",
+    "    file_name=ENV_PATH,\n",
+    "    seed=1,\n",
+    "    side_channels=[],\n",
+    "    worker_id=WORKER_ID,\n",
+    "    base_port=BASE_PORT,\n",
+    ")\n",
+    "\n",
+    "trackedAgent = 0\n",
+    "env.reset()\n",
+    "BEHA_SPECS = env.behavior_specs\n",
+    "BEHA_NAME = list(BEHA_SPECS)[0]\n",
+    "SPEC = BEHA_SPECS[BEHA_NAME]\n",
+    "print(SPEC)\n",
+    "\n",
+    "decisionSteps, terminalSteps = env.get_steps(BEHA_NAME)\n",
+    "\n",
+    "if trackedAgent in decisionSteps:  # ゲーム終了していない場合、環境状態がdecision_stepsに保存される\n",
+    "    nextState = decisionSteps[trackedAgent].obs[0]\n",
+    "    reward = decisionSteps[trackedAgent].reward\n",
+    "    done = False\n",
+    "if trackedAgent in terminalSteps:  # ゲーム終了した場合、環境状態がterminal_stepsに保存される\n",
+    "    nextState = terminalSteps[trackedAgent].obs[0]\n",
+    "    reward = terminalSteps[trackedAgent].reward\n",
+    "    done = True\n",
+    "print(decisionSteps.agent_id)\n",
+    "print(terminalSteps)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "decisionSteps.agent_id [1 2 5 7]\n",
+      "decisionSteps.agent_id_to_index {1: 0, 2: 1, 5: 2, 7: 3}\n",
+      "decisionSteps.reward [0. 0. 0. 0.]\n",
+      "decisionSteps.action_mask [array([[False, False, False],\n",
+      "       [False, False, False],\n",
+      "       [False, False, False],\n",
+      "       [False, False, False]]), array([[False, False, False],\n",
+      "       [False, False, False],\n",
+      "       [False, False, False],\n",
+      "       [False, False, False]]), array([[False, False],\n",
+      "       [False, False],\n",
+      "       [False, False],\n",
+      "       [False, False]])]\n",
+      "decisionSteps.obs [  0.          0.          0.          0.          0.          0.\n",
+      "   0.          0.          0.          0.          0.          0.\n",
+      "   0.          0.          0.          0.          0.          0.\n",
+      "   0.          0.          0.          0.          0.          0.\n",
+      "   0.          0.          0.          0.          0.          0.\n",
+      "   0.          0.          0.          0.          0.          0.\n",
+      "   0.          0.          0.          0.          0.          0.\n",
+      "   0.          0.          0.          0.          0.          0.\n",
+      "   0.          0.          0.          0.          0.          0.\n",
+      "   0.          0.          0.          0.          0.          0.\n",
+      "   0.          0.        -15.994009    1.        -26.322788    1.\n",
+      "   1.          1.          1.          1.          1.          2.\n",
+      "   1.          1.          1.          1.          1.          1.\n",
+      "   1.          1.3519633   1.6946528   2.3051548   3.673389    9.067246\n",
+      "  17.521473   21.727095   22.753294   24.167128   25.905216   18.35725\n",
+      "  21.02278    21.053417    0.       ]\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "'decisionSteps.obs [array([[-15.994009 ,   1.       , -26.322788 ,   1.       ,   1.       ,\\n          1.       ,   1.       ,   1.       ,   1.       ,   2.       ,\\n          1.       ,   1.       ,   1.       ,   1.       ,   1.       ,\\n          1.       ,   1.       ,   1.3519633,   1.6946528,   2.3051548,\\n          3.673389 ,   9.067246 ,  17.521473 ,  21.727095 ,  22.753294 ,\\n         24.167128 ,  25.905216 ,  18.35725  ,  21.02278  ,  21.053417 ,\\n          0.       ],\\n       [ -1.8809433,   1.       , -25.66834  ,   1.       ,   2.       ,\\n          1.       ,   1.       ,   1.       ,   1.       ,   1.       ,\\n          1.       ,   1.       ,   1.       ,   1.       ,   1.       ,\\n          1.       ,   1.       ,  16.768637 ,  23.414627 ,  22.04486  ,\\n         21.050663 ,  20.486784 ,  20.486784 ,  21.050665 ,  15.049731 ,\\n         11.578419 ,   9.695194 ,  20.398016 ,  20.368341 ,  20.398016 ,\\n...\\n         20.551746 ,  20.00118  ,  20.001116 ,  20.551594 ,  21.5222   ,\\n         17.707508 ,  14.86889  ,  19.914494 ,  19.885508 ,  19.914463 ,\\n          0.       ]], dtype=float32)]'"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "print(\"decisionSteps.agent_id\",decisionSteps.agent_id)\n",
+    "# decisionSteps.agent_id [1 2 5 7]\n",
+    "print(\"decisionSteps.agent_id_to_index\",decisionSteps.agent_id_to_index)\n",
+    "# decisionSteps.agent_id_to_index {1: 0, 2: 1, 5: 2, 7: 3}\n",
+    "print(\"decisionSteps.reward\",decisionSteps.reward)\n",
+    "# decisionSteps.reward [0. 0. 0. 0.]\n",
+    "print(\"decisionSteps.action_mask\",decisionSteps.action_mask)\n",
+    "'''\n",
+    "decisionSteps.action_mask [array([[False, False, False],\n",
+    "       [False, False, False],\n",
+    "       [False, False, False],\n",
+    "       [False, False, False]]), array([[False, False, False],\n",
+    "       [False, False, False],\n",
+    "       [False, False, False],\n",
+    "       [False, False, False]]), array([[False, False],\n",
+    "       [False, False],\n",
+    "       [False, False],\n",
+    "       [False, False]])]\n",
+    "'''\n",
+    "print(\"decisionSteps.obs\", decisionSteps.obs[0][0])\n",
+    "'''decisionSteps.obs [array([[-15.994009 ,   1.       , -26.322788 ,   1.       ,   1.       ,\n",
+    "          1.       ,   1.       ,   1.       ,   1.       ,   2.       ,\n",
+    "          1.       ,   1.       ,   1.       ,   1.       ,   1.       ,\n",
+    "          1.       ,   1.       ,   1.3519633,   1.6946528,   2.3051548,\n",
+    "          3.673389 ,   9.067246 ,  17.521473 ,  21.727095 ,  22.753294 ,\n",
+    "         24.167128 ,  25.905216 ,  18.35725  ,  21.02278  ,  21.053417 ,\n",
+    "          0.       ],\n",
+    "       [ -1.8809433,   1.       , -25.66834  ,   1.       ,   2.       ,\n",
+    "          1.       ,   1.       ,   1.       ,   1.       ,   1.       ,\n",
+    "          1.       ,   1.       ,   1.       ,   1.       ,   1.       ,\n",
+    "          1.       ,   1.       ,  16.768637 ,  23.414627 ,  22.04486  ,\n",
+    "         21.050663 ,  20.486784 ,  20.486784 ,  21.050665 ,  15.049731 ,\n",
+    "         11.578419 ,   9.695194 ,  20.398016 ,  20.368341 ,  20.398016 ,\n",
+    "...\n",
+    "         20.551746 ,  20.00118  ,  20.001116 ,  20.551594 ,  21.5222   ,\n",
+    "         17.707508 ,  14.86889  ,  19.914494 ,  19.885508 ,  19.914463 ,\n",
+    "          0.       ]], dtype=float32)]'''\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from AimbotEnv import Aimbot\n",
+    "\n",
+    "ENV_PATH = \"../Build-ParallelEnv/Aimbot-ParallelEnv\"\n",
+    "WORKER_ID = 1\n",
+    "BASE_PORT = 2002\n",
+    "\n",
+    "env = Aimbot(envPath=ENV_PATH,workerID= WORKER_ID,basePort= BASE_PORT)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(array([[  0.       ,   0.       ,   0.       ,   0.       ,   0.       ,\n",
+       "           0.       ,   0.       ,   0.       ,   0.       ,   0.       ,\n",
+       "           0.       ,   0.       ,   0.       ,   0.       ,   0.       ,\n",
+       "           0.       ,   0.       ,   0.       ,   0.       ,   0.       ,\n",
+       "           0.       ,   0.       ,   0.       ,   0.       ,   0.       ,\n",
+       "           0.       ,   0.       ,   0.       ,   0.       ,   0.       ,\n",
+       "           0.       , -15.994009 ,   1.       , -26.322788 ,   1.       ,\n",
+       "           1.       ,   1.       ,   1.       ,   1.       ,   1.       ,\n",
+       "           2.       ,   1.       ,   1.       ,   1.       ,   1.       ,\n",
+       "           1.       ,   1.       ,   1.       ,   1.3519633,   1.6946528,\n",
+       "           2.3051548,   3.673389 ,   9.067246 ,  17.521473 ,  21.727095 ,\n",
+       "          22.753294 ,  24.167128 ,  25.905216 ,  18.35725  ,  21.02278  ,\n",
+       "          21.053417 ,   0.       , -15.994003 ,   1.       , -26.322784 ,\n",
+       "           1.       ,   1.       ,   1.       ,   1.       ,   1.       ,\n",
+       "           1.       ,   1.       ,   1.       ,   1.       ,   1.       ,\n",
+       "           1.       ,   1.       ,   1.       ,   1.       ,   1.3519667,\n",
+       "           1.6946585,   2.3051722,   3.6734192,   9.067533 ,  21.145092 ,\n",
+       "          21.727148 ,  22.753365 ,  24.167217 ,  25.905317 ,  18.358263 ,\n",
+       "          21.022812 ,  21.053455 ,   0.       ],\n",
+       "        [  0.       ,   0.       ,   0.       ,   0.       ,   0.       ,\n",
+       "           0.       ,   0.       ,   0.       ,   0.       ,   0.       ,\n",
+       "           0.       ,   0.       ,   0.       ,   0.       ,   0.       ,\n",
+       "           0.       ,   0.       ,   0.       ,   0.       ,   0.       ,\n",
+       "           0.       ,   0.       ,   0.       ,   0.       ,   0.       ,\n",
+       "           0.       ,   0.       ,   0.       ,   0.       ,   0.       ,\n",
+       "           0.       ,  -1.8809433,   1.       , -25.66834  ,   1.       ,\n",
+       "           2.       ,   1.       ,   1.       ,   1.       ,   1.       ,\n",
+       "           1.       ,   1.       ,   1.       ,   1.       ,   1.       ,\n",
+       "           1.       ,   1.       ,   1.       ,  16.768637 ,  23.414627 ,\n",
+       "          22.04486  ,  21.050663 ,  20.486784 ,  20.486784 ,  21.050665 ,\n",
+       "          15.049731 ,  11.578419 ,   9.695194 ,  20.398016 ,  20.368341 ,\n",
+       "          20.398016 ,   0.       ,  -1.8809433,   1.       , -25.66834  ,\n",
+       "           1.       ,   1.       ,   2.       ,   1.       ,   1.       ,\n",
+       "           1.       ,   1.       ,   1.       ,   1.       ,   2.       ,\n",
+       "           2.       ,   1.       ,   1.       ,   1.       ,  25.098585 ,\n",
+       "          15.749494 ,  22.044899 ,  21.050697 ,  20.486813 ,  20.486813 ,\n",
+       "          21.050694 ,  15.049746 ,   3.872317 ,   3.789325 ,  20.398046 ,\n",
+       "          20.368372 ,  20.398046 ,   0.       ],\n",
+       "        [  0.       ,   0.       ,   0.       ,   0.       ,   0.       ,\n",
+       "           0.       ,   0.       ,   0.       ,   0.       ,   0.       ,\n",
+       "           0.       ,   0.       ,   0.       ,   0.       ,   0.       ,\n",
+       "           0.       ,   0.       ,   0.       ,   0.       ,   0.       ,\n",
+       "           0.       ,   0.       ,   0.       ,   0.       ,   0.       ,\n",
+       "           0.       ,   0.       ,   0.       ,   0.       ,   0.       ,\n",
+       "           0.       , -13.672583 ,   1.       , -26.479263 ,   1.       ,\n",
+       "           1.       ,   1.       ,   1.       ,   1.       ,   1.       ,\n",
+       "           1.       ,   1.       ,   1.       ,   1.       ,   1.       ,\n",
+       "           1.       ,   1.       ,   1.       ,   5.3249803,   6.401276 ,\n",
+       "           8.374101 ,  12.8657875,  21.302414 ,  21.30242  ,  21.888742 ,\n",
+       "          22.92251  ,  24.346794 ,  26.09773  ,  21.210114 ,  21.179258 ,\n",
+       "          21.210117 ,   0.       , -13.672583 ,   1.       , -26.479263 ,\n",
+       "           1.       ,   1.       ,   1.       ,   1.       ,   1.       ,\n",
+       "           1.       ,   1.       ,   2.       ,   1.       ,   1.       ,\n",
+       "           2.       ,   1.       ,   1.       ,   2.       ,   5.3249855,\n",
+       "           6.4012837,   8.374114 ,  12.865807 ,  21.302446 ,  21.30245  ,\n",
+       "          16.168503 ,  22.922543 ,  24.346823 ,   7.1110754,  21.210148 ,\n",
+       "          21.17929  ,  12.495141 ,   0.       ],\n",
+       "        [  0.       ,   0.       ,   0.       ,   0.       ,   0.       ,\n",
+       "           0.       ,   0.       ,   0.       ,   0.       ,   0.       ,\n",
+       "           0.       ,   0.       ,   0.       ,   0.       ,   0.       ,\n",
+       "           0.       ,   0.       ,   0.       ,   0.       ,   0.       ,\n",
+       "           0.       ,   0.       ,   0.       ,   0.       ,   0.       ,\n",
+       "           0.       ,   0.       ,   0.       ,   0.       ,   0.       ,\n",
+       "           0.       ,  -4.9038744,   1.       , -25.185507 ,   1.       ,\n",
+       "           1.       ,   1.       ,   1.       ,   1.       ,   1.       ,\n",
+       "           1.       ,   1.       ,   1.       ,   1.       ,   1.       ,\n",
+       "           1.       ,   1.       ,   1.       ,  20.33171  ,  22.859762 ,\n",
+       "          21.522427 ,  20.551746 ,  20.00118  ,  20.001116 ,  20.551594 ,\n",
+       "          21.5222   ,  17.707508 ,  14.86889  ,  19.914494 ,  19.885508 ,\n",
+       "          19.914463 ,   0.       ,  -4.9038773,   1.       , -25.185507 ,\n",
+       "           1.       ,   2.       ,   1.       ,   2.       ,   1.       ,\n",
+       "           1.       ,   1.       ,   1.       ,   2.       ,   1.       ,\n",
+       "           1.       ,   1.       ,   1.       ,   1.       ,  15.905993 ,\n",
+       "          22.85977  ,  11.566693 ,  20.551773 ,  20.00121  ,  20.001146 ,\n",
+       "          20.551619 ,   7.135157 ,  17.707582 ,  14.868943 ,  19.914528 ,\n",
+       "          19.88554  ,  19.914494 ,   0.       ]], dtype=float32),\n",
+       " [[-0.05], [-0.05], [-0.05], [-0.05]],\n",
+       " [[False], [False], [False], [False]])"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import numpy as np\n",
+    "env.unity_observation_shape\n",
+    "(128, 4) + env.unity_observation_shape\n",
+    "env.reset()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "tensor([[1, 2, 3],\n",
+      "        [1, 2, 3],\n",
+      "        [1, 2, 3],\n",
+      "        [1, 2, 3]], device='cuda:0')\n",
+      "tensor([[1],\n",
+      "        [2],\n",
+      "        [3],\n",
+      "        [4]], device='cuda:0')\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "tensor([[1, 2, 3, 1],\n",
+       "        [1, 2, 3, 2],\n",
+       "        [1, 2, 3, 3],\n",
+       "        [1, 2, 3, 4]], device='cuda:0')"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import torch\n",
+    "aa = torch.tensor([[1,2,3],[1,2,3],[1,2,3],[1,2,3]]).to(\"cuda:0\")\n",
+    "bb = torch.tensor([[1],[2],[3],[4]]).to(\"cuda:0\")\n",
+    "print(aa)\n",
+    "print(bb)\n",
+    "torch.cat([aa,bb],axis = 1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "AttributeError",
+     "evalue": "Can't get attribute 'PPOAgent' on <module '__main__'>",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
+      "\u001b[1;32m~\\AppData\\Local\\Temp\\ipykernel_31348\\1930153251.py\u001b[0m in \u001b[0;36m<cell line: 2>\u001b[1;34m()\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mtorch\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 2\u001b[1;33m \u001b[0mmymodel\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mtorch\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mload\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"../PPO-Model/SmallArea-256-128-hybrid.pt\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m      3\u001b[0m \u001b[0mmymodel\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0meval\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+      "\u001b[1;32mc:\\Users\\UCUNI\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\torch\\serialization.py\u001b[0m in \u001b[0;36mload\u001b[1;34m(f, map_location, pickle_module, **pickle_load_args)\u001b[0m\n\u001b[0;32m    710\u001b[0m                     \u001b[0mopened_file\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mseek\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0morig_position\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    711\u001b[0m                     \u001b[1;32mreturn\u001b[0m \u001b[0mtorch\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mjit\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mload\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mopened_file\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 712\u001b[1;33m                 \u001b[1;32mreturn\u001b[0m \u001b[0m_load\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mopened_zipfile\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmap_location\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mpickle_module\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mpickle_load_args\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    713\u001b[0m         \u001b[1;32mreturn\u001b[0m \u001b[0m_legacy_load\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mopened_file\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmap_location\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mpickle_module\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mpickle_load_args\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    714\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
+      "\u001b[1;32mc:\\Users\\UCUNI\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\torch\\serialization.py\u001b[0m in \u001b[0;36m_load\u001b[1;34m(zip_file, map_location, pickle_module, pickle_file, **pickle_load_args)\u001b[0m\n\u001b[0;32m   1047\u001b[0m     \u001b[0munpickler\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mUnpicklerWrapper\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdata_file\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mpickle_load_args\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1048\u001b[0m     \u001b[0munpickler\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mpersistent_load\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mpersistent_load\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1049\u001b[1;33m     \u001b[0mresult\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0munpickler\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mload\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   1050\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1051\u001b[0m     \u001b[0mtorch\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_utils\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_validate_loaded_sparse_tensors\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+      "\u001b[1;32mc:\\Users\\UCUNI\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\torch\\serialization.py\u001b[0m in \u001b[0;36mfind_class\u001b[1;34m(self, mod_name, name)\u001b[0m\n\u001b[0;32m   1040\u001b[0m                     \u001b[1;32mpass\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1041\u001b[0m             \u001b[0mmod_name\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mload_module_mapping\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmod_name\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmod_name\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1042\u001b[1;33m             \u001b[1;32mreturn\u001b[0m \u001b[0msuper\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfind_class\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmod_name\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mname\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   1043\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1044\u001b[0m     \u001b[1;31m# Load the data (which may in turn use `persistent_load` to load tensors)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+      "\u001b[1;31mAttributeError\u001b[0m: Can't get attribute 'PPOAgent' on <module '__main__'>"
+     ]
+    }
+   ],
+   "source": [
+    "import torch\n",
+    "\n",
+    "def layer_init(layer, std=np.sqrt(2), bias_const=0.0):\n",
+    "    torch.nn.init.orthogonal_(layer.weight, std)\n",
+    "    torch.nn.init.constant_(layer.bias, bias_const)\n",
+    "    return layer\n",
+    "\n",
+    "class PPOAgent(nn.Module):\n",
+    "    def __init__(self, env: Aimbot):\n",
+    "        super(PPOAgent, self).__init__()\n",
+    "        self.discrete_size = env.unity_discrete_size\n",
+    "        self.discrete_shape = list(env.unity_discrete_branches)\n",
+    "        self.continuous_size = env.unity_continuous_size\n",
+    "\n",
+    "        self.network = nn.Sequential(\n",
+    "            layer_init(nn.Linear(np.array(env.unity_observation_shape).prod(), 256)),\n",
+    "            nn.ReLU(),\n",
+    "            layer_init(nn.Linear(256, 128)),\n",
+    "            nn.ReLU(),\n",
+    "        )\n",
+    "        self.actor_dis = layer_init(nn.Linear(128, self.discrete_size), std=0.01)\n",
+    "        self.actor_mean = layer_init(nn.Linear(128, self.continuous_size), std=0.01)\n",
+    "        self.actor_logstd = nn.Parameter(torch.zeros(1, self.continuous_size))\n",
+    "        self.critic = layer_init(nn.Linear(128, 1), std=1)\n",
+    "\n",
+    "    def get_value(self, state: torch.Tensor):\n",
+    "        return self.critic(self.network(state))\n",
+    "\n",
+    "    def get_actions_value(self, state: torch.Tensor, actions=None):\n",
+    "        hidden = self.network(state)\n",
+    "        # discrete\n",
+    "        dis_logits = self.actor_dis(hidden)\n",
+    "        split_logits = torch.split(dis_logits, self.discrete_shape, dim=1)\n",
+    "        multi_categoricals = [Categorical(logits=thisLogits) for thisLogits in split_logits]\n",
+    "        # continuous\n",
+    "        actions_mean = self.actor_mean(hidden)\n",
+    "        action_logstd = self.actor_logstd.expand_as(actions_mean)\n",
+    "        action_std = torch.exp(action_logstd)\n",
+    "        con_probs = Normal(actions_mean, action_std)\n",
+    "\n",
+    "        if actions is None:\n",
+    "            disAct = torch.stack([ctgr.sample() for ctgr in multi_categoricals])\n",
+    "            conAct = con_probs.sample()\n",
+    "            actions = torch.cat([disAct.T, conAct], dim=1)\n",
+    "        else:\n",
+    "            disAct = actions[:, 0 : env.unity_discrete_type].T\n",
+    "            conAct = actions[:, env.unity_discrete_type :]\n",
+    "        dis_log_prob = torch.stack(\n",
+    "            [ctgr.log_prob(act) for act, ctgr in zip(disAct, multi_categoricals)]\n",
+    "        )\n",
+    "        dis_entropy = torch.stack([ctgr.entropy() for ctgr in multi_categoricals])\n",
+    "        return (\n",
+    "            actions,\n",
+    "            dis_log_prob.sum(0),\n",
+    "            dis_entropy.sum(0),\n",
+    "            con_probs.log_prob(conAct).sum(1),\n",
+    "            con_probs.entropy().sum(1),\n",
+    "            self.critic(hidden),\n",
+    "        )\n",
+    "\n",
+    "\n",
+    "mymodel = torch.load(\"../PPO-Model/SmallArea-256-128-hybrid.pt\")\n",
+    "mymodel.eval()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "x : torch.Size([2, 3, 4])\n",
+      "x : torch.Size([6, 2, 3, 4])\n",
+      "x : torch.Size([6, 2, 3, 4])\n"
+     ]
+    }
+   ],
+   "source": [
+    "import torch\n",
+    "#1\n",
+    "x = torch.randn(2, 1, 1)#为1可以扩展为3和4\n",
+    "x = x.expand(2, 3, 4)\n",
+    "print('x :', x.size())\n",
+    "\n",
+    "#2\n",
+    "#扩展一个新的维度必须在最前面，否则会报错\n",
+    "#x = x.expand(2, 3, 4, 6)\n",
+    "\n",
+    "x = x.expand(6, 2, 3, 4)\n",
+    "print('x :', x.size())\n",
+    "\n",
+    "#3\n",
+    "#某一个维度为-1表示不改变该维度的大小\n",
+    "x = x.expand(6, -1, -1, -1)\n",
+    "print('x :', x.size())\n",
+    "\n",
+    "x : torch.Size([2, 3, 4])\n",
+    "x : torch.Size([6, 2, 3, 4])\n",
+    "x : torch.Size([6, 2, 3, 4])"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.9.7 64-bit",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.7"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "86e2db13b09bd6be22cb599ea60c1572b9ef36ebeaa27a4c8e961d6df315ac32"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
@@ -1,5 +0,0 @@
-import numpy as np
-
-aa = np.array([1,2,3,4,5,6,7,8,9,10])
-
-print(aa)