Merge branch 'OffP-PartialMNN-review' into OffP-PartialMNN

This commit is contained in:
Koha9 2023-07-22 19:30:29 +09:00
commit ef0ee495f2
12 changed files with 763 additions and 476 deletions

5
.vscode/settings.json vendored Normal file
View File

@ -0,0 +1,5 @@
{
"python.linting.enabled": false,
"python.analysis.typeCheckingMode": "off",
"commentTranslate.source": "intellsmi.deepl-translate-deepl"
}

View File

@ -1,9 +1,16 @@
import gym import gym
import numpy as np import numpy as np
import uuid
import airecorder
from numpy import ndarray from numpy import ndarray
from mlagents_envs.base_env import ActionTuple from mlagents_envs.base_env import ActionTuple
from mlagents_envs.environment import UnityEnvironment from mlagents_envs.environment import UnityEnvironment
from typing import Tuple, List
from mlagents_envs.side_channel.side_channel import (
SideChannel,
IncomingMessage,
OutgoingMessage,
)
class Aimbot(gym.Env): class Aimbot(gym.Env):
@ -61,7 +68,7 @@ class Aimbot(gym.Env):
# agents number # agents number
self.unity_agent_num = len(self.unity_agent_IDS) self.unity_agent_num = len(self.unity_agent_IDS)
def reset(self): def reset(self)->Tuple[np.ndarray, List, List]:
"""reset enviroment and get observations """reset enviroment and get observations
Returns: Returns:
@ -69,7 +76,7 @@ class Aimbot(gym.Env):
""" """
# reset env # reset env
self.env.reset() self.env.reset()
nextState, reward, done = self.getSteps() nextState, reward, done = self.get_steps()
return nextState, reward, done return nextState, reward, done
# TODO: # TODO:
@ -80,7 +87,7 @@ class Aimbot(gym.Env):
def step( def step(
self, self,
actions: ndarray, actions: ndarray,
): )->Tuple[np.ndarray, List, List]:
"""change ations list to ActionTuple then send it to enviroment """change ations list to ActionTuple then send it to enviroment
Args: Args:
@ -114,10 +121,10 @@ class Aimbot(gym.Env):
self.env.set_actions(behavior_name=self.unity_beha_name, action=thisActionTuple) self.env.set_actions(behavior_name=self.unity_beha_name, action=thisActionTuple)
self.env.step() self.env.step()
# get nextState & reward & done after this action # get nextState & reward & done after this action
nextStates, rewards, dones = self.getSteps() nextStates, rewards, dones = self.get_steps()
return nextStates, rewards, dones return nextStates, rewards, dones
def getSteps(self): def get_steps(self)->Tuple[np.ndarray, List, List]:
"""get enviroment now observations. """get enviroment now observations.
Include State, Reward, Done Include State, Reward, Done
@ -127,28 +134,92 @@ class Aimbot(gym.Env):
ndarray: nextState, reward, done ndarray: nextState, reward, done
""" """
# get nextState & reward & done # get nextState & reward & done
decisionSteps, terminalSteps = self.env.get_steps(self.unity_beha_name) decision_steps, terminal_steps = self.env.get_steps(self.unity_beha_name)
nextStates = [] next_states = []
dones = [] dones = []
rewards = [] rewards = []
for thisAgentID in self.unity_agent_IDS: for this_agent_ID in self.unity_agent_IDS:
# while Episode over agentID will both in decisionSteps and terminalSteps. # while Episode over agentID will both in decisionSteps and terminalSteps.
# avoid redundant state and reward, # avoid redundant state and reward,
# use agentExist toggle to check if agent is already exist. # use agentExist toggle to check if agent is already exist.
agentExist = False agent_exist = False
# game done # game done
if thisAgentID in terminalSteps: if this_agent_ID in terminal_steps:
nextStates.append(terminalSteps[thisAgentID].obs[0]) next_states.append(terminal_steps[this_agent_ID].obs[0])
dones.append(True) dones.append(True)
rewards.append(terminalSteps[thisAgentID].reward) rewards.append(terminal_steps[this_agent_ID].reward)
agentExist = True agent_exist = True
# game not over yet and agent not in terminalSteps # game not over yet and agent not in terminalSteps
if (thisAgentID in decisionSteps) and (not agentExist): if (this_agent_ID in decision_steps) and (not agent_exist):
nextStates.append(decisionSteps[thisAgentID].obs[0]) next_states.append(decision_steps[this_agent_ID].obs[0])
dones.append(False) dones.append(False)
rewards.append(decisionSteps[thisAgentID].reward) rewards.append(decision_steps[this_agent_ID].reward)
return np.asarray(nextStates), rewards, dones return np.asarray(next_states), rewards, dones
def close(self): def close(self):
self.env.close() self.env.close()
class AimbotSideChannel(SideChannel):
def __init__(self, channel_id: uuid.UUID) -> None:
super().__init__(channel_id)
def on_message_received(self, msg: IncomingMessage) -> None:
"""
Note: We must implement this method of the SideChannel interface to
receive messages from Unity
Message will be sent like this:
"Warning|Message1|Message2|Message3" or
"Error|Message1|Message2|Message3"
"""
this_message = msg.read_string()
this_result = this_message.split("|")
if(this_result[0] == "result"):
airecorder.total_rounds[this_result[1]]+=1
if(this_result[2] == "Win"):
airecorder.win_rounds[this_result[1]]+=1
#print(TotalRounds)
#print(WinRounds)
elif(this_result[0] == "Error"):
print(this_message)
# # while Message type is Warning
# if(thisResult[0] == "Warning"):
# # while Message1 is result means one game is over
# if (thisResult[1] == "Result"):
# TotalRounds[thisResult[2]]+=1
# # while Message3 is Win means this agent win this game
# if(thisResult[3] == "Win"):
# WinRounds[thisResult[2]]+=1
# # while Message1 is GameState means this game is just start
# # and tell python which game mode is
# elif (thisResult[1] == "GameState"):
# SCrecieved = 1
# # while Message type is Error
# elif(thisResult[0] == "Error"):
# print(thisMessage)
# 发送函数
def send_string(self, data: str) -> None:
# send a string toC#
msg = OutgoingMessage()
msg.write_string(data)
super().queue_message_to_send(msg)
def send_bool(self, data: bool) -> None:
msg = OutgoingMessage()
msg.write_bool(data)
super().queue_message_to_send(msg)
def send_int(self, data: int) -> None:
msg = OutgoingMessage()
msg.write_int32(data)
super().queue_message_to_send(msg)
def send_float(self, data: float) -> None:
msg = OutgoingMessage()
msg.write_float32(data)
super().queue_message_to_send(msg)
def send_float_list(self, data: List[float]) -> None:
msg = OutgoingMessage()
msg.write_float32_list(data)
super().queue_message_to_send(msg)

View File

@ -107,6 +107,97 @@
")\n", ")\n",
"from typing import List\n" "from typing import List\n"
] ]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"ename": "AttributeError",
"evalue": "'aaa' object has no attribute 'outa'",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[1;32mIn[5], line 14\u001b[0m\n\u001b[0;32m 12\u001b[0m asd \u001b[39m=\u001b[39m aaa(outa, outb)\n\u001b[0;32m 13\u001b[0m asd\u001b[39m.\u001b[39mfunc()\n\u001b[1;32m---> 14\u001b[0m \u001b[39mprint\u001b[39m(asd\u001b[39m.\u001b[39;49mouta) \u001b[39m# 输出 100\u001b[39;00m\n",
"\u001b[1;31mAttributeError\u001b[0m: 'aaa' object has no attribute 'outa'"
]
}
],
"source": [
"class aaa():\n",
" def __init__(self, a, b):\n",
" self.a = a\n",
" self.b = b\n",
"\n",
" def func(self):\n",
" global outa\n",
" outa = 100\n",
"\n",
"outa = 1\n",
"outb = 2\n",
"asd = aaa(outa, outb)\n",
"asd.func()\n",
"print(asd.outa) # 输出 100"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"usage: ipykernel_launcher.py [-h] [--seed SEED]\n",
"ipykernel_launcher.py: error: unrecognized arguments: --ip=127.0.0.1 --stdin=9003 --control=9001 --hb=9000 --Session.signature_scheme=\"hmac-sha256\" --Session.key=b\"46ef9317-59fb-4ab6-ae4e-6b35744fc423\" --shell=9002 --transport=\"tcp\" --iopub=9004 --f=c:\\Users\\UCUNI\\AppData\\Roaming\\jupyter\\runtime\\kernel-v2-311926K1uko38tdWb.json\n"
]
},
{
"ename": "SystemExit",
"evalue": "2",
"output_type": "error",
"traceback": [
"An exception has occurred, use %tb to see the full traceback.\n",
"\u001b[1;31mSystemExit\u001b[0m\u001b[1;31m:\u001b[0m 2\n"
]
}
],
"source": [
"import argparse\n",
"\n",
"def parse_args():\n",
" parser = argparse.ArgumentParser()\n",
" parser.add_argument(\"--seed\", type=int, default=11,\n",
" help=\"seed of the experiment\")\n",
" args = parser.parse_args()\n",
" return args\n",
"\n",
"arggg = parse_args()\n",
"print(type(arggg))"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(1.2, 3.2)\n",
"1.2\n"
]
}
],
"source": [
"aaa = (1.2,3.2)\n",
"print(aaa)\n",
"print(aaa[0])"
]
} }
], ],
"metadata": { "metadata": {
@ -125,7 +216,7 @@
"name": "python", "name": "python",
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", "pygments_lexer": "ipython3",
"version": "3.11.3" "version": "3.9.17"
}, },
"orig_nbformat": 4 "orig_nbformat": 4
}, },

View File

@ -62,7 +62,6 @@
"outputs": [], "outputs": [],
"source": [ "source": [
"from mlagents_envs.environment import UnityEnvironment\n", "from mlagents_envs.environment import UnityEnvironment\n",
"from gym_unity.envs import UnityToGymWrapper\n",
"import numpy as np\n", "import numpy as np\n",
"\n", "\n",
"ENV_PATH = \"../Build-ParallelEnv/Aimbot-ParallelEnv\"\n", "ENV_PATH = \"../Build-ParallelEnv/Aimbot-ParallelEnv\"\n",
@ -368,6 +367,7 @@
], ],
"source": [ "source": [
"import torch\n", "import torch\n",
"from torch import nn\n",
"\n", "\n",
"def layer_init(layer, std=np.sqrt(2), bias_const=0.0):\n", "def layer_init(layer, std=np.sqrt(2), bias_const=0.0):\n",
" torch.nn.init.orthogonal_(layer.weight, std)\n", " torch.nn.init.orthogonal_(layer.weight, std)\n",
@ -1248,6 +1248,24 @@
"saveDir = \"C:/Users/UCUNI/OneDrive/Unity/ML-Agents/Aimbot-PPO/Aimbot-PPO-Python/PPO-Model/Chimera-1677965178-1678547500.pt\"\n", "saveDir = \"C:/Users/UCUNI/OneDrive/Unity/ML-Agents/Aimbot-PPO/Aimbot-PPO-Python/PPO-Model/Chimera-1677965178-1678547500.pt\"\n",
"torch.save(badGotoAgent,saveDir)" "torch.save(badGotoAgent,saveDir)"
] ]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"True\n"
]
}
],
"source": [
"import torch\n",
"print(torch.cuda.is_available())"
]
} }
], ],
"metadata": { "metadata": {
@ -1266,7 +1284,7 @@
"name": "python", "name": "python",
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", "pygments_lexer": "ipython3",
"version": "3.9.7" "version": "3.9.17"
}, },
"orig_nbformat": 4, "orig_nbformat": 4,
"vscode": { "vscode": {

View File

@ -1,5 +1,4 @@
import argparse import argparse
import wandb
import time import time
import numpy as np import numpy as np
import random import random
@ -9,24 +8,17 @@ import torch.nn as nn
import torch.optim as optim import torch.optim as optim
import atexit import atexit
from AimbotEnv import Aimbot
from tqdm import tqdm from aimbotEnv import Aimbot
from aimbotEnv import AimbotSideChannel
from ppoagent import PPOAgent
from airecorder import WandbRecorder
from aimemory import PPOMem
from aimemory import Targets
from enum import Enum from enum import Enum
from torch.distributions.normal import Normal
from torch.distributions.categorical import Categorical
from distutils.util import strtobool from distutils.util import strtobool
from torch.utils.tensorboard import SummaryWriter
from mlagents_envs.environment import UnityEnvironment
from mlagents_envs.side_channel.side_channel import (
SideChannel,
IncomingMessage,
OutgoingMessage,
)
from typing import List
bestReward = -1 best_reward = -1
SCrecieved = 0
DEFAULT_SEED = 9331 DEFAULT_SEED = 9331
ENV_PATH = "../Build/2.9/Goto-NonNormalization/Aimbot-ParallelEnv" ENV_PATH = "../Build/2.9/Goto-NonNormalization/Aimbot-ParallelEnv"
@ -36,8 +28,8 @@ WORKER_ID = 1
BASE_PORT = 1000 BASE_PORT = 1000
# tensorboard names # tensorboard names
game_name = "Aimbot_Target_Hybrid_PMNN_V3" GAME_NAME = "Aimbot_Target_Hybrid_PMNN_V3"
game_type = "Mix_Verification" GAME_TYPE = "Mix_Verification"
# max round steps per agent is 2500/Decision_period, 25 seconds # max round steps per agent is 2500/Decision_period, 25 seconds
# !!!check every parameters before run!!! # !!!check every parameters before run!!!
@ -62,19 +54,12 @@ BROADCASTREWARD = False
ANNEAL_LEARNING_RATE = True ANNEAL_LEARNING_RATE = True
CLIP_VLOSS = True CLIP_VLOSS = True
NORM_ADV = False NORM_ADV = False
TRAIN = False TRAIN = True
SAVE_MODEL = False SAVE_MODEL = False
WANDB_TACK = True WANDB_TACK = False
LOAD_DIR = None LOAD_DIR = None
LOAD_DIR = "../PPO-Model/PList_Go_LeakyReLU_9331_1677965178_bestGoto/PList_Go_LeakyReLU_9331_1677965178_10.709002.pt" #LOAD_DIR = "../PPO-Model/PList_Go_LeakyReLU_9331_1677965178_bestGoto/PList_Go_LeakyReLU_9331_1677965178_10.709002.pt"
# public data
class Targets(Enum):
Free = 0
Go = 1
Attack = 2
Defence = 3
Num = 4
TARGET_STATE_SIZE = 6 TARGET_STATE_SIZE = 6
INAREA_STATE_SIZE = 1 INAREA_STATE_SIZE = 1
TIME_STATE_SIZE = 1 TIME_STATE_SIZE = 1
@ -86,8 +71,6 @@ BASE_LOSEREWARD = -999
TARGETNUM= 4 TARGETNUM= 4
ENV_TIMELIMIT = 30 ENV_TIMELIMIT = 30
RESULT_BROADCAST_RATIO = 1/ENV_TIMELIMIT RESULT_BROADCAST_RATIO = 1/ENV_TIMELIMIT
TotalRounds = {"Free":0,"Go":0,"Attack":0}
WinRounds = {"Free":0,"Go":0,"Attack":0}
# !!!SPECIAL PARAMETERS!!! # !!!SPECIAL PARAMETERS!!!
# change it while program is finished # change it while program is finished
@ -168,230 +151,6 @@ def parse_args():
return args return args
def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
torch.nn.init.orthogonal_(layer.weight, std)
torch.nn.init.constant_(layer.bias, bias_const)
return layer
class PPOAgent(nn.Module):
def __init__(self, env: Aimbot,targetNum:int):
super(PPOAgent, self).__init__()
self.targetNum = targetNum
self.stateSize = env.unity_observation_shape[0]
self.agentNum = env.unity_agent_num
self.targetSize = TARGET_STATE_SIZE
self.timeSize = TIME_STATE_SIZE
self.gunSize = GUN_STATE_SIZE
self.myStateSize = MY_STATE_SIZE
self.raySize = env.unity_observation_shape[0] - TOTAL_T_SIZE
self.nonRaySize = TOTAL_T_SIZE
self.head_input_size = env.unity_observation_shape[0] - self.targetSize-self.timeSize-self.gunSize# except target state input
self.discrete_size = env.unity_discrete_size
self.discrete_shape = list(env.unity_discrete_branches)
self.continuous_size = env.unity_continuous_size
self.viewNetwork = nn.Sequential(
layer_init(nn.Linear(self.raySize, 200)),
nn.LeakyReLU()
)
self.targetNetworks = nn.ModuleList([nn.Sequential(
layer_init(nn.Linear(self.nonRaySize, 100)),
nn.LeakyReLU()
)for i in range(targetNum)])
self.middleNetworks = nn.ModuleList([nn.Sequential(
layer_init(nn.Linear(300,200)),
nn.LeakyReLU()
)for i in range(targetNum)])
self.actor_dis = nn.ModuleList([layer_init(nn.Linear(200, self.discrete_size), std=0.5) for i in range(targetNum)])
self.actor_mean = nn.ModuleList([layer_init(nn.Linear(200, self.continuous_size), std=0.5) for i in range(targetNum)])
# self.actor_logstd = nn.ModuleList([layer_init(nn.Linear(200, self.continuous_size), std=1) for i in range(targetNum)])
# self.actor_logstd = nn.Parameter(torch.zeros(1, self.continuous_size))
self.actor_logstd = nn.ParameterList([nn.Parameter(torch.zeros(1,self.continuous_size))for i in range(targetNum)]) # nn.Parameter(torch.zeros(1, self.continuous_size))
self.critic = nn.ModuleList([layer_init(nn.Linear(200, 1), std=1)for i in range(targetNum)])
def get_value(self, state: torch.Tensor):
target = state[:,0].to(torch.int32) # int
thisStateNum = target.size()[0]
viewInput = state[:,-self.raySize:] # all ray input
targetInput = state[:,:self.nonRaySize]
viewLayer = self.viewNetwork(viewInput)
targetLayer = torch.stack([self.targetNetworks[target[i]](targetInput[i]) for i in range(thisStateNum)])
middleInput = torch.cat([viewLayer,targetLayer],dim = 1)
middleLayer = torch.stack([self.middleNetworks[target[i]](middleInput[i]) for i in range(thisStateNum)])
criticV = torch.stack([self.critic[target[i]](middleLayer[i]) for i in range(thisStateNum)]) # self.critic
return criticV
def get_actions_value(self, state: torch.Tensor, actions=None):
target = state[:,0].to(torch.int32) # int
thisStateNum = target.size()[0]
viewInput = state[:,-self.raySize:] # all ray input
targetInput = state[:,:self.nonRaySize]
viewLayer = self.viewNetwork(viewInput)
targetLayer = torch.stack([self.targetNetworks[target[i]](targetInput[i]) for i in range(thisStateNum)])
middleInput = torch.cat([viewLayer,targetLayer],dim = 1)
middleLayer = torch.stack([self.middleNetworks[target[i]](middleInput[i]) for i in range(thisStateNum)])
# discrete
# 递归targets的数量,既agent数来实现根据target不同来选用对应的输出网络计算输出
dis_logits = torch.stack([self.actor_dis[target[i]](middleLayer[i]) for i in range(thisStateNum)])
split_logits = torch.split(dis_logits, self.discrete_shape, dim=1)
multi_categoricals = [Categorical(logits=thisLogits) for thisLogits in split_logits]
# continuous
actions_mean = torch.stack([self.actor_mean[target[i]](middleLayer[i]) for i in range(thisStateNum)]) # self.actor_mean(hidden)
# action_logstd = torch.stack([self.actor_logstd[target[i]](middleLayer[i]) for i in range(thisStateNum)]) # self.actor_logstd(hidden)
# action_logstd = self.actor_logstd.expand_as(actions_mean) # self.actor_logstd.expand_as(actions_mean)
action_logstd = torch.stack([torch.squeeze(self.actor_logstd[target[i]],0) for i in range(thisStateNum)])
# print(action_logstd)
action_std = torch.exp(action_logstd) # torch.exp(action_logstd)
con_probs = Normal(actions_mean, action_std)
# critic
criticV = torch.stack([self.critic[target[i]](middleLayer[i]) for i in range(thisStateNum)]) # self.critic
if actions is None:
if args.train:
# select actions base on probability distribution model
disAct = torch.stack([ctgr.sample() for ctgr in multi_categoricals])
conAct = con_probs.sample()
actions = torch.cat([disAct.T, conAct], dim=1)
else:
# select actions base on best probability distribution
# disAct = torch.stack([torch.argmax(logit, dim=1) for logit in split_logits])
conAct = actions_mean
disAct = torch.stack([ctgr.sample() for ctgr in multi_categoricals])
conAct = con_probs.sample()
actions = torch.cat([disAct.T, conAct], dim=1)
else:
disAct = actions[:, 0 : env.unity_discrete_type].T
conAct = actions[:, env.unity_discrete_type :]
dis_log_prob = torch.stack(
[ctgr.log_prob(act) for act, ctgr in zip(disAct, multi_categoricals)]
)
dis_entropy = torch.stack([ctgr.entropy() for ctgr in multi_categoricals])
return (
actions,
dis_log_prob.sum(0),
dis_entropy.sum(0),
con_probs.log_prob(conAct).sum(1),
con_probs.entropy().sum(1),
criticV,
)
def GAE(agent, args, rewards, dones, values, next_obs, next_done):
# GAE
with torch.no_grad():
next_value = agent.get_value(next_obs).reshape(1, -1)
data_size = rewards.size()[0]
if args.gae:
advantages = torch.zeros_like(rewards).to(device)
lastgaelam = 0
for t in reversed(range(data_size)):
if t == data_size - 1:
nextnonterminal = 1.0 - next_done
nextvalues = next_value
else:
nextnonterminal = 1.0 - dones[t + 1]
nextvalues = values[t + 1]
delta = rewards[t] + args.gamma * nextvalues * nextnonterminal - values[t]
advantages[t] = lastgaelam = (
delta + args.gamma * args.gaeLambda * nextnonterminal * lastgaelam
)
returns = advantages + values
else:
returns = torch.zeros_like(rewards).to(device)
for t in reversed(range(data_size)):
if t == data_size - 1:
nextnonterminal = 1.0 - next_done
next_return = next_value
else:
nextnonterminal = 1.0 - dones[t + 1]
next_return = returns[t + 1]
returns[t] = rewards[t] + args.gamma * nextnonterminal * next_return
advantages = returns - values
return advantages, returns
class AimbotSideChannel(SideChannel):
def __init__(self, channel_id: uuid.UUID) -> None:
super().__init__(channel_id)
def on_message_received(self, msg: IncomingMessage) -> None:
global SCrecieved # make sure this variable is global
"""
Note: We must implement this method of the SideChannel interface to
receive messages from Unity
Message will be sent like this:
"Warning|Message1|Message2|Message3" or
"Error|Message1|Message2|Message3"
"""
thisMessage = msg.read_string()
thisResult = thisMessage.split("|")
if(thisResult[0] == "result"):
TotalRounds[thisResult[1]]+=1
if(thisResult[2] == "Win"):
WinRounds[thisResult[1]]+=1
#print(TotalRounds)
#print(WinRounds)
elif(thisResult[0] == "Error"):
print(thisMessage)
# # while Message type is Warning
# if(thisResult[0] == "Warning"):
# # while Message1 is result means one game is over
# if (thisResult[1] == "Result"):
# TotalRounds[thisResult[2]]+=1
# # while Message3 is Win means this agent win this game
# if(thisResult[3] == "Win"):
# WinRounds[thisResult[2]]+=1
# # while Message1 is GameState means this game is just start
# # and tell python which game mode is
# elif (thisResult[1] == "GameState"):
# SCrecieved = 1
# # while Message type is Error
# elif(thisResult[0] == "Error"):
# print(thisMessage)
# 发送函数
def send_string(self, data: str) -> None:
# send a string toC#
msg = OutgoingMessage()
msg.write_string(data)
super().queue_message_to_send(msg)
def send_bool(self, data: bool) -> None:
msg = OutgoingMessage()
msg.write_bool(data)
super().queue_message_to_send(msg)
def send_int(self, data: int) -> None:
msg = OutgoingMessage()
msg.write_int32(data)
super().queue_message_to_send(msg)
def send_float(self, data: float) -> None:
msg = OutgoingMessage()
msg.write_float32(data)
super().queue_message_to_send(msg)
def send_float_list(self, data: List[float]) -> None:
msg = OutgoingMessage()
msg.write_float32_list(data)
super().queue_message_to_send(msg)
def broadCastEndReward(rewardBF:list,remainTime:float):
thisRewardBF = rewardBF
if (rewardBF[-1]<=-500):
# print("Lose DO NOT BROAD CAST",rewardBF[-1])
thisRewardBF[-1] = rewardBF[-1]-BASE_LOSEREWARD
elif (rewardBF[-1]>=500):
# print("Win! Broadcast reward!",rewardBF[-1])
print(sum(thisRewardBF)/len(thisRewardBF))
thisRewardBF[-1] = rewardBF[-1]-BASE_WINREWARD
thisRewardBF = (np.asarray(thisRewardBF)+(remainTime*args.result_broadcast_ratio)).tolist()
else:
print("!!!!!DIDNT GET RESULT REWARD!!!!!!",rewardBF[-1])
return torch.Tensor(thisRewardBF).to(device)
if __name__ == "__main__": if __name__ == "__main__":
args = parse_args() args = parse_args()
random.seed(args.seed) random.seed(args.seed)
@ -401,10 +160,21 @@ if __name__ == "__main__":
device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu") device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu")
# Initialize environment anget optimizer # Initialize environment anget optimizer
aimBotsideChannel = AimbotSideChannel(SIDE_CHANNEL_UUID); aimbot_sidechannel = AimbotSideChannel(SIDE_CHANNEL_UUID);
env = Aimbot(envPath=args.path, workerID=args.workerID, basePort=args.baseport,side_channels=[aimBotsideChannel]) env = Aimbot(envPath=args.path, workerID=args.workerID, basePort=args.baseport,side_channels=[aimbot_sidechannel])
if args.load_dir is None: if args.load_dir is None:
agent = PPOAgent(env,TARGETNUM).to(device) agent = PPOAgent(
env = env,
this_args=args,
train_agent=args.train,
target_num=TARGETNUM,
target_state_size= TARGET_STATE_SIZE,
time_state_size=TIME_STATE_SIZE,
gun_state_size=GUN_STATE_SIZE,
my_state_size=MY_STATE_SIZE,
total_t_size=TOTAL_T_SIZE,
device=device,
).to(device)
else: else:
agent = torch.load(args.load_dir) agent = torch.load(args.load_dir)
# freeze # freeze
@ -419,24 +189,8 @@ if __name__ == "__main__":
optimizer = optim.Adam(agent.parameters(), lr=args.lr, eps=1e-5) optimizer = optim.Adam(agent.parameters(), lr=args.lr, eps=1e-5)
# Tensorboard and WandB Recorder # Tensorboard and WandB Recorder
run_name = f"{game_type}_{args.seed}_{int(time.time())}" run_name = f"{GAME_TYPE}_{args.seed}_{int(time.time())}"
if args.wandb_track: wdb_recorder = WandbRecorder(GAME_NAME, GAME_TYPE, run_name, args)
wandb.init(
project=game_name,
entity=args.wandb_entity,
sync_tensorboard=True,
config=vars(args),
name=run_name,
monitor_gym=True,
save_code=True,
)
writer = SummaryWriter(f"runs/{run_name}")
writer.add_text(
"hyperparameters",
"|param|value|\n|-|-|\n%s"
% ("\n".join([f"|{key}|{value}|" for key, value in vars(args).items()])),
)
@atexit.register @atexit.register
def save_model(): def save_model():
@ -444,60 +198,49 @@ if __name__ == "__main__":
env.close() env.close()
if args.save_model: if args.save_model:
# save model while exit # save model while exit
saveDir = "../PPO-Model/"+ run_name + "_last.pt" save_dir = "../PPO-Model/"+ run_name + "_last.pt"
torch.save(agent, saveDir) torch.save(agent, save_dir)
print("save model to " + saveDir) print("save model to " + save_dir)
# Trajectory Buffer
ob_bf = [[] for i in range(env.unity_agent_num)]
act_bf = [[] for i in range(env.unity_agent_num)]
dis_logprobs_bf = [[] for i in range(env.unity_agent_num)]
con_logprobs_bf = [[] for i in range(env.unity_agent_num)]
rewards_bf = [[] for i in range(env.unity_agent_num)]
dones_bf = [[] for i in range(env.unity_agent_num)]
values_bf = [[] for i in range(env.unity_agent_num)]
# start the game # start the game
total_update_step = using_targets_num * args.total_timesteps // args.datasetSize total_update_step = using_targets_num * args.total_timesteps // args.datasetSize
target_steps = [0 for i in range(TARGETNUM)] target_steps = [0 for i in range(TARGETNUM)]
start_time = time.time() start_time = time.time()
state, _, done = env.reset() state, _, done = env.reset()
# state = torch.Tensor(next_obs).to(device)
# next_done = torch.zeros(env.unity_agent_num).to(device)
# initialize empty training datasets # initialize AI memories
obs = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,env.unity_observation_size) ppo_memories = PPOMem(
actions = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,env.unity_action_size) env = env,
dis_logprobs = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1) device = device,
con_logprobs = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1) args=args,
rewards = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1) target_num = TARGETNUM,
values = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1) target_state_size = TARGET_STATE_SIZE,
advantages = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1) base_lose_reward = BASE_LOSEREWARD,
returns = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1) base_win_reward = BASE_WINREWARD,
)
for total_steps in range(total_update_step): for total_steps in range(total_update_step):
# discunt learning rate, while step == total_update_step lr will be 0 # discunt learning rate, while step == total_update_step lr will be 0
if args.annealLR: if args.annealLR:
finalRatio = TARGET_LEARNING_RATE/args.lr final_lr_ratio = TARGET_LEARNING_RATE/args.lr
frac = 1.0 - ((total_steps + 1.0) / total_update_step) frac = 1.0 - ((total_steps + 1.0) / total_update_step)
lrnow = frac * args.lr lr_now = frac * args.lr
optimizer.param_groups[0]["lr"] = lrnow optimizer.param_groups[0]["lr"] = lr_now
else: else:
lrnow = args.lr lr_now = args.lr
print("new episode",total_steps,"learning rate = ",lrnow) print("new episode",total_steps,"learning rate = ",lr_now)
# MAIN LOOP: run agent in environment # MAIN LOOP: run agent in environment
step = 0 step = 0
training = False training = False
trainQueue = [] train_queue = []
last_reward = [0.for i in range(env.unity_agent_num)] last_reward = [0.for i in range(env.unity_agent_num)]
while True: while True:
if step % args.decision_period == 0: if step % args.decision_period == 0:
step += 1 step += 1
# Choose action by agent # Choose action by agent
with torch.no_grad(): with torch.no_grad():
# predict actions # predict actions
action, dis_logprob, _, con_logprob, _, value = agent.get_actions_value( action, dis_logprob, _, con_logprob, _, value = agent.get_actions_value(
@ -514,60 +257,27 @@ if __name__ == "__main__":
next_state, reward, next_done = env.step(action_cpu) next_state, reward, next_done = env.step(action_cpu)
# save memories # save memories
for i in range(env.unity_agent_num): ppo_memories.save_memories(
# save memories to buffers now_step = step,
ob_bf[i].append(state[i]) agent = agent,
act_bf[i].append(action_cpu[i]) state = state,
dis_logprobs_bf[i].append(dis_logprob_cpu[i]) action_cpu = action_cpu,
con_logprobs_bf[i].append(con_logprob_cpu[i]) dis_logprob_cpu = dis_logprob_cpu,
rewards_bf[i].append(reward[i]+last_reward[i]) con_logprob_cpu = con_logprob_cpu,
dones_bf[i].append(done[i]) reward = reward,
values_bf[i].append(value_cpu[i]) done = done,
remainTime = state[i,TARGET_STATE_SIZE] value_cpu = value_cpu,
if next_done[i] == True: last_reward = last_reward,
# finished a round, send finished memories to training datasets next_done = next_done,
# compute advantage and discounted reward next_state=next_state,
#print(i,"over") )
roundTargetType = int(state[i,0])
thisRewardsTensor = broadCastEndReward(rewards_bf[i],remainTime)
adv, rt = GAE(
agent,
args,
thisRewardsTensor,
torch.Tensor(dones_bf[i]).to(device),
torch.tensor(values_bf[i]).to(device),
torch.tensor(next_state[i]).to(device).unsqueeze(0),
torch.Tensor([next_done[i]]).to(device),
)
# send memories to training datasets
obs[roundTargetType] = torch.cat((obs[roundTargetType], torch.tensor(ob_bf[i]).to(device)), 0)
actions[roundTargetType] = torch.cat((actions[roundTargetType], torch.tensor(act_bf[i]).to(device)), 0)
dis_logprobs[roundTargetType] = torch.cat(
(dis_logprobs[roundTargetType], torch.tensor(dis_logprobs_bf[i]).to(device)), 0
)
con_logprobs[roundTargetType] = torch.cat(
(con_logprobs[roundTargetType], torch.tensor(con_logprobs_bf[i]).to(device)), 0
)
rewards[roundTargetType] = torch.cat((rewards[roundTargetType], thisRewardsTensor), 0)
values[roundTargetType] = torch.cat((values[roundTargetType], torch.tensor(values_bf[i]).to(device)), 0)
advantages[roundTargetType] = torch.cat((advantages[roundTargetType], adv), 0)
returns[roundTargetType] = torch.cat((returns[roundTargetType], rt), 0)
# clear buffers
ob_bf[i] = []
act_bf[i] = []
dis_logprobs_bf[i] = []
con_logprobs_bf[i] = []
rewards_bf[i] = []
dones_bf[i] = []
values_bf[i] = []
print(f"train dataset {Targets(roundTargetType).name} added:{obs[roundTargetType].size()[0]}/{args.datasetSize}")
# check if any training dataset is full and ready to train
for i in range(TARGETNUM): for i in range(TARGETNUM):
if obs[i].size()[0] >= args.datasetSize: if ppo_memories.obs[i].size()[0] >= args.datasetSize:
# start train NN # start train NN
trainQueue.append(i) train_queue.append(i)
if(len(trainQueue)>0): if(len(train_queue)>0):
break break
state, done = next_state, next_done state, done = next_state, next_done
else: else:
@ -575,74 +285,40 @@ if __name__ == "__main__":
# skip this step use last predict action # skip this step use last predict action
next_state, reward, next_done = env.step(action_cpu) next_state, reward, next_done = env.step(action_cpu)
# save memories # save memories
for i in range(env.unity_agent_num): ppo_memories.save_memories(
if next_done[i] == True: now_step = step,
#print(i,"over???") agent = agent,
# save memories to buffers state = state,
ob_bf[i].append(state[i]) action_cpu = action_cpu,
act_bf[i].append(action_cpu[i]) dis_logprob_cpu = dis_logprob_cpu,
dis_logprobs_bf[i].append(dis_logprob_cpu[i]) con_logprob_cpu = con_logprob_cpu,
con_logprobs_bf[i].append(con_logprob_cpu[i]) reward = reward,
rewards_bf[i].append(reward[i]) done = done,
dones_bf[i].append(done[i]) value_cpu = value_cpu,
values_bf[i].append(value_cpu[i]) last_reward = last_reward,
remainTime = state[i,TARGET_STATE_SIZE] next_done = next_done,
# finished a round, send finished memories to training datasets next_state=next_state,
# compute advantage and discounted reward )
roundTargetType = int(state[i,0])
thisRewardsTensor = broadCastEndReward(rewards_bf[i],remainTime)
adv, rt = GAE(
agent,
args,
thisRewardsTensor,
torch.Tensor(dones_bf[i]).to(device),
torch.tensor(values_bf[i]).to(device),
torch.Tensor(next_state[i]).to(device).unsqueeze(dim = 0),
torch.Tensor([next_done[i]]).to(device),
)
# send memories to training datasets
obs[roundTargetType] = torch.cat((obs[roundTargetType], torch.tensor(ob_bf[i]).to(device)), 0)
actions[roundTargetType] = torch.cat((actions[roundTargetType], torch.tensor(act_bf[i]).to(device)), 0)
dis_logprobs[roundTargetType] = torch.cat(
(dis_logprobs[roundTargetType], torch.tensor(dis_logprobs_bf[i]).to(device)), 0
)
con_logprobs[roundTargetType] = torch.cat(
(con_logprobs[roundTargetType], torch.tensor(con_logprobs_bf[i]).to(device)), 0
)
rewards[roundTargetType] = torch.cat((rewards[roundTargetType], thisRewardsTensor), 0)
values[roundTargetType] = torch.cat((values[roundTargetType], torch.tensor(values_bf[i]).to(device)), 0)
advantages[roundTargetType] = torch.cat((advantages[roundTargetType], adv), 0)
returns[roundTargetType] = torch.cat((returns[roundTargetType], rt), 0)
# clear buffers
ob_bf[i] = []
act_bf[i] = []
dis_logprobs_bf[i] = []
con_logprobs_bf[i] = []
rewards_bf[i] = []
dones_bf[i] = []
values_bf[i] = []
print(f"train dataset {Targets(roundTargetType).name} added:{obs[roundTargetType].size()[0]}/{args.datasetSize}")
state = next_state state = next_state
last_reward = reward last_reward = reward
i += 1
if args.train: if args.train:
meanRewardList = [] # for WANDB # train mode on
mean_reward_list = [] # for WANDB
# loop all tarining queue # loop all tarining queue
for thisT in trainQueue: for thisT in train_queue:
# sart time # sart time
startTime = time.time() start_time = time.time()
target_steps[thisT]+=1 target_steps[thisT]+=1
# flatten the batch # flatten the batch
b_obs = obs[thisT].reshape((-1,) + env.unity_observation_shape) b_obs = ppo_memories.obs[thisT].reshape((-1,) + env.unity_observation_shape)
b_dis_logprobs = dis_logprobs[thisT].reshape(-1) b_dis_logprobs = ppo_memories.dis_logprobs[thisT].reshape(-1)
b_con_logprobs = con_logprobs[thisT].reshape(-1) b_con_logprobs = ppo_memories.con_logprobs[thisT].reshape(-1)
b_actions = actions[thisT].reshape((-1,) + (env.unity_action_size,)) b_actions = ppo_memories.actions[thisT].reshape((-1,) + (env.unity_action_size,))
b_advantages = advantages[thisT].reshape(-1) b_advantages = ppo_memories.advantages[thisT].reshape(-1)
b_returns = returns[thisT].reshape(-1) b_returns = ppo_memories.returns[thisT].reshape(-1)
b_values = values[thisT].reshape(-1) b_values = ppo_memories.values[thisT].reshape(-1)
b_size = b_obs.size()[0] b_size = b_obs.size()[0]
# Optimizing the policy and value network # Optimizing the policy and value network
b_inds = np.arange(b_size) b_inds = np.arange(b_size)
@ -751,67 +427,61 @@ if __name__ == "__main__":
""" """
# record mean reward before clear history # record mean reward before clear history
print("done") print("done")
targetRewardMean = np.mean(rewards[thisT].to("cpu").detach().numpy().copy()) targetRewardMean = np.mean(ppo_memories.rewards[thisT].to("cpu").detach().numpy().copy())
meanRewardList.append(targetRewardMean) mean_reward_list.append(targetRewardMean)
targetName = Targets(thisT).name targetName = Targets(thisT).name
# clear this target trainning set buffer # clear this target trainning set buffer
obs[thisT] = torch.tensor([]).to(device) ppo_memories.clear_training_datasets(thisT)
actions[thisT] = torch.tensor([]).to(device)
dis_logprobs[thisT] = torch.tensor([]).to(device)
con_logprobs[thisT] = torch.tensor([]).to(device)
rewards[thisT] = torch.tensor([]).to(device)
values[thisT] = torch.tensor([]).to(device)
advantages[thisT] = torch.tensor([]).to(device)
returns[thisT] = torch.tensor([]).to(device)
# record rewards for plotting purposes # record rewards for plotting purposes
writer.add_scalar(f"Target{targetName}/value_loss", v_loss.item(), target_steps[thisT]) wdb_recorder.add_target_scalar(
writer.add_scalar(f"Target{targetName}/dis_policy_loss", dis_pg_loss.item(), target_steps[thisT]) targetName,
writer.add_scalar(f"Target{targetName}/con_policy_loss", con_pg_loss.item(), target_steps[thisT]) thisT,
writer.add_scalar(f"Target{targetName}/total_loss", loss.item(), target_steps[thisT]) v_loss,
writer.add_scalar(f"Target{targetName}/entropy_loss", entropy_loss.item(), target_steps[thisT]) dis_pg_loss,
writer.add_scalar(f"Target{targetName}/Reward", targetRewardMean, target_steps[thisT]) con_pg_loss,
writer.add_scalar(f"Target{targetName}/WinRatio", WinRounds[targetName]/TotalRounds[targetName], target_steps[thisT]) loss,
entropy_loss,
targetRewardMean,
target_steps,
)
print(f"episode over Target{targetName} mean reward:", targetRewardMean) print(f"episode over Target{targetName} mean reward:", targetRewardMean)
TotalRewardMean = np.mean(meanRewardList) TotalRewardMean = np.mean(mean_reward_list)
writer.add_scalar("GlobalCharts/TotalRewardMean", TotalRewardMean, total_steps) wdb_recorder.add_global_scalar(
writer.add_scalar("GlobalCharts/learning_rate", optimizer.param_groups[0]["lr"], total_steps) TotalRewardMean,
optimizer.param_groups[0]["lr"],
total_steps,
)
# print cost time as seconds # print cost time as seconds
print("cost time:", time.time() - start_time) print("cost time:", time.time() - start_time)
# New Record! # New Record!
if TotalRewardMean > bestReward and args.save_model: if TotalRewardMean > best_reward and args.save_model:
bestReward = targetRewardMean best_reward = targetRewardMean
saveDir = "../PPO-Model/" + run_name +"_"+ str(TotalRewardMean) + ".pt" saveDir = "../PPO-Model/" + run_name +"_"+ str(TotalRewardMean) + ".pt"
torch.save(agent, saveDir) torch.save(agent, saveDir)
else: else:
meanRewardList = [] # for WANDB # train mode off
mean_reward_list = [] # for WANDB
# while not in training mode, clear the buffer # while not in training mode, clear the buffer
for thisT in trainQueue: for thisT in train_queue:
target_steps[thisT]+=1 target_steps[thisT]+=1
targetName = Targets(thisT).name targetName = Targets(thisT).name
targetRewardMean = np.mean(rewards[thisT].to("cpu").detach().numpy().copy()) targetRewardMean = np.mean(ppo_memories.rewards[thisT].to("cpu").detach().numpy().copy())
meanRewardList.append(targetRewardMean) mean_reward_list.append(targetRewardMean)
print(target_steps[thisT]) print(target_steps[thisT])
obs[thisT] = torch.tensor([]).to(device) # clear this target trainning set buffer
actions[thisT] = torch.tensor([]).to(device) ppo_memories.clear_training_datasets(thisT)
dis_logprobs[thisT] = torch.tensor([]).to(device)
con_logprobs[thisT] = torch.tensor([]).to(device)
rewards[thisT] = torch.tensor([]).to(device)
values[thisT] = torch.tensor([]).to(device)
advantages[thisT] = torch.tensor([]).to(device)
returns[thisT] = torch.tensor([]).to(device)
# record rewards for plotting purposes # record rewards for plotting purposes
wdb_recorder.writer.add_scalar(f"Target{targetName}/Reward", targetRewardMean, target_steps[thisT])
writer.add_scalar(f"Target{targetName}/Reward", targetRewardMean, target_steps[thisT]) wdb_recorder.add_win_ratio(targetName,target_steps[thisT])
writer.add_scalar(f"Target{targetName}/WinRatio", WinRounds[targetName]/TotalRounds[targetName], target_steps[thisT])
print(f"episode over Target{targetName} mean reward:", targetRewardMean) print(f"episode over Target{targetName} mean reward:", targetRewardMean)
TotalRewardMean = np.mean(meanRewardList) TotalRewardMean = np.mean(mean_reward_list)
writer.add_scalar("GlobalCharts/TotalRewardMean", TotalRewardMean, total_steps) wdb_recorder.writer.add_scalar("GlobalCharts/TotalRewardMean", TotalRewardMean, total_steps)
saveDir = "../PPO-Model/"+ run_name + "_last.pt" saveDir = "../PPO-Model/"+ run_name + "_last.pt"
torch.save(agent, saveDir) torch.save(agent, saveDir)
env.close() env.close()
writer.close() wdb_recorder.writer.close()

View File

@ -0,0 +1,146 @@
import torch
import numpy as np
import argparse
from aimbotEnv import Aimbot
from ppoagent import PPOAgent
from enum import Enum
# public data
class Targets(Enum):
Free = 0
Go = 1
Attack = 2
Defence = 3
Num = 4
class PPOMem:
def __init__(
self,
env: Aimbot,
args: argparse.Namespace,
device: torch.device,
target_num: int,
target_state_size: int,
base_lose_reward: int,
base_win_reward: int,
) -> None:
self.data_set_size = args.datasetSize
self.result_broadcast_ratio = args.result_broadcast_ratio
self.decision_period = args.decision_period
self.unity_agent_num = env.unity_agent_num
self.base_lose_reward = base_lose_reward
self.base_win_reward = base_win_reward
self.target_state_size = target_state_size
self.device = device
# Trajectory Buffer
self.ob_bf = [[] for i in range(env.unity_agent_num)]
self.act_bf = [[] for i in range(env.unity_agent_num)]
self.dis_logprobs_bf = [[] for i in range(env.unity_agent_num)]
self.con_logprobs_bf = [[] for i in range(env.unity_agent_num)]
self.rewards_bf = [[] for i in range(env.unity_agent_num)]
self.dones_bf = [[] for i in range(env.unity_agent_num)]
self.values_bf = [[] for i in range(env.unity_agent_num)]
# initialize empty training datasets
self.obs = [torch.tensor([]).to(device) for i in range(target_num)] # (TARGETNUM,n,env.unity_observation_size)
self.actions = [torch.tensor([]).to(device) for i in range(target_num)] # (TARGETNUM,n,env.unity_action_size)
self.dis_logprobs = [torch.tensor([]).to(device) for i in range(target_num)] # (TARGETNUM,n,1)
self.con_logprobs = [torch.tensor([]).to(device) for i in range(target_num)] # (TARGETNUM,n,1)
self.rewards = [torch.tensor([]).to(device) for i in range(target_num)] # (TARGETNUM,n,1)
self.values = [torch.tensor([]).to(device) for i in range(target_num)] # (TARGETNUM,n,1)
self.advantages = [torch.tensor([]).to(device) for i in range(target_num)] # (TARGETNUM,n,1)
self.returns = [torch.tensor([]).to(device) for i in range(target_num)] # (TARGETNUM,n,1)
def broad_cast_end_reward(self, rewardBF: list, remainTime: float) -> torch.Tensor:
thisRewardBF = rewardBF.copy()
if rewardBF[-1] <= -500:
# print("Lose DO NOT BROAD CAST",rewardBF[-1])
thisRewardBF[-1] = rewardBF[-1] - self.base_lose_reward
elif rewardBF[-1] >= 500:
# print("Win! Broadcast reward!",rewardBF[-1])
print(sum(thisRewardBF) / len(thisRewardBF))
thisRewardBF[-1] = rewardBF[-1] - self.base_win_reward
thisRewardBF = (np.asarray(thisRewardBF) + (remainTime * self.result_broadcast_ratio)).tolist()
else:
print("!!!!!DIDNT GET RESULT REWARD!!!!!!", rewardBF[-1])
return torch.Tensor(thisRewardBF).to(self.device)
def save_memories(
self,
now_step: int,
agent: PPOAgent,
state: np.ndarray,
action_cpu: np.ndarray,
dis_logprob_cpu: np.ndarray,
con_logprob_cpu: np.ndarray,
reward: list,
done: list,
value_cpu: np.ndarray,
last_reward: list,
next_done: list,
next_state: np.ndarray,
):
for i in range(self.unity_agent_num):
if now_step % self.decision_period == 0 or next_done[i] == True:
# only on decision period or finished a round, save memories to buffer
self.ob_bf[i].append(state[i])
self.act_bf[i].append(action_cpu[i])
self.dis_logprobs_bf[i].append(dis_logprob_cpu[i])
self.con_logprobs_bf[i].append(con_logprob_cpu[i])
self.dones_bf[i].append(done[i])
self.values_bf[i].append(value_cpu[i])
if now_step % self.decision_period == 0:
# on decision period, add last skiped round's reward
self.rewards_bf[i].append(reward[i] + last_reward[i])
else:
# not on decision period, only add this round's reward
self.rewards_bf[i].append(reward[i])
if next_done[i] == True:
# finished a round, send finished memories to training datasets
# compute advantage and discounted reward
remainTime = state[i, self.target_state_size]
roundTargetType = int(state[i, 0])
thisRewardsTensor = self.broad_cast_end_reward(self.rewards_bf[i], remainTime)
adv, rt = agent.gae(
rewards=thisRewardsTensor,
dones=torch.Tensor(self.dones_bf[i]).to(self.device),
values=torch.tensor(self.values_bf[i]).to(self.device),
next_obs=torch.tensor(next_state[i]).to(self.device).unsqueeze(0),
next_done=torch.Tensor([next_done[i]]).to(self.device),
)
# send memories to training datasets
self.obs[roundTargetType] = torch.cat((self.obs[roundTargetType], torch.tensor(self.ob_bf[i]).to(self.device)), 0)
self.actions[roundTargetType] = torch.cat((self.actions[roundTargetType], torch.tensor(self.act_bf[i]).to(self.device)), 0)
self.dis_logprobs[roundTargetType] = torch.cat((self.dis_logprobs[roundTargetType], torch.tensor(self.dis_logprobs_bf[i]).to(self.device)), 0)
self.con_logprobs[roundTargetType] = torch.cat((self.con_logprobs[roundTargetType], torch.tensor(self.con_logprobs_bf[i]).to(self.device)), 0)
self.rewards[roundTargetType] = torch.cat((self.rewards[roundTargetType], thisRewardsTensor), 0)
self.values[roundTargetType] = torch.cat((self.values[roundTargetType], torch.tensor(self.values_bf[i]).to(self.device)), 0)
self.advantages[roundTargetType] = torch.cat((self.advantages[roundTargetType], adv), 0)
self.returns[roundTargetType] = torch.cat((self.returns[roundTargetType], rt), 0)
# clear buffers
self.clear_buffers(i)
print(f"train dataset {Targets(roundTargetType).name} added:{self.obs[roundTargetType].size()[0]}/{self.data_set_size}")
def clear_buffers(self,ind:int):
# clear buffers
self.ob_bf[ind] = []
self.act_bf[ind] = []
self.dis_logprobs_bf[ind] = []
self.con_logprobs_bf[ind] = []
self.rewards_bf[ind] = []
self.dones_bf[ind] = []
self.values_bf[ind] = []
def clear_training_datasets(self,ind:int):
# clear training datasets
self.obs[ind] = torch.tensor([]).to(self.device)
self.actions[ind] = torch.tensor([]).to(self.device)
self.dis_logprobs[ind] = torch.tensor([]).to(self.device)
self.con_logprobs[ind] = torch.tensor([]).to(self.device)
self.rewards[ind] = torch.tensor([]).to(self.device)
self.values[ind] = torch.tensor([]).to(self.device)
self.advantages[ind] = torch.tensor([]).to(self.device)
self.returns[ind] = torch.tensor([]).to(self.device)

View File

@ -0,0 +1,82 @@
import wandb
import time
from torch.utils.tensorboard import SummaryWriter
total_rounds = {"Free": 0, "Go": 0, "Attack": 0}
win_rounds = {"Free": 0, "Go": 0, "Attack": 0}
# class for wandb recording
class WandbRecorder:
def __init__(self, game_name: str, game_type: str, run_name: str, _args) -> None:
# init wandb
self.game_name = game_name
self.game_type = game_type
self._args = _args
self.run_name = run_name
if self._args.wandb_track:
wandb.init(
project=self.game_name,
entity=self._args.wandb_entity,
sync_tensorboard=True,
config=vars(self._args),
name=self.run_name,
monitor_gym=True,
save_code=True,
)
self.writer = SummaryWriter(f"runs/{self.run_name}")
self.writer.add_text(
"hyperparameters",
"|param|value|\n|-|-|\n%s"
% ("\n".join([f"|{key}|{value}|" for key, value in vars(self._args).items()])),
)
def add_target_scalar(
self,
target_name,
thisT,
v_loss,
dis_pg_loss,
con_pg_loss,
loss,
entropy_loss,
target_reward_mean,
target_steps,
):
# fmt:off
self.writer.add_scalar(
f"Target{target_name}/value_loss", v_loss.item(), target_steps[thisT]
)
self.writer.add_scalar(
f"Target{target_name}/dis_policy_loss", dis_pg_loss.item(), target_steps[thisT]
)
self.writer.add_scalar(
f"Target{target_name}/con_policy_loss", con_pg_loss.item(), target_steps[thisT]
)
self.writer.add_scalar(
f"Target{target_name}/total_loss", loss.item(), target_steps[thisT]
)
self.writer.add_scalar(
f"Target{target_name}/entropy_loss", entropy_loss.item(), target_steps[thisT]
)
self.writer.add_scalar(
f"Target{target_name}/Reward", target_reward_mean, target_steps[thisT]
)
self.writer.add_scalar(
f"Target{target_name}/WinRatio", win_rounds[target_name] / total_rounds[target_name], target_steps[thisT],
)
# fmt:on
def add_global_scalar(
self,
total_reward_mean,
learning_rate,
total_steps,
):
self.writer.add_scalar("GlobalCharts/TotalRewardMean", total_reward_mean, total_steps)
self.writer.add_scalar("GlobalCharts/learning_rate", learning_rate, total_steps)
def add_win_ratio(self, target_name, target_steps):
self.writer.add_scalar(
f"Target{target_name}/WinRatio", win_rounds[target_name] / total_rounds[target_name], target_steps,
)

View File

@ -0,0 +1,204 @@
import numpy as np
import torch
import argparse
from torch import nn
from aimbotEnv import Aimbot
from torch.distributions.normal import Normal
from torch.distributions.categorical import Categorical
def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
nn.init.orthogonal_(layer.weight, std)
nn.init.constant_(layer.bias, bias_const)
return layer
class PPOAgent(nn.Module):
def __init__(
self,
env: Aimbot,
this_args:argparse.Namespace,
train_agent: bool,
target_num: int,
target_state_size: int,
time_state_size: int,
gun_state_size: int,
my_state_size: int,
total_t_size: int,
device: torch.device,
):
super(PPOAgent, self).__init__()
self.device = device
self.args = this_args
self.trainAgent = train_agent
self.targetNum = target_num
self.stateSize = env.unity_observation_shape[0]
self.agentNum = env.unity_agent_num
self.targetSize = target_state_size
self.timeSize = time_state_size
self.gunSize = gun_state_size
self.myStateSize = my_state_size
self.raySize = env.unity_observation_shape[0] - total_t_size
self.nonRaySize = total_t_size
self.head_input_size = (
env.unity_observation_shape[0] - self.targetSize - self.timeSize - self.gunSize
) # except target state input
self.unityDiscreteType = env.unity_discrete_type
self.discrete_size = env.unity_discrete_size
self.discrete_shape = list(env.unity_discrete_branches)
self.continuous_size = env.unity_continuous_size
self.viewNetwork = nn.Sequential(layer_init(nn.Linear(self.raySize, 200)), nn.LeakyReLU())
self.targetNetworks = nn.ModuleList(
[
nn.Sequential(layer_init(nn.Linear(self.nonRaySize, 100)), nn.LeakyReLU())
for i in range(target_num)
]
)
self.middleNetworks = nn.ModuleList(
[
nn.Sequential(layer_init(nn.Linear(300, 200)), nn.LeakyReLU())
for i in range(target_num)
]
)
self.actor_dis = nn.ModuleList(
[layer_init(nn.Linear(200, self.discrete_size), std=0.5) for i in range(target_num)]
)
self.actor_mean = nn.ModuleList(
[layer_init(nn.Linear(200, self.continuous_size), std=0.5) for i in range(target_num)]
)
# self.actor_logstd = nn.ModuleList([layer_init(nn.Linear(200, self.continuous_size), std=1) for i in range(targetNum)])
# self.actor_logstd = nn.Parameter(torch.zeros(1, self.continuous_size))
self.actor_logstd = nn.ParameterList(
[nn.Parameter(torch.zeros(1, self.continuous_size)) for i in range(target_num)]
) # nn.Parameter(torch.zeros(1, self.continuous_size))
self.critic = nn.ModuleList(
[layer_init(nn.Linear(200, 1), std=1) for i in range(target_num)]
)
def get_value(self, state: torch.Tensor):
target = state[:, 0].to(torch.int32) # int
thisStateNum = target.size()[0]
viewInput = state[:, -self.raySize :] # all ray input
targetInput = state[:, : self.nonRaySize]
viewLayer = self.viewNetwork(viewInput)
targetLayer = torch.stack(
[self.targetNetworks[target[i]](targetInput[i]) for i in range(thisStateNum)]
)
middleInput = torch.cat([viewLayer, targetLayer], dim=1)
middleLayer = torch.stack(
[self.middleNetworks[target[i]](middleInput[i]) for i in range(thisStateNum)]
)
criticV = torch.stack(
[self.critic[target[i]](middleLayer[i]) for i in range(thisStateNum)]
) # self.critic
return criticV
def get_actions_value(self, state: torch.Tensor, actions=None):
target = state[:, 0].to(torch.int32) # int
thisStateNum = target.size()[0]
viewInput = state[:, -self.raySize :] # all ray input
targetInput = state[:, : self.nonRaySize]
viewLayer = self.viewNetwork(viewInput)
targetLayer = torch.stack(
[self.targetNetworks[target[i]](targetInput[i]) for i in range(thisStateNum)]
)
middleInput = torch.cat([viewLayer, targetLayer], dim=1)
middleLayer = torch.stack(
[self.middleNetworks[target[i]](middleInput[i]) for i in range(thisStateNum)]
)
# discrete
# 递归targets的数量,既agent数来实现根据target不同来选用对应的输出网络计算输出
dis_logits = torch.stack(
[self.actor_dis[target[i]](middleLayer[i]) for i in range(thisStateNum)]
)
split_logits = torch.split(dis_logits, self.discrete_shape, dim=1)
multi_categoricals = [Categorical(logits=thisLogits) for thisLogits in split_logits]
# continuous
actions_mean = torch.stack(
[self.actor_mean[target[i]](middleLayer[i]) for i in range(thisStateNum)]
) # self.actor_mean(hidden)
# action_logstd = torch.stack([self.actor_logstd[target[i]](middleLayer[i]) for i in range(thisStateNum)]) # self.actor_logstd(hidden)
# action_logstd = self.actor_logstd.expand_as(actions_mean) # self.actor_logstd.expand_as(actions_mean)
action_logstd = torch.stack(
[torch.squeeze(self.actor_logstd[target[i]], 0) for i in range(thisStateNum)]
)
# print(action_logstd)
action_std = torch.exp(action_logstd) # torch.exp(action_logstd)
con_probs = Normal(actions_mean, action_std)
# critic
criticV = torch.stack(
[self.critic[target[i]](middleLayer[i]) for i in range(thisStateNum)]
) # self.critic
if actions is None:
if self.trainAgent:
# select actions base on probability distribution model
disAct = torch.stack([ctgr.sample() for ctgr in multi_categoricals])
conAct = con_probs.sample()
actions = torch.cat([disAct.T, conAct], dim=1)
else:
# select actions base on best probability distribution
# disAct = torch.stack([torch.argmax(logit, dim=1) for logit in split_logits])
conAct = actions_mean
disAct = torch.stack([ctgr.sample() for ctgr in multi_categoricals])
conAct = con_probs.sample()
actions = torch.cat([disAct.T, conAct], dim=1)
else:
disAct = actions[:, 0 : self.unityDiscreteType].T
conAct = actions[:, self.unityDiscreteType :]
dis_log_prob = torch.stack(
[ctgr.log_prob(act) for act, ctgr in zip(disAct, multi_categoricals)]
)
dis_entropy = torch.stack([ctgr.entropy() for ctgr in multi_categoricals])
return (
actions,
dis_log_prob.sum(0),
dis_entropy.sum(0),
con_probs.log_prob(conAct).sum(1),
con_probs.entropy().sum(1),
criticV,
)
def gae(
self,
rewards: torch.Tensor,
dones: torch.Tensor,
values: torch.tensor,
next_obs: torch.tensor,
next_done: torch.Tensor,
) -> tuple:
# GAE
with torch.no_grad():
next_value = self.get_value(next_obs).reshape(1, -1)
data_size = rewards.size()[0]
if self.args.gae:
advantages = torch.zeros_like(rewards).to(self.device)
last_gae_lam = 0
for t in reversed(range(data_size)):
if t == data_size - 1:
nextnonterminal = 1.0 - next_done
next_values = next_value
else:
nextnonterminal = 1.0 - dones[t + 1]
next_values = values[t + 1]
delta = rewards[t] + self.args.gamma * next_values * nextnonterminal - values[t]
advantages[t] = last_gae_lam = (
delta + self.args.gamma * self.args.gaeLambda * nextnonterminal * last_gae_lam
)
returns = advantages + values
else:
returns = torch.zeros_like(rewards).to(self.device)
for t in reversed(range(data_size)):
if t == data_size - 1:
nextnonterminal = 1.0 - next_done
next_return = next_value
else:
nextnonterminal = 1.0 - dones[t + 1]
next_return = returns[t + 1]
returns[t] = rewards[t] + self.args.gamma * nextnonterminal * next_return
advantages = returns - values
return advantages, returns