Merge branch 'OffP-PartialMNN-review' into OffP-PartialMNN
This commit is contained in:
commit
ef0ee495f2
5
.vscode/settings.json
vendored
Normal file
5
.vscode/settings.json
vendored
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
{
|
||||||
|
"python.linting.enabled": false,
|
||||||
|
"python.analysis.typeCheckingMode": "off",
|
||||||
|
"commentTranslate.source": "intellsmi.deepl-translate-deepl"
|
||||||
|
}
|
@ -1,9 +1,16 @@
|
|||||||
import gym
|
import gym
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
import uuid
|
||||||
|
import airecorder
|
||||||
from numpy import ndarray
|
from numpy import ndarray
|
||||||
from mlagents_envs.base_env import ActionTuple
|
from mlagents_envs.base_env import ActionTuple
|
||||||
from mlagents_envs.environment import UnityEnvironment
|
from mlagents_envs.environment import UnityEnvironment
|
||||||
|
from typing import Tuple, List
|
||||||
|
from mlagents_envs.side_channel.side_channel import (
|
||||||
|
SideChannel,
|
||||||
|
IncomingMessage,
|
||||||
|
OutgoingMessage,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class Aimbot(gym.Env):
|
class Aimbot(gym.Env):
|
||||||
@ -61,7 +68,7 @@ class Aimbot(gym.Env):
|
|||||||
# agents number
|
# agents number
|
||||||
self.unity_agent_num = len(self.unity_agent_IDS)
|
self.unity_agent_num = len(self.unity_agent_IDS)
|
||||||
|
|
||||||
def reset(self):
|
def reset(self)->Tuple[np.ndarray, List, List]:
|
||||||
"""reset enviroment and get observations
|
"""reset enviroment and get observations
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
@ -69,7 +76,7 @@ class Aimbot(gym.Env):
|
|||||||
"""
|
"""
|
||||||
# reset env
|
# reset env
|
||||||
self.env.reset()
|
self.env.reset()
|
||||||
nextState, reward, done = self.getSteps()
|
nextState, reward, done = self.get_steps()
|
||||||
return nextState, reward, done
|
return nextState, reward, done
|
||||||
|
|
||||||
# TODO:
|
# TODO:
|
||||||
@ -80,7 +87,7 @@ class Aimbot(gym.Env):
|
|||||||
def step(
|
def step(
|
||||||
self,
|
self,
|
||||||
actions: ndarray,
|
actions: ndarray,
|
||||||
):
|
)->Tuple[np.ndarray, List, List]:
|
||||||
"""change ations list to ActionTuple then send it to enviroment
|
"""change ations list to ActionTuple then send it to enviroment
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@ -114,10 +121,10 @@ class Aimbot(gym.Env):
|
|||||||
self.env.set_actions(behavior_name=self.unity_beha_name, action=thisActionTuple)
|
self.env.set_actions(behavior_name=self.unity_beha_name, action=thisActionTuple)
|
||||||
self.env.step()
|
self.env.step()
|
||||||
# get nextState & reward & done after this action
|
# get nextState & reward & done after this action
|
||||||
nextStates, rewards, dones = self.getSteps()
|
nextStates, rewards, dones = self.get_steps()
|
||||||
return nextStates, rewards, dones
|
return nextStates, rewards, dones
|
||||||
|
|
||||||
def getSteps(self):
|
def get_steps(self)->Tuple[np.ndarray, List, List]:
|
||||||
"""get enviroment now observations.
|
"""get enviroment now observations.
|
||||||
Include State, Reward, Done
|
Include State, Reward, Done
|
||||||
|
|
||||||
@ -127,28 +134,92 @@ class Aimbot(gym.Env):
|
|||||||
ndarray: nextState, reward, done
|
ndarray: nextState, reward, done
|
||||||
"""
|
"""
|
||||||
# get nextState & reward & done
|
# get nextState & reward & done
|
||||||
decisionSteps, terminalSteps = self.env.get_steps(self.unity_beha_name)
|
decision_steps, terminal_steps = self.env.get_steps(self.unity_beha_name)
|
||||||
nextStates = []
|
next_states = []
|
||||||
dones = []
|
dones = []
|
||||||
rewards = []
|
rewards = []
|
||||||
for thisAgentID in self.unity_agent_IDS:
|
for this_agent_ID in self.unity_agent_IDS:
|
||||||
# while Episode over agentID will both in decisionSteps and terminalSteps.
|
# while Episode over agentID will both in decisionSteps and terminalSteps.
|
||||||
# avoid redundant state and reward,
|
# avoid redundant state and reward,
|
||||||
# use agentExist toggle to check if agent is already exist.
|
# use agentExist toggle to check if agent is already exist.
|
||||||
agentExist = False
|
agent_exist = False
|
||||||
# game done
|
# game done
|
||||||
if thisAgentID in terminalSteps:
|
if this_agent_ID in terminal_steps:
|
||||||
nextStates.append(terminalSteps[thisAgentID].obs[0])
|
next_states.append(terminal_steps[this_agent_ID].obs[0])
|
||||||
dones.append(True)
|
dones.append(True)
|
||||||
rewards.append(terminalSteps[thisAgentID].reward)
|
rewards.append(terminal_steps[this_agent_ID].reward)
|
||||||
agentExist = True
|
agent_exist = True
|
||||||
# game not over yet and agent not in terminalSteps
|
# game not over yet and agent not in terminalSteps
|
||||||
if (thisAgentID in decisionSteps) and (not agentExist):
|
if (this_agent_ID in decision_steps) and (not agent_exist):
|
||||||
nextStates.append(decisionSteps[thisAgentID].obs[0])
|
next_states.append(decision_steps[this_agent_ID].obs[0])
|
||||||
dones.append(False)
|
dones.append(False)
|
||||||
rewards.append(decisionSteps[thisAgentID].reward)
|
rewards.append(decision_steps[this_agent_ID].reward)
|
||||||
|
|
||||||
return np.asarray(nextStates), rewards, dones
|
return np.asarray(next_states), rewards, dones
|
||||||
|
|
||||||
def close(self):
|
def close(self):
|
||||||
self.env.close()
|
self.env.close()
|
||||||
|
|
||||||
|
class AimbotSideChannel(SideChannel):
|
||||||
|
def __init__(self, channel_id: uuid.UUID) -> None:
|
||||||
|
super().__init__(channel_id)
|
||||||
|
|
||||||
|
def on_message_received(self, msg: IncomingMessage) -> None:
|
||||||
|
"""
|
||||||
|
Note: We must implement this method of the SideChannel interface to
|
||||||
|
receive messages from Unity
|
||||||
|
Message will be sent like this:
|
||||||
|
"Warning|Message1|Message2|Message3" or
|
||||||
|
"Error|Message1|Message2|Message3"
|
||||||
|
"""
|
||||||
|
this_message = msg.read_string()
|
||||||
|
this_result = this_message.split("|")
|
||||||
|
if(this_result[0] == "result"):
|
||||||
|
airecorder.total_rounds[this_result[1]]+=1
|
||||||
|
if(this_result[2] == "Win"):
|
||||||
|
airecorder.win_rounds[this_result[1]]+=1
|
||||||
|
#print(TotalRounds)
|
||||||
|
#print(WinRounds)
|
||||||
|
elif(this_result[0] == "Error"):
|
||||||
|
print(this_message)
|
||||||
|
# # while Message type is Warning
|
||||||
|
# if(thisResult[0] == "Warning"):
|
||||||
|
# # while Message1 is result means one game is over
|
||||||
|
# if (thisResult[1] == "Result"):
|
||||||
|
# TotalRounds[thisResult[2]]+=1
|
||||||
|
# # while Message3 is Win means this agent win this game
|
||||||
|
# if(thisResult[3] == "Win"):
|
||||||
|
# WinRounds[thisResult[2]]+=1
|
||||||
|
# # while Message1 is GameState means this game is just start
|
||||||
|
# # and tell python which game mode is
|
||||||
|
# elif (thisResult[1] == "GameState"):
|
||||||
|
# SCrecieved = 1
|
||||||
|
# # while Message type is Error
|
||||||
|
# elif(thisResult[0] == "Error"):
|
||||||
|
# print(thisMessage)
|
||||||
|
# 发送函数
|
||||||
|
def send_string(self, data: str) -> None:
|
||||||
|
# send a string toC#
|
||||||
|
msg = OutgoingMessage()
|
||||||
|
msg.write_string(data)
|
||||||
|
super().queue_message_to_send(msg)
|
||||||
|
|
||||||
|
def send_bool(self, data: bool) -> None:
|
||||||
|
msg = OutgoingMessage()
|
||||||
|
msg.write_bool(data)
|
||||||
|
super().queue_message_to_send(msg)
|
||||||
|
|
||||||
|
def send_int(self, data: int) -> None:
|
||||||
|
msg = OutgoingMessage()
|
||||||
|
msg.write_int32(data)
|
||||||
|
super().queue_message_to_send(msg)
|
||||||
|
|
||||||
|
def send_float(self, data: float) -> None:
|
||||||
|
msg = OutgoingMessage()
|
||||||
|
msg.write_float32(data)
|
||||||
|
super().queue_message_to_send(msg)
|
||||||
|
|
||||||
|
def send_float_list(self, data: List[float]) -> None:
|
||||||
|
msg = OutgoingMessage()
|
||||||
|
msg.write_float32_list(data)
|
||||||
|
super().queue_message_to_send(msg)
|
@ -107,6 +107,97 @@
|
|||||||
")\n",
|
")\n",
|
||||||
"from typing import List\n"
|
"from typing import List\n"
|
||||||
]
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"ename": "AttributeError",
|
||||||
|
"evalue": "'aaa' object has no attribute 'outa'",
|
||||||
|
"output_type": "error",
|
||||||
|
"traceback": [
|
||||||
|
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
||||||
|
"\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)",
|
||||||
|
"Cell \u001b[1;32mIn[5], line 14\u001b[0m\n\u001b[0;32m 12\u001b[0m asd \u001b[39m=\u001b[39m aaa(outa, outb)\n\u001b[0;32m 13\u001b[0m asd\u001b[39m.\u001b[39mfunc()\n\u001b[1;32m---> 14\u001b[0m \u001b[39mprint\u001b[39m(asd\u001b[39m.\u001b[39;49mouta) \u001b[39m# 输出 100\u001b[39;00m\n",
|
||||||
|
"\u001b[1;31mAttributeError\u001b[0m: 'aaa' object has no attribute 'outa'"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"class aaa():\n",
|
||||||
|
" def __init__(self, a, b):\n",
|
||||||
|
" self.a = a\n",
|
||||||
|
" self.b = b\n",
|
||||||
|
"\n",
|
||||||
|
" def func(self):\n",
|
||||||
|
" global outa\n",
|
||||||
|
" outa = 100\n",
|
||||||
|
"\n",
|
||||||
|
"outa = 1\n",
|
||||||
|
"outb = 2\n",
|
||||||
|
"asd = aaa(outa, outb)\n",
|
||||||
|
"asd.func()\n",
|
||||||
|
"print(asd.outa) # 输出 100"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"usage: ipykernel_launcher.py [-h] [--seed SEED]\n",
|
||||||
|
"ipykernel_launcher.py: error: unrecognized arguments: --ip=127.0.0.1 --stdin=9003 --control=9001 --hb=9000 --Session.signature_scheme=\"hmac-sha256\" --Session.key=b\"46ef9317-59fb-4ab6-ae4e-6b35744fc423\" --shell=9002 --transport=\"tcp\" --iopub=9004 --f=c:\\Users\\UCUNI\\AppData\\Roaming\\jupyter\\runtime\\kernel-v2-311926K1uko38tdWb.json\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ename": "SystemExit",
|
||||||
|
"evalue": "2",
|
||||||
|
"output_type": "error",
|
||||||
|
"traceback": [
|
||||||
|
"An exception has occurred, use %tb to see the full traceback.\n",
|
||||||
|
"\u001b[1;31mSystemExit\u001b[0m\u001b[1;31m:\u001b[0m 2\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"import argparse\n",
|
||||||
|
"\n",
|
||||||
|
"def parse_args():\n",
|
||||||
|
" parser = argparse.ArgumentParser()\n",
|
||||||
|
" parser.add_argument(\"--seed\", type=int, default=11,\n",
|
||||||
|
" help=\"seed of the experiment\")\n",
|
||||||
|
" args = parser.parse_args()\n",
|
||||||
|
" return args\n",
|
||||||
|
"\n",
|
||||||
|
"arggg = parse_args()\n",
|
||||||
|
"print(type(arggg))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"(1.2, 3.2)\n",
|
||||||
|
"1.2\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"aaa = (1.2,3.2)\n",
|
||||||
|
"print(aaa)\n",
|
||||||
|
"print(aaa[0])"
|
||||||
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
@ -125,7 +216,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.11.3"
|
"version": "3.9.17"
|
||||||
},
|
},
|
||||||
"orig_nbformat": 4
|
"orig_nbformat": 4
|
||||||
},
|
},
|
@ -62,7 +62,6 @@
|
|||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"from mlagents_envs.environment import UnityEnvironment\n",
|
"from mlagents_envs.environment import UnityEnvironment\n",
|
||||||
"from gym_unity.envs import UnityToGymWrapper\n",
|
|
||||||
"import numpy as np\n",
|
"import numpy as np\n",
|
||||||
"\n",
|
"\n",
|
||||||
"ENV_PATH = \"../Build-ParallelEnv/Aimbot-ParallelEnv\"\n",
|
"ENV_PATH = \"../Build-ParallelEnv/Aimbot-ParallelEnv\"\n",
|
||||||
@ -368,6 +367,7 @@
|
|||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"import torch\n",
|
"import torch\n",
|
||||||
|
"from torch import nn\n",
|
||||||
"\n",
|
"\n",
|
||||||
"def layer_init(layer, std=np.sqrt(2), bias_const=0.0):\n",
|
"def layer_init(layer, std=np.sqrt(2), bias_const=0.0):\n",
|
||||||
" torch.nn.init.orthogonal_(layer.weight, std)\n",
|
" torch.nn.init.orthogonal_(layer.weight, std)\n",
|
||||||
@ -1248,6 +1248,24 @@
|
|||||||
"saveDir = \"C:/Users/UCUNI/OneDrive/Unity/ML-Agents/Aimbot-PPO/Aimbot-PPO-Python/PPO-Model/Chimera-1677965178-1678547500.pt\"\n",
|
"saveDir = \"C:/Users/UCUNI/OneDrive/Unity/ML-Agents/Aimbot-PPO/Aimbot-PPO-Python/PPO-Model/Chimera-1677965178-1678547500.pt\"\n",
|
||||||
"torch.save(badGotoAgent,saveDir)"
|
"torch.save(badGotoAgent,saveDir)"
|
||||||
]
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"True\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"import torch\n",
|
||||||
|
"print(torch.cuda.is_available())"
|
||||||
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
@ -1266,7 +1284,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.9.7"
|
"version": "3.9.17"
|
||||||
},
|
},
|
||||||
"orig_nbformat": 4,
|
"orig_nbformat": 4,
|
||||||
"vscode": {
|
"vscode": {
|
@ -1,5 +1,4 @@
|
|||||||
import argparse
|
import argparse
|
||||||
import wandb
|
|
||||||
import time
|
import time
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import random
|
import random
|
||||||
@ -9,24 +8,17 @@ import torch.nn as nn
|
|||||||
import torch.optim as optim
|
import torch.optim as optim
|
||||||
import atexit
|
import atexit
|
||||||
|
|
||||||
from AimbotEnv import Aimbot
|
|
||||||
from tqdm import tqdm
|
from aimbotEnv import Aimbot
|
||||||
|
from aimbotEnv import AimbotSideChannel
|
||||||
|
from ppoagent import PPOAgent
|
||||||
|
from airecorder import WandbRecorder
|
||||||
|
from aimemory import PPOMem
|
||||||
|
from aimemory import Targets
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from torch.distributions.normal import Normal
|
|
||||||
from torch.distributions.categorical import Categorical
|
|
||||||
from distutils.util import strtobool
|
from distutils.util import strtobool
|
||||||
from torch.utils.tensorboard import SummaryWriter
|
|
||||||
from mlagents_envs.environment import UnityEnvironment
|
|
||||||
from mlagents_envs.side_channel.side_channel import (
|
|
||||||
SideChannel,
|
|
||||||
IncomingMessage,
|
|
||||||
OutgoingMessage,
|
|
||||||
)
|
|
||||||
from typing import List
|
|
||||||
|
|
||||||
bestReward = -1
|
best_reward = -1
|
||||||
|
|
||||||
SCrecieved = 0
|
|
||||||
|
|
||||||
DEFAULT_SEED = 9331
|
DEFAULT_SEED = 9331
|
||||||
ENV_PATH = "../Build/2.9/Goto-NonNormalization/Aimbot-ParallelEnv"
|
ENV_PATH = "../Build/2.9/Goto-NonNormalization/Aimbot-ParallelEnv"
|
||||||
@ -36,8 +28,8 @@ WORKER_ID = 1
|
|||||||
BASE_PORT = 1000
|
BASE_PORT = 1000
|
||||||
|
|
||||||
# tensorboard names
|
# tensorboard names
|
||||||
game_name = "Aimbot_Target_Hybrid_PMNN_V3"
|
GAME_NAME = "Aimbot_Target_Hybrid_PMNN_V3"
|
||||||
game_type = "Mix_Verification"
|
GAME_TYPE = "Mix_Verification"
|
||||||
|
|
||||||
# max round steps per agent is 2500/Decision_period, 25 seconds
|
# max round steps per agent is 2500/Decision_period, 25 seconds
|
||||||
# !!!check every parameters before run!!!
|
# !!!check every parameters before run!!!
|
||||||
@ -62,19 +54,12 @@ BROADCASTREWARD = False
|
|||||||
ANNEAL_LEARNING_RATE = True
|
ANNEAL_LEARNING_RATE = True
|
||||||
CLIP_VLOSS = True
|
CLIP_VLOSS = True
|
||||||
NORM_ADV = False
|
NORM_ADV = False
|
||||||
TRAIN = False
|
TRAIN = True
|
||||||
SAVE_MODEL = False
|
SAVE_MODEL = False
|
||||||
WANDB_TACK = True
|
WANDB_TACK = False
|
||||||
LOAD_DIR = None
|
LOAD_DIR = None
|
||||||
LOAD_DIR = "../PPO-Model/PList_Go_LeakyReLU_9331_1677965178_bestGoto/PList_Go_LeakyReLU_9331_1677965178_10.709002.pt"
|
#LOAD_DIR = "../PPO-Model/PList_Go_LeakyReLU_9331_1677965178_bestGoto/PList_Go_LeakyReLU_9331_1677965178_10.709002.pt"
|
||||||
|
|
||||||
# public data
|
|
||||||
class Targets(Enum):
|
|
||||||
Free = 0
|
|
||||||
Go = 1
|
|
||||||
Attack = 2
|
|
||||||
Defence = 3
|
|
||||||
Num = 4
|
|
||||||
TARGET_STATE_SIZE = 6
|
TARGET_STATE_SIZE = 6
|
||||||
INAREA_STATE_SIZE = 1
|
INAREA_STATE_SIZE = 1
|
||||||
TIME_STATE_SIZE = 1
|
TIME_STATE_SIZE = 1
|
||||||
@ -86,8 +71,6 @@ BASE_LOSEREWARD = -999
|
|||||||
TARGETNUM= 4
|
TARGETNUM= 4
|
||||||
ENV_TIMELIMIT = 30
|
ENV_TIMELIMIT = 30
|
||||||
RESULT_BROADCAST_RATIO = 1/ENV_TIMELIMIT
|
RESULT_BROADCAST_RATIO = 1/ENV_TIMELIMIT
|
||||||
TotalRounds = {"Free":0,"Go":0,"Attack":0}
|
|
||||||
WinRounds = {"Free":0,"Go":0,"Attack":0}
|
|
||||||
|
|
||||||
# !!!SPECIAL PARAMETERS!!!
|
# !!!SPECIAL PARAMETERS!!!
|
||||||
# change it while program is finished
|
# change it while program is finished
|
||||||
@ -168,230 +151,6 @@ def parse_args():
|
|||||||
return args
|
return args
|
||||||
|
|
||||||
|
|
||||||
def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
|
|
||||||
torch.nn.init.orthogonal_(layer.weight, std)
|
|
||||||
torch.nn.init.constant_(layer.bias, bias_const)
|
|
||||||
return layer
|
|
||||||
|
|
||||||
|
|
||||||
class PPOAgent(nn.Module):
|
|
||||||
def __init__(self, env: Aimbot,targetNum:int):
|
|
||||||
super(PPOAgent, self).__init__()
|
|
||||||
self.targetNum = targetNum
|
|
||||||
self.stateSize = env.unity_observation_shape[0]
|
|
||||||
self.agentNum = env.unity_agent_num
|
|
||||||
self.targetSize = TARGET_STATE_SIZE
|
|
||||||
self.timeSize = TIME_STATE_SIZE
|
|
||||||
self.gunSize = GUN_STATE_SIZE
|
|
||||||
self.myStateSize = MY_STATE_SIZE
|
|
||||||
self.raySize = env.unity_observation_shape[0] - TOTAL_T_SIZE
|
|
||||||
self.nonRaySize = TOTAL_T_SIZE
|
|
||||||
self.head_input_size = env.unity_observation_shape[0] - self.targetSize-self.timeSize-self.gunSize# except target state input
|
|
||||||
|
|
||||||
self.discrete_size = env.unity_discrete_size
|
|
||||||
self.discrete_shape = list(env.unity_discrete_branches)
|
|
||||||
self.continuous_size = env.unity_continuous_size
|
|
||||||
|
|
||||||
self.viewNetwork = nn.Sequential(
|
|
||||||
layer_init(nn.Linear(self.raySize, 200)),
|
|
||||||
nn.LeakyReLU()
|
|
||||||
)
|
|
||||||
self.targetNetworks = nn.ModuleList([nn.Sequential(
|
|
||||||
layer_init(nn.Linear(self.nonRaySize, 100)),
|
|
||||||
nn.LeakyReLU()
|
|
||||||
)for i in range(targetNum)])
|
|
||||||
self.middleNetworks = nn.ModuleList([nn.Sequential(
|
|
||||||
layer_init(nn.Linear(300,200)),
|
|
||||||
nn.LeakyReLU()
|
|
||||||
)for i in range(targetNum)])
|
|
||||||
self.actor_dis = nn.ModuleList([layer_init(nn.Linear(200, self.discrete_size), std=0.5) for i in range(targetNum)])
|
|
||||||
self.actor_mean = nn.ModuleList([layer_init(nn.Linear(200, self.continuous_size), std=0.5) for i in range(targetNum)])
|
|
||||||
# self.actor_logstd = nn.ModuleList([layer_init(nn.Linear(200, self.continuous_size), std=1) for i in range(targetNum)])
|
|
||||||
# self.actor_logstd = nn.Parameter(torch.zeros(1, self.continuous_size))
|
|
||||||
self.actor_logstd = nn.ParameterList([nn.Parameter(torch.zeros(1,self.continuous_size))for i in range(targetNum)]) # nn.Parameter(torch.zeros(1, self.continuous_size))
|
|
||||||
self.critic = nn.ModuleList([layer_init(nn.Linear(200, 1), std=1)for i in range(targetNum)])
|
|
||||||
|
|
||||||
def get_value(self, state: torch.Tensor):
|
|
||||||
target = state[:,0].to(torch.int32) # int
|
|
||||||
thisStateNum = target.size()[0]
|
|
||||||
viewInput = state[:,-self.raySize:] # all ray input
|
|
||||||
targetInput = state[:,:self.nonRaySize]
|
|
||||||
viewLayer = self.viewNetwork(viewInput)
|
|
||||||
targetLayer = torch.stack([self.targetNetworks[target[i]](targetInput[i]) for i in range(thisStateNum)])
|
|
||||||
middleInput = torch.cat([viewLayer,targetLayer],dim = 1)
|
|
||||||
middleLayer = torch.stack([self.middleNetworks[target[i]](middleInput[i]) for i in range(thisStateNum)])
|
|
||||||
criticV = torch.stack([self.critic[target[i]](middleLayer[i]) for i in range(thisStateNum)]) # self.critic
|
|
||||||
return criticV
|
|
||||||
|
|
||||||
def get_actions_value(self, state: torch.Tensor, actions=None):
|
|
||||||
target = state[:,0].to(torch.int32) # int
|
|
||||||
thisStateNum = target.size()[0]
|
|
||||||
viewInput = state[:,-self.raySize:] # all ray input
|
|
||||||
targetInput = state[:,:self.nonRaySize]
|
|
||||||
viewLayer = self.viewNetwork(viewInput)
|
|
||||||
targetLayer = torch.stack([self.targetNetworks[target[i]](targetInput[i]) for i in range(thisStateNum)])
|
|
||||||
middleInput = torch.cat([viewLayer,targetLayer],dim = 1)
|
|
||||||
middleLayer = torch.stack([self.middleNetworks[target[i]](middleInput[i]) for i in range(thisStateNum)])
|
|
||||||
|
|
||||||
# discrete
|
|
||||||
# 递归targets的数量,既agent数来实现根据target不同来选用对应的输出网络计算输出
|
|
||||||
dis_logits = torch.stack([self.actor_dis[target[i]](middleLayer[i]) for i in range(thisStateNum)])
|
|
||||||
split_logits = torch.split(dis_logits, self.discrete_shape, dim=1)
|
|
||||||
multi_categoricals = [Categorical(logits=thisLogits) for thisLogits in split_logits]
|
|
||||||
# continuous
|
|
||||||
actions_mean = torch.stack([self.actor_mean[target[i]](middleLayer[i]) for i in range(thisStateNum)]) # self.actor_mean(hidden)
|
|
||||||
# action_logstd = torch.stack([self.actor_logstd[target[i]](middleLayer[i]) for i in range(thisStateNum)]) # self.actor_logstd(hidden)
|
|
||||||
# action_logstd = self.actor_logstd.expand_as(actions_mean) # self.actor_logstd.expand_as(actions_mean)
|
|
||||||
action_logstd = torch.stack([torch.squeeze(self.actor_logstd[target[i]],0) for i in range(thisStateNum)])
|
|
||||||
# print(action_logstd)
|
|
||||||
action_std = torch.exp(action_logstd) # torch.exp(action_logstd)
|
|
||||||
con_probs = Normal(actions_mean, action_std)
|
|
||||||
# critic
|
|
||||||
criticV = torch.stack([self.critic[target[i]](middleLayer[i]) for i in range(thisStateNum)]) # self.critic
|
|
||||||
|
|
||||||
if actions is None:
|
|
||||||
if args.train:
|
|
||||||
# select actions base on probability distribution model
|
|
||||||
disAct = torch.stack([ctgr.sample() for ctgr in multi_categoricals])
|
|
||||||
conAct = con_probs.sample()
|
|
||||||
actions = torch.cat([disAct.T, conAct], dim=1)
|
|
||||||
else:
|
|
||||||
# select actions base on best probability distribution
|
|
||||||
# disAct = torch.stack([torch.argmax(logit, dim=1) for logit in split_logits])
|
|
||||||
conAct = actions_mean
|
|
||||||
disAct = torch.stack([ctgr.sample() for ctgr in multi_categoricals])
|
|
||||||
conAct = con_probs.sample()
|
|
||||||
actions = torch.cat([disAct.T, conAct], dim=1)
|
|
||||||
else:
|
|
||||||
disAct = actions[:, 0 : env.unity_discrete_type].T
|
|
||||||
conAct = actions[:, env.unity_discrete_type :]
|
|
||||||
dis_log_prob = torch.stack(
|
|
||||||
[ctgr.log_prob(act) for act, ctgr in zip(disAct, multi_categoricals)]
|
|
||||||
)
|
|
||||||
dis_entropy = torch.stack([ctgr.entropy() for ctgr in multi_categoricals])
|
|
||||||
return (
|
|
||||||
actions,
|
|
||||||
dis_log_prob.sum(0),
|
|
||||||
dis_entropy.sum(0),
|
|
||||||
con_probs.log_prob(conAct).sum(1),
|
|
||||||
con_probs.entropy().sum(1),
|
|
||||||
criticV,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def GAE(agent, args, rewards, dones, values, next_obs, next_done):
|
|
||||||
# GAE
|
|
||||||
with torch.no_grad():
|
|
||||||
next_value = agent.get_value(next_obs).reshape(1, -1)
|
|
||||||
data_size = rewards.size()[0]
|
|
||||||
if args.gae:
|
|
||||||
advantages = torch.zeros_like(rewards).to(device)
|
|
||||||
lastgaelam = 0
|
|
||||||
for t in reversed(range(data_size)):
|
|
||||||
if t == data_size - 1:
|
|
||||||
nextnonterminal = 1.0 - next_done
|
|
||||||
nextvalues = next_value
|
|
||||||
else:
|
|
||||||
nextnonterminal = 1.0 - dones[t + 1]
|
|
||||||
nextvalues = values[t + 1]
|
|
||||||
delta = rewards[t] + args.gamma * nextvalues * nextnonterminal - values[t]
|
|
||||||
advantages[t] = lastgaelam = (
|
|
||||||
delta + args.gamma * args.gaeLambda * nextnonterminal * lastgaelam
|
|
||||||
)
|
|
||||||
returns = advantages + values
|
|
||||||
else:
|
|
||||||
returns = torch.zeros_like(rewards).to(device)
|
|
||||||
for t in reversed(range(data_size)):
|
|
||||||
if t == data_size - 1:
|
|
||||||
nextnonterminal = 1.0 - next_done
|
|
||||||
next_return = next_value
|
|
||||||
else:
|
|
||||||
nextnonterminal = 1.0 - dones[t + 1]
|
|
||||||
next_return = returns[t + 1]
|
|
||||||
returns[t] = rewards[t] + args.gamma * nextnonterminal * next_return
|
|
||||||
advantages = returns - values
|
|
||||||
return advantages, returns
|
|
||||||
|
|
||||||
class AimbotSideChannel(SideChannel):
|
|
||||||
def __init__(self, channel_id: uuid.UUID) -> None:
|
|
||||||
super().__init__(channel_id)
|
|
||||||
def on_message_received(self, msg: IncomingMessage) -> None:
|
|
||||||
global SCrecieved # make sure this variable is global
|
|
||||||
"""
|
|
||||||
Note: We must implement this method of the SideChannel interface to
|
|
||||||
receive messages from Unity
|
|
||||||
Message will be sent like this:
|
|
||||||
"Warning|Message1|Message2|Message3" or
|
|
||||||
"Error|Message1|Message2|Message3"
|
|
||||||
"""
|
|
||||||
thisMessage = msg.read_string()
|
|
||||||
thisResult = thisMessage.split("|")
|
|
||||||
if(thisResult[0] == "result"):
|
|
||||||
TotalRounds[thisResult[1]]+=1
|
|
||||||
if(thisResult[2] == "Win"):
|
|
||||||
WinRounds[thisResult[1]]+=1
|
|
||||||
#print(TotalRounds)
|
|
||||||
#print(WinRounds)
|
|
||||||
elif(thisResult[0] == "Error"):
|
|
||||||
print(thisMessage)
|
|
||||||
|
|
||||||
# # while Message type is Warning
|
|
||||||
# if(thisResult[0] == "Warning"):
|
|
||||||
# # while Message1 is result means one game is over
|
|
||||||
# if (thisResult[1] == "Result"):
|
|
||||||
# TotalRounds[thisResult[2]]+=1
|
|
||||||
# # while Message3 is Win means this agent win this game
|
|
||||||
# if(thisResult[3] == "Win"):
|
|
||||||
# WinRounds[thisResult[2]]+=1
|
|
||||||
# # while Message1 is GameState means this game is just start
|
|
||||||
# # and tell python which game mode is
|
|
||||||
# elif (thisResult[1] == "GameState"):
|
|
||||||
# SCrecieved = 1
|
|
||||||
# # while Message type is Error
|
|
||||||
# elif(thisResult[0] == "Error"):
|
|
||||||
# print(thisMessage)
|
|
||||||
# 发送函数
|
|
||||||
def send_string(self, data: str) -> None:
|
|
||||||
# send a string toC#
|
|
||||||
msg = OutgoingMessage()
|
|
||||||
msg.write_string(data)
|
|
||||||
super().queue_message_to_send(msg)
|
|
||||||
|
|
||||||
def send_bool(self, data: bool) -> None:
|
|
||||||
msg = OutgoingMessage()
|
|
||||||
msg.write_bool(data)
|
|
||||||
super().queue_message_to_send(msg)
|
|
||||||
|
|
||||||
def send_int(self, data: int) -> None:
|
|
||||||
msg = OutgoingMessage()
|
|
||||||
msg.write_int32(data)
|
|
||||||
super().queue_message_to_send(msg)
|
|
||||||
|
|
||||||
def send_float(self, data: float) -> None:
|
|
||||||
msg = OutgoingMessage()
|
|
||||||
msg.write_float32(data)
|
|
||||||
super().queue_message_to_send(msg)
|
|
||||||
|
|
||||||
def send_float_list(self, data: List[float]) -> None:
|
|
||||||
msg = OutgoingMessage()
|
|
||||||
msg.write_float32_list(data)
|
|
||||||
super().queue_message_to_send(msg)
|
|
||||||
|
|
||||||
def broadCastEndReward(rewardBF:list,remainTime:float):
|
|
||||||
thisRewardBF = rewardBF
|
|
||||||
if (rewardBF[-1]<=-500):
|
|
||||||
# print("Lose DO NOT BROAD CAST",rewardBF[-1])
|
|
||||||
thisRewardBF[-1] = rewardBF[-1]-BASE_LOSEREWARD
|
|
||||||
elif (rewardBF[-1]>=500):
|
|
||||||
# print("Win! Broadcast reward!",rewardBF[-1])
|
|
||||||
print(sum(thisRewardBF)/len(thisRewardBF))
|
|
||||||
thisRewardBF[-1] = rewardBF[-1]-BASE_WINREWARD
|
|
||||||
thisRewardBF = (np.asarray(thisRewardBF)+(remainTime*args.result_broadcast_ratio)).tolist()
|
|
||||||
else:
|
|
||||||
print("!!!!!DIDNT GET RESULT REWARD!!!!!!",rewardBF[-1])
|
|
||||||
return torch.Tensor(thisRewardBF).to(device)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
args = parse_args()
|
args = parse_args()
|
||||||
random.seed(args.seed)
|
random.seed(args.seed)
|
||||||
@ -401,10 +160,21 @@ if __name__ == "__main__":
|
|||||||
device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu")
|
device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu")
|
||||||
|
|
||||||
# Initialize environment anget optimizer
|
# Initialize environment anget optimizer
|
||||||
aimBotsideChannel = AimbotSideChannel(SIDE_CHANNEL_UUID);
|
aimbot_sidechannel = AimbotSideChannel(SIDE_CHANNEL_UUID);
|
||||||
env = Aimbot(envPath=args.path, workerID=args.workerID, basePort=args.baseport,side_channels=[aimBotsideChannel])
|
env = Aimbot(envPath=args.path, workerID=args.workerID, basePort=args.baseport,side_channels=[aimbot_sidechannel])
|
||||||
if args.load_dir is None:
|
if args.load_dir is None:
|
||||||
agent = PPOAgent(env,TARGETNUM).to(device)
|
agent = PPOAgent(
|
||||||
|
env = env,
|
||||||
|
this_args=args,
|
||||||
|
train_agent=args.train,
|
||||||
|
target_num=TARGETNUM,
|
||||||
|
target_state_size= TARGET_STATE_SIZE,
|
||||||
|
time_state_size=TIME_STATE_SIZE,
|
||||||
|
gun_state_size=GUN_STATE_SIZE,
|
||||||
|
my_state_size=MY_STATE_SIZE,
|
||||||
|
total_t_size=TOTAL_T_SIZE,
|
||||||
|
device=device,
|
||||||
|
).to(device)
|
||||||
else:
|
else:
|
||||||
agent = torch.load(args.load_dir)
|
agent = torch.load(args.load_dir)
|
||||||
# freeze
|
# freeze
|
||||||
@ -419,24 +189,8 @@ if __name__ == "__main__":
|
|||||||
optimizer = optim.Adam(agent.parameters(), lr=args.lr, eps=1e-5)
|
optimizer = optim.Adam(agent.parameters(), lr=args.lr, eps=1e-5)
|
||||||
|
|
||||||
# Tensorboard and WandB Recorder
|
# Tensorboard and WandB Recorder
|
||||||
run_name = f"{game_type}_{args.seed}_{int(time.time())}"
|
run_name = f"{GAME_TYPE}_{args.seed}_{int(time.time())}"
|
||||||
if args.wandb_track:
|
wdb_recorder = WandbRecorder(GAME_NAME, GAME_TYPE, run_name, args)
|
||||||
wandb.init(
|
|
||||||
project=game_name,
|
|
||||||
entity=args.wandb_entity,
|
|
||||||
sync_tensorboard=True,
|
|
||||||
config=vars(args),
|
|
||||||
name=run_name,
|
|
||||||
monitor_gym=True,
|
|
||||||
save_code=True,
|
|
||||||
)
|
|
||||||
|
|
||||||
writer = SummaryWriter(f"runs/{run_name}")
|
|
||||||
writer.add_text(
|
|
||||||
"hyperparameters",
|
|
||||||
"|param|value|\n|-|-|\n%s"
|
|
||||||
% ("\n".join([f"|{key}|{value}|" for key, value in vars(args).items()])),
|
|
||||||
)
|
|
||||||
|
|
||||||
@atexit.register
|
@atexit.register
|
||||||
def save_model():
|
def save_model():
|
||||||
@ -444,60 +198,49 @@ if __name__ == "__main__":
|
|||||||
env.close()
|
env.close()
|
||||||
if args.save_model:
|
if args.save_model:
|
||||||
# save model while exit
|
# save model while exit
|
||||||
saveDir = "../PPO-Model/"+ run_name + "_last.pt"
|
save_dir = "../PPO-Model/"+ run_name + "_last.pt"
|
||||||
torch.save(agent, saveDir)
|
torch.save(agent, save_dir)
|
||||||
print("save model to " + saveDir)
|
print("save model to " + save_dir)
|
||||||
|
|
||||||
# Trajectory Buffer
|
|
||||||
ob_bf = [[] for i in range(env.unity_agent_num)]
|
|
||||||
act_bf = [[] for i in range(env.unity_agent_num)]
|
|
||||||
dis_logprobs_bf = [[] for i in range(env.unity_agent_num)]
|
|
||||||
con_logprobs_bf = [[] for i in range(env.unity_agent_num)]
|
|
||||||
rewards_bf = [[] for i in range(env.unity_agent_num)]
|
|
||||||
dones_bf = [[] for i in range(env.unity_agent_num)]
|
|
||||||
values_bf = [[] for i in range(env.unity_agent_num)]
|
|
||||||
|
|
||||||
# start the game
|
# start the game
|
||||||
total_update_step = using_targets_num * args.total_timesteps // args.datasetSize
|
total_update_step = using_targets_num * args.total_timesteps // args.datasetSize
|
||||||
target_steps = [0 for i in range(TARGETNUM)]
|
target_steps = [0 for i in range(TARGETNUM)]
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
state, _, done = env.reset()
|
state, _, done = env.reset()
|
||||||
# state = torch.Tensor(next_obs).to(device)
|
|
||||||
# next_done = torch.zeros(env.unity_agent_num).to(device)
|
|
||||||
|
|
||||||
# initialize empty training datasets
|
# initialize AI memories
|
||||||
obs = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,env.unity_observation_size)
|
ppo_memories = PPOMem(
|
||||||
actions = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,env.unity_action_size)
|
env = env,
|
||||||
dis_logprobs = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1)
|
device = device,
|
||||||
con_logprobs = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1)
|
args=args,
|
||||||
rewards = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1)
|
target_num = TARGETNUM,
|
||||||
values = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1)
|
target_state_size = TARGET_STATE_SIZE,
|
||||||
advantages = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1)
|
base_lose_reward = BASE_LOSEREWARD,
|
||||||
returns = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1)
|
base_win_reward = BASE_WINREWARD,
|
||||||
|
)
|
||||||
|
|
||||||
for total_steps in range(total_update_step):
|
for total_steps in range(total_update_step):
|
||||||
# discunt learning rate, while step == total_update_step lr will be 0
|
# discunt learning rate, while step == total_update_step lr will be 0
|
||||||
|
|
||||||
if args.annealLR:
|
if args.annealLR:
|
||||||
finalRatio = TARGET_LEARNING_RATE/args.lr
|
final_lr_ratio = TARGET_LEARNING_RATE/args.lr
|
||||||
frac = 1.0 - ((total_steps + 1.0) / total_update_step)
|
frac = 1.0 - ((total_steps + 1.0) / total_update_step)
|
||||||
lrnow = frac * args.lr
|
lr_now = frac * args.lr
|
||||||
optimizer.param_groups[0]["lr"] = lrnow
|
optimizer.param_groups[0]["lr"] = lr_now
|
||||||
else:
|
else:
|
||||||
lrnow = args.lr
|
lr_now = args.lr
|
||||||
print("new episode",total_steps,"learning rate = ",lrnow)
|
print("new episode",total_steps,"learning rate = ",lr_now)
|
||||||
|
|
||||||
|
|
||||||
# MAIN LOOP: run agent in environment
|
# MAIN LOOP: run agent in environment
|
||||||
step = 0
|
step = 0
|
||||||
training = False
|
training = False
|
||||||
trainQueue = []
|
train_queue = []
|
||||||
last_reward = [0.for i in range(env.unity_agent_num)]
|
last_reward = [0.for i in range(env.unity_agent_num)]
|
||||||
while True:
|
while True:
|
||||||
if step % args.decision_period == 0:
|
if step % args.decision_period == 0:
|
||||||
step += 1
|
step += 1
|
||||||
# Choose action by agent
|
# Choose action by agent
|
||||||
|
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
# predict actions
|
# predict actions
|
||||||
action, dis_logprob, _, con_logprob, _, value = agent.get_actions_value(
|
action, dis_logprob, _, con_logprob, _, value = agent.get_actions_value(
|
||||||
@ -514,60 +257,27 @@ if __name__ == "__main__":
|
|||||||
next_state, reward, next_done = env.step(action_cpu)
|
next_state, reward, next_done = env.step(action_cpu)
|
||||||
|
|
||||||
# save memories
|
# save memories
|
||||||
for i in range(env.unity_agent_num):
|
ppo_memories.save_memories(
|
||||||
# save memories to buffers
|
now_step = step,
|
||||||
ob_bf[i].append(state[i])
|
agent = agent,
|
||||||
act_bf[i].append(action_cpu[i])
|
state = state,
|
||||||
dis_logprobs_bf[i].append(dis_logprob_cpu[i])
|
action_cpu = action_cpu,
|
||||||
con_logprobs_bf[i].append(con_logprob_cpu[i])
|
dis_logprob_cpu = dis_logprob_cpu,
|
||||||
rewards_bf[i].append(reward[i]+last_reward[i])
|
con_logprob_cpu = con_logprob_cpu,
|
||||||
dones_bf[i].append(done[i])
|
reward = reward,
|
||||||
values_bf[i].append(value_cpu[i])
|
done = done,
|
||||||
remainTime = state[i,TARGET_STATE_SIZE]
|
value_cpu = value_cpu,
|
||||||
if next_done[i] == True:
|
last_reward = last_reward,
|
||||||
# finished a round, send finished memories to training datasets
|
next_done = next_done,
|
||||||
# compute advantage and discounted reward
|
next_state=next_state,
|
||||||
#print(i,"over")
|
)
|
||||||
roundTargetType = int(state[i,0])
|
|
||||||
thisRewardsTensor = broadCastEndReward(rewards_bf[i],remainTime)
|
|
||||||
adv, rt = GAE(
|
|
||||||
agent,
|
|
||||||
args,
|
|
||||||
thisRewardsTensor,
|
|
||||||
torch.Tensor(dones_bf[i]).to(device),
|
|
||||||
torch.tensor(values_bf[i]).to(device),
|
|
||||||
torch.tensor(next_state[i]).to(device).unsqueeze(0),
|
|
||||||
torch.Tensor([next_done[i]]).to(device),
|
|
||||||
)
|
|
||||||
# send memories to training datasets
|
|
||||||
obs[roundTargetType] = torch.cat((obs[roundTargetType], torch.tensor(ob_bf[i]).to(device)), 0)
|
|
||||||
actions[roundTargetType] = torch.cat((actions[roundTargetType], torch.tensor(act_bf[i]).to(device)), 0)
|
|
||||||
dis_logprobs[roundTargetType] = torch.cat(
|
|
||||||
(dis_logprobs[roundTargetType], torch.tensor(dis_logprobs_bf[i]).to(device)), 0
|
|
||||||
)
|
|
||||||
con_logprobs[roundTargetType] = torch.cat(
|
|
||||||
(con_logprobs[roundTargetType], torch.tensor(con_logprobs_bf[i]).to(device)), 0
|
|
||||||
)
|
|
||||||
rewards[roundTargetType] = torch.cat((rewards[roundTargetType], thisRewardsTensor), 0)
|
|
||||||
values[roundTargetType] = torch.cat((values[roundTargetType], torch.tensor(values_bf[i]).to(device)), 0)
|
|
||||||
advantages[roundTargetType] = torch.cat((advantages[roundTargetType], adv), 0)
|
|
||||||
returns[roundTargetType] = torch.cat((returns[roundTargetType], rt), 0)
|
|
||||||
|
|
||||||
# clear buffers
|
|
||||||
ob_bf[i] = []
|
|
||||||
act_bf[i] = []
|
|
||||||
dis_logprobs_bf[i] = []
|
|
||||||
con_logprobs_bf[i] = []
|
|
||||||
rewards_bf[i] = []
|
|
||||||
dones_bf[i] = []
|
|
||||||
values_bf[i] = []
|
|
||||||
print(f"train dataset {Targets(roundTargetType).name} added:{obs[roundTargetType].size()[0]}/{args.datasetSize}")
|
|
||||||
|
|
||||||
|
# check if any training dataset is full and ready to train
|
||||||
for i in range(TARGETNUM):
|
for i in range(TARGETNUM):
|
||||||
if obs[i].size()[0] >= args.datasetSize:
|
if ppo_memories.obs[i].size()[0] >= args.datasetSize:
|
||||||
# start train NN
|
# start train NN
|
||||||
trainQueue.append(i)
|
train_queue.append(i)
|
||||||
if(len(trainQueue)>0):
|
if(len(train_queue)>0):
|
||||||
break
|
break
|
||||||
state, done = next_state, next_done
|
state, done = next_state, next_done
|
||||||
else:
|
else:
|
||||||
@ -575,74 +285,40 @@ if __name__ == "__main__":
|
|||||||
# skip this step use last predict action
|
# skip this step use last predict action
|
||||||
next_state, reward, next_done = env.step(action_cpu)
|
next_state, reward, next_done = env.step(action_cpu)
|
||||||
# save memories
|
# save memories
|
||||||
for i in range(env.unity_agent_num):
|
ppo_memories.save_memories(
|
||||||
if next_done[i] == True:
|
now_step = step,
|
||||||
#print(i,"over???")
|
agent = agent,
|
||||||
# save memories to buffers
|
state = state,
|
||||||
ob_bf[i].append(state[i])
|
action_cpu = action_cpu,
|
||||||
act_bf[i].append(action_cpu[i])
|
dis_logprob_cpu = dis_logprob_cpu,
|
||||||
dis_logprobs_bf[i].append(dis_logprob_cpu[i])
|
con_logprob_cpu = con_logprob_cpu,
|
||||||
con_logprobs_bf[i].append(con_logprob_cpu[i])
|
reward = reward,
|
||||||
rewards_bf[i].append(reward[i])
|
done = done,
|
||||||
dones_bf[i].append(done[i])
|
value_cpu = value_cpu,
|
||||||
values_bf[i].append(value_cpu[i])
|
last_reward = last_reward,
|
||||||
remainTime = state[i,TARGET_STATE_SIZE]
|
next_done = next_done,
|
||||||
# finished a round, send finished memories to training datasets
|
next_state=next_state,
|
||||||
# compute advantage and discounted reward
|
)
|
||||||
roundTargetType = int(state[i,0])
|
|
||||||
thisRewardsTensor = broadCastEndReward(rewards_bf[i],remainTime)
|
|
||||||
adv, rt = GAE(
|
|
||||||
agent,
|
|
||||||
args,
|
|
||||||
thisRewardsTensor,
|
|
||||||
torch.Tensor(dones_bf[i]).to(device),
|
|
||||||
torch.tensor(values_bf[i]).to(device),
|
|
||||||
torch.Tensor(next_state[i]).to(device).unsqueeze(dim = 0),
|
|
||||||
torch.Tensor([next_done[i]]).to(device),
|
|
||||||
)
|
|
||||||
# send memories to training datasets
|
|
||||||
obs[roundTargetType] = torch.cat((obs[roundTargetType], torch.tensor(ob_bf[i]).to(device)), 0)
|
|
||||||
actions[roundTargetType] = torch.cat((actions[roundTargetType], torch.tensor(act_bf[i]).to(device)), 0)
|
|
||||||
dis_logprobs[roundTargetType] = torch.cat(
|
|
||||||
(dis_logprobs[roundTargetType], torch.tensor(dis_logprobs_bf[i]).to(device)), 0
|
|
||||||
)
|
|
||||||
con_logprobs[roundTargetType] = torch.cat(
|
|
||||||
(con_logprobs[roundTargetType], torch.tensor(con_logprobs_bf[i]).to(device)), 0
|
|
||||||
)
|
|
||||||
rewards[roundTargetType] = torch.cat((rewards[roundTargetType], thisRewardsTensor), 0)
|
|
||||||
values[roundTargetType] = torch.cat((values[roundTargetType], torch.tensor(values_bf[i]).to(device)), 0)
|
|
||||||
advantages[roundTargetType] = torch.cat((advantages[roundTargetType], adv), 0)
|
|
||||||
returns[roundTargetType] = torch.cat((returns[roundTargetType], rt), 0)
|
|
||||||
|
|
||||||
# clear buffers
|
|
||||||
ob_bf[i] = []
|
|
||||||
act_bf[i] = []
|
|
||||||
dis_logprobs_bf[i] = []
|
|
||||||
con_logprobs_bf[i] = []
|
|
||||||
rewards_bf[i] = []
|
|
||||||
dones_bf[i] = []
|
|
||||||
values_bf[i] = []
|
|
||||||
print(f"train dataset {Targets(roundTargetType).name} added:{obs[roundTargetType].size()[0]}/{args.datasetSize}")
|
|
||||||
|
|
||||||
state = next_state
|
state = next_state
|
||||||
last_reward = reward
|
last_reward = reward
|
||||||
i += 1
|
|
||||||
|
|
||||||
if args.train:
|
if args.train:
|
||||||
meanRewardList = [] # for WANDB
|
# train mode on
|
||||||
|
mean_reward_list = [] # for WANDB
|
||||||
# loop all tarining queue
|
# loop all tarining queue
|
||||||
for thisT in trainQueue:
|
for thisT in train_queue:
|
||||||
# sart time
|
# sart time
|
||||||
startTime = time.time()
|
start_time = time.time()
|
||||||
target_steps[thisT]+=1
|
target_steps[thisT]+=1
|
||||||
# flatten the batch
|
# flatten the batch
|
||||||
b_obs = obs[thisT].reshape((-1,) + env.unity_observation_shape)
|
b_obs = ppo_memories.obs[thisT].reshape((-1,) + env.unity_observation_shape)
|
||||||
b_dis_logprobs = dis_logprobs[thisT].reshape(-1)
|
b_dis_logprobs = ppo_memories.dis_logprobs[thisT].reshape(-1)
|
||||||
b_con_logprobs = con_logprobs[thisT].reshape(-1)
|
b_con_logprobs = ppo_memories.con_logprobs[thisT].reshape(-1)
|
||||||
b_actions = actions[thisT].reshape((-1,) + (env.unity_action_size,))
|
b_actions = ppo_memories.actions[thisT].reshape((-1,) + (env.unity_action_size,))
|
||||||
b_advantages = advantages[thisT].reshape(-1)
|
b_advantages = ppo_memories.advantages[thisT].reshape(-1)
|
||||||
b_returns = returns[thisT].reshape(-1)
|
b_returns = ppo_memories.returns[thisT].reshape(-1)
|
||||||
b_values = values[thisT].reshape(-1)
|
b_values = ppo_memories.values[thisT].reshape(-1)
|
||||||
b_size = b_obs.size()[0]
|
b_size = b_obs.size()[0]
|
||||||
# Optimizing the policy and value network
|
# Optimizing the policy and value network
|
||||||
b_inds = np.arange(b_size)
|
b_inds = np.arange(b_size)
|
||||||
@ -751,67 +427,61 @@ if __name__ == "__main__":
|
|||||||
"""
|
"""
|
||||||
# record mean reward before clear history
|
# record mean reward before clear history
|
||||||
print("done")
|
print("done")
|
||||||
targetRewardMean = np.mean(rewards[thisT].to("cpu").detach().numpy().copy())
|
targetRewardMean = np.mean(ppo_memories.rewards[thisT].to("cpu").detach().numpy().copy())
|
||||||
meanRewardList.append(targetRewardMean)
|
mean_reward_list.append(targetRewardMean)
|
||||||
targetName = Targets(thisT).name
|
targetName = Targets(thisT).name
|
||||||
|
|
||||||
# clear this target trainning set buffer
|
# clear this target trainning set buffer
|
||||||
obs[thisT] = torch.tensor([]).to(device)
|
ppo_memories.clear_training_datasets(thisT)
|
||||||
actions[thisT] = torch.tensor([]).to(device)
|
|
||||||
dis_logprobs[thisT] = torch.tensor([]).to(device)
|
|
||||||
con_logprobs[thisT] = torch.tensor([]).to(device)
|
|
||||||
rewards[thisT] = torch.tensor([]).to(device)
|
|
||||||
values[thisT] = torch.tensor([]).to(device)
|
|
||||||
advantages[thisT] = torch.tensor([]).to(device)
|
|
||||||
returns[thisT] = torch.tensor([]).to(device)
|
|
||||||
|
|
||||||
# record rewards for plotting purposes
|
# record rewards for plotting purposes
|
||||||
writer.add_scalar(f"Target{targetName}/value_loss", v_loss.item(), target_steps[thisT])
|
wdb_recorder.add_target_scalar(
|
||||||
writer.add_scalar(f"Target{targetName}/dis_policy_loss", dis_pg_loss.item(), target_steps[thisT])
|
targetName,
|
||||||
writer.add_scalar(f"Target{targetName}/con_policy_loss", con_pg_loss.item(), target_steps[thisT])
|
thisT,
|
||||||
writer.add_scalar(f"Target{targetName}/total_loss", loss.item(), target_steps[thisT])
|
v_loss,
|
||||||
writer.add_scalar(f"Target{targetName}/entropy_loss", entropy_loss.item(), target_steps[thisT])
|
dis_pg_loss,
|
||||||
writer.add_scalar(f"Target{targetName}/Reward", targetRewardMean, target_steps[thisT])
|
con_pg_loss,
|
||||||
writer.add_scalar(f"Target{targetName}/WinRatio", WinRounds[targetName]/TotalRounds[targetName], target_steps[thisT])
|
loss,
|
||||||
|
entropy_loss,
|
||||||
|
targetRewardMean,
|
||||||
|
target_steps,
|
||||||
|
)
|
||||||
print(f"episode over Target{targetName} mean reward:", targetRewardMean)
|
print(f"episode over Target{targetName} mean reward:", targetRewardMean)
|
||||||
TotalRewardMean = np.mean(meanRewardList)
|
TotalRewardMean = np.mean(mean_reward_list)
|
||||||
writer.add_scalar("GlobalCharts/TotalRewardMean", TotalRewardMean, total_steps)
|
wdb_recorder.add_global_scalar(
|
||||||
writer.add_scalar("GlobalCharts/learning_rate", optimizer.param_groups[0]["lr"], total_steps)
|
TotalRewardMean,
|
||||||
|
optimizer.param_groups[0]["lr"],
|
||||||
|
total_steps,
|
||||||
|
)
|
||||||
# print cost time as seconds
|
# print cost time as seconds
|
||||||
print("cost time:", time.time() - start_time)
|
print("cost time:", time.time() - start_time)
|
||||||
# New Record!
|
# New Record!
|
||||||
if TotalRewardMean > bestReward and args.save_model:
|
if TotalRewardMean > best_reward and args.save_model:
|
||||||
bestReward = targetRewardMean
|
best_reward = targetRewardMean
|
||||||
saveDir = "../PPO-Model/" + run_name +"_"+ str(TotalRewardMean) + ".pt"
|
saveDir = "../PPO-Model/" + run_name +"_"+ str(TotalRewardMean) + ".pt"
|
||||||
torch.save(agent, saveDir)
|
torch.save(agent, saveDir)
|
||||||
else:
|
else:
|
||||||
meanRewardList = [] # for WANDB
|
# train mode off
|
||||||
|
mean_reward_list = [] # for WANDB
|
||||||
# while not in training mode, clear the buffer
|
# while not in training mode, clear the buffer
|
||||||
for thisT in trainQueue:
|
for thisT in train_queue:
|
||||||
target_steps[thisT]+=1
|
target_steps[thisT]+=1
|
||||||
targetName = Targets(thisT).name
|
targetName = Targets(thisT).name
|
||||||
targetRewardMean = np.mean(rewards[thisT].to("cpu").detach().numpy().copy())
|
targetRewardMean = np.mean(ppo_memories.rewards[thisT].to("cpu").detach().numpy().copy())
|
||||||
meanRewardList.append(targetRewardMean)
|
mean_reward_list.append(targetRewardMean)
|
||||||
print(target_steps[thisT])
|
print(target_steps[thisT])
|
||||||
|
|
||||||
obs[thisT] = torch.tensor([]).to(device)
|
# clear this target trainning set buffer
|
||||||
actions[thisT] = torch.tensor([]).to(device)
|
ppo_memories.clear_training_datasets(thisT)
|
||||||
dis_logprobs[thisT] = torch.tensor([]).to(device)
|
|
||||||
con_logprobs[thisT] = torch.tensor([]).to(device)
|
|
||||||
rewards[thisT] = torch.tensor([]).to(device)
|
|
||||||
values[thisT] = torch.tensor([]).to(device)
|
|
||||||
advantages[thisT] = torch.tensor([]).to(device)
|
|
||||||
returns[thisT] = torch.tensor([]).to(device)
|
|
||||||
|
|
||||||
# record rewards for plotting purposes
|
# record rewards for plotting purposes
|
||||||
|
wdb_recorder.writer.add_scalar(f"Target{targetName}/Reward", targetRewardMean, target_steps[thisT])
|
||||||
writer.add_scalar(f"Target{targetName}/Reward", targetRewardMean, target_steps[thisT])
|
wdb_recorder.add_win_ratio(targetName,target_steps[thisT])
|
||||||
writer.add_scalar(f"Target{targetName}/WinRatio", WinRounds[targetName]/TotalRounds[targetName], target_steps[thisT])
|
|
||||||
print(f"episode over Target{targetName} mean reward:", targetRewardMean)
|
print(f"episode over Target{targetName} mean reward:", targetRewardMean)
|
||||||
TotalRewardMean = np.mean(meanRewardList)
|
TotalRewardMean = np.mean(mean_reward_list)
|
||||||
writer.add_scalar("GlobalCharts/TotalRewardMean", TotalRewardMean, total_steps)
|
wdb_recorder.writer.add_scalar("GlobalCharts/TotalRewardMean", TotalRewardMean, total_steps)
|
||||||
|
|
||||||
saveDir = "../PPO-Model/"+ run_name + "_last.pt"
|
saveDir = "../PPO-Model/"+ run_name + "_last.pt"
|
||||||
torch.save(agent, saveDir)
|
torch.save(agent, saveDir)
|
||||||
env.close()
|
env.close()
|
||||||
writer.close()
|
wdb_recorder.writer.close()
|
||||||
|
146
Aimbot-PPO-Python/Pytorch/aimemory.py
Normal file
146
Aimbot-PPO-Python/Pytorch/aimemory.py
Normal file
@ -0,0 +1,146 @@
|
|||||||
|
import torch
|
||||||
|
import numpy as np
|
||||||
|
import argparse
|
||||||
|
from aimbotEnv import Aimbot
|
||||||
|
from ppoagent import PPOAgent
|
||||||
|
from enum import Enum
|
||||||
|
|
||||||
|
# public data
|
||||||
|
class Targets(Enum):
|
||||||
|
Free = 0
|
||||||
|
Go = 1
|
||||||
|
Attack = 2
|
||||||
|
Defence = 3
|
||||||
|
Num = 4
|
||||||
|
|
||||||
|
class PPOMem:
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
env: Aimbot,
|
||||||
|
args: argparse.Namespace,
|
||||||
|
device: torch.device,
|
||||||
|
target_num: int,
|
||||||
|
target_state_size: int,
|
||||||
|
base_lose_reward: int,
|
||||||
|
base_win_reward: int,
|
||||||
|
) -> None:
|
||||||
|
self.data_set_size = args.datasetSize
|
||||||
|
self.result_broadcast_ratio = args.result_broadcast_ratio
|
||||||
|
self.decision_period = args.decision_period
|
||||||
|
self.unity_agent_num = env.unity_agent_num
|
||||||
|
|
||||||
|
self.base_lose_reward = base_lose_reward
|
||||||
|
self.base_win_reward = base_win_reward
|
||||||
|
self.target_state_size = target_state_size
|
||||||
|
self.device = device
|
||||||
|
|
||||||
|
# Trajectory Buffer
|
||||||
|
self.ob_bf = [[] for i in range(env.unity_agent_num)]
|
||||||
|
self.act_bf = [[] for i in range(env.unity_agent_num)]
|
||||||
|
self.dis_logprobs_bf = [[] for i in range(env.unity_agent_num)]
|
||||||
|
self.con_logprobs_bf = [[] for i in range(env.unity_agent_num)]
|
||||||
|
self.rewards_bf = [[] for i in range(env.unity_agent_num)]
|
||||||
|
self.dones_bf = [[] for i in range(env.unity_agent_num)]
|
||||||
|
self.values_bf = [[] for i in range(env.unity_agent_num)]
|
||||||
|
|
||||||
|
# initialize empty training datasets
|
||||||
|
self.obs = [torch.tensor([]).to(device) for i in range(target_num)] # (TARGETNUM,n,env.unity_observation_size)
|
||||||
|
self.actions = [torch.tensor([]).to(device) for i in range(target_num)] # (TARGETNUM,n,env.unity_action_size)
|
||||||
|
self.dis_logprobs = [torch.tensor([]).to(device) for i in range(target_num)] # (TARGETNUM,n,1)
|
||||||
|
self.con_logprobs = [torch.tensor([]).to(device) for i in range(target_num)] # (TARGETNUM,n,1)
|
||||||
|
self.rewards = [torch.tensor([]).to(device) for i in range(target_num)] # (TARGETNUM,n,1)
|
||||||
|
self.values = [torch.tensor([]).to(device) for i in range(target_num)] # (TARGETNUM,n,1)
|
||||||
|
self.advantages = [torch.tensor([]).to(device) for i in range(target_num)] # (TARGETNUM,n,1)
|
||||||
|
self.returns = [torch.tensor([]).to(device) for i in range(target_num)] # (TARGETNUM,n,1)
|
||||||
|
|
||||||
|
def broad_cast_end_reward(self, rewardBF: list, remainTime: float) -> torch.Tensor:
|
||||||
|
thisRewardBF = rewardBF.copy()
|
||||||
|
if rewardBF[-1] <= -500:
|
||||||
|
# print("Lose DO NOT BROAD CAST",rewardBF[-1])
|
||||||
|
thisRewardBF[-1] = rewardBF[-1] - self.base_lose_reward
|
||||||
|
elif rewardBF[-1] >= 500:
|
||||||
|
# print("Win! Broadcast reward!",rewardBF[-1])
|
||||||
|
print(sum(thisRewardBF) / len(thisRewardBF))
|
||||||
|
thisRewardBF[-1] = rewardBF[-1] - self.base_win_reward
|
||||||
|
thisRewardBF = (np.asarray(thisRewardBF) + (remainTime * self.result_broadcast_ratio)).tolist()
|
||||||
|
else:
|
||||||
|
print("!!!!!DIDNT GET RESULT REWARD!!!!!!", rewardBF[-1])
|
||||||
|
return torch.Tensor(thisRewardBF).to(self.device)
|
||||||
|
|
||||||
|
def save_memories(
|
||||||
|
self,
|
||||||
|
now_step: int,
|
||||||
|
agent: PPOAgent,
|
||||||
|
state: np.ndarray,
|
||||||
|
action_cpu: np.ndarray,
|
||||||
|
dis_logprob_cpu: np.ndarray,
|
||||||
|
con_logprob_cpu: np.ndarray,
|
||||||
|
reward: list,
|
||||||
|
done: list,
|
||||||
|
value_cpu: np.ndarray,
|
||||||
|
last_reward: list,
|
||||||
|
next_done: list,
|
||||||
|
next_state: np.ndarray,
|
||||||
|
):
|
||||||
|
for i in range(self.unity_agent_num):
|
||||||
|
if now_step % self.decision_period == 0 or next_done[i] == True:
|
||||||
|
# only on decision period or finished a round, save memories to buffer
|
||||||
|
self.ob_bf[i].append(state[i])
|
||||||
|
self.act_bf[i].append(action_cpu[i])
|
||||||
|
self.dis_logprobs_bf[i].append(dis_logprob_cpu[i])
|
||||||
|
self.con_logprobs_bf[i].append(con_logprob_cpu[i])
|
||||||
|
self.dones_bf[i].append(done[i])
|
||||||
|
self.values_bf[i].append(value_cpu[i])
|
||||||
|
if now_step % self.decision_period == 0:
|
||||||
|
# on decision period, add last skiped round's reward
|
||||||
|
self.rewards_bf[i].append(reward[i] + last_reward[i])
|
||||||
|
else:
|
||||||
|
# not on decision period, only add this round's reward
|
||||||
|
self.rewards_bf[i].append(reward[i])
|
||||||
|
if next_done[i] == True:
|
||||||
|
# finished a round, send finished memories to training datasets
|
||||||
|
# compute advantage and discounted reward
|
||||||
|
remainTime = state[i, self.target_state_size]
|
||||||
|
roundTargetType = int(state[i, 0])
|
||||||
|
thisRewardsTensor = self.broad_cast_end_reward(self.rewards_bf[i], remainTime)
|
||||||
|
adv, rt = agent.gae(
|
||||||
|
rewards=thisRewardsTensor,
|
||||||
|
dones=torch.Tensor(self.dones_bf[i]).to(self.device),
|
||||||
|
values=torch.tensor(self.values_bf[i]).to(self.device),
|
||||||
|
next_obs=torch.tensor(next_state[i]).to(self.device).unsqueeze(0),
|
||||||
|
next_done=torch.Tensor([next_done[i]]).to(self.device),
|
||||||
|
)
|
||||||
|
# send memories to training datasets
|
||||||
|
self.obs[roundTargetType] = torch.cat((self.obs[roundTargetType], torch.tensor(self.ob_bf[i]).to(self.device)), 0)
|
||||||
|
self.actions[roundTargetType] = torch.cat((self.actions[roundTargetType], torch.tensor(self.act_bf[i]).to(self.device)), 0)
|
||||||
|
self.dis_logprobs[roundTargetType] = torch.cat((self.dis_logprobs[roundTargetType], torch.tensor(self.dis_logprobs_bf[i]).to(self.device)), 0)
|
||||||
|
self.con_logprobs[roundTargetType] = torch.cat((self.con_logprobs[roundTargetType], torch.tensor(self.con_logprobs_bf[i]).to(self.device)), 0)
|
||||||
|
self.rewards[roundTargetType] = torch.cat((self.rewards[roundTargetType], thisRewardsTensor), 0)
|
||||||
|
self.values[roundTargetType] = torch.cat((self.values[roundTargetType], torch.tensor(self.values_bf[i]).to(self.device)), 0)
|
||||||
|
self.advantages[roundTargetType] = torch.cat((self.advantages[roundTargetType], adv), 0)
|
||||||
|
self.returns[roundTargetType] = torch.cat((self.returns[roundTargetType], rt), 0)
|
||||||
|
|
||||||
|
# clear buffers
|
||||||
|
self.clear_buffers(i)
|
||||||
|
print(f"train dataset {Targets(roundTargetType).name} added:{self.obs[roundTargetType].size()[0]}/{self.data_set_size}")
|
||||||
|
|
||||||
|
def clear_buffers(self,ind:int):
|
||||||
|
# clear buffers
|
||||||
|
self.ob_bf[ind] = []
|
||||||
|
self.act_bf[ind] = []
|
||||||
|
self.dis_logprobs_bf[ind] = []
|
||||||
|
self.con_logprobs_bf[ind] = []
|
||||||
|
self.rewards_bf[ind] = []
|
||||||
|
self.dones_bf[ind] = []
|
||||||
|
self.values_bf[ind] = []
|
||||||
|
|
||||||
|
def clear_training_datasets(self,ind:int):
|
||||||
|
# clear training datasets
|
||||||
|
self.obs[ind] = torch.tensor([]).to(self.device)
|
||||||
|
self.actions[ind] = torch.tensor([]).to(self.device)
|
||||||
|
self.dis_logprobs[ind] = torch.tensor([]).to(self.device)
|
||||||
|
self.con_logprobs[ind] = torch.tensor([]).to(self.device)
|
||||||
|
self.rewards[ind] = torch.tensor([]).to(self.device)
|
||||||
|
self.values[ind] = torch.tensor([]).to(self.device)
|
||||||
|
self.advantages[ind] = torch.tensor([]).to(self.device)
|
||||||
|
self.returns[ind] = torch.tensor([]).to(self.device)
|
82
Aimbot-PPO-Python/Pytorch/airecorder.py
Normal file
82
Aimbot-PPO-Python/Pytorch/airecorder.py
Normal file
@ -0,0 +1,82 @@
|
|||||||
|
import wandb
|
||||||
|
import time
|
||||||
|
from torch.utils.tensorboard import SummaryWriter
|
||||||
|
|
||||||
|
|
||||||
|
total_rounds = {"Free": 0, "Go": 0, "Attack": 0}
|
||||||
|
win_rounds = {"Free": 0, "Go": 0, "Attack": 0}
|
||||||
|
|
||||||
|
|
||||||
|
# class for wandb recording
|
||||||
|
class WandbRecorder:
|
||||||
|
def __init__(self, game_name: str, game_type: str, run_name: str, _args) -> None:
|
||||||
|
# init wandb
|
||||||
|
self.game_name = game_name
|
||||||
|
self.game_type = game_type
|
||||||
|
self._args = _args
|
||||||
|
self.run_name = run_name
|
||||||
|
if self._args.wandb_track:
|
||||||
|
wandb.init(
|
||||||
|
project=self.game_name,
|
||||||
|
entity=self._args.wandb_entity,
|
||||||
|
sync_tensorboard=True,
|
||||||
|
config=vars(self._args),
|
||||||
|
name=self.run_name,
|
||||||
|
monitor_gym=True,
|
||||||
|
save_code=True,
|
||||||
|
)
|
||||||
|
self.writer = SummaryWriter(f"runs/{self.run_name}")
|
||||||
|
self.writer.add_text(
|
||||||
|
"hyperparameters",
|
||||||
|
"|param|value|\n|-|-|\n%s"
|
||||||
|
% ("\n".join([f"|{key}|{value}|" for key, value in vars(self._args).items()])),
|
||||||
|
)
|
||||||
|
|
||||||
|
def add_target_scalar(
|
||||||
|
self,
|
||||||
|
target_name,
|
||||||
|
thisT,
|
||||||
|
v_loss,
|
||||||
|
dis_pg_loss,
|
||||||
|
con_pg_loss,
|
||||||
|
loss,
|
||||||
|
entropy_loss,
|
||||||
|
target_reward_mean,
|
||||||
|
target_steps,
|
||||||
|
):
|
||||||
|
# fmt:off
|
||||||
|
self.writer.add_scalar(
|
||||||
|
f"Target{target_name}/value_loss", v_loss.item(), target_steps[thisT]
|
||||||
|
)
|
||||||
|
self.writer.add_scalar(
|
||||||
|
f"Target{target_name}/dis_policy_loss", dis_pg_loss.item(), target_steps[thisT]
|
||||||
|
)
|
||||||
|
self.writer.add_scalar(
|
||||||
|
f"Target{target_name}/con_policy_loss", con_pg_loss.item(), target_steps[thisT]
|
||||||
|
)
|
||||||
|
self.writer.add_scalar(
|
||||||
|
f"Target{target_name}/total_loss", loss.item(), target_steps[thisT]
|
||||||
|
)
|
||||||
|
self.writer.add_scalar(
|
||||||
|
f"Target{target_name}/entropy_loss", entropy_loss.item(), target_steps[thisT]
|
||||||
|
)
|
||||||
|
self.writer.add_scalar(
|
||||||
|
f"Target{target_name}/Reward", target_reward_mean, target_steps[thisT]
|
||||||
|
)
|
||||||
|
self.writer.add_scalar(
|
||||||
|
f"Target{target_name}/WinRatio", win_rounds[target_name] / total_rounds[target_name], target_steps[thisT],
|
||||||
|
)
|
||||||
|
# fmt:on
|
||||||
|
|
||||||
|
def add_global_scalar(
|
||||||
|
self,
|
||||||
|
total_reward_mean,
|
||||||
|
learning_rate,
|
||||||
|
total_steps,
|
||||||
|
):
|
||||||
|
self.writer.add_scalar("GlobalCharts/TotalRewardMean", total_reward_mean, total_steps)
|
||||||
|
self.writer.add_scalar("GlobalCharts/learning_rate", learning_rate, total_steps)
|
||||||
|
def add_win_ratio(self, target_name, target_steps):
|
||||||
|
self.writer.add_scalar(
|
||||||
|
f"Target{target_name}/WinRatio", win_rounds[target_name] / total_rounds[target_name], target_steps,
|
||||||
|
)
|
204
Aimbot-PPO-Python/Pytorch/ppoagent.py
Normal file
204
Aimbot-PPO-Python/Pytorch/ppoagent.py
Normal file
@ -0,0 +1,204 @@
|
|||||||
|
import numpy as np
|
||||||
|
import torch
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
from torch import nn
|
||||||
|
from aimbotEnv import Aimbot
|
||||||
|
from torch.distributions.normal import Normal
|
||||||
|
from torch.distributions.categorical import Categorical
|
||||||
|
|
||||||
|
|
||||||
|
def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
|
||||||
|
nn.init.orthogonal_(layer.weight, std)
|
||||||
|
nn.init.constant_(layer.bias, bias_const)
|
||||||
|
return layer
|
||||||
|
|
||||||
|
|
||||||
|
class PPOAgent(nn.Module):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
env: Aimbot,
|
||||||
|
this_args:argparse.Namespace,
|
||||||
|
train_agent: bool,
|
||||||
|
target_num: int,
|
||||||
|
target_state_size: int,
|
||||||
|
time_state_size: int,
|
||||||
|
gun_state_size: int,
|
||||||
|
my_state_size: int,
|
||||||
|
total_t_size: int,
|
||||||
|
device: torch.device,
|
||||||
|
):
|
||||||
|
super(PPOAgent, self).__init__()
|
||||||
|
self.device = device
|
||||||
|
self.args = this_args
|
||||||
|
self.trainAgent = train_agent
|
||||||
|
self.targetNum = target_num
|
||||||
|
self.stateSize = env.unity_observation_shape[0]
|
||||||
|
self.agentNum = env.unity_agent_num
|
||||||
|
self.targetSize = target_state_size
|
||||||
|
self.timeSize = time_state_size
|
||||||
|
self.gunSize = gun_state_size
|
||||||
|
self.myStateSize = my_state_size
|
||||||
|
self.raySize = env.unity_observation_shape[0] - total_t_size
|
||||||
|
self.nonRaySize = total_t_size
|
||||||
|
self.head_input_size = (
|
||||||
|
env.unity_observation_shape[0] - self.targetSize - self.timeSize - self.gunSize
|
||||||
|
) # except target state input
|
||||||
|
|
||||||
|
self.unityDiscreteType = env.unity_discrete_type
|
||||||
|
self.discrete_size = env.unity_discrete_size
|
||||||
|
self.discrete_shape = list(env.unity_discrete_branches)
|
||||||
|
self.continuous_size = env.unity_continuous_size
|
||||||
|
|
||||||
|
self.viewNetwork = nn.Sequential(layer_init(nn.Linear(self.raySize, 200)), nn.LeakyReLU())
|
||||||
|
self.targetNetworks = nn.ModuleList(
|
||||||
|
[
|
||||||
|
nn.Sequential(layer_init(nn.Linear(self.nonRaySize, 100)), nn.LeakyReLU())
|
||||||
|
for i in range(target_num)
|
||||||
|
]
|
||||||
|
)
|
||||||
|
self.middleNetworks = nn.ModuleList(
|
||||||
|
[
|
||||||
|
nn.Sequential(layer_init(nn.Linear(300, 200)), nn.LeakyReLU())
|
||||||
|
for i in range(target_num)
|
||||||
|
]
|
||||||
|
)
|
||||||
|
self.actor_dis = nn.ModuleList(
|
||||||
|
[layer_init(nn.Linear(200, self.discrete_size), std=0.5) for i in range(target_num)]
|
||||||
|
)
|
||||||
|
self.actor_mean = nn.ModuleList(
|
||||||
|
[layer_init(nn.Linear(200, self.continuous_size), std=0.5) for i in range(target_num)]
|
||||||
|
)
|
||||||
|
# self.actor_logstd = nn.ModuleList([layer_init(nn.Linear(200, self.continuous_size), std=1) for i in range(targetNum)])
|
||||||
|
# self.actor_logstd = nn.Parameter(torch.zeros(1, self.continuous_size))
|
||||||
|
self.actor_logstd = nn.ParameterList(
|
||||||
|
[nn.Parameter(torch.zeros(1, self.continuous_size)) for i in range(target_num)]
|
||||||
|
) # nn.Parameter(torch.zeros(1, self.continuous_size))
|
||||||
|
self.critic = nn.ModuleList(
|
||||||
|
[layer_init(nn.Linear(200, 1), std=1) for i in range(target_num)]
|
||||||
|
)
|
||||||
|
|
||||||
|
def get_value(self, state: torch.Tensor):
|
||||||
|
target = state[:, 0].to(torch.int32) # int
|
||||||
|
thisStateNum = target.size()[0]
|
||||||
|
viewInput = state[:, -self.raySize :] # all ray input
|
||||||
|
targetInput = state[:, : self.nonRaySize]
|
||||||
|
viewLayer = self.viewNetwork(viewInput)
|
||||||
|
targetLayer = torch.stack(
|
||||||
|
[self.targetNetworks[target[i]](targetInput[i]) for i in range(thisStateNum)]
|
||||||
|
)
|
||||||
|
middleInput = torch.cat([viewLayer, targetLayer], dim=1)
|
||||||
|
middleLayer = torch.stack(
|
||||||
|
[self.middleNetworks[target[i]](middleInput[i]) for i in range(thisStateNum)]
|
||||||
|
)
|
||||||
|
criticV = torch.stack(
|
||||||
|
[self.critic[target[i]](middleLayer[i]) for i in range(thisStateNum)]
|
||||||
|
) # self.critic
|
||||||
|
return criticV
|
||||||
|
|
||||||
|
def get_actions_value(self, state: torch.Tensor, actions=None):
|
||||||
|
target = state[:, 0].to(torch.int32) # int
|
||||||
|
thisStateNum = target.size()[0]
|
||||||
|
viewInput = state[:, -self.raySize :] # all ray input
|
||||||
|
targetInput = state[:, : self.nonRaySize]
|
||||||
|
viewLayer = self.viewNetwork(viewInput)
|
||||||
|
targetLayer = torch.stack(
|
||||||
|
[self.targetNetworks[target[i]](targetInput[i]) for i in range(thisStateNum)]
|
||||||
|
)
|
||||||
|
middleInput = torch.cat([viewLayer, targetLayer], dim=1)
|
||||||
|
middleLayer = torch.stack(
|
||||||
|
[self.middleNetworks[target[i]](middleInput[i]) for i in range(thisStateNum)]
|
||||||
|
)
|
||||||
|
|
||||||
|
# discrete
|
||||||
|
# 递归targets的数量,既agent数来实现根据target不同来选用对应的输出网络计算输出
|
||||||
|
dis_logits = torch.stack(
|
||||||
|
[self.actor_dis[target[i]](middleLayer[i]) for i in range(thisStateNum)]
|
||||||
|
)
|
||||||
|
split_logits = torch.split(dis_logits, self.discrete_shape, dim=1)
|
||||||
|
multi_categoricals = [Categorical(logits=thisLogits) for thisLogits in split_logits]
|
||||||
|
# continuous
|
||||||
|
actions_mean = torch.stack(
|
||||||
|
[self.actor_mean[target[i]](middleLayer[i]) for i in range(thisStateNum)]
|
||||||
|
) # self.actor_mean(hidden)
|
||||||
|
# action_logstd = torch.stack([self.actor_logstd[target[i]](middleLayer[i]) for i in range(thisStateNum)]) # self.actor_logstd(hidden)
|
||||||
|
# action_logstd = self.actor_logstd.expand_as(actions_mean) # self.actor_logstd.expand_as(actions_mean)
|
||||||
|
action_logstd = torch.stack(
|
||||||
|
[torch.squeeze(self.actor_logstd[target[i]], 0) for i in range(thisStateNum)]
|
||||||
|
)
|
||||||
|
# print(action_logstd)
|
||||||
|
action_std = torch.exp(action_logstd) # torch.exp(action_logstd)
|
||||||
|
con_probs = Normal(actions_mean, action_std)
|
||||||
|
# critic
|
||||||
|
criticV = torch.stack(
|
||||||
|
[self.critic[target[i]](middleLayer[i]) for i in range(thisStateNum)]
|
||||||
|
) # self.critic
|
||||||
|
|
||||||
|
if actions is None:
|
||||||
|
if self.trainAgent:
|
||||||
|
# select actions base on probability distribution model
|
||||||
|
disAct = torch.stack([ctgr.sample() for ctgr in multi_categoricals])
|
||||||
|
conAct = con_probs.sample()
|
||||||
|
actions = torch.cat([disAct.T, conAct], dim=1)
|
||||||
|
else:
|
||||||
|
# select actions base on best probability distribution
|
||||||
|
# disAct = torch.stack([torch.argmax(logit, dim=1) for logit in split_logits])
|
||||||
|
conAct = actions_mean
|
||||||
|
disAct = torch.stack([ctgr.sample() for ctgr in multi_categoricals])
|
||||||
|
conAct = con_probs.sample()
|
||||||
|
actions = torch.cat([disAct.T, conAct], dim=1)
|
||||||
|
else:
|
||||||
|
disAct = actions[:, 0 : self.unityDiscreteType].T
|
||||||
|
conAct = actions[:, self.unityDiscreteType :]
|
||||||
|
dis_log_prob = torch.stack(
|
||||||
|
[ctgr.log_prob(act) for act, ctgr in zip(disAct, multi_categoricals)]
|
||||||
|
)
|
||||||
|
dis_entropy = torch.stack([ctgr.entropy() for ctgr in multi_categoricals])
|
||||||
|
return (
|
||||||
|
actions,
|
||||||
|
dis_log_prob.sum(0),
|
||||||
|
dis_entropy.sum(0),
|
||||||
|
con_probs.log_prob(conAct).sum(1),
|
||||||
|
con_probs.entropy().sum(1),
|
||||||
|
criticV,
|
||||||
|
)
|
||||||
|
|
||||||
|
def gae(
|
||||||
|
self,
|
||||||
|
rewards: torch.Tensor,
|
||||||
|
dones: torch.Tensor,
|
||||||
|
values: torch.tensor,
|
||||||
|
next_obs: torch.tensor,
|
||||||
|
next_done: torch.Tensor,
|
||||||
|
) -> tuple:
|
||||||
|
# GAE
|
||||||
|
with torch.no_grad():
|
||||||
|
next_value = self.get_value(next_obs).reshape(1, -1)
|
||||||
|
data_size = rewards.size()[0]
|
||||||
|
if self.args.gae:
|
||||||
|
advantages = torch.zeros_like(rewards).to(self.device)
|
||||||
|
last_gae_lam = 0
|
||||||
|
for t in reversed(range(data_size)):
|
||||||
|
if t == data_size - 1:
|
||||||
|
nextnonterminal = 1.0 - next_done
|
||||||
|
next_values = next_value
|
||||||
|
else:
|
||||||
|
nextnonterminal = 1.0 - dones[t + 1]
|
||||||
|
next_values = values[t + 1]
|
||||||
|
delta = rewards[t] + self.args.gamma * next_values * nextnonterminal - values[t]
|
||||||
|
advantages[t] = last_gae_lam = (
|
||||||
|
delta + self.args.gamma * self.args.gaeLambda * nextnonterminal * last_gae_lam
|
||||||
|
)
|
||||||
|
returns = advantages + values
|
||||||
|
else:
|
||||||
|
returns = torch.zeros_like(rewards).to(self.device)
|
||||||
|
for t in reversed(range(data_size)):
|
||||||
|
if t == data_size - 1:
|
||||||
|
nextnonterminal = 1.0 - next_done
|
||||||
|
next_return = next_value
|
||||||
|
else:
|
||||||
|
nextnonterminal = 1.0 - dones[t + 1]
|
||||||
|
next_return = returns[t + 1]
|
||||||
|
returns[t] = rewards[t] + self.args.gamma * nextnonterminal * next_return
|
||||||
|
advantages = returns - values
|
||||||
|
return advantages, returns
|
Loading…
Reference in New Issue
Block a user