Compare commits
1 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 10a1663230 |
@@ -11,7 +11,6 @@ from mlagents_envs.side_channel.side_channel import (
|
|||||||
IncomingMessage,
|
IncomingMessage,
|
||||||
OutgoingMessage,
|
OutgoingMessage,
|
||||||
)
|
)
|
||||||
from arguments import set_save_model
|
|
||||||
|
|
||||||
|
|
||||||
class Aimbot(gym.Env):
|
class Aimbot(gym.Env):
|
||||||
@@ -177,21 +176,18 @@ class AimbotSideChannel(SideChannel):
|
|||||||
"Warning|Message1|Message2|Message3" or
|
"Warning|Message1|Message2|Message3" or
|
||||||
"Error|Message1|Message2|Message3"
|
"Error|Message1|Message2|Message3"
|
||||||
"""
|
"""
|
||||||
this_message_Original = msg.read_string()
|
this_message = msg.read_string()
|
||||||
this_message = this_message_Original.split("|")
|
this_result = this_message.split("|")
|
||||||
print(this_message)
|
print(this_result)
|
||||||
if this_message[0] == "Warning":
|
if this_result[0] == "Warning":
|
||||||
if this_message[1] == "Result":
|
if this_result[1] == "Result":
|
||||||
airecorder.total_rounds[this_message[2]] += 1
|
airecorder.total_rounds[this_result[2]] += 1
|
||||||
if this_message[3] == "Win":
|
if this_result[3] == "Win":
|
||||||
airecorder.win_rounds[this_message[2]] += 1
|
airecorder.win_rounds[this_result[2]] += 1
|
||||||
# print(TotalRounds)
|
# print(TotalRounds)
|
||||||
# print(WinRounds)
|
# print(WinRounds)
|
||||||
if this_message[1] == "Command":
|
elif this_result[0] == "Error":
|
||||||
set_save_model(True)
|
print(this_message)
|
||||||
print("Command: " + this_message_Original)
|
|
||||||
elif this_message[0] == "Error":
|
|
||||||
print(this_message_Original)
|
|
||||||
# # while Message type is Warning
|
# # while Message type is Warning
|
||||||
# if(thisResult[0] == "Warning"):
|
# if(thisResult[0] == "Warning"):
|
||||||
# # while Message1 is result means one game is over
|
# # while Message1 is result means one game is over
|
||||||
|
|||||||
@@ -81,43 +81,130 @@
|
|||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 1,
|
"execution_count": 1,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import argparse\n",
|
||||||
|
"import wandb\n",
|
||||||
|
"import time\n",
|
||||||
|
"import numpy as np\n",
|
||||||
|
"import random\n",
|
||||||
|
"import uuid\n",
|
||||||
|
"import torch\n",
|
||||||
|
"import torch.nn as nn\n",
|
||||||
|
"import torch.optim as optim\n",
|
||||||
|
"\n",
|
||||||
|
"from AimbotEnv import Aimbot\n",
|
||||||
|
"from tqdm import tqdm\n",
|
||||||
|
"from torch.distributions.normal import Normal\n",
|
||||||
|
"from torch.distributions.categorical import Categorical\n",
|
||||||
|
"from distutils.util import strtobool\n",
|
||||||
|
"from torch.utils.tensorboard import SummaryWriter\n",
|
||||||
|
"from mlagents_envs.environment import UnityEnvironment\n",
|
||||||
|
"from mlagents_envs.side_channel.side_channel import (\n",
|
||||||
|
" SideChannel,\n",
|
||||||
|
" IncomingMessage,\n",
|
||||||
|
" OutgoingMessage,\n",
|
||||||
|
")\n",
|
||||||
|
"from typing import List\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
"name": "stdout",
|
"ename": "AttributeError",
|
||||||
"output_type": "stream",
|
"evalue": "'aaa' object has no attribute 'outa'",
|
||||||
"text": [
|
"output_type": "error",
|
||||||
"0\n",
|
"traceback": [
|
||||||
"i = 0\n",
|
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
||||||
"i = 1\n",
|
"\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)",
|
||||||
"i = 2\n",
|
"Cell \u001b[1;32mIn[5], line 14\u001b[0m\n\u001b[0;32m 12\u001b[0m asd \u001b[39m=\u001b[39m aaa(outa, outb)\n\u001b[0;32m 13\u001b[0m asd\u001b[39m.\u001b[39mfunc()\n\u001b[1;32m---> 14\u001b[0m \u001b[39mprint\u001b[39m(asd\u001b[39m.\u001b[39;49mouta) \u001b[39m# 输出 100\u001b[39;00m\n",
|
||||||
"i = 3\n",
|
"\u001b[1;31mAttributeError\u001b[0m: 'aaa' object has no attribute 'outa'"
|
||||||
"i = 4\n",
|
|
||||||
"i = 5\n",
|
|
||||||
"i = 6\n",
|
|
||||||
"i = 7\n",
|
|
||||||
"i = 8\n",
|
|
||||||
"i = 9\n",
|
|
||||||
"10\n"
|
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"import threading\n",
|
"class aaa():\n",
|
||||||
|
" def __init__(self, a, b):\n",
|
||||||
|
" self.a = a\n",
|
||||||
|
" self.b = b\n",
|
||||||
"\n",
|
"\n",
|
||||||
"num = 0\n",
|
" def func(self):\n",
|
||||||
|
" global outa\n",
|
||||||
|
" outa = 100\n",
|
||||||
"\n",
|
"\n",
|
||||||
"def print_numers():\n",
|
"outa = 1\n",
|
||||||
" global num\n",
|
"outb = 2\n",
|
||||||
" for i in range(10):\n",
|
"asd = aaa(outa, outb)\n",
|
||||||
" num +=1\n",
|
"asd.func()\n",
|
||||||
" print(\"i = \",i)\n",
|
"print(asd.outa) # 输出 100"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"usage: ipykernel_launcher.py [-h] [--seed SEED]\n",
|
||||||
|
"ipykernel_launcher.py: error: unrecognized arguments: --ip=127.0.0.1 --stdin=9003 --control=9001 --hb=9000 --Session.signature_scheme=\"hmac-sha256\" --Session.key=b\"46ef9317-59fb-4ab6-ae4e-6b35744fc423\" --shell=9002 --transport=\"tcp\" --iopub=9004 --f=c:\\Users\\UCUNI\\AppData\\Roaming\\jupyter\\runtime\\kernel-v2-311926K1uko38tdWb.json\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ename": "SystemExit",
|
||||||
|
"evalue": "2",
|
||||||
|
"output_type": "error",
|
||||||
|
"traceback": [
|
||||||
|
"An exception has occurred, use %tb to see the full traceback.\n",
|
||||||
|
"\u001b[1;31mSystemExit\u001b[0m\u001b[1;31m:\u001b[0m 2\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"import argparse\n",
|
||||||
"\n",
|
"\n",
|
||||||
"thread = threading.Thread(target=print_numers)\n",
|
"def parse_args():\n",
|
||||||
|
" parser = argparse.ArgumentParser()\n",
|
||||||
|
" parser.add_argument(\"--seed\", type=int, default=11,\n",
|
||||||
|
" help=\"seed of the experiment\")\n",
|
||||||
|
" args = parser.parse_args()\n",
|
||||||
|
" return args\n",
|
||||||
"\n",
|
"\n",
|
||||||
"print(num)\n",
|
"arggg = parse_args()\n",
|
||||||
"thread.start()\n",
|
"print(type(arggg))"
|
||||||
"thread.join()\n",
|
]
|
||||||
"print(num)"
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"array([[0., 0., 0., 0.],\n",
|
||||||
|
" [0., 0., 0., 0.],\n",
|
||||||
|
" [0., 0., 0., 0.],\n",
|
||||||
|
" [0., 0., 0., 0.],\n",
|
||||||
|
" [0., 0., 0., 0.],\n",
|
||||||
|
" [0., 0., 0., 0.],\n",
|
||||||
|
" [0., 0., 0., 0.],\n",
|
||||||
|
" [0., 0., 0., 0.]])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"import numpy as np\n",
|
||||||
|
"np.zeros((8, 4))"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
|
|||||||
@@ -4,7 +4,6 @@ import random
|
|||||||
import uuid
|
import uuid
|
||||||
import torch
|
import torch
|
||||||
import atexit
|
import atexit
|
||||||
import os
|
|
||||||
|
|
||||||
from aimbotEnv import Aimbot
|
from aimbotEnv import Aimbot
|
||||||
from aimbotEnv import AimbotSideChannel
|
from aimbotEnv import AimbotSideChannel
|
||||||
@@ -13,14 +12,13 @@ from airecorder import WandbRecorder
|
|||||||
from aimemory import PPOMem
|
from aimemory import PPOMem
|
||||||
from aimemory import Targets
|
from aimemory import Targets
|
||||||
from arguments import parse_args
|
from arguments import parse_args
|
||||||
from arguments import set_save_model, is_save_model
|
|
||||||
import torch.optim as optim
|
import torch.optim as optim
|
||||||
|
|
||||||
# side channel uuid
|
# side channel uuid
|
||||||
SIDE_CHANNEL_UUID = uuid.UUID("8bbfb62a-99b4-457c-879d-b78b69066b5e")
|
SIDE_CHANNEL_UUID = uuid.UUID("8bbfb62a-99b4-457c-879d-b78b69066b5e")
|
||||||
# tensorboard names
|
# tensorboard names
|
||||||
GAME_NAME = "Aimbot_Hybrid_Full_MNN_MultiLevel_V2"
|
GAME_NAME = "Aimbot_Hybrid_V3"
|
||||||
GAME_TYPE = "GotoOnly-Level0123-new512Model"
|
GAME_TYPE = "Mix_Verification"
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
args = parse_args()
|
args = parse_args()
|
||||||
@@ -49,8 +47,9 @@ if __name__ == "__main__":
|
|||||||
# freeze
|
# freeze
|
||||||
if args.freeze_viewnet:
|
if args.freeze_viewnet:
|
||||||
# freeze the view network
|
# freeze the view network
|
||||||
print("FREEZE VIEW NETWORK is not compatible with Full MNN!")
|
for p in agent.viewNetwork.parameters():
|
||||||
raise NotImplementedError
|
p.requires_grad = False
|
||||||
|
print("VIEW NETWORK FREEZE")
|
||||||
print("Load Agent", args.load_dir)
|
print("Load Agent", args.load_dir)
|
||||||
print(agent.eval())
|
print(agent.eval())
|
||||||
# optimizer
|
# optimizer
|
||||||
@@ -59,6 +58,16 @@ if __name__ == "__main__":
|
|||||||
run_name = f"{GAME_TYPE}_{args.seed}_{int(time.time())}"
|
run_name = f"{GAME_TYPE}_{args.seed}_{int(time.time())}"
|
||||||
wdb_recorder = WandbRecorder(GAME_NAME, GAME_TYPE, run_name, args)
|
wdb_recorder = WandbRecorder(GAME_NAME, GAME_TYPE, run_name, args)
|
||||||
|
|
||||||
|
@atexit.register
|
||||||
|
def save_model():
|
||||||
|
# close env
|
||||||
|
env.close()
|
||||||
|
if args.save_model:
|
||||||
|
# save model while exit
|
||||||
|
save_dir = "../PPO-Model/" + run_name + "_last.pt"
|
||||||
|
torch.save(agent, save_dir)
|
||||||
|
print("save model to " + save_dir)
|
||||||
|
|
||||||
# start the game
|
# start the game
|
||||||
total_update_step = args.target_num * args.total_timesteps // args.datasetSize
|
total_update_step = args.target_num * args.total_timesteps // args.datasetSize
|
||||||
target_steps = [0 for i in range(args.target_num)]
|
target_steps = [0 for i in range(args.target_num)]
|
||||||
@@ -214,16 +223,11 @@ if __name__ == "__main__":
|
|||||||
)
|
)
|
||||||
# print cost time as seconds
|
# print cost time as seconds
|
||||||
print("cost time:", time.time() - start_time)
|
print("cost time:", time.time() - start_time)
|
||||||
# New Record! or save model
|
# New Record!
|
||||||
if ((is_save_model() or TotalRewardMean > best_reward) and args.save_model):
|
if TotalRewardMean > best_reward and args.save_model:
|
||||||
# check saveDir is exist
|
best_reward = target_reward_mean
|
||||||
saveDir = "../PPO-Model/" + run_name + "/"
|
saveDir = "../PPO-Model/" + run_name + "_" + str(TotalRewardMean) + ".pt"
|
||||||
if not os.path.isdir(saveDir):
|
torch.save(agent, saveDir)
|
||||||
os.mkdir(saveDir)
|
|
||||||
best_reward = TotalRewardMean
|
|
||||||
torch.save(agent, saveDir + str(TotalRewardMean) + ".pt")
|
|
||||||
print("Model Saved!")
|
|
||||||
set_save_model(False)
|
|
||||||
else:
|
else:
|
||||||
# train mode off
|
# train mode off
|
||||||
mean_reward_list = [] # for WANDB
|
mean_reward_list = [] # for WANDB
|
||||||
@@ -246,10 +250,7 @@ if __name__ == "__main__":
|
|||||||
TotalRewardMean = np.mean(mean_reward_list)
|
TotalRewardMean = np.mean(mean_reward_list)
|
||||||
wdb_recorder.writer.add_scalar("GlobalCharts/TotalRewardMean", TotalRewardMean, total_steps)
|
wdb_recorder.writer.add_scalar("GlobalCharts/TotalRewardMean", TotalRewardMean, total_steps)
|
||||||
|
|
||||||
saveDir = "../PPO-Model/" + run_name + "/"
|
saveDir = "../PPO-Model/" + run_name + "_last.pt"
|
||||||
if not os.path.isdir(saveDir):
|
torch.save(agent, saveDir)
|
||||||
os.mkdir(saveDir)
|
|
||||||
best_reward = target_reward_mean
|
|
||||||
torch.save(agent, saveDir + "_last.pt")
|
|
||||||
env.close()
|
env.close()
|
||||||
wdb_recorder.writer.close()
|
wdb_recorder.writer.close()
|
||||||
|
|||||||
@@ -4,38 +4,41 @@ import uuid
|
|||||||
from distutils.util import strtobool
|
from distutils.util import strtobool
|
||||||
|
|
||||||
DEFAULT_SEED = 9331
|
DEFAULT_SEED = 9331
|
||||||
ENV_PATH = "../Build/3.5/Aimbot-ParallelEnv"
|
ENV_PATH = "../Build/3.1.6/Aimbot-ParallelEnv"
|
||||||
WAND_ENTITY = "koha9"
|
WAND_ENTITY = "koha9"
|
||||||
WORKER_ID = 1
|
WORKER_ID = 1
|
||||||
BASE_PORT = 1000
|
BASE_PORT = 1000
|
||||||
|
|
||||||
# tensorboard names
|
# tensorboard names
|
||||||
|
GAME_NAME = "Aimbot_Target_Hybrid_PMNN_V3"
|
||||||
|
GAME_TYPE = "Mix_Verification"
|
||||||
|
|
||||||
# max round steps per agent is 2500/Decision_period, 25 seconds
|
# max round steps per agent is 2500/Decision_period, 25 seconds
|
||||||
TOTAL_STEPS = 3150000
|
TOTAL_STEPS = 3150000
|
||||||
BATCH_SIZE = 512
|
BATCH_SIZE = 512
|
||||||
MAX_TRAINNING_DATASETS = 6000
|
MAX_TRAINNING_DATASETS = 6000
|
||||||
DECISION_PERIOD = 1
|
DECISION_PERIOD = 1
|
||||||
LEARNING_RATE = 1.5e-4
|
LEARNING_RATE = 6.5e-4
|
||||||
GAMMA = 0.99
|
GAMMA = 0.99
|
||||||
GAE_LAMBDA = 0.95
|
GAE_LAMBDA = 0.95
|
||||||
EPOCHS = 3
|
EPOCHS = 3
|
||||||
CLIP_COEF = 0.11
|
CLIP_COEF = 0.11
|
||||||
LOSS_COEF = [1.0, 1.0, 1.0, 1.0] # free go attack defence
|
LOSS_COEF = [1.0, 1.0, 1.0, 1.0] # free go attack defence
|
||||||
POLICY_COEF = [0.8, 0.8, 0.8, 0.8]
|
POLICY_COEF = [1.0, 1.0, 1.0, 1.0]
|
||||||
ENTROPY_COEF = [0.05, 0.05, 0.05, 0.05]
|
ENTROPY_COEF = [0.05, 0.05, 0.05, 0.05]
|
||||||
CRITIC_COEF = [0.8, 0.8, 0.8, 0.8]
|
CRITIC_COEF = [0.5, 0.5, 0.5, 0.5]
|
||||||
TARGET_LEARNING_RATE = 1e-6
|
TARGET_LEARNING_RATE = 1e-6
|
||||||
|
|
||||||
FREEZE_VIEW_NETWORK = False
|
FREEZE_VIEW_NETWORK = False
|
||||||
|
BROADCASTREWARD = False
|
||||||
ANNEAL_LEARNING_RATE = True
|
ANNEAL_LEARNING_RATE = True
|
||||||
CLIP_VLOSS = True
|
CLIP_VLOSS = True
|
||||||
NORM_ADV = False
|
NORM_ADV = False
|
||||||
TRAIN = True
|
TRAIN = True
|
||||||
SAVE_MODEL = True
|
SAVE_MODEL = False
|
||||||
WANDB_TACK = True
|
WANDB_TACK = False
|
||||||
LOAD_DIR = None
|
LOAD_DIR = None
|
||||||
# LOAD_DIR = "../PPO-Model/GotoOnly-Level0123_9331_1696965321/5.1035867.pt"
|
#LOAD_DIR = "../PPO-Model/PList_Go_LeakyReLU_9331_1677965178_bestGoto/PList_Go_LeakyReLU_9331_1677965178_10.709002.pt"
|
||||||
|
|
||||||
# Unity Environment Parameters
|
# Unity Environment Parameters
|
||||||
TARGET_STATE_SIZE = 6
|
TARGET_STATE_SIZE = 6
|
||||||
@@ -50,16 +53,6 @@ TARGETNUM= 4
|
|||||||
ENV_TIMELIMIT = 30
|
ENV_TIMELIMIT = 30
|
||||||
RESULT_BROADCAST_RATIO = 1/ENV_TIMELIMIT
|
RESULT_BROADCAST_RATIO = 1/ENV_TIMELIMIT
|
||||||
|
|
||||||
save_model_this_episode = False
|
|
||||||
|
|
||||||
def is_save_model():
|
|
||||||
global save_model_this_episode
|
|
||||||
return save_model_this_episode
|
|
||||||
def set_save_model(save_model:bool):
|
|
||||||
print("set save model to ",save_model)
|
|
||||||
global save_model_this_episode
|
|
||||||
save_model_this_episode = save_model
|
|
||||||
|
|
||||||
def parse_args():
|
def parse_args():
|
||||||
# fmt: off
|
# fmt: off
|
||||||
# pytorch and environment parameters
|
# pytorch and environment parameters
|
||||||
@@ -104,10 +97,12 @@ def parse_args():
|
|||||||
help="the number of steps to run in each environment per policy rollout")
|
help="the number of steps to run in each environment per policy rollout")
|
||||||
parser.add_argument("--result-broadcast-ratio", type=float, default=RESULT_BROADCAST_RATIO,
|
parser.add_argument("--result-broadcast-ratio", type=float, default=RESULT_BROADCAST_RATIO,
|
||||||
help="broadcast result when win round is reached,r=result-broadcast-ratio*remainTime")
|
help="broadcast result when win round is reached,r=result-broadcast-ratio*remainTime")
|
||||||
|
parser.add_argument("--broadCastEndReward", type=lambda x: bool(strtobool(x)), default=BROADCASTREWARD, nargs="?", const=True,
|
||||||
|
help="save model or not")
|
||||||
# target_learning_rate
|
# target_learning_rate
|
||||||
parser.add_argument("--target-lr", type=float, default=TARGET_LEARNING_RATE,
|
parser.add_argument("--target-lr", type=float, default=TARGET_LEARNING_RATE,
|
||||||
help="target value of downscaling the learning rate")
|
help="target value of downscaling the learning rate")
|
||||||
|
|
||||||
# POLICY_COEF ENTROPY_COEF CRITIC_COEF LOSS_COEF
|
# POLICY_COEF ENTROPY_COEF CRITIC_COEF LOSS_COEF
|
||||||
parser.add_argument("--policy-coef", type=float, default=POLICY_COEF,
|
parser.add_argument("--policy-coef", type=float, default=POLICY_COEF,
|
||||||
help="coefficient of the policy loss")
|
help="coefficient of the policy loss")
|
||||||
|
|||||||
@@ -1,255 +0,0 @@
|
|||||||
import time
|
|
||||||
import numpy as np
|
|
||||||
import random
|
|
||||||
import uuid
|
|
||||||
import torch
|
|
||||||
import atexit
|
|
||||||
import os
|
|
||||||
|
|
||||||
from aimbotEnv import Aimbot
|
|
||||||
from aimbotEnv import AimbotSideChannel
|
|
||||||
from ppoagent import PPOAgent
|
|
||||||
from airecorder import WandbRecorder
|
|
||||||
from aimemory import PPOMem
|
|
||||||
from aimemory import Targets
|
|
||||||
from arguments import parse_args
|
|
||||||
from arguments import set_save_model, is_save_model
|
|
||||||
import torch.optim as optim
|
|
||||||
|
|
||||||
# side channel uuid
|
|
||||||
SIDE_CHANNEL_UUID = uuid.UUID("8bbfb62a-99b4-457c-879d-b78b69066b5e")
|
|
||||||
# tensorboard names
|
|
||||||
GAME_NAME = "Aimbot_Hybrid_Full_MNN_MultiLevel_V2"
|
|
||||||
GAME_TYPE = "GotoOnly-Level0123-new512Model"
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
args = parse_args()
|
|
||||||
random.seed(args.seed)
|
|
||||||
np.random.seed(args.seed)
|
|
||||||
torch.manual_seed(args.seed)
|
|
||||||
|
|
||||||
device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu")
|
|
||||||
best_reward = -1
|
|
||||||
|
|
||||||
# Initialize environment agent optimizer
|
|
||||||
aimbot_side_channel = AimbotSideChannel(SIDE_CHANNEL_UUID)
|
|
||||||
env = Aimbot(
|
|
||||||
env_path=args.path,
|
|
||||||
worker_id=args.workerID,
|
|
||||||
base_port=args.baseport,
|
|
||||||
side_channels=[aimbot_side_channel])
|
|
||||||
if args.load_dir is None:
|
|
||||||
agent = PPOAgent(
|
|
||||||
env=env,
|
|
||||||
this_args=args,
|
|
||||||
device=device,
|
|
||||||
).to(device)
|
|
||||||
else:
|
|
||||||
agent = torch.load(args.load_dir)
|
|
||||||
# freeze
|
|
||||||
if args.freeze_viewnet:
|
|
||||||
# freeze the view network
|
|
||||||
print("FREEZE VIEW NETWORK is not compatible with Full MNN!")
|
|
||||||
raise NotImplementedError
|
|
||||||
print("Load Agent", args.load_dir)
|
|
||||||
print(agent.eval())
|
|
||||||
# optimizer
|
|
||||||
optimizer = optim.Adam(agent.parameters(), lr=args.lr, eps=1e-5)
|
|
||||||
# Tensorboard and WandB Recorder
|
|
||||||
run_name = f"{GAME_TYPE}_{args.seed}_{int(time.time())}"
|
|
||||||
wdb_recorder = WandbRecorder(GAME_NAME, GAME_TYPE, run_name, args)
|
|
||||||
|
|
||||||
# start the game
|
|
||||||
total_update_step = args.target_num * args.total_timesteps // args.datasetSize
|
|
||||||
target_steps = [0 for i in range(args.target_num)]
|
|
||||||
start_time = time.time()
|
|
||||||
state, _, done = env.reset()
|
|
||||||
|
|
||||||
# initialize AI memories
|
|
||||||
ppo_memories = PPOMem(
|
|
||||||
args=args,
|
|
||||||
unity_agent_num=env.unity_agent_num,
|
|
||||||
device=device,
|
|
||||||
)
|
|
||||||
|
|
||||||
# MAIN LOOP: run agent in environment
|
|
||||||
for total_steps in range(total_update_step):
|
|
||||||
# discount learning rate, while step == total_update_step lr will be 0
|
|
||||||
if args.annealLR:
|
|
||||||
final_lr_ratio = args.target_lr / args.lr
|
|
||||||
frac = 1.0 - ((total_steps + 1.0) / total_update_step)
|
|
||||||
lr_now = frac * args.lr
|
|
||||||
optimizer.param_groups[0]["lr"] = lr_now
|
|
||||||
else:
|
|
||||||
lr_now = args.lr
|
|
||||||
|
|
||||||
# episode start show learning rate
|
|
||||||
print("new episode", total_steps, "learning rate = ", lr_now)
|
|
||||||
step = 0
|
|
||||||
training = False
|
|
||||||
train_queue = []
|
|
||||||
last_reward = [0. for i in range(env.unity_agent_num)]
|
|
||||||
# MAIN LOOP: run agent in environment
|
|
||||||
while True:
|
|
||||||
# Target Type(state[0][0]) is stay(4),use all zero action
|
|
||||||
if state[0][0] == 4:
|
|
||||||
next_state, reward, next_done = env.step(env.all_zero_action)
|
|
||||||
state, done = next_state, next_done
|
|
||||||
continue
|
|
||||||
# On decision point, and Target Type(state[0][0]) is not stay(4) choose action by agent
|
|
||||||
if step % args.decision_period == 0:
|
|
||||||
step += 1
|
|
||||||
# Choose action by agent
|
|
||||||
with torch.no_grad():
|
|
||||||
# predict actions
|
|
||||||
action, dis_logprob, _, con_logprob, _, value = agent.get_actions_value(
|
|
||||||
torch.tensor(state,dtype=torch.float32).to(device)
|
|
||||||
)
|
|
||||||
value = value.flatten()
|
|
||||||
|
|
||||||
# variable from GPU to CPU
|
|
||||||
action_cpu = action.cpu().numpy()
|
|
||||||
dis_logprob_cpu = dis_logprob.cpu().numpy()
|
|
||||||
con_logprob_cpu = con_logprob.cpu().numpy()
|
|
||||||
value_cpu = value.cpu().numpy()
|
|
||||||
# Environment step
|
|
||||||
next_state, reward, next_done = env.step(action_cpu)
|
|
||||||
|
|
||||||
# save memories
|
|
||||||
if args.train:
|
|
||||||
ppo_memories.save_memories(
|
|
||||||
now_step=step,
|
|
||||||
agent=agent,
|
|
||||||
state=state,
|
|
||||||
action_cpu=action_cpu,
|
|
||||||
dis_logprob_cpu=dis_logprob_cpu,
|
|
||||||
con_logprob_cpu=con_logprob_cpu,
|
|
||||||
reward=reward,
|
|
||||||
done=done,
|
|
||||||
value_cpu=value_cpu,
|
|
||||||
last_reward=last_reward,
|
|
||||||
next_done=next_done,
|
|
||||||
next_state=next_state,
|
|
||||||
)
|
|
||||||
# check if any training dataset is full and ready to train
|
|
||||||
for i in range(args.target_num):
|
|
||||||
if ppo_memories.obs[i].size()[0] >= args.datasetSize:
|
|
||||||
# start train NN
|
|
||||||
train_queue.append(i)
|
|
||||||
if len(train_queue) > 0:
|
|
||||||
# break while loop and start train
|
|
||||||
break
|
|
||||||
# update state
|
|
||||||
state, done = next_state, next_done
|
|
||||||
else:
|
|
||||||
step += 1
|
|
||||||
# skip this step use last predict action
|
|
||||||
next_state, reward, next_done = env.step(action_cpu)
|
|
||||||
# save memories
|
|
||||||
if args.train:
|
|
||||||
ppo_memories.save_memories(
|
|
||||||
now_step=step,
|
|
||||||
agent=agent,
|
|
||||||
state=state,
|
|
||||||
action_cpu=action_cpu,
|
|
||||||
dis_logprob_cpu=dis_logprob_cpu,
|
|
||||||
con_logprob_cpu=con_logprob_cpu,
|
|
||||||
reward=reward,
|
|
||||||
done=done,
|
|
||||||
value_cpu=value_cpu,
|
|
||||||
last_reward=last_reward,
|
|
||||||
next_done=next_done,
|
|
||||||
next_state=next_state,
|
|
||||||
)
|
|
||||||
# update state
|
|
||||||
state = next_state
|
|
||||||
last_reward = reward
|
|
||||||
|
|
||||||
if args.train:
|
|
||||||
# train mode on
|
|
||||||
mean_reward_list = [] # for WANDB
|
|
||||||
# loop all training queue
|
|
||||||
for this_train_ind in train_queue:
|
|
||||||
# start time
|
|
||||||
start_time = time.time()
|
|
||||||
target_steps[this_train_ind] += 1
|
|
||||||
# train agent
|
|
||||||
(
|
|
||||||
v_loss,
|
|
||||||
dis_pg_loss,
|
|
||||||
con_pg_loss,
|
|
||||||
loss,
|
|
||||||
entropy_loss
|
|
||||||
) = agent.train_net(
|
|
||||||
this_train_ind=this_train_ind,
|
|
||||||
ppo_memories=ppo_memories,
|
|
||||||
optimizer=optimizer
|
|
||||||
)
|
|
||||||
# record mean reward before clear history
|
|
||||||
print("done")
|
|
||||||
target_reward_mean = np.mean(ppo_memories.rewards[this_train_ind].to("cpu").detach().numpy().copy())
|
|
||||||
mean_reward_list.append(target_reward_mean)
|
|
||||||
targetName = Targets(this_train_ind).name
|
|
||||||
|
|
||||||
# clear this target training set buffer
|
|
||||||
ppo_memories.clear_training_datasets(this_train_ind)
|
|
||||||
# record rewards for plotting purposes
|
|
||||||
wdb_recorder.add_target_scalar(
|
|
||||||
targetName,
|
|
||||||
this_train_ind,
|
|
||||||
v_loss,
|
|
||||||
dis_pg_loss,
|
|
||||||
con_pg_loss,
|
|
||||||
loss,
|
|
||||||
entropy_loss,
|
|
||||||
target_reward_mean,
|
|
||||||
target_steps,
|
|
||||||
)
|
|
||||||
print(f"episode over Target{targetName} mean reward:", target_reward_mean)
|
|
||||||
TotalRewardMean = np.mean(mean_reward_list)
|
|
||||||
wdb_recorder.add_global_scalar(
|
|
||||||
TotalRewardMean,
|
|
||||||
optimizer.param_groups[0]["lr"],
|
|
||||||
total_steps,
|
|
||||||
)
|
|
||||||
# print cost time as seconds
|
|
||||||
print("cost time:", time.time() - start_time)
|
|
||||||
# New Record! or save model
|
|
||||||
if ((is_save_model() or TotalRewardMean > best_reward) and args.save_model):
|
|
||||||
# check saveDir is exist
|
|
||||||
saveDir = "../PPO-Model/" + run_name + "/"
|
|
||||||
if not os.path.isdir(saveDir):
|
|
||||||
os.mkdir(saveDir)
|
|
||||||
best_reward = TotalRewardMean
|
|
||||||
torch.save(agent, saveDir + str(TotalRewardMean) + ".pt")
|
|
||||||
print("Model Saved!")
|
|
||||||
set_save_model(False)
|
|
||||||
else:
|
|
||||||
# train mode off
|
|
||||||
mean_reward_list = [] # for WANDB
|
|
||||||
# while not in training mode, clear the buffer
|
|
||||||
for this_train_ind in train_queue:
|
|
||||||
target_steps[this_train_ind] += 1
|
|
||||||
targetName = Targets(this_train_ind).name
|
|
||||||
target_reward_mean = np.mean(ppo_memories.rewards[this_train_ind].to("cpu").detach().numpy().copy())
|
|
||||||
mean_reward_list.append(target_reward_mean)
|
|
||||||
print(target_steps[this_train_ind])
|
|
||||||
|
|
||||||
# clear this target training set buffer
|
|
||||||
ppo_memories.clear_training_datasets(this_train_ind)
|
|
||||||
|
|
||||||
# record rewards for plotting purposes
|
|
||||||
wdb_recorder.writer.add_scalar(f"Target{targetName}/Reward", target_reward_mean,
|
|
||||||
target_steps[this_train_ind])
|
|
||||||
wdb_recorder.add_win_ratio(targetName, target_steps[this_train_ind])
|
|
||||||
print(f"episode over Target{targetName} mean reward:", target_reward_mean)
|
|
||||||
TotalRewardMean = np.mean(mean_reward_list)
|
|
||||||
wdb_recorder.writer.add_scalar("GlobalCharts/TotalRewardMean", TotalRewardMean, total_steps)
|
|
||||||
|
|
||||||
saveDir = "../PPO-Model/" + run_name + "/"
|
|
||||||
if not os.path.isdir(saveDir):
|
|
||||||
os.mkdir(saveDir)
|
|
||||||
best_reward = target_reward_mean
|
|
||||||
torch.save(agent, saveDir + "_last.pt")
|
|
||||||
env.close()
|
|
||||||
wdb_recorder.writer.close()
|
|
||||||
@@ -8,8 +8,6 @@ from aimbotEnv import Aimbot
|
|||||||
from torch.distributions.normal import Normal
|
from torch.distributions.normal import Normal
|
||||||
from torch.distributions.categorical import Categorical
|
from torch.distributions.categorical import Categorical
|
||||||
|
|
||||||
firstLayerNum = 512
|
|
||||||
secondLayerNum = 128
|
|
||||||
|
|
||||||
def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
|
def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
|
||||||
nn.init.orthogonal_(layer.weight, std)
|
nn.init.orthogonal_(layer.weight, std)
|
||||||
@@ -48,70 +46,84 @@ class PPOAgent(nn.Module):
|
|||||||
self.discrete_shape = list(env.unity_discrete_branches)
|
self.discrete_shape = list(env.unity_discrete_branches)
|
||||||
self.continuous_size = env.unity_continuous_size
|
self.continuous_size = env.unity_continuous_size
|
||||||
|
|
||||||
self.hidden_networks = nn.ModuleList(
|
self.view_network = nn.Sequential(layer_init(nn.Linear(self.ray_state_size, 200)), nn.LeakyReLU())
|
||||||
|
self.target_networks = nn.ModuleList(
|
||||||
[
|
[
|
||||||
nn.Sequential(
|
nn.Sequential(layer_init(nn.Linear(self.state_size_without_ray, 100)), nn.LeakyReLU())
|
||||||
layer_init(nn.Linear(self.state_size, firstLayerNum)),
|
for i in range(self.target_num)
|
||||||
nn.LeakyReLU(),
|
]
|
||||||
layer_init(nn.Linear(firstLayerNum, secondLayerNum)),
|
)
|
||||||
nn.LeakyReLU(),
|
self.middle_networks = nn.ModuleList(
|
||||||
)
|
[
|
||||||
|
nn.Sequential(layer_init(nn.Linear(300, 200)), nn.LeakyReLU())
|
||||||
for i in range(self.target_num)
|
for i in range(self.target_num)
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
self.actor_dis = nn.ModuleList(
|
self.actor_dis = nn.ModuleList(
|
||||||
[layer_init(nn.Linear(secondLayerNum, self.discrete_size), std=0.5) for i in range(self.target_num)]
|
[layer_init(nn.Linear(200, self.discrete_size), std=0.5) for i in range(self.target_num)]
|
||||||
)
|
)
|
||||||
self.actor_mean = nn.ModuleList(
|
self.actor_mean = nn.ModuleList(
|
||||||
[layer_init(nn.Linear(secondLayerNum, self.continuous_size), std=0) for i in range(self.target_num)]
|
[layer_init(nn.Linear(200, self.continuous_size), std=0.5) for i in range(self.target_num)]
|
||||||
)
|
)
|
||||||
self.actor_logstd = nn.ParameterList(
|
self.actor_logstd = nn.ParameterList(
|
||||||
[nn.Parameter(torch.zeros(1, self.continuous_size)) for i in range(self.target_num)]
|
[nn.Parameter(torch.zeros(1, self.continuous_size)) for i in range(self.target_num)]
|
||||||
)
|
) # nn.Parameter(torch.zeros(1, self.continuous_size))
|
||||||
self.critic = nn.ModuleList(
|
self.critic = nn.ModuleList(
|
||||||
[layer_init(nn.Linear(secondLayerNum, 1), std=0) for i in range(self.target_num)]
|
[layer_init(nn.Linear(200, 1), std=1) for i in range(self.target_num)]
|
||||||
)
|
)
|
||||||
|
|
||||||
def get_value(self, state: torch.Tensor):
|
def get_value(self, state: torch.Tensor):
|
||||||
# get critic value
|
|
||||||
# state.size()[0] is batch_size
|
|
||||||
target = state[:, 0].to(torch.int32) # int
|
target = state[:, 0].to(torch.int32) # int
|
||||||
hidden_output = torch.stack(
|
this_state_num = target.size()[0]
|
||||||
[self.hidden_networks[target[i]](state[i]) for i in range(state.size()[0])]
|
view_input = state[:, -self.ray_state_size:] # all ray input
|
||||||
|
target_input = state[:, : self.state_size_without_ray]
|
||||||
|
view_layer = self.view_network(view_input)
|
||||||
|
target_layer = torch.stack(
|
||||||
|
[self.target_networks[target[i]](target_input[i]) for i in range(this_state_num)]
|
||||||
|
)
|
||||||
|
middle_input = torch.cat([view_layer, target_layer], dim=1)
|
||||||
|
middle_layer = torch.stack(
|
||||||
|
[self.middle_networks[target[i]](middle_input[i]) for i in range(this_state_num)]
|
||||||
)
|
)
|
||||||
criticV = torch.stack(
|
criticV = torch.stack(
|
||||||
[self.critic[target[i]](hidden_output[i]) for i in range(state.size()[0])]
|
[self.critic[target[i]](middle_layer[i]) for i in range(this_state_num)]
|
||||||
)
|
) # self.critic
|
||||||
return criticV
|
return criticV
|
||||||
|
|
||||||
def get_actions_value(self, state: torch.Tensor, actions=None):
|
def get_actions_value(self, state: torch.Tensor, actions=None):
|
||||||
# get actions and value
|
|
||||||
target = state[:, 0].to(torch.int32) # int
|
target = state[:, 0].to(torch.int32) # int
|
||||||
hidden_output = torch.stack(
|
this_state_num = target.size()[0]
|
||||||
[self.hidden_networks[target[i]](state[i]) for i in range(target.size()[0])]
|
view_input = state[:, -self.ray_state_size:] # all ray input
|
||||||
|
target_input = state[:, : self.state_size_without_ray]
|
||||||
|
view_layer = self.view_network(view_input)
|
||||||
|
target_layer = torch.stack(
|
||||||
|
[self.target_networks[target[i]](target_input[i]) for i in range(this_state_num)]
|
||||||
|
)
|
||||||
|
middle_input = torch.cat([view_layer, target_layer], dim=1)
|
||||||
|
middle_layer = torch.stack(
|
||||||
|
[self.middle_networks[target[i]](middle_input[i]) for i in range(this_state_num)]
|
||||||
)
|
)
|
||||||
|
|
||||||
# discrete
|
# discrete
|
||||||
# 递归targets的数量,既agent数来实现根据target不同来选用对应的输出网络计算输出
|
# 递归targets的数量,既agent数来实现根据target不同来选用对应的输出网络计算输出
|
||||||
dis_logits = torch.stack(
|
dis_logits = torch.stack(
|
||||||
[self.actor_dis[target[i]](hidden_output[i]) for i in range(target.size()[0])]
|
[self.actor_dis[target[i]](middle_layer[i]) for i in range(this_state_num)]
|
||||||
)
|
)
|
||||||
split_logits = torch.split(dis_logits, self.discrete_shape, dim=1)
|
split_logits = torch.split(dis_logits, self.discrete_shape, dim=1)
|
||||||
multi_categoricals = [Categorical(logits=thisLogits) for thisLogits in split_logits]
|
multi_categoricals = [Categorical(logits=thisLogits) for thisLogits in split_logits]
|
||||||
# continuous
|
# continuous
|
||||||
actions_mean = torch.stack(
|
actions_mean = torch.stack(
|
||||||
[self.actor_mean[target[i]](hidden_output[i]) for i in range(target.size()[0])]
|
[self.actor_mean[target[i]](middle_layer[i]) for i in range(this_state_num)]
|
||||||
) # self.actor_mean(hidden)
|
) # self.actor_mean(hidden)
|
||||||
action_logstd = torch.stack(
|
action_logstd = torch.stack(
|
||||||
[torch.squeeze(self.actor_logstd[target[i]], 0) for i in range(target.size()[0])]
|
[torch.squeeze(self.actor_logstd[target[i]], 0) for i in range(this_state_num)]
|
||||||
)
|
)
|
||||||
# print(action_logstd)
|
# print(action_logstd)
|
||||||
action_std = torch.exp(action_logstd) # torch.exp(action_logstd)
|
action_std = torch.exp(action_logstd) # torch.exp(action_logstd)
|
||||||
con_probs = Normal(actions_mean, action_std)
|
con_probs = Normal(actions_mean, action_std)
|
||||||
# critic
|
# critic
|
||||||
criticV = torch.stack(
|
criticV = torch.stack(
|
||||||
[self.critic[target[i]](hidden_output[i]) for i in range(target.size()[0])]
|
[self.critic[target[i]](middle_layer[i]) for i in range(this_state_num)]
|
||||||
) # self.critic
|
) # self.critic
|
||||||
|
|
||||||
if actions is None:
|
if actions is None:
|
||||||
|
|||||||
Reference in New Issue
Block a user