Compare commits
8 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 34206b95c5 | |||
| 1787872e82 | |||
| ad9817e7a4 | |||
| cbc385ca10 | |||
| 895cd5c118 | |||
| 3930bcd953 | |||
| 5631569b31 | |||
| 32d398dbef |
@@ -84,3 +84,4 @@ crashlytics-build.properties
|
|||||||
/Aimbot-PPO-Python/Build/
|
/Aimbot-PPO-Python/Build/
|
||||||
/Aimbot-PPO-Python/PPO-Model/
|
/Aimbot-PPO-Python/PPO-Model/
|
||||||
/Aimbot-PPO-Python/GAIL-Expert-Data/
|
/Aimbot-PPO-Python/GAIL-Expert-Data/
|
||||||
|
/Aimbot-PPO-Python/runs/
|
||||||
@@ -0,0 +1,709 @@
|
|||||||
|
import argparse
|
||||||
|
import wandb
|
||||||
|
import time
|
||||||
|
import numpy as np
|
||||||
|
import random
|
||||||
|
import uuid
|
||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
import torch.optim as optim
|
||||||
|
|
||||||
|
from AimbotEnv import Aimbot
|
||||||
|
from tqdm import tqdm
|
||||||
|
from enum import Enum
|
||||||
|
from torch.distributions.normal import Normal
|
||||||
|
from torch.distributions.categorical import Categorical
|
||||||
|
from distutils.util import strtobool
|
||||||
|
from torch.utils.tensorboard import SummaryWriter
|
||||||
|
from mlagents_envs.environment import UnityEnvironment
|
||||||
|
from mlagents_envs.side_channel.side_channel import (
|
||||||
|
SideChannel,
|
||||||
|
IncomingMessage,
|
||||||
|
OutgoingMessage,
|
||||||
|
)
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
bestReward = -1
|
||||||
|
|
||||||
|
DEFAULT_SEED = 9331
|
||||||
|
ENV_PATH = "../Build/Build-ParallelEnv-Target-OffPolicy-SingleStack-SideChannel-EndReward-Easy-V2.7-FreeOnly-NormalMapSize/Aimbot-ParallelEnv"
|
||||||
|
SIDE_CHANNEL_UUID = uuid.UUID("8bbfb62a-99b4-457c-879d-b78b69066b5e")
|
||||||
|
WAND_ENTITY = "koha9"
|
||||||
|
WORKER_ID = 3
|
||||||
|
BASE_PORT = 1002
|
||||||
|
|
||||||
|
# max round steps per agent is 2500/Decision_period, 25 seconds
|
||||||
|
# !!!check every parameters before run!!!
|
||||||
|
|
||||||
|
TOTAL_STEPS = 3150000
|
||||||
|
BATCH_SIZE = 1024
|
||||||
|
MAX_TRAINNING_DATASETS = 6000
|
||||||
|
DECISION_PERIOD = 1
|
||||||
|
LEARNING_RATE = 5e-4
|
||||||
|
GAMMA = 0.99
|
||||||
|
GAE_LAMBDA = 0.95
|
||||||
|
EPOCHS = 3
|
||||||
|
CLIP_COEF = 0.11
|
||||||
|
LOSS_COEF = [1.0, 1.0, 1.0, 1.0] # free go attack defence
|
||||||
|
POLICY_COEF = [1.0, 1.0, 1.0, 1.0]
|
||||||
|
ENTROPY_COEF = [0.1, 0.1, 0.1, 0.1]
|
||||||
|
CRITIC_COEF = [0.5, 0.5, 0.5, 0.5]
|
||||||
|
TARGET_LEARNING_RATE = 1e-6
|
||||||
|
|
||||||
|
ANNEAL_LEARNING_RATE = True
|
||||||
|
CLIP_VLOSS = True
|
||||||
|
NORM_ADV = True
|
||||||
|
TRAIN = True
|
||||||
|
|
||||||
|
WANDB_TACK = False
|
||||||
|
LOAD_DIR = None
|
||||||
|
#LOAD_DIR = "../PPO-Model/Aimbot_Target_Hybrid_PMNN_V2_OffPolicy_EndBC_9331_1670522099-freeonly-12/Aimbot-target-last.pt"
|
||||||
|
|
||||||
|
# public data
|
||||||
|
class Targets(Enum):
|
||||||
|
Free = 0
|
||||||
|
Go = 1
|
||||||
|
Attack = 2
|
||||||
|
Defence = 3
|
||||||
|
Num = 4
|
||||||
|
TARGET_STATE_SIZE = 6
|
||||||
|
INAREA_STATE_SIZE = 1
|
||||||
|
TIME_STATE_SIZE = 1
|
||||||
|
GUN_STATE_SIZE = 1
|
||||||
|
MY_STATE_SIZE = 4
|
||||||
|
TOTAL_T_SIZE = TARGET_STATE_SIZE+INAREA_STATE_SIZE+TIME_STATE_SIZE+GUN_STATE_SIZE+MY_STATE_SIZE
|
||||||
|
BASE_WINREWARD = 999
|
||||||
|
BASE_LOSEREWARD = -999
|
||||||
|
TARGETNUM= 4
|
||||||
|
ENV_TIMELIMIT = 30
|
||||||
|
RESULT_BROADCAST_RATIO = 1/ENV_TIMELIMIT
|
||||||
|
TotalRounds = {"Free":0,"Go":0,"Attack":0}
|
||||||
|
WinRounds = {"Free":0,"Go":0,"Attack":0}
|
||||||
|
|
||||||
|
# !!!SPECIAL PARAMETERS!!!
|
||||||
|
# change it while program is finished
|
||||||
|
using_targets_num = 3
|
||||||
|
|
||||||
|
|
||||||
|
def parse_args():
|
||||||
|
# fmt: off
|
||||||
|
# pytorch and environment parameters
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("--seed", type=int, default=DEFAULT_SEED,
|
||||||
|
help="seed of the experiment")
|
||||||
|
parser.add_argument("--path", type=str, default=ENV_PATH,
|
||||||
|
help="enviroment path")
|
||||||
|
parser.add_argument("--workerID", type=int, default=WORKER_ID,
|
||||||
|
help="unity worker ID")
|
||||||
|
parser.add_argument("--baseport", type=int, default=BASE_PORT,
|
||||||
|
help="port to connect to Unity environment")
|
||||||
|
parser.add_argument("--lr", type=float, default=LEARNING_RATE,
|
||||||
|
help="the learning rate of optimizer")
|
||||||
|
parser.add_argument("--cuda", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
|
||||||
|
help="if toggled, cuda will be enabled by default")
|
||||||
|
parser.add_argument("--total-timesteps", type=int, default=TOTAL_STEPS,
|
||||||
|
help="total timesteps of the experiments")
|
||||||
|
|
||||||
|
# model parameters
|
||||||
|
parser.add_argument("--train",type=lambda x: bool(strtobool(x)), default=TRAIN, nargs="?", const=True,
|
||||||
|
help="Train Model or not")
|
||||||
|
parser.add_argument("--datasetSize", type=int, default=MAX_TRAINNING_DATASETS,
|
||||||
|
help="training dataset size,start training while dataset collect enough data")
|
||||||
|
parser.add_argument("--minibatchSize", type=int, default=BATCH_SIZE,
|
||||||
|
help="nimi batch size")
|
||||||
|
parser.add_argument("--epochs", type=int, default=EPOCHS,
|
||||||
|
help="the K epochs to update the policy")
|
||||||
|
parser.add_argument("--annealLR", type=lambda x: bool(strtobool(x)), default=ANNEAL_LEARNING_RATE, nargs="?", const=True,
|
||||||
|
help="Toggle learning rate annealing for policy and value networks")
|
||||||
|
parser.add_argument("--wandb-track", type=lambda x: bool(strtobool(x)), default=WANDB_TACK, nargs="?", const=True,
|
||||||
|
help="track on the wandb")
|
||||||
|
parser.add_argument("--wandb-entity", type=str, default=WAND_ENTITY,
|
||||||
|
help="the entity (team) of wandb's project")
|
||||||
|
parser.add_argument("--load-dir", type=str, default=LOAD_DIR,
|
||||||
|
help="load model directory")
|
||||||
|
parser.add_argument("--decision-period", type=int, default=DECISION_PERIOD,
|
||||||
|
help="the number of steps to run in each environment per policy rollout")
|
||||||
|
parser.add_argument("--result-broadcast-ratio", type=float, default=RESULT_BROADCAST_RATIO,
|
||||||
|
help="broadcast result when win round is reached,r=result-broadcast-ratio*remainTime")
|
||||||
|
|
||||||
|
# GAE loss
|
||||||
|
parser.add_argument("--gae", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
|
||||||
|
help="Use GAE for advantage computation")
|
||||||
|
parser.add_argument("--norm-adv", type=lambda x: bool(strtobool(x)), default=NORM_ADV, nargs="?", const=True,
|
||||||
|
help="Toggles advantages normalization")
|
||||||
|
parser.add_argument("--gamma", type=float, default=GAMMA,
|
||||||
|
help="the discount factor gamma")
|
||||||
|
parser.add_argument("--gaeLambda", type=float, default=GAE_LAMBDA,
|
||||||
|
help="the lambda for the general advantage estimation")
|
||||||
|
parser.add_argument("--clip-coef", type=float, default=CLIP_COEF,
|
||||||
|
help="the surrogate clipping coefficient")
|
||||||
|
parser.add_argument("--policy-coef", type=float, default=POLICY_COEF,
|
||||||
|
help="coefficient of the policy")
|
||||||
|
parser.add_argument("--ent-coef", type=float, default=ENTROPY_COEF,
|
||||||
|
help="coefficient of the entropy")
|
||||||
|
parser.add_argument("--critic-coef", type=float, default=CRITIC_COEF,
|
||||||
|
help="coefficient of the value function")
|
||||||
|
parser.add_argument("--clip-vloss", type=lambda x: bool(strtobool(x)), default=CLIP_VLOSS, nargs="?", const=True,
|
||||||
|
help="Toggles whether or not to use a clipped loss for the value function, as per the paper.")
|
||||||
|
parser.add_argument("--max-grad-norm", type=float, default=0.5,
|
||||||
|
help="the maximum norm for the gradient clipping")
|
||||||
|
parser.add_argument("--target-kl", type=float, default=None,
|
||||||
|
help="the target KL divergence threshold")
|
||||||
|
# fmt: on
|
||||||
|
args = parser.parse_args()
|
||||||
|
return args
|
||||||
|
|
||||||
|
|
||||||
|
def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
|
||||||
|
torch.nn.init.orthogonal_(layer.weight, std)
|
||||||
|
torch.nn.init.constant_(layer.bias, bias_const)
|
||||||
|
return layer
|
||||||
|
|
||||||
|
|
||||||
|
class PPOAgent(nn.Module):
|
||||||
|
def __init__(self, env: Aimbot,targetNum:int):
|
||||||
|
super(PPOAgent, self).__init__()
|
||||||
|
self.targetNum = targetNum
|
||||||
|
self.stateSize = env.unity_observation_shape[0]
|
||||||
|
self.targetSize = TARGET_STATE_SIZE
|
||||||
|
self.timeSize = TIME_STATE_SIZE
|
||||||
|
self.gunSize = GUN_STATE_SIZE
|
||||||
|
self.myStateSize = MY_STATE_SIZE
|
||||||
|
self.totalMiddleSize = TOTAL_T_SIZE
|
||||||
|
self.head_input_size = env.unity_observation_shape[0] - self.targetSize-self.timeSize-self.gunSize# except target state input
|
||||||
|
|
||||||
|
self.discrete_size = env.unity_discrete_size
|
||||||
|
self.discrete_shape = list(env.unity_discrete_branches)
|
||||||
|
self.continuous_size = env.unity_continuous_size
|
||||||
|
|
||||||
|
self.network = nn.Sequential(
|
||||||
|
layer_init(nn.Linear(env.unity_observation_shape[0], 300)),
|
||||||
|
nn.Tanh(),
|
||||||
|
layer_init(nn.Linear(300, 200)),
|
||||||
|
nn.Tanh(),
|
||||||
|
)
|
||||||
|
self.actor_dis = layer_init(nn.Linear(200, self.discrete_size), std=0.5)
|
||||||
|
self.actor_mean = layer_init(nn.Linear(200, self.continuous_size), std=0.5)
|
||||||
|
self.actor_logstd = nn.Parameter(torch.zeros(1, self.continuous_size))
|
||||||
|
self.critic = layer_init(nn.Linear(200, 1), std=1)
|
||||||
|
|
||||||
|
def get_value(self, state: torch.Tensor):
|
||||||
|
return self.critic(self.network(state))
|
||||||
|
|
||||||
|
def get_actions_value(self, state: torch.Tensor, actions=None):
|
||||||
|
hidden = self.network(state)
|
||||||
|
|
||||||
|
# discrete
|
||||||
|
dis_logits = self.actor_dis(hidden)
|
||||||
|
split_logits = torch.split(dis_logits, self.discrete_shape, dim=1)
|
||||||
|
multi_categoricals = [Categorical(logits=thisLogits) for thisLogits in split_logits]
|
||||||
|
# continuous
|
||||||
|
actions_mean = self.actor_mean(hidden)
|
||||||
|
action_logstd = self.actor_logstd.expand_as(actions_mean)
|
||||||
|
action_std = torch.exp(action_logstd)
|
||||||
|
con_probs = Normal(actions_mean, action_std)
|
||||||
|
|
||||||
|
if actions is None:
|
||||||
|
if args.train:
|
||||||
|
# select actions base on probability distribution model
|
||||||
|
disAct = torch.stack([ctgr.sample() for ctgr in multi_categoricals])
|
||||||
|
conAct = con_probs.sample()
|
||||||
|
actions = torch.cat([disAct.T, conAct], dim=1)
|
||||||
|
else:
|
||||||
|
# select actions base on best probability distribution
|
||||||
|
disAct = torch.stack([torch.argmax(logit, dim=1) for logit in split_logits])
|
||||||
|
conAct = actions_mean
|
||||||
|
actions = torch.cat([disAct.T, conAct], dim=1)
|
||||||
|
else:
|
||||||
|
disAct = actions[:, 0 : env.unity_discrete_type].T
|
||||||
|
conAct = actions[:, env.unity_discrete_type :]
|
||||||
|
dis_log_prob = torch.stack(
|
||||||
|
[ctgr.log_prob(act) for act, ctgr in zip(disAct, multi_categoricals)]
|
||||||
|
)
|
||||||
|
dis_entropy = torch.stack([ctgr.entropy() for ctgr in multi_categoricals])
|
||||||
|
return (
|
||||||
|
actions,
|
||||||
|
dis_log_prob.sum(0),
|
||||||
|
dis_entropy.sum(0),
|
||||||
|
con_probs.log_prob(conAct).sum(1),
|
||||||
|
con_probs.entropy().sum(1),
|
||||||
|
self.critic(hidden),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def GAE(agent, args, rewards, dones, values, next_obs, next_done):
|
||||||
|
# GAE
|
||||||
|
with torch.no_grad():
|
||||||
|
next_value = agent.get_value(next_obs).reshape(1, -1)
|
||||||
|
data_size = rewards.size()[0]
|
||||||
|
if args.gae:
|
||||||
|
advantages = torch.zeros_like(rewards).to(device)
|
||||||
|
lastgaelam = 0
|
||||||
|
for t in reversed(range(data_size)):
|
||||||
|
if t == data_size - 1:
|
||||||
|
nextnonterminal = 1.0 - next_done
|
||||||
|
nextvalues = next_value
|
||||||
|
else:
|
||||||
|
nextnonterminal = 1.0 - dones[t + 1]
|
||||||
|
nextvalues = values[t + 1]
|
||||||
|
delta = rewards[t] + args.gamma * nextvalues * nextnonterminal - values[t]
|
||||||
|
advantages[t] = lastgaelam = (
|
||||||
|
delta + args.gamma * args.gaeLambda * nextnonterminal * lastgaelam
|
||||||
|
)
|
||||||
|
returns = advantages + values
|
||||||
|
else:
|
||||||
|
returns = torch.zeros_like(rewards).to(device)
|
||||||
|
for t in reversed(range(data_size)):
|
||||||
|
if t == data_size - 1:
|
||||||
|
nextnonterminal = 1.0 - next_done
|
||||||
|
next_return = next_value
|
||||||
|
else:
|
||||||
|
nextnonterminal = 1.0 - dones[t + 1]
|
||||||
|
next_return = returns[t + 1]
|
||||||
|
returns[t] = rewards[t] + args.gamma * nextnonterminal * next_return
|
||||||
|
advantages = returns - values
|
||||||
|
return advantages, returns
|
||||||
|
|
||||||
|
class AimbotSideChannel(SideChannel):
|
||||||
|
def __init__(self, channel_id: uuid.UUID) -> None:
|
||||||
|
super().__init__(channel_id)
|
||||||
|
def on_message_received(self, msg: IncomingMessage) -> None:
|
||||||
|
"""
|
||||||
|
Note: We must implement this method of the SideChannel interface to
|
||||||
|
receive messages from Unity
|
||||||
|
"""
|
||||||
|
thisMessage = msg.read_string()
|
||||||
|
# print(thisMessage)
|
||||||
|
thisResult = thisMessage.split("|")
|
||||||
|
if(thisResult[0] == "result"):
|
||||||
|
TotalRounds[thisResult[1]]+=1
|
||||||
|
if(thisResult[2] == "Win"):
|
||||||
|
WinRounds[thisResult[1]]+=1
|
||||||
|
#print(TotalRounds)
|
||||||
|
#print(WinRounds)
|
||||||
|
elif(thisResult[0] == "Error"):
|
||||||
|
print(thisMessage)
|
||||||
|
# 发送函数
|
||||||
|
def send_string(self, data: str) -> None:
|
||||||
|
# send a string toC#
|
||||||
|
msg = OutgoingMessage()
|
||||||
|
msg.write_string(data)
|
||||||
|
super().queue_message_to_send(msg)
|
||||||
|
|
||||||
|
def send_bool(self, data: bool) -> None:
|
||||||
|
msg = OutgoingMessage()
|
||||||
|
msg.write_bool(data)
|
||||||
|
super().queue_message_to_send(msg)
|
||||||
|
|
||||||
|
def send_int(self, data: int) -> None:
|
||||||
|
msg = OutgoingMessage()
|
||||||
|
msg.write_int32(data)
|
||||||
|
super().queue_message_to_send(msg)
|
||||||
|
|
||||||
|
def send_float(self, data: float) -> None:
|
||||||
|
msg = OutgoingMessage()
|
||||||
|
msg.write_float32(data)
|
||||||
|
super().queue_message_to_send(msg)
|
||||||
|
|
||||||
|
def send_float_list(self, data: List[float]) -> None:
|
||||||
|
msg = OutgoingMessage()
|
||||||
|
msg.write_float32_list(data)
|
||||||
|
super().queue_message_to_send(msg)
|
||||||
|
|
||||||
|
def broadCastEndReward(rewardBF:list,remainTime:float):
|
||||||
|
thisRewardBF = rewardBF
|
||||||
|
if (rewardBF[-1]<=-500):
|
||||||
|
# print("Lose DO NOT BROAD CAST",rewardBF[-1])
|
||||||
|
thisRewardBF[-1] = rewardBF[-1]-BASE_LOSEREWARD
|
||||||
|
thisRewardBF = thisRewardBF
|
||||||
|
elif (rewardBF[-1]>=500):
|
||||||
|
# print("Win! Broadcast reward!",rewardBF[-1])
|
||||||
|
thisRewardBF[-1] = rewardBF[-1]-BASE_WINREWARD
|
||||||
|
thisRewardBF = (np.asarray(thisRewardBF)+(remainTime*args.result_broadcast_ratio)).tolist()
|
||||||
|
else:
|
||||||
|
print("!!!!!DIDNT GET RESULT REWARD!!!!!!",rewardBF[-1])
|
||||||
|
return torch.Tensor(thisRewardBF).to(device)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
args = parse_args()
|
||||||
|
random.seed(args.seed)
|
||||||
|
np.random.seed(args.seed)
|
||||||
|
torch.manual_seed(args.seed)
|
||||||
|
|
||||||
|
device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu")
|
||||||
|
|
||||||
|
# Initialize environment anget optimizer
|
||||||
|
aimBotsideChannel = AimbotSideChannel(SIDE_CHANNEL_UUID);
|
||||||
|
env = Aimbot(envPath=args.path, workerID=args.workerID, basePort=args.baseport,side_channels=[aimBotsideChannel])
|
||||||
|
agentList = []
|
||||||
|
optimizers = []
|
||||||
|
if args.load_dir is None:
|
||||||
|
for i in range(using_targets_num):
|
||||||
|
agentList.append(PPOAgent(env,TARGETNUM).to(device))
|
||||||
|
optimizers.append(optim.Adam(agentList[i].parameters(), lr=args.lr, eps=1e-5))
|
||||||
|
else:
|
||||||
|
print("NAH")
|
||||||
|
# !!!not finished
|
||||||
|
# agent = torch.load(args.load_dir)
|
||||||
|
# print("Load Agent", args.load_dir)
|
||||||
|
# print(agent.eval())
|
||||||
|
|
||||||
|
|
||||||
|
# Tensorboard and WandB Recorder
|
||||||
|
game_name = "Aimbot_Target_Hybrid_PMNN_V2"
|
||||||
|
game_type = "OffPolicy_EndBC"
|
||||||
|
run_name = f"{game_name}_{game_type}_{args.seed}_{int(time.time())}"
|
||||||
|
if args.wandb_track:
|
||||||
|
wandb.init(
|
||||||
|
project=game_name,
|
||||||
|
entity=args.wandb_entity,
|
||||||
|
sync_tensorboard=True,
|
||||||
|
config=vars(args),
|
||||||
|
name=run_name,
|
||||||
|
monitor_gym=True,
|
||||||
|
save_code=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
writer = SummaryWriter(f"runs/{run_name}")
|
||||||
|
writer.add_text(
|
||||||
|
"hyperparameters",
|
||||||
|
"|param|value|\n|-|-|\n%s"
|
||||||
|
% ("\n".join([f"|{key}|{value}|" for key, value in vars(args).items()])),
|
||||||
|
)
|
||||||
|
|
||||||
|
# Trajectory Buffer
|
||||||
|
ob_bf = [[] for i in range(env.unity_agent_num)]
|
||||||
|
act_bf = [[] for i in range(env.unity_agent_num)]
|
||||||
|
dis_logprobs_bf = [[] for i in range(env.unity_agent_num)]
|
||||||
|
con_logprobs_bf = [[] for i in range(env.unity_agent_num)]
|
||||||
|
rewards_bf = [[] for i in range(env.unity_agent_num)]
|
||||||
|
dones_bf = [[] for i in range(env.unity_agent_num)]
|
||||||
|
values_bf = [[] for i in range(env.unity_agent_num)]
|
||||||
|
|
||||||
|
# start the game
|
||||||
|
total_update_step = using_targets_num * args.total_timesteps // args.datasetSize
|
||||||
|
target_steps = [0 for i in range(TARGETNUM)]
|
||||||
|
start_time = time.time()
|
||||||
|
state, _, done = env.reset()
|
||||||
|
# state = torch.Tensor(next_obs).to(device)
|
||||||
|
# next_done = torch.zeros(env.unity_agent_num).to(device)
|
||||||
|
|
||||||
|
# initialize empty training datasets
|
||||||
|
obs = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,env.unity_observation_size)
|
||||||
|
actions = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,env.unity_action_size)
|
||||||
|
dis_logprobs = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1)
|
||||||
|
con_logprobs = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1)
|
||||||
|
rewards = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1)
|
||||||
|
values = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1)
|
||||||
|
advantages = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1)
|
||||||
|
returns = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1)
|
||||||
|
|
||||||
|
for total_steps in range(total_update_step):
|
||||||
|
# discunt learning rate, while step == total_update_step lr will be 0
|
||||||
|
|
||||||
|
if args.annealLR:
|
||||||
|
finalRatio = TARGET_LEARNING_RATE/args.lr
|
||||||
|
frac = 1.0 - ((total_steps + 1.0) / total_update_step)
|
||||||
|
lrnow = frac * args.lr
|
||||||
|
for optimizer in optimizers:
|
||||||
|
optimizer.param_groups[0]["lr"] = lrnow
|
||||||
|
else:
|
||||||
|
lrnow = args.lr
|
||||||
|
print("new episode",total_steps,"learning rate = ",lrnow)
|
||||||
|
|
||||||
|
|
||||||
|
# MAIN LOOP: run agent in environment
|
||||||
|
step = 0
|
||||||
|
training = False
|
||||||
|
trainQueue = []
|
||||||
|
last_reward = [0.for i in range(env.unity_agent_num)]
|
||||||
|
action = torch.zeros((env.unity_agent_num,env.unity_discrete_type+env.unity_continuous_size))
|
||||||
|
dis_logprob = torch.zeros((env.unity_agent_num,env.unity_discrete_size))
|
||||||
|
con_logprob = torch.zeros((env.unity_agent_num,env.unity_continuous_size))
|
||||||
|
value = torch.zeros((env.unity_agent_num,1))
|
||||||
|
while True:
|
||||||
|
if step % args.decision_period == 0:
|
||||||
|
step += 1
|
||||||
|
# Choose action by agent
|
||||||
|
|
||||||
|
with torch.no_grad():
|
||||||
|
# predict actions
|
||||||
|
for i in range(env.unity_agent_num):
|
||||||
|
actTarget = int(state[i][0])
|
||||||
|
act, dis_lgprb, _, con_lgprb, _, vl = agentList[actTarget].get_actions_value(
|
||||||
|
torch.Tensor([state[i]]).to(device)
|
||||||
|
)
|
||||||
|
action[i] = act
|
||||||
|
dis_logprob[i] = dis_lgprb.squeeze(0)
|
||||||
|
con_logprob[i] = con_lgprb.squeeze(0)
|
||||||
|
value[i] = vl.squeeze(0)
|
||||||
|
|
||||||
|
# variable from GPU to CPU
|
||||||
|
action_cpu = action.cpu().numpy()
|
||||||
|
dis_logprob_cpu = dis_logprob.cpu().numpy()
|
||||||
|
con_logprob_cpu = con_logprob.cpu().numpy()
|
||||||
|
value_cpu = value.flatten().cpu().numpy()
|
||||||
|
# Environment step
|
||||||
|
next_state, reward, next_done = env.step(action_cpu)
|
||||||
|
# save memories
|
||||||
|
for i in range(env.unity_agent_num):
|
||||||
|
# save memories to buffers
|
||||||
|
ob_bf[i].append(state[i])
|
||||||
|
act_bf[i].append(action_cpu[i])
|
||||||
|
dis_logprobs_bf[i].append(dis_logprob_cpu[i])
|
||||||
|
con_logprobs_bf[i].append(con_logprob_cpu[i])
|
||||||
|
rewards_bf[i].append(reward[i]+last_reward[i])
|
||||||
|
dones_bf[i].append(done[i])
|
||||||
|
values_bf[i].append(value_cpu[i])
|
||||||
|
remainTime = state[i,TARGET_STATE_SIZE]
|
||||||
|
if next_done[i] == True:
|
||||||
|
# finished a round, send finished memories to training datasets
|
||||||
|
# compute advantage and discounted reward
|
||||||
|
#print(i,"over")
|
||||||
|
endTarget = int(ob_bf[i][0][0])
|
||||||
|
roundTargetType = int(state[i,0])
|
||||||
|
thisRewardsTensor = broadCastEndReward(rewards_bf[i],remainTime)
|
||||||
|
adv, rt = GAE(
|
||||||
|
agentList[endTarget],
|
||||||
|
args,
|
||||||
|
thisRewardsTensor,
|
||||||
|
torch.Tensor(dones_bf[i]).to(device),
|
||||||
|
torch.tensor(values_bf[i]).to(device),
|
||||||
|
torch.tensor(next_state[i]).to(device).unsqueeze(0),
|
||||||
|
torch.Tensor([next_done[i]]).to(device),
|
||||||
|
)
|
||||||
|
# send memories to training datasets
|
||||||
|
obs[roundTargetType] = torch.cat((obs[roundTargetType], torch.tensor(ob_bf[i]).to(device)), 0)
|
||||||
|
actions[roundTargetType] = torch.cat((actions[roundTargetType], torch.tensor(act_bf[i]).to(device)), 0)
|
||||||
|
dis_logprobs[roundTargetType] = torch.cat(
|
||||||
|
(dis_logprobs[roundTargetType], torch.tensor(dis_logprobs_bf[i]).to(device)), 0
|
||||||
|
)
|
||||||
|
con_logprobs[roundTargetType] = torch.cat(
|
||||||
|
(con_logprobs[roundTargetType], torch.tensor(con_logprobs_bf[i]).to(device)), 0
|
||||||
|
)
|
||||||
|
rewards[roundTargetType] = torch.cat((rewards[roundTargetType], thisRewardsTensor), 0)
|
||||||
|
values[roundTargetType] = torch.cat((values[roundTargetType], torch.tensor(values_bf[i]).to(device)), 0)
|
||||||
|
advantages[roundTargetType] = torch.cat((advantages[roundTargetType], adv), 0)
|
||||||
|
returns[roundTargetType] = torch.cat((returns[roundTargetType], rt), 0)
|
||||||
|
|
||||||
|
# clear buffers
|
||||||
|
ob_bf[i] = []
|
||||||
|
act_bf[i] = []
|
||||||
|
dis_logprobs_bf[i] = []
|
||||||
|
con_logprobs_bf[i] = []
|
||||||
|
rewards_bf[i] = []
|
||||||
|
dones_bf[i] = []
|
||||||
|
values_bf[i] = []
|
||||||
|
print(f"train dataset {Targets(roundTargetType).name} added:{obs[roundTargetType].size()[0]}/{args.datasetSize}")
|
||||||
|
|
||||||
|
for i in range(TARGETNUM):
|
||||||
|
if obs[i].size()[0] >= args.datasetSize:
|
||||||
|
# start train NN
|
||||||
|
trainQueue.append(i)
|
||||||
|
if(len(trainQueue)>0):
|
||||||
|
break
|
||||||
|
state, done = next_state, next_done
|
||||||
|
else:
|
||||||
|
step += 1
|
||||||
|
# skip this step use last predict action
|
||||||
|
next_state, reward, next_done = env.step(action_cpu)
|
||||||
|
# save memories
|
||||||
|
for i in range(env.unity_agent_num):
|
||||||
|
if next_done[i] == True:
|
||||||
|
#print(i,"over???")
|
||||||
|
# save memories to buffers
|
||||||
|
ob_bf[i].append(state[i])
|
||||||
|
act_bf[i].append(action_cpu[i])
|
||||||
|
dis_logprobs_bf[i].append(dis_logprob_cpu[i])
|
||||||
|
con_logprobs_bf[i].append(con_logprob_cpu[i])
|
||||||
|
rewards_bf[i].append(reward[i])
|
||||||
|
dones_bf[i].append(done[i])
|
||||||
|
values_bf[i].append(value_cpu[i])
|
||||||
|
remainTime = state[i,TARGET_STATE_SIZE]
|
||||||
|
# finished a round, send finished memories to training datasets
|
||||||
|
# compute advantage and discounted reward
|
||||||
|
roundTargetType = int(state[i,0])
|
||||||
|
thisRewardsTensor = broadCastEndReward(rewards_bf[i],remainTime)
|
||||||
|
adv, rt = GAE(
|
||||||
|
agentList[roundTargetType],
|
||||||
|
args,
|
||||||
|
thisRewardsTensor,
|
||||||
|
torch.Tensor(dones_bf[i]).to(device),
|
||||||
|
torch.tensor(values_bf[i]).to(device),
|
||||||
|
torch.Tensor(next_state[i]).to(device).unsqueeze(dim = 0),
|
||||||
|
torch.Tensor([next_done[i]]).to(device),
|
||||||
|
)
|
||||||
|
# send memories to training datasets
|
||||||
|
obs[roundTargetType] = torch.cat((obs[roundTargetType], torch.tensor(ob_bf[i]).to(device)), 0)
|
||||||
|
actions[roundTargetType] = torch.cat((actions[roundTargetType], torch.tensor(act_bf[i]).to(device)), 0)
|
||||||
|
dis_logprobs[roundTargetType] = torch.cat(
|
||||||
|
(dis_logprobs[roundTargetType], torch.tensor(dis_logprobs_bf[i]).to(device)), 0
|
||||||
|
)
|
||||||
|
con_logprobs[roundTargetType] = torch.cat(
|
||||||
|
(con_logprobs[roundTargetType], torch.tensor(con_logprobs_bf[i]).to(device)), 0
|
||||||
|
)
|
||||||
|
rewards[roundTargetType] = torch.cat((rewards[roundTargetType], thisRewardsTensor), 0)
|
||||||
|
values[roundTargetType] = torch.cat((values[roundTargetType], torch.tensor(values_bf[i]).to(device)), 0)
|
||||||
|
advantages[roundTargetType] = torch.cat((advantages[roundTargetType], adv), 0)
|
||||||
|
returns[roundTargetType] = torch.cat((returns[roundTargetType], rt), 0)
|
||||||
|
|
||||||
|
# clear buffers
|
||||||
|
ob_bf[i] = []
|
||||||
|
act_bf[i] = []
|
||||||
|
dis_logprobs_bf[i] = []
|
||||||
|
con_logprobs_bf[i] = []
|
||||||
|
rewards_bf[i] = []
|
||||||
|
dones_bf[i] = []
|
||||||
|
values_bf[i] = []
|
||||||
|
print(f"train dataset {Targets(roundTargetType).name} added:{obs[roundTargetType].size()[0]}/{args.datasetSize}")
|
||||||
|
|
||||||
|
state = next_state
|
||||||
|
last_reward = reward
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
if args.train:
|
||||||
|
meanRewardList = [] # for WANDB
|
||||||
|
# loop all tarining queue
|
||||||
|
for thisT in trainQueue:
|
||||||
|
target_steps[thisT]+=1
|
||||||
|
# flatten the batch
|
||||||
|
b_obs = obs[thisT].reshape((-1,) + env.unity_observation_shape)
|
||||||
|
b_dis_logprobs = dis_logprobs[thisT].reshape(-1)
|
||||||
|
b_con_logprobs = con_logprobs[thisT].reshape(-1)
|
||||||
|
b_actions = actions[thisT].reshape((-1,) + (env.unity_action_size,))
|
||||||
|
b_advantages = advantages[thisT].reshape(-1)
|
||||||
|
b_returns = returns[thisT].reshape(-1)
|
||||||
|
b_values = values[thisT].reshape(-1)
|
||||||
|
b_size = b_obs.size()[0]
|
||||||
|
# Optimizing the policy and value network
|
||||||
|
b_inds = np.arange(b_size)
|
||||||
|
# clipfracs = []
|
||||||
|
for epoch in range(args.epochs):
|
||||||
|
print(epoch,end="")
|
||||||
|
# shuffle all datasets
|
||||||
|
np.random.shuffle(b_inds)
|
||||||
|
for start in range(0, b_size, args.minibatchSize):
|
||||||
|
print(".",end="")
|
||||||
|
end = start + args.minibatchSize
|
||||||
|
mb_inds = b_inds[start:end]
|
||||||
|
mb_advantages = b_advantages[mb_inds]
|
||||||
|
|
||||||
|
# normalize advantages
|
||||||
|
if args.norm_adv:
|
||||||
|
mb_advantages = (mb_advantages - mb_advantages.mean()) / (
|
||||||
|
mb_advantages.std() + 1e-8
|
||||||
|
)
|
||||||
|
|
||||||
|
(
|
||||||
|
_,
|
||||||
|
new_dis_logprob,
|
||||||
|
dis_entropy,
|
||||||
|
new_con_logprob,
|
||||||
|
con_entropy,
|
||||||
|
newvalue,
|
||||||
|
) = agentList[thisT].get_actions_value(b_obs[mb_inds], b_actions[mb_inds])
|
||||||
|
# discrete ratio
|
||||||
|
dis_logratio = new_dis_logprob - b_dis_logprobs[mb_inds]
|
||||||
|
dis_ratio = dis_logratio.exp()
|
||||||
|
# continuous ratio
|
||||||
|
con_logratio = new_con_logprob - b_con_logprobs[mb_inds]
|
||||||
|
con_ratio = con_logratio.exp()
|
||||||
|
|
||||||
|
"""
|
||||||
|
# early stop
|
||||||
|
with torch.no_grad():
|
||||||
|
# calculate approx_kl http://joschu.net/blog/kl-approx.html
|
||||||
|
old_approx_kl = (-logratio).mean()
|
||||||
|
approx_kl = ((ratio - 1) - logratio).mean()
|
||||||
|
clipfracs += [((ratio - 1.0).abs() > args.clip_coef).float().mean().item()]
|
||||||
|
"""
|
||||||
|
|
||||||
|
# discrete Policy loss
|
||||||
|
dis_pg_loss_orig = -mb_advantages * dis_ratio
|
||||||
|
dis_pg_loss_clip = -mb_advantages * torch.clamp(
|
||||||
|
dis_ratio, 1 - args.clip_coef, 1 + args.clip_coef
|
||||||
|
)
|
||||||
|
dis_pg_loss = torch.max(dis_pg_loss_orig, dis_pg_loss_clip).mean()
|
||||||
|
# continuous Policy loss
|
||||||
|
con_pg_loss_orig = -mb_advantages * con_ratio
|
||||||
|
con_pg_loss_clip = -mb_advantages * torch.clamp(
|
||||||
|
con_ratio, 1 - args.clip_coef, 1 + args.clip_coef
|
||||||
|
)
|
||||||
|
con_pg_loss = torch.max(con_pg_loss_orig, con_pg_loss_clip).mean()
|
||||||
|
|
||||||
|
# Value loss
|
||||||
|
newvalue = newvalue.view(-1)
|
||||||
|
if args.clip_vloss:
|
||||||
|
v_loss_unclipped = (newvalue - b_returns[mb_inds]) ** 2
|
||||||
|
v_clipped = b_values[mb_inds] + torch.clamp(
|
||||||
|
newvalue - b_values[mb_inds],
|
||||||
|
-args.clip_coef,
|
||||||
|
args.clip_coef,
|
||||||
|
)
|
||||||
|
v_loss_clipped = (v_clipped - b_returns[mb_inds]) ** 2
|
||||||
|
v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped)
|
||||||
|
v_loss = 0.5 * v_loss_max.mean()
|
||||||
|
else:
|
||||||
|
v_loss = 0.5 * ((newvalue - b_returns[mb_inds]) ** 2).mean()
|
||||||
|
|
||||||
|
# total loss
|
||||||
|
entropy_loss = dis_entropy.mean() + con_entropy.mean()
|
||||||
|
loss = (
|
||||||
|
dis_pg_loss * POLICY_COEF[thisT]
|
||||||
|
+ con_pg_loss * POLICY_COEF[thisT]
|
||||||
|
+ entropy_loss * ENTROPY_COEF[thisT]
|
||||||
|
+ v_loss * CRITIC_COEF[thisT]
|
||||||
|
)*LOSS_COEF[thisT]
|
||||||
|
|
||||||
|
optimizers[thisT].zero_grad()
|
||||||
|
loss.backward()
|
||||||
|
# Clips gradient norm of an iterable of parameters.
|
||||||
|
nn.utils.clip_grad_norm_(agentList[thisT].parameters(), args.max_grad_norm)
|
||||||
|
optimizers[thisT].step()
|
||||||
|
|
||||||
|
"""
|
||||||
|
if args.target_kl is not None:
|
||||||
|
if approx_kl > args.target_kl:
|
||||||
|
break
|
||||||
|
"""
|
||||||
|
# record mean reward before clear history
|
||||||
|
print("done")
|
||||||
|
targetRewardMean = np.mean(rewards[thisT].to("cpu").detach().numpy().copy())
|
||||||
|
meanRewardList.append(targetRewardMean)
|
||||||
|
targetName = Targets(thisT).name
|
||||||
|
|
||||||
|
# clear this target trainning set buffer
|
||||||
|
obs[thisT] = torch.tensor([]).to(device)
|
||||||
|
actions[thisT] = torch.tensor([]).to(device)
|
||||||
|
dis_logprobs[thisT] = torch.tensor([]).to(device)
|
||||||
|
con_logprobs[thisT] = torch.tensor([]).to(device)
|
||||||
|
rewards[thisT] = torch.tensor([]).to(device)
|
||||||
|
values[thisT] = torch.tensor([]).to(device)
|
||||||
|
advantages[thisT] = torch.tensor([]).to(device)
|
||||||
|
returns[thisT] = torch.tensor([]).to(device)
|
||||||
|
|
||||||
|
# record rewards for plotting purposes
|
||||||
|
writer.add_scalar(f"Target{targetName}/value_loss", v_loss.item(), target_steps[thisT])
|
||||||
|
writer.add_scalar(f"Target{targetName}/dis_policy_loss", dis_pg_loss.item(), target_steps[thisT])
|
||||||
|
writer.add_scalar(f"Target{targetName}/con_policy_loss", con_pg_loss.item(), target_steps[thisT])
|
||||||
|
writer.add_scalar(f"Target{targetName}/total_loss", loss.item(), target_steps[thisT])
|
||||||
|
writer.add_scalar(f"Target{targetName}/entropy_loss", entropy_loss.item(), target_steps[thisT])
|
||||||
|
writer.add_scalar(f"Target{targetName}/Reward", targetRewardMean, target_steps[thisT])
|
||||||
|
writer.add_scalar(f"Target{targetName}/WinRatio", WinRounds[targetName]/TotalRounds[targetName], target_steps[thisT])
|
||||||
|
print(f"episode over Target{targetName} mean reward:", targetRewardMean)
|
||||||
|
TotalRewardMean = np.mean(meanRewardList)
|
||||||
|
writer.add_scalar("GlobalCharts/TotalRewardMean", TotalRewardMean, total_steps)
|
||||||
|
writer.add_scalar("GlobalCharts/learning_rate", optimizer.param_groups[0]["lr"], total_steps)
|
||||||
|
# New Record!
|
||||||
|
if TotalRewardMean > bestReward:
|
||||||
|
bestReward = targetRewardMean
|
||||||
|
for i in range(using_targets_num):
|
||||||
|
saveDir = "../PPO-Model/" + run_name +"_"+ str(TotalRewardMean) +"_"+ str(i)+".pt"
|
||||||
|
torch.save(agentList[i], saveDir)
|
||||||
|
|
||||||
|
for i in range(using_targets_num):
|
||||||
|
saveDir = "../PPO-Model/"+ run_name +"_last_"+ str(i) + ".pt"
|
||||||
|
torch.save(agentList[i], saveDir)
|
||||||
|
env.close()
|
||||||
|
writer.close()
|
||||||
@@ -9,6 +9,7 @@ import torch.nn as nn
|
|||||||
import torch.optim as optim
|
import torch.optim as optim
|
||||||
|
|
||||||
from AimbotEnv import Aimbot
|
from AimbotEnv import Aimbot
|
||||||
|
from tqdm import tqdm
|
||||||
from torch.distributions.normal import Normal
|
from torch.distributions.normal import Normal
|
||||||
from torch.distributions.categorical import Categorical
|
from torch.distributions.categorical import Categorical
|
||||||
from distutils.util import strtobool
|
from distutils.util import strtobool
|
||||||
@@ -24,26 +25,28 @@ from typing import List
|
|||||||
bestReward = 0
|
bestReward = 0
|
||||||
|
|
||||||
DEFAULT_SEED = 9331
|
DEFAULT_SEED = 9331
|
||||||
ENV_PATH = "../Build/Build-ParallelEnv-Target-OffPolicy-SingleStack-SideChannel/Aimbot-ParallelEnv"
|
ENV_PATH = "../Build/Build-ParallelEnv-Target-OffPolicy-SingleStack-SideChannel-ExtremeReward/Aimbot-ParallelEnv"
|
||||||
SIDE_CHANNEL_UUID = uuid.UUID("8bbfb62a-99b4-457c-879d-b78b69066b5e")
|
SIDE_CHANNEL_UUID = uuid.UUID("8bbfb62a-99b4-457c-879d-b78b69066b5e")
|
||||||
WAND_ENTITY = "koha9"
|
WAND_ENTITY = "koha9"
|
||||||
WORKER_ID = 1
|
WORKER_ID = 1
|
||||||
BASE_PORT = 1000
|
BASE_PORT = 1000
|
||||||
|
|
||||||
|
# max round steps per agent is 2500/Decision_period, 25 seconds
|
||||||
# !!!check every parameters before run!!!
|
# !!!check every parameters before run!!!
|
||||||
|
|
||||||
TOTAL_STEPS = 2000000
|
TOTAL_STEPS = 6000000
|
||||||
STEP_NUM = 314
|
BATCH_SIZE = 512
|
||||||
DECISION_PERIOD = 2
|
MAX_TRAINNING_DATASETS = 8000
|
||||||
LEARNING_RATE = 7e-4
|
DECISION_PERIOD = 1
|
||||||
|
LEARNING_RATE = 1e-3
|
||||||
GAMMA = 0.99
|
GAMMA = 0.99
|
||||||
GAE_LAMBDA = 0.95
|
GAE_LAMBDA = 0.95
|
||||||
MINIBATCH_NUM = 4
|
|
||||||
EPOCHS = 4
|
EPOCHS = 4
|
||||||
CLIP_COEF = 0.1
|
CLIP_COEF = 0.1
|
||||||
POLICY_COEF = 1.0
|
POLICY_COEF = 1.0
|
||||||
ENTROPY_COEF = 0.01
|
ENTROPY_COEF = 0.01
|
||||||
CRITIC_COEF = 0.5
|
CRITIC_COEF = 0.5
|
||||||
|
TARGET_LEARNING_RATE = 5e-5
|
||||||
|
|
||||||
ANNEAL_LEARNING_RATE = True
|
ANNEAL_LEARNING_RATE = True
|
||||||
CLIP_VLOSS = True
|
CLIP_VLOSS = True
|
||||||
@@ -51,8 +54,8 @@ NORM_ADV = True
|
|||||||
TRAIN = True
|
TRAIN = True
|
||||||
|
|
||||||
WANDB_TACK = False
|
WANDB_TACK = False
|
||||||
LOAD_DIR = None
|
#LOAD_DIR = None
|
||||||
# LOAD_DIR = "../PPO-Model/SmallArea-256-128-hybrid-2nd-trainning.pt"
|
LOAD_DIR = "../PPO-Model/Aimbot-target-last.pt"
|
||||||
|
|
||||||
# public data
|
# public data
|
||||||
TotalRounds = {"Go":0,"Attack":0,"Free":0}
|
TotalRounds = {"Go":0,"Attack":0,"Free":0}
|
||||||
@@ -81,10 +84,10 @@ def parse_args():
|
|||||||
# model parameters
|
# model parameters
|
||||||
parser.add_argument("--train",type=lambda x: bool(strtobool(x)), default=TRAIN, nargs="?", const=True,
|
parser.add_argument("--train",type=lambda x: bool(strtobool(x)), default=TRAIN, nargs="?", const=True,
|
||||||
help="Train Model or not")
|
help="Train Model or not")
|
||||||
parser.add_argument("--stepNum", type=int, default=STEP_NUM,
|
parser.add_argument("--datasetSize", type=int, default=MAX_TRAINNING_DATASETS,
|
||||||
help="the number of steps to run in each environment per policy rollout")
|
help="training dataset size,start training while dataset collect enough data")
|
||||||
parser.add_argument("--minibatchesNum", type=int, default=MINIBATCH_NUM,
|
parser.add_argument("--minibatchSize", type=int, default=BATCH_SIZE,
|
||||||
help="the number of mini-batches")
|
help="nimi batch size")
|
||||||
parser.add_argument("--epochs", type=int, default=EPOCHS,
|
parser.add_argument("--epochs", type=int, default=EPOCHS,
|
||||||
help="the K epochs to update the policy")
|
help="the K epochs to update the policy")
|
||||||
parser.add_argument("--annealLR", type=lambda x: bool(strtobool(x)), default=ANNEAL_LEARNING_RATE, nargs="?", const=True,
|
parser.add_argument("--annealLR", type=lambda x: bool(strtobool(x)), default=ANNEAL_LEARNING_RATE, nargs="?", const=True,
|
||||||
@@ -140,9 +143,11 @@ class PPOAgent(nn.Module):
|
|||||||
self.continuous_size = env.unity_continuous_size
|
self.continuous_size = env.unity_continuous_size
|
||||||
|
|
||||||
self.network = nn.Sequential(
|
self.network = nn.Sequential(
|
||||||
layer_init(nn.Linear(np.array(env.unity_observation_shape).prod(), 384)),
|
layer_init(nn.Linear(np.array(env.unity_observation_shape).prod(), 700)),
|
||||||
nn.ReLU(),
|
nn.ReLU(),
|
||||||
layer_init(nn.Linear(384, 256)),
|
layer_init(nn.Linear(700, 500)),
|
||||||
|
nn.ReLU(),
|
||||||
|
layer_init(nn.Linear(500, 256)),
|
||||||
nn.ReLU(),
|
nn.ReLU(),
|
||||||
)
|
)
|
||||||
self.actor_dis = layer_init(nn.Linear(256, self.discrete_size), std=0.01)
|
self.actor_dis = layer_init(nn.Linear(256, self.discrete_size), std=0.01)
|
||||||
@@ -192,6 +197,40 @@ class PPOAgent(nn.Module):
|
|||||||
self.critic(hidden),
|
self.critic(hidden),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def GAE(agent, args, rewards, dones, values, next_obs, next_done):
|
||||||
|
# GAE
|
||||||
|
with torch.no_grad():
|
||||||
|
next_value = agent.get_value(next_obs).reshape(1, -1)
|
||||||
|
data_size = rewards.size()[0]
|
||||||
|
if args.gae:
|
||||||
|
advantages = torch.zeros_like(rewards).to(device)
|
||||||
|
lastgaelam = 0
|
||||||
|
for t in reversed(range(data_size)):
|
||||||
|
if t == data_size - 1:
|
||||||
|
nextnonterminal = 1.0 - next_done
|
||||||
|
nextvalues = next_value
|
||||||
|
else:
|
||||||
|
nextnonterminal = 1.0 - dones[t + 1]
|
||||||
|
nextvalues = values[t + 1]
|
||||||
|
delta = rewards[t] + args.gamma * nextvalues * nextnonterminal - values[t]
|
||||||
|
advantages[t] = lastgaelam = (
|
||||||
|
delta + args.gamma * args.gaeLambda * nextnonterminal * lastgaelam
|
||||||
|
)
|
||||||
|
returns = advantages + values
|
||||||
|
else:
|
||||||
|
returns = torch.zeros_like(rewards).to(device)
|
||||||
|
for t in reversed(range(data_size)):
|
||||||
|
if t == data_size - 1:
|
||||||
|
nextnonterminal = 1.0 - next_done
|
||||||
|
next_return = next_value
|
||||||
|
else:
|
||||||
|
nextnonterminal = 1.0 - dones[t + 1]
|
||||||
|
next_return = returns[t + 1]
|
||||||
|
returns[t] = rewards[t] + args.gamma * nextnonterminal * next_return
|
||||||
|
advantages = returns - values
|
||||||
|
return advantages, returns
|
||||||
|
|
||||||
class AimbotSideChannel(SideChannel):
|
class AimbotSideChannel(SideChannel):
|
||||||
def __init__(self, channel_id: uuid.UUID) -> None:
|
def __init__(self, channel_id: uuid.UUID) -> None:
|
||||||
super().__init__(channel_id)
|
super().__init__(channel_id)
|
||||||
@@ -201,14 +240,14 @@ class AimbotSideChannel(SideChannel):
|
|||||||
receive messages from Unity
|
receive messages from Unity
|
||||||
"""
|
"""
|
||||||
thisMessage = msg.read_string()
|
thisMessage = msg.read_string()
|
||||||
print(thisMessage)
|
#print(thisMessage)
|
||||||
thisResult = thisMessage.split("|")
|
thisResult = thisMessage.split("|")
|
||||||
if(thisResult[0] == "result"):
|
if(thisResult[0] == "result"):
|
||||||
TotalRounds[thisResult[1]]+=1
|
TotalRounds[thisResult[1]]+=1
|
||||||
if(thisResult[2] == "Win"):
|
if(thisResult[2] == "Win"):
|
||||||
WinRounds[thisResult[1]]+=1
|
WinRounds[thisResult[1]]+=1
|
||||||
print(TotalRounds)
|
#print(TotalRounds)
|
||||||
print(WinRounds)
|
#print(WinRounds)
|
||||||
elif(thisResult[0] == "Error"):
|
elif(thisResult[0] == "Error"):
|
||||||
print(thisMessage)
|
print(thisMessage)
|
||||||
# 发送函数
|
# 发送函数
|
||||||
@@ -238,6 +277,7 @@ class AimbotSideChannel(SideChannel):
|
|||||||
msg.write_float32_list(data)
|
msg.write_float32_list(data)
|
||||||
super().queue_message_to_send(msg)
|
super().queue_message_to_send(msg)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
args = parse_args()
|
args = parse_args()
|
||||||
random.seed(args.seed)
|
random.seed(args.seed)
|
||||||
@@ -259,11 +299,12 @@ if __name__ == "__main__":
|
|||||||
optimizer = optim.Adam(agent.parameters(), lr=args.lr, eps=1e-5)
|
optimizer = optim.Adam(agent.parameters(), lr=args.lr, eps=1e-5)
|
||||||
|
|
||||||
# Tensorboard and WandB Recorder
|
# Tensorboard and WandB Recorder
|
||||||
game_name = "Aimbot"
|
game_name = "Aimbot_Target"
|
||||||
run_name = f"{game_name}_{args.seed}_{int(time.time())}"
|
game_type = "OffPolicy"
|
||||||
|
run_name = f"{game_name}_{game_type}_{args.seed}_{int(time.time())}"
|
||||||
if args.wandb_track:
|
if args.wandb_track:
|
||||||
wandb.init(
|
wandb.init(
|
||||||
project=run_name,
|
project=game_name,
|
||||||
entity=args.wandb_entity,
|
entity=args.wandb_entity,
|
||||||
sync_tensorboard=True,
|
sync_tensorboard=True,
|
||||||
config=vars(args),
|
config=vars(args),
|
||||||
@@ -279,94 +320,168 @@ if __name__ == "__main__":
|
|||||||
% ("\n".join([f"|{key}|{value}|" for key, value in vars(args).items()])),
|
% ("\n".join([f"|{key}|{value}|" for key, value in vars(args).items()])),
|
||||||
)
|
)
|
||||||
|
|
||||||
# Memory Record
|
# Trajectory Buffer
|
||||||
obs = torch.zeros((args.stepNum, env.unity_agent_num) + env.unity_observation_shape).to(device)
|
ob_bf = [[] for i in range(env.unity_agent_num)]
|
||||||
actions = torch.zeros((args.stepNum, env.unity_agent_num) + (env.unity_action_size,)).to(device)
|
act_bf = [[] for i in range(env.unity_agent_num)]
|
||||||
dis_logprobs = torch.zeros((args.stepNum, env.unity_agent_num)).to(device)
|
dis_logprobs_bf = [[] for i in range(env.unity_agent_num)]
|
||||||
con_logprobs = torch.zeros((args.stepNum, env.unity_agent_num)).to(device)
|
con_logprobs_bf = [[] for i in range(env.unity_agent_num)]
|
||||||
rewards = torch.zeros((args.stepNum, env.unity_agent_num)).to(device)
|
rewards_bf = [[] for i in range(env.unity_agent_num)]
|
||||||
dones = torch.zeros((args.stepNum, env.unity_agent_num)).to(device)
|
dones_bf = [[] for i in range(env.unity_agent_num)]
|
||||||
values = torch.zeros((args.stepNum, env.unity_agent_num)).to(device)
|
values_bf = [[] for i in range(env.unity_agent_num)]
|
||||||
|
|
||||||
# TRY NOT TO MODIFY: start the game
|
# TRY NOT TO MODIFY: start the game
|
||||||
args.batch_size = int(env.unity_agent_num * args.stepNum)
|
total_update_step = args.total_timesteps // args.datasetSize
|
||||||
args.minibatch_size = int(args.batch_size // args.minibatchesNum)
|
|
||||||
total_update_step = args.total_timesteps // args.batch_size
|
|
||||||
global_step = 0
|
global_step = 0
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
next_obs, _, _ = env.reset()
|
state, _, done = env.reset()
|
||||||
next_obs = torch.Tensor(next_obs).to(device)
|
# state = torch.Tensor(next_obs).to(device)
|
||||||
next_done = torch.zeros(env.unity_agent_num).to(device)
|
# next_done = torch.zeros(env.unity_agent_num).to(device)
|
||||||
|
|
||||||
for total_steps in range(total_update_step):
|
for total_steps in range(total_update_step):
|
||||||
# discunt learning rate, while step == total_update_step lr will be 0
|
# discunt learning rate, while step == total_update_step lr will be 0
|
||||||
|
print("new episode")
|
||||||
if args.annealLR:
|
if args.annealLR:
|
||||||
frac = 1.0 - (total_steps - 1.0) / total_update_step
|
finalRatio = TARGET_LEARNING_RATE/args.lr
|
||||||
|
frac = 1.0 - finalRatio*((total_steps - 1.0) / total_update_step)
|
||||||
lrnow = frac * args.lr
|
lrnow = frac * args.lr
|
||||||
optimizer.param_groups[0]["lr"] = lrnow
|
optimizer.param_groups[0]["lr"] = lrnow
|
||||||
|
|
||||||
|
# initialize empty training datasets
|
||||||
|
obs = torch.tensor([]).to(device) # (n,env.unity_observation_size)
|
||||||
|
actions = torch.tensor([]).to(device) # (n,env.unity_action_size)
|
||||||
|
dis_logprobs = torch.tensor([]).to(device) # (n,1)
|
||||||
|
con_logprobs = torch.tensor([]).to(device) # (n,1)
|
||||||
|
rewards = torch.tensor([]).to(device) # (n,1)
|
||||||
|
values = torch.tensor([]).to(device) # (n,1)
|
||||||
|
advantages = torch.tensor([]).to(device) # (n,1)
|
||||||
|
returns = torch.tensor([]).to(device) # (n,1)
|
||||||
|
|
||||||
# MAIN LOOP: run agent in environment
|
# MAIN LOOP: run agent in environment
|
||||||
for i in range(args.stepNum * args.decision_period):
|
i = 0
|
||||||
|
training = False
|
||||||
|
while True:
|
||||||
if i % args.decision_period == 0:
|
if i % args.decision_period == 0:
|
||||||
step = round(i / args.decision_period)
|
step = round(i / args.decision_period)
|
||||||
# Choose action by agent
|
# Choose action by agent
|
||||||
global_step += 1 * env.unity_agent_num
|
global_step += 1 * env.unity_agent_num
|
||||||
obs[step] = next_obs
|
|
||||||
dones[step] = next_done
|
|
||||||
|
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
# predict actions
|
# predict actions
|
||||||
action, dis_logprob, _, con_logprob, _, value = agent.get_actions_value(
|
action, dis_logprob, _, con_logprob, _, value = agent.get_actions_value(
|
||||||
next_obs
|
torch.Tensor(state).to(device)
|
||||||
)
|
)
|
||||||
value = value.flatten()
|
value = value.flatten()
|
||||||
next_obs, reward, done = env.step(action.cpu().numpy())
|
|
||||||
|
# variable from GPU to CPU
|
||||||
|
action_cpu = action.cpu().numpy()
|
||||||
|
dis_logprob_cpu = dis_logprob.cpu().numpy()
|
||||||
|
con_logprob_cpu = con_logprob.cpu().numpy()
|
||||||
|
value_cpu = value.cpu().numpy()
|
||||||
|
# Environment step
|
||||||
|
next_state, reward, next_done = env.step(action_cpu)
|
||||||
|
|
||||||
# save memories
|
# save memories
|
||||||
actions[step] = action
|
for i in range(env.unity_agent_num):
|
||||||
dis_logprobs[step] = dis_logprob
|
# save memories to buffers
|
||||||
con_logprobs[step] = con_logprob
|
ob_bf[i].append(state[i])
|
||||||
values[step] = value
|
act_bf[i].append(action_cpu[i])
|
||||||
rewards[step] = torch.tensor(reward).to(device).view(-1)
|
dis_logprobs_bf[i].append(dis_logprob_cpu[i])
|
||||||
next_obs, next_done = torch.Tensor(next_obs).to(device), torch.Tensor(done).to(
|
con_logprobs_bf[i].append(con_logprob_cpu[i])
|
||||||
device
|
rewards_bf[i].append(reward[i])
|
||||||
)
|
dones_bf[i].append(done[i])
|
||||||
|
values_bf[i].append(value_cpu[i])
|
||||||
|
if next_done[i] == True:
|
||||||
|
# finished a round, send finished memories to training datasets
|
||||||
|
# compute advantage and discounted reward
|
||||||
|
#print(i,"over")
|
||||||
|
adv, rt = GAE(
|
||||||
|
agent,
|
||||||
|
args,
|
||||||
|
torch.tensor(rewards_bf[i]).to(device),
|
||||||
|
torch.Tensor(dones_bf[i]).to(device),
|
||||||
|
torch.tensor(values_bf[i]).to(device),
|
||||||
|
torch.tensor(next_state[i]).to(device),
|
||||||
|
torch.Tensor([next_done[i]]).to(device),
|
||||||
|
)
|
||||||
|
# send memories to training datasets
|
||||||
|
obs = torch.cat((obs, torch.tensor(ob_bf[i]).to(device)), 0)
|
||||||
|
actions = torch.cat((actions, torch.tensor(act_bf[i]).to(device)), 0)
|
||||||
|
dis_logprobs = torch.cat(
|
||||||
|
(dis_logprobs, torch.tensor(dis_logprobs_bf[i]).to(device)), 0
|
||||||
|
)
|
||||||
|
con_logprobs = torch.cat(
|
||||||
|
(con_logprobs, torch.tensor(con_logprobs_bf[i]).to(device)), 0
|
||||||
|
)
|
||||||
|
rewards = torch.cat((rewards, torch.tensor(rewards_bf[i]).to(device)), 0)
|
||||||
|
values = torch.cat((values, torch.tensor(values_bf[i]).to(device)), 0)
|
||||||
|
advantages = torch.cat((advantages, adv), 0)
|
||||||
|
returns = torch.cat((returns, rt), 0)
|
||||||
|
|
||||||
|
# clear buffers
|
||||||
|
ob_bf[i] = []
|
||||||
|
act_bf[i] = []
|
||||||
|
dis_logprobs_bf[i] = []
|
||||||
|
con_logprobs_bf[i] = []
|
||||||
|
rewards_bf[i] = []
|
||||||
|
dones_bf[i] = []
|
||||||
|
values_bf[i] = []
|
||||||
|
print(f"train dataset added:{obs.size()[0]}/{args.datasetSize}")
|
||||||
|
|
||||||
|
if obs.size()[0] >= args.datasetSize:
|
||||||
|
# start train NN
|
||||||
|
break
|
||||||
|
state, done = next_state, next_done
|
||||||
else:
|
else:
|
||||||
# skip this step use last predict action
|
# skip this step use last predict action
|
||||||
next_obs, reward, done = env.step(action.cpu().numpy())
|
next_obs, reward, next_done = env.step(action_cpu)
|
||||||
next_obs, next_done = torch.Tensor(next_obs).to(device), torch.Tensor(done).to(
|
# save memories
|
||||||
device
|
for i in range(env.unity_agent_num):
|
||||||
)
|
if next_done[i] == True:
|
||||||
|
#print(i,"over???")
|
||||||
|
# save last memories to buffers
|
||||||
|
ob_bf[i].append(state[i])
|
||||||
|
act_bf[i].append(action_cpu[i])
|
||||||
|
dis_logprobs_bf[i].append(dis_logprob_cpu[i])
|
||||||
|
con_logprobs_bf[i].append(con_logprob_cpu[i])
|
||||||
|
rewards_bf[i].append(reward[i])
|
||||||
|
dones_bf[i].append(done[i])
|
||||||
|
values_bf[i].append(value_cpu[i])
|
||||||
|
# finished a round, send finished memories to training datasets
|
||||||
|
# compute advantage and discounted reward
|
||||||
|
adv, rt = GAE(
|
||||||
|
agent,
|
||||||
|
args,
|
||||||
|
torch.tensor(rewards_bf[i]).to(device),
|
||||||
|
torch.Tensor(dones_bf[i]).to(device),
|
||||||
|
torch.tensor(values_bf[i]).to(device),
|
||||||
|
torch.tensor(next_state[i]).to(device),
|
||||||
|
torch.Tensor([next_done[i]]).to(device),
|
||||||
|
)
|
||||||
|
# send memories to training datasets
|
||||||
|
obs = torch.cat((obs, torch.tensor(ob_bf[i]).to(device)), 0)
|
||||||
|
actions = torch.cat((actions, torch.tensor(act_bf[i]).to(device)), 0)
|
||||||
|
dis_logprobs = torch.cat(
|
||||||
|
(dis_logprobs, torch.tensor(dis_logprobs_bf[i]).to(device)), 0
|
||||||
|
)
|
||||||
|
con_logprobs = torch.cat(
|
||||||
|
(con_logprobs, torch.tensor(con_logprobs_bf[i]).to(device)), 0
|
||||||
|
)
|
||||||
|
rewards = torch.cat((rewards, torch.tensor(rewards_bf[i]).to(device)), 0)
|
||||||
|
values = torch.cat((values, torch.tensor(values_bf[i]).to(device)), 0)
|
||||||
|
advantages = torch.cat((advantages, adv), 0)
|
||||||
|
returns = torch.cat((returns, rt), 0)
|
||||||
|
|
||||||
# GAE
|
# clear buffers
|
||||||
with torch.no_grad():
|
ob_bf[i] = []
|
||||||
next_value = agent.get_value(next_obs).reshape(1, -1)
|
act_bf[i] = []
|
||||||
if args.gae:
|
dis_logprobs_bf[i] = []
|
||||||
advantages = torch.zeros_like(rewards).to(device)
|
con_logprobs_bf[i] = []
|
||||||
lastgaelam = 0
|
rewards_bf[i] = []
|
||||||
for t in reversed(range(args.stepNum)):
|
dones_bf[i] = []
|
||||||
if t == args.stepNum - 1:
|
values_bf[i] = []
|
||||||
nextnonterminal = 1.0 - next_done
|
print(f"train dataset added:{obs.size()[0]}/{args.datasetSize}")
|
||||||
nextvalues = next_value
|
state, done = next_state, next_done
|
||||||
else:
|
i += 1
|
||||||
nextnonterminal = 1.0 - dones[t + 1]
|
|
||||||
nextvalues = values[t + 1]
|
|
||||||
delta = rewards[t] + args.gamma * nextvalues * nextnonterminal - values[t]
|
|
||||||
advantages[t] = lastgaelam = (
|
|
||||||
delta + args.gamma * args.gaeLambda * nextnonterminal * lastgaelam
|
|
||||||
)
|
|
||||||
returns = advantages + values
|
|
||||||
else:
|
|
||||||
returns = torch.zeros_like(rewards).to(device)
|
|
||||||
for t in reversed(range(args.stepNum)):
|
|
||||||
if t == args.stepNum - 1:
|
|
||||||
nextnonterminal = 1.0 - next_done
|
|
||||||
next_return = next_value
|
|
||||||
else:
|
|
||||||
nextnonterminal = 1.0 - dones[t + 1]
|
|
||||||
next_return = returns[t + 1]
|
|
||||||
returns[t] = rewards[t] + args.gamma * nextnonterminal * next_return
|
|
||||||
advantages = returns - values
|
|
||||||
|
|
||||||
if args.train:
|
if args.train:
|
||||||
# flatten the batch
|
# flatten the batch
|
||||||
@@ -377,15 +492,15 @@ if __name__ == "__main__":
|
|||||||
b_advantages = advantages.reshape(-1)
|
b_advantages = advantages.reshape(-1)
|
||||||
b_returns = returns.reshape(-1)
|
b_returns = returns.reshape(-1)
|
||||||
b_values = values.reshape(-1)
|
b_values = values.reshape(-1)
|
||||||
|
b_size = b_obs.size()[0]
|
||||||
# Optimizing the policy and value network
|
# Optimizing the policy and value network
|
||||||
b_inds = np.arange(args.batch_size)
|
b_inds = np.arange(b_size)
|
||||||
# clipfracs = []
|
# clipfracs = []
|
||||||
for epoch in range(args.epochs):
|
for epoch in range(args.epochs):
|
||||||
# shuffle all datasets
|
# shuffle all datasets
|
||||||
np.random.shuffle(b_inds)
|
np.random.shuffle(b_inds)
|
||||||
for start in range(0, args.batch_size, args.minibatch_size):
|
for start in range(0, b_size, args.minibatchSize):
|
||||||
end = start + args.minibatch_size
|
end = start + args.minibatchSize
|
||||||
mb_inds = b_inds[start:end]
|
mb_inds = b_inds[start:end]
|
||||||
mb_advantages = b_advantages[mb_inds]
|
mb_advantages = b_advantages[mb_inds]
|
||||||
|
|
||||||
@@ -484,12 +599,12 @@ if __name__ == "__main__":
|
|||||||
"charts/SPS", int(global_step / (time.time() - start_time)), global_step
|
"charts/SPS", int(global_step / (time.time() - start_time)), global_step
|
||||||
)
|
)
|
||||||
writer.add_scalar("charts/Reward", rewardsMean, global_step)
|
writer.add_scalar("charts/Reward", rewardsMean, global_step)
|
||||||
writer.add_scalar("charts/GoWinRatio", WinRounds["Go"]/TotalRounds["Go"] if TotalRounds["Go"] != 0 else 0, global_step)
|
writer.add_scalar("charts/GoWinRatio", WinRounds["Go"]/TotalRounds["Go"], global_step)
|
||||||
writer.add_scalar("charts/AttackWinRatio", WinRounds["Attack"]/TotalRounds["Attack"] if TotalRounds["Attack"] != 0 else 0, global_step)
|
writer.add_scalar("charts/AttackWinRatio", WinRounds["Attack"]/TotalRounds["Attack"], global_step)
|
||||||
writer.add_scalar("charts/FreeWinRatio", WinRounds["Free"]/TotalRounds["Free"] if TotalRounds["Free"] != 0 else 0, global_step)
|
writer.add_scalar("charts/FreeWinRatio", WinRounds["Free"]/TotalRounds["Free"], global_step)
|
||||||
if rewardsMean > bestReward:
|
if rewardsMean > bestReward:
|
||||||
bestReward = rewardsMean
|
bestReward = rewardsMean
|
||||||
saveDir = "../PPO-Model/bigArea-384-128-hybrid-" + str(rewardsMean) + ".pt"
|
saveDir = "../PPO-Model/Target-700-500-256-hybrid-" + str(rewardsMean) + ".pt"
|
||||||
torch.save(agent, saveDir)
|
torch.save(agent, saveDir)
|
||||||
|
|
||||||
env.close()
|
env.close()
|
||||||
|
|||||||
@@ -434,41 +434,292 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 6,
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import torch\n",
|
||||||
|
"import numpy as np\n",
|
||||||
|
"\n",
|
||||||
|
"x = torch.randn(2, 3).to(\"cuda\")\n",
|
||||||
|
"print(x)\n",
|
||||||
|
"print(torch.cat((x, x, x), 0))\n",
|
||||||
|
"print(torch.cat((x, x, x), 1))\n",
|
||||||
|
"\n",
|
||||||
|
"aa = torch.empty(0).to(\"cuda\")\n",
|
||||||
|
"torch.cat([aa,x])\n",
|
||||||
|
"bb = [[]]*2\n",
|
||||||
|
"print(bb)\n",
|
||||||
|
"bb.append(x.to(\"cpu\").tolist())\n",
|
||||||
|
"bb.append(x.to(\"cpu\").tolist())\n",
|
||||||
|
"print(bb)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 64,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
"name": "stdout",
|
"name": "stdout",
|
||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"text": [
|
"text": [
|
||||||
"x : torch.Size([2, 3, 4])\n",
|
"tensor([[-1.1090, 0.4686, 0.6883],\n",
|
||||||
"x : torch.Size([6, 2, 3, 4])\n",
|
" [-0.1862, -0.3943, -0.0202],\n",
|
||||||
"x : torch.Size([6, 2, 3, 4])\n"
|
" [ 0.1436, -0.9444, -1.2079],\n",
|
||||||
|
" [-2.9434, -2.5989, -0.6653],\n",
|
||||||
|
" [ 0.4668, 0.8548, -0.4641],\n",
|
||||||
|
" [-0.3956, -0.2832, -0.1889],\n",
|
||||||
|
" [-0.2801, -0.2092, 1.7254],\n",
|
||||||
|
" [ 2.7938, -0.7742, 0.7053]], device='cuda:0')\n",
|
||||||
|
"(8, 0)\n",
|
||||||
|
"---\n",
|
||||||
|
"[[array([-1.1090169, 0.4685607, 0.6883437], dtype=float32)], [array([-0.1861974 , -0.39429024, -0.02016036], dtype=float32)], [array([ 0.14360362, -0.9443668 , -1.2079065 ], dtype=float32)], [array([-2.9433894 , -2.598913 , -0.66532046], dtype=float32)], [array([ 0.46684313, 0.8547877 , -0.46408093], dtype=float32)], [array([-0.39563984, -0.2831819 , -0.18891 ], dtype=float32)], [array([-0.28008553, -0.20918302, 1.7253567 ], dtype=float32)], [array([ 2.7938051, -0.7742478, 0.705279 ], dtype=float32)]]\n",
|
||||||
|
"[[array([-1.1090169, 0.4685607, 0.6883437], dtype=float32)], [], [array([ 0.14360362, -0.9443668 , -1.2079065 ], dtype=float32)], [array([-2.9433894 , -2.598913 , -0.66532046], dtype=float32)], [array([ 0.46684313, 0.8547877 , -0.46408093], dtype=float32)], [array([-0.39563984, -0.2831819 , -0.18891 ], dtype=float32)], [array([-0.28008553, -0.20918302, 1.7253567 ], dtype=float32)], [array([ 2.7938051, -0.7742478, 0.705279 ], dtype=float32)]]\n",
|
||||||
|
"---\n",
|
||||||
|
"[array([-1.1090169, 0.4685607, 0.6883437], dtype=float32), array([-1.1090169, 0.4685607, 0.6883437], dtype=float32)]\n",
|
||||||
|
"vvv tensor([[-1.1090, 0.4686, 0.6883],\n",
|
||||||
|
" [-1.1090, 0.4686, 0.6883]], device='cuda:0')\n",
|
||||||
|
"tensor([[-1.1090, 0.4686, 0.6883],\n",
|
||||||
|
" [-1.1090, 0.4686, 0.6883]], device='cuda:0')\n"
|
||||||
]
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"True"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 64,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"import numpy as np\n",
|
||||||
|
"import torch\n",
|
||||||
|
"\n",
|
||||||
|
"agent_num = 8\n",
|
||||||
|
"ob_buffer = [[]for i in range(agent_num)]\n",
|
||||||
|
"obs = torch.randn(8, 3).to(\"cuda\")\n",
|
||||||
|
"print(obs)\n",
|
||||||
|
"print(np.shape(np.array(ob_buffer)))\n",
|
||||||
|
"print('---')\n",
|
||||||
|
"obs_cpu = obs.to(\"cpu\").numpy()\n",
|
||||||
|
"for i in range(agent_num):\n",
|
||||||
|
" ob_buffer[i].append(obs_cpu[i])\n",
|
||||||
|
"print(ob_buffer)\n",
|
||||||
|
"ob_buffer[1] = []\n",
|
||||||
|
"print(ob_buffer)\n",
|
||||||
|
"print('---')\n",
|
||||||
|
"for i in range(agent_num):\n",
|
||||||
|
" ob_buffer[i].append(obs_cpu[i])\n",
|
||||||
|
"print(ob_buffer[0])\n",
|
||||||
|
"vvv = torch.tensor(ob_buffer[0]).to(\"cuda\")\n",
|
||||||
|
"print(\"vvv\",vvv)\n",
|
||||||
|
"empt = torch.tensor([]).to(\"cuda\")\n",
|
||||||
|
"vvvv = torch.cat((empt,vvv),0)\n",
|
||||||
|
"print(vvvv)\n",
|
||||||
|
"vvvv.size()[0]>0"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from AimbotEnv import Aimbot\n",
|
||||||
|
"from enum import Enum\n",
|
||||||
|
"import uuid\n",
|
||||||
|
"from mlagents_envs.side_channel.side_channel import (\n",
|
||||||
|
" SideChannel,\n",
|
||||||
|
" IncomingMessage,\n",
|
||||||
|
" OutgoingMessage,\n",
|
||||||
|
")\n",
|
||||||
|
"from typing import List\n",
|
||||||
|
"\n",
|
||||||
|
"class Targets(Enum):\n",
|
||||||
|
" Free = 0\n",
|
||||||
|
" Go = 1\n",
|
||||||
|
" Attack = 2\n",
|
||||||
|
" Num = 3\n",
|
||||||
|
"TotalRounds = {\"Go\":0,\"Attack\":0,\"Free\":0}\n",
|
||||||
|
"WinRounds = {\"Go\":0,\"Attack\":0,\"Free\":0}\n",
|
||||||
|
"\n",
|
||||||
|
"class AimbotSideChannel(SideChannel):\n",
|
||||||
|
" def __init__(self, channel_id: uuid.UUID) -> None:\n",
|
||||||
|
" super().__init__(channel_id)\n",
|
||||||
|
" def on_message_received(self, msg: IncomingMessage) -> None:\n",
|
||||||
|
" \"\"\"\n",
|
||||||
|
" Note: We must implement this method of the SideChannel interface to\n",
|
||||||
|
" receive messages from Unity\n",
|
||||||
|
" \"\"\"\n",
|
||||||
|
" thisMessage = msg.read_string()\n",
|
||||||
|
" #print(thisMessage)\n",
|
||||||
|
" thisResult = thisMessage.split(\"|\")\n",
|
||||||
|
" if(thisResult[0] == \"result\"):\n",
|
||||||
|
" TotalRounds[thisResult[1]]+=1\n",
|
||||||
|
" if(thisResult[2] == \"Win\"):\n",
|
||||||
|
" WinRounds[thisResult[1]]+=1\n",
|
||||||
|
" #print(TotalRounds)\n",
|
||||||
|
" #print(WinRounds)\n",
|
||||||
|
" elif(thisResult[0] == \"Error\"):\n",
|
||||||
|
" print(thisMessage)\n",
|
||||||
|
"\t# 发送函数\n",
|
||||||
|
" def send_string(self, data: str) -> None:\n",
|
||||||
|
" \"\"\"发送一个字符串给C#\"\"\"\n",
|
||||||
|
" msg = OutgoingMessage()\n",
|
||||||
|
" msg.write_string(data)\n",
|
||||||
|
" super().queue_message_to_send(msg)\n",
|
||||||
|
"\n",
|
||||||
|
" def send_bool(self, data: bool) -> None:\n",
|
||||||
|
" msg = OutgoingMessage()\n",
|
||||||
|
" msg.write_bool(data)\n",
|
||||||
|
" super().queue_message_to_send(msg)\n",
|
||||||
|
"\n",
|
||||||
|
" def send_int(self, data: int) -> None:\n",
|
||||||
|
" msg = OutgoingMessage()\n",
|
||||||
|
" msg.write_int32(data)\n",
|
||||||
|
" super().queue_message_to_send(msg)\n",
|
||||||
|
"\n",
|
||||||
|
" def send_float(self, data: float) -> None:\n",
|
||||||
|
" msg = OutgoingMessage()\n",
|
||||||
|
" msg.write_float32(data)\n",
|
||||||
|
" super().queue_message_to_send(msg)\n",
|
||||||
|
"\n",
|
||||||
|
" def send_float_list(self, data: List[float]) -> None:\n",
|
||||||
|
" msg = OutgoingMessage()\n",
|
||||||
|
" msg.write_float32_list(data)\n",
|
||||||
|
" super().queue_message_to_send(msg)\n",
|
||||||
|
" \n",
|
||||||
|
"SIDE_CHANNEL_UUID = uuid.UUID(\"8bbfb62a-99b4-457c-879d-b78b69066b5e\")\n",
|
||||||
|
"ENV_PATH = \"../Build/Build-ParallelEnv-Target-OffPolicy-SingleStack-SideChannel-EndReward/Aimbot-ParallelEnv\"\n",
|
||||||
|
"aimBotsideChannel = AimbotSideChannel(SIDE_CHANNEL_UUID)\n",
|
||||||
|
"env = Aimbot(envPath=ENV_PATH, workerID=123, basePort=999,side_channels=[aimBotsideChannel])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import numpy as np\n",
|
||||||
|
"import torch\n",
|
||||||
|
"import torch.nn as nn\n",
|
||||||
|
"import torch.optim as optim\n",
|
||||||
|
"from AimbotEnv import Aimbot\n",
|
||||||
|
"from torch.distributions.normal import Normal\n",
|
||||||
|
"from torch.distributions.categorical import Categorical\n",
|
||||||
|
"device = torch.device(\"cuda\" if torch.cuda.is_available() and True else \"cpu\")\n",
|
||||||
|
"\n",
|
||||||
|
"def layer_init(layer, std=np.sqrt(2), bias_const=0.0):\n",
|
||||||
|
" torch.nn.init.orthogonal_(layer.weight, std)\n",
|
||||||
|
" torch.nn.init.constant_(layer.bias, bias_const)\n",
|
||||||
|
" return layer\n",
|
||||||
|
"\n",
|
||||||
|
"class PPOAgent(nn.Module):\n",
|
||||||
|
" def __init__(self, env: Aimbot,targetNum:int):\n",
|
||||||
|
" super(PPOAgent, self).__init__()\n",
|
||||||
|
" self.stateSize = env.unity_observation_shape[0]\n",
|
||||||
|
"\n",
|
||||||
|
" self.discrete_size = env.unity_discrete_size\n",
|
||||||
|
" self.discrete_shape = list(env.unity_discrete_branches)\n",
|
||||||
|
" self.continuous_size = env.unity_continuous_size\n",
|
||||||
|
"\n",
|
||||||
|
" self.network = nn.Sequential(\n",
|
||||||
|
" layer_init(nn.Linear(env.unity_observation_shape[0], 300)),\n",
|
||||||
|
" nn.Tanh(),\n",
|
||||||
|
" layer_init(nn.Linear(300, 200)),\n",
|
||||||
|
" nn.Tanh(),\n",
|
||||||
|
" )\n",
|
||||||
|
" self.actor_dis = layer_init(nn.Linear(200, self.discrete_size), std=0.5)\n",
|
||||||
|
" self.actor_mean = layer_init(nn.Linear(200, self.continuous_size), std=0.5)\n",
|
||||||
|
" self.actor_logstd = nn.Parameter(torch.zeros(1, self.continuous_size))\n",
|
||||||
|
" self.critic = layer_init(nn.Linear(200, 1), std=1)\n",
|
||||||
|
"\n",
|
||||||
|
" def get_value(self, state: torch.Tensor):\n",
|
||||||
|
" return self.critic(self.network(state))\n",
|
||||||
|
"\n",
|
||||||
|
" def get_actions_value(self, state: torch.Tensor, actions=None):\n",
|
||||||
|
" hidden = self.network(state)\n",
|
||||||
|
"\n",
|
||||||
|
" # discrete\n",
|
||||||
|
" dis_logits = self.actor_dis(hidden)\n",
|
||||||
|
" split_logits = torch.split(dis_logits, self.discrete_shape, dim=1)\n",
|
||||||
|
" multi_categoricals = [Categorical(logits=thisLogits) for thisLogits in split_logits]\n",
|
||||||
|
" # continuous\n",
|
||||||
|
" actions_mean = self.actor_mean(hidden)\n",
|
||||||
|
" action_logstd = self.actor_logstd.expand_as(actions_mean)\n",
|
||||||
|
" action_std = torch.exp(action_logstd)\n",
|
||||||
|
" con_probs = Normal(actions_mean, action_std)\n",
|
||||||
|
"\n",
|
||||||
|
" if actions is None:\n",
|
||||||
|
" if True:\n",
|
||||||
|
" # select actions base on probability distribution model\n",
|
||||||
|
" disAct = torch.stack([ctgr.sample() for ctgr in multi_categoricals])\n",
|
||||||
|
" conAct = con_probs.sample()\n",
|
||||||
|
" actions = torch.cat([disAct.T, conAct], dim=1)\n",
|
||||||
|
" else:\n",
|
||||||
|
" # select actions base on best probability distribution\n",
|
||||||
|
" disAct = torch.stack([torch.argmax(logit, dim=1) for logit in split_logits])\n",
|
||||||
|
" conAct = actions_mean\n",
|
||||||
|
" actions = torch.cat([disAct.T, conAct], dim=1)\n",
|
||||||
|
" else:\n",
|
||||||
|
" disAct = actions[:, 0 : env.unity_discrete_type].T\n",
|
||||||
|
" conAct = actions[:, env.unity_discrete_type :]\n",
|
||||||
|
" dis_log_prob = torch.stack(\n",
|
||||||
|
" [ctgr.log_prob(act) for act, ctgr in zip(disAct, multi_categoricals)]\n",
|
||||||
|
" )\n",
|
||||||
|
" dis_entropy = torch.stack([ctgr.entropy() for ctgr in multi_categoricals])\n",
|
||||||
|
" return (\n",
|
||||||
|
" actions,\n",
|
||||||
|
" dis_log_prob.sum(0),\n",
|
||||||
|
" dis_entropy.sum(0),\n",
|
||||||
|
" con_probs.log_prob(conAct).sum(1),\n",
|
||||||
|
" con_probs.entropy().sum(1),\n",
|
||||||
|
" self.critic(hidden),\n",
|
||||||
|
" )"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"ppp = \"../Build/Build-ParallelEnv-Target-OffPolicy-SingleStack-SideChannel-EndReward-Easy-V2.7-FreeOnly-NormalMapSize/Aimbot-ParallelEnv\"\n",
|
||||||
|
"env = Aimbot(envPath=ppp, workerID=1, basePort=1000,side_channels=[])\n",
|
||||||
|
"agent_list = []\n",
|
||||||
|
"optimizers = []\n",
|
||||||
|
"for i in range(3):\n",
|
||||||
|
" agent_list.append(PPOAgent(env=env,targetNum=3).to('cuda'))\n",
|
||||||
|
" optimizers.append(optim.Adam(agent_list[i].parameters(),lr=1e-4))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"tensor([1., 2., 3., 4., 5.])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"import torch\n",
|
"import torch\n",
|
||||||
"#1\n",
|
|
||||||
"x = torch.randn(2, 1, 1)#为1可以扩展为3和4\n",
|
|
||||||
"x = x.expand(2, 3, 4)\n",
|
|
||||||
"print('x :', x.size())\n",
|
|
||||||
"\n",
|
"\n",
|
||||||
"#2\n",
|
"aaa = torch.zeros((8,5))\n",
|
||||||
"#扩展一个新的维度必须在最前面,否则会报错\n",
|
"aaa[0] = torch.Tensor([1,2,3,4,5])\n",
|
||||||
"#x = x.expand(2, 3, 4, 6)\n",
|
"aaa[0]"
|
||||||
"\n",
|
|
||||||
"x = x.expand(6, 2, 3, 4)\n",
|
|
||||||
"print('x :', x.size())\n",
|
|
||||||
"\n",
|
|
||||||
"#3\n",
|
|
||||||
"#某一个维度为-1表示不改变该维度的大小\n",
|
|
||||||
"x = x.expand(6, -1, -1, -1)\n",
|
|
||||||
"print('x :', x.size())\n",
|
|
||||||
"\n",
|
|
||||||
"x : torch.Size([2, 3, 4])\n",
|
|
||||||
"x : torch.Size([6, 2, 3, 4])\n",
|
|
||||||
"x : torch.Size([6, 2, 3, 4])"
|
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
|
|||||||
Reference in New Issue
Block a user