8 Commits

Author SHA1 Message Date
Koha9 34206b95c5 Multi Agent Each Type Action Select Style
Multi Agent Each Type Action Select Style.
waste too much time
2022-12-14 09:01:29 +09:00
Koha9 1787872e82 wrong remain Time Fix
wrong remain Time Fix, what a stupid mistake...
and fix doubled WANDB writer
2022-12-04 09:20:05 +09:00
Koha9 ad9817e7a4 Totally disparate NN by target
Totally disparate NN by target.
2022-12-03 21:35:33 +09:00
Koha9 cbc385ca10 Change training dataset storage method
save training dataset by it target type.
while training NN use single target training set to backward NN.
this improve at least 20 times faster than last update!
2022-12-03 07:54:38 +09:00
Koha9 895cd5c118 Add EndReward Broadcast function
while game over add remaintime/15 to every step's rewards. to improve this round's training weight.
fix get target from states still using onehot decoder bug.
2022-12-03 03:58:19 +09:00
Koha9 3930bcd953 Add Multi-NN agent
Add Multi neural network in output layer
use different nn while facing to different target.
2022-12-01 19:55:51 +09:00
Koha9 5631569b31 Side Channel added
add side Channel to save target win ratio. 
Fix some Bug
2022-11-30 06:45:07 +09:00
Koha9 32d398dbef Change Learning timing
change learning timing to each episode end.
2022-11-16 19:40:57 +09:00
4 changed files with 1189 additions and 113 deletions
+2 -1
View File
@@ -83,4 +83,5 @@ crashlytics-build.properties
/Aimbot-PPO-Python/Backup/
/Aimbot-PPO-Python/Build/
/Aimbot-PPO-Python/PPO-Model/
/Aimbot-PPO-Python/GAIL-Expert-Data/
/Aimbot-PPO-Python/GAIL-Expert-Data/
/Aimbot-PPO-Python/runs/
+709
View File
@@ -0,0 +1,709 @@
import argparse
import wandb
import time
import numpy as np
import random
import uuid
import torch
import torch.nn as nn
import torch.optim as optim
from AimbotEnv import Aimbot
from tqdm import tqdm
from enum import Enum
from torch.distributions.normal import Normal
from torch.distributions.categorical import Categorical
from distutils.util import strtobool
from torch.utils.tensorboard import SummaryWriter
from mlagents_envs.environment import UnityEnvironment
from mlagents_envs.side_channel.side_channel import (
SideChannel,
IncomingMessage,
OutgoingMessage,
)
from typing import List
bestReward = -1
DEFAULT_SEED = 9331
ENV_PATH = "../Build/Build-ParallelEnv-Target-OffPolicy-SingleStack-SideChannel-EndReward-Easy-V2.7-FreeOnly-NormalMapSize/Aimbot-ParallelEnv"
SIDE_CHANNEL_UUID = uuid.UUID("8bbfb62a-99b4-457c-879d-b78b69066b5e")
WAND_ENTITY = "koha9"
WORKER_ID = 3
BASE_PORT = 1002
# max round steps per agent is 2500/Decision_period, 25 seconds
# !!!check every parameters before run!!!
TOTAL_STEPS = 3150000
BATCH_SIZE = 1024
MAX_TRAINNING_DATASETS = 6000
DECISION_PERIOD = 1
LEARNING_RATE = 5e-4
GAMMA = 0.99
GAE_LAMBDA = 0.95
EPOCHS = 3
CLIP_COEF = 0.11
LOSS_COEF = [1.0, 1.0, 1.0, 1.0] # free go attack defence
POLICY_COEF = [1.0, 1.0, 1.0, 1.0]
ENTROPY_COEF = [0.1, 0.1, 0.1, 0.1]
CRITIC_COEF = [0.5, 0.5, 0.5, 0.5]
TARGET_LEARNING_RATE = 1e-6
ANNEAL_LEARNING_RATE = True
CLIP_VLOSS = True
NORM_ADV = True
TRAIN = True
WANDB_TACK = False
LOAD_DIR = None
#LOAD_DIR = "../PPO-Model/Aimbot_Target_Hybrid_PMNN_V2_OffPolicy_EndBC_9331_1670522099-freeonly-12/Aimbot-target-last.pt"
# public data
class Targets(Enum):
Free = 0
Go = 1
Attack = 2
Defence = 3
Num = 4
TARGET_STATE_SIZE = 6
INAREA_STATE_SIZE = 1
TIME_STATE_SIZE = 1
GUN_STATE_SIZE = 1
MY_STATE_SIZE = 4
TOTAL_T_SIZE = TARGET_STATE_SIZE+INAREA_STATE_SIZE+TIME_STATE_SIZE+GUN_STATE_SIZE+MY_STATE_SIZE
BASE_WINREWARD = 999
BASE_LOSEREWARD = -999
TARGETNUM= 4
ENV_TIMELIMIT = 30
RESULT_BROADCAST_RATIO = 1/ENV_TIMELIMIT
TotalRounds = {"Free":0,"Go":0,"Attack":0}
WinRounds = {"Free":0,"Go":0,"Attack":0}
# !!!SPECIAL PARAMETERS!!!
# change it while program is finished
using_targets_num = 3
def parse_args():
# fmt: off
# pytorch and environment parameters
parser = argparse.ArgumentParser()
parser.add_argument("--seed", type=int, default=DEFAULT_SEED,
help="seed of the experiment")
parser.add_argument("--path", type=str, default=ENV_PATH,
help="enviroment path")
parser.add_argument("--workerID", type=int, default=WORKER_ID,
help="unity worker ID")
parser.add_argument("--baseport", type=int, default=BASE_PORT,
help="port to connect to Unity environment")
parser.add_argument("--lr", type=float, default=LEARNING_RATE,
help="the learning rate of optimizer")
parser.add_argument("--cuda", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
help="if toggled, cuda will be enabled by default")
parser.add_argument("--total-timesteps", type=int, default=TOTAL_STEPS,
help="total timesteps of the experiments")
# model parameters
parser.add_argument("--train",type=lambda x: bool(strtobool(x)), default=TRAIN, nargs="?", const=True,
help="Train Model or not")
parser.add_argument("--datasetSize", type=int, default=MAX_TRAINNING_DATASETS,
help="training dataset size,start training while dataset collect enough data")
parser.add_argument("--minibatchSize", type=int, default=BATCH_SIZE,
help="nimi batch size")
parser.add_argument("--epochs", type=int, default=EPOCHS,
help="the K epochs to update the policy")
parser.add_argument("--annealLR", type=lambda x: bool(strtobool(x)), default=ANNEAL_LEARNING_RATE, nargs="?", const=True,
help="Toggle learning rate annealing for policy and value networks")
parser.add_argument("--wandb-track", type=lambda x: bool(strtobool(x)), default=WANDB_TACK, nargs="?", const=True,
help="track on the wandb")
parser.add_argument("--wandb-entity", type=str, default=WAND_ENTITY,
help="the entity (team) of wandb's project")
parser.add_argument("--load-dir", type=str, default=LOAD_DIR,
help="load model directory")
parser.add_argument("--decision-period", type=int, default=DECISION_PERIOD,
help="the number of steps to run in each environment per policy rollout")
parser.add_argument("--result-broadcast-ratio", type=float, default=RESULT_BROADCAST_RATIO,
help="broadcast result when win round is reached,r=result-broadcast-ratio*remainTime")
# GAE loss
parser.add_argument("--gae", type=lambda x: bool(strtobool(x)), default=True, nargs="?", const=True,
help="Use GAE for advantage computation")
parser.add_argument("--norm-adv", type=lambda x: bool(strtobool(x)), default=NORM_ADV, nargs="?", const=True,
help="Toggles advantages normalization")
parser.add_argument("--gamma", type=float, default=GAMMA,
help="the discount factor gamma")
parser.add_argument("--gaeLambda", type=float, default=GAE_LAMBDA,
help="the lambda for the general advantage estimation")
parser.add_argument("--clip-coef", type=float, default=CLIP_COEF,
help="the surrogate clipping coefficient")
parser.add_argument("--policy-coef", type=float, default=POLICY_COEF,
help="coefficient of the policy")
parser.add_argument("--ent-coef", type=float, default=ENTROPY_COEF,
help="coefficient of the entropy")
parser.add_argument("--critic-coef", type=float, default=CRITIC_COEF,
help="coefficient of the value function")
parser.add_argument("--clip-vloss", type=lambda x: bool(strtobool(x)), default=CLIP_VLOSS, nargs="?", const=True,
help="Toggles whether or not to use a clipped loss for the value function, as per the paper.")
parser.add_argument("--max-grad-norm", type=float, default=0.5,
help="the maximum norm for the gradient clipping")
parser.add_argument("--target-kl", type=float, default=None,
help="the target KL divergence threshold")
# fmt: on
args = parser.parse_args()
return args
def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
torch.nn.init.orthogonal_(layer.weight, std)
torch.nn.init.constant_(layer.bias, bias_const)
return layer
class PPOAgent(nn.Module):
def __init__(self, env: Aimbot,targetNum:int):
super(PPOAgent, self).__init__()
self.targetNum = targetNum
self.stateSize = env.unity_observation_shape[0]
self.targetSize = TARGET_STATE_SIZE
self.timeSize = TIME_STATE_SIZE
self.gunSize = GUN_STATE_SIZE
self.myStateSize = MY_STATE_SIZE
self.totalMiddleSize = TOTAL_T_SIZE
self.head_input_size = env.unity_observation_shape[0] - self.targetSize-self.timeSize-self.gunSize# except target state input
self.discrete_size = env.unity_discrete_size
self.discrete_shape = list(env.unity_discrete_branches)
self.continuous_size = env.unity_continuous_size
self.network = nn.Sequential(
layer_init(nn.Linear(env.unity_observation_shape[0], 300)),
nn.Tanh(),
layer_init(nn.Linear(300, 200)),
nn.Tanh(),
)
self.actor_dis = layer_init(nn.Linear(200, self.discrete_size), std=0.5)
self.actor_mean = layer_init(nn.Linear(200, self.continuous_size), std=0.5)
self.actor_logstd = nn.Parameter(torch.zeros(1, self.continuous_size))
self.critic = layer_init(nn.Linear(200, 1), std=1)
def get_value(self, state: torch.Tensor):
return self.critic(self.network(state))
def get_actions_value(self, state: torch.Tensor, actions=None):
hidden = self.network(state)
# discrete
dis_logits = self.actor_dis(hidden)
split_logits = torch.split(dis_logits, self.discrete_shape, dim=1)
multi_categoricals = [Categorical(logits=thisLogits) for thisLogits in split_logits]
# continuous
actions_mean = self.actor_mean(hidden)
action_logstd = self.actor_logstd.expand_as(actions_mean)
action_std = torch.exp(action_logstd)
con_probs = Normal(actions_mean, action_std)
if actions is None:
if args.train:
# select actions base on probability distribution model
disAct = torch.stack([ctgr.sample() for ctgr in multi_categoricals])
conAct = con_probs.sample()
actions = torch.cat([disAct.T, conAct], dim=1)
else:
# select actions base on best probability distribution
disAct = torch.stack([torch.argmax(logit, dim=1) for logit in split_logits])
conAct = actions_mean
actions = torch.cat([disAct.T, conAct], dim=1)
else:
disAct = actions[:, 0 : env.unity_discrete_type].T
conAct = actions[:, env.unity_discrete_type :]
dis_log_prob = torch.stack(
[ctgr.log_prob(act) for act, ctgr in zip(disAct, multi_categoricals)]
)
dis_entropy = torch.stack([ctgr.entropy() for ctgr in multi_categoricals])
return (
actions,
dis_log_prob.sum(0),
dis_entropy.sum(0),
con_probs.log_prob(conAct).sum(1),
con_probs.entropy().sum(1),
self.critic(hidden),
)
def GAE(agent, args, rewards, dones, values, next_obs, next_done):
# GAE
with torch.no_grad():
next_value = agent.get_value(next_obs).reshape(1, -1)
data_size = rewards.size()[0]
if args.gae:
advantages = torch.zeros_like(rewards).to(device)
lastgaelam = 0
for t in reversed(range(data_size)):
if t == data_size - 1:
nextnonterminal = 1.0 - next_done
nextvalues = next_value
else:
nextnonterminal = 1.0 - dones[t + 1]
nextvalues = values[t + 1]
delta = rewards[t] + args.gamma * nextvalues * nextnonterminal - values[t]
advantages[t] = lastgaelam = (
delta + args.gamma * args.gaeLambda * nextnonterminal * lastgaelam
)
returns = advantages + values
else:
returns = torch.zeros_like(rewards).to(device)
for t in reversed(range(data_size)):
if t == data_size - 1:
nextnonterminal = 1.0 - next_done
next_return = next_value
else:
nextnonterminal = 1.0 - dones[t + 1]
next_return = returns[t + 1]
returns[t] = rewards[t] + args.gamma * nextnonterminal * next_return
advantages = returns - values
return advantages, returns
class AimbotSideChannel(SideChannel):
def __init__(self, channel_id: uuid.UUID) -> None:
super().__init__(channel_id)
def on_message_received(self, msg: IncomingMessage) -> None:
"""
Note: We must implement this method of the SideChannel interface to
receive messages from Unity
"""
thisMessage = msg.read_string()
# print(thisMessage)
thisResult = thisMessage.split("|")
if(thisResult[0] == "result"):
TotalRounds[thisResult[1]]+=1
if(thisResult[2] == "Win"):
WinRounds[thisResult[1]]+=1
#print(TotalRounds)
#print(WinRounds)
elif(thisResult[0] == "Error"):
print(thisMessage)
# 发送函数
def send_string(self, data: str) -> None:
# send a string toC#
msg = OutgoingMessage()
msg.write_string(data)
super().queue_message_to_send(msg)
def send_bool(self, data: bool) -> None:
msg = OutgoingMessage()
msg.write_bool(data)
super().queue_message_to_send(msg)
def send_int(self, data: int) -> None:
msg = OutgoingMessage()
msg.write_int32(data)
super().queue_message_to_send(msg)
def send_float(self, data: float) -> None:
msg = OutgoingMessage()
msg.write_float32(data)
super().queue_message_to_send(msg)
def send_float_list(self, data: List[float]) -> None:
msg = OutgoingMessage()
msg.write_float32_list(data)
super().queue_message_to_send(msg)
def broadCastEndReward(rewardBF:list,remainTime:float):
thisRewardBF = rewardBF
if (rewardBF[-1]<=-500):
# print("Lose DO NOT BROAD CAST",rewardBF[-1])
thisRewardBF[-1] = rewardBF[-1]-BASE_LOSEREWARD
thisRewardBF = thisRewardBF
elif (rewardBF[-1]>=500):
# print("Win! Broadcast reward!",rewardBF[-1])
thisRewardBF[-1] = rewardBF[-1]-BASE_WINREWARD
thisRewardBF = (np.asarray(thisRewardBF)+(remainTime*args.result_broadcast_ratio)).tolist()
else:
print("!!!!!DIDNT GET RESULT REWARD!!!!!!",rewardBF[-1])
return torch.Tensor(thisRewardBF).to(device)
if __name__ == "__main__":
args = parse_args()
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu")
# Initialize environment anget optimizer
aimBotsideChannel = AimbotSideChannel(SIDE_CHANNEL_UUID);
env = Aimbot(envPath=args.path, workerID=args.workerID, basePort=args.baseport,side_channels=[aimBotsideChannel])
agentList = []
optimizers = []
if args.load_dir is None:
for i in range(using_targets_num):
agentList.append(PPOAgent(env,TARGETNUM).to(device))
optimizers.append(optim.Adam(agentList[i].parameters(), lr=args.lr, eps=1e-5))
else:
print("NAH")
# !!!not finished
# agent = torch.load(args.load_dir)
# print("Load Agent", args.load_dir)
# print(agent.eval())
# Tensorboard and WandB Recorder
game_name = "Aimbot_Target_Hybrid_PMNN_V2"
game_type = "OffPolicy_EndBC"
run_name = f"{game_name}_{game_type}_{args.seed}_{int(time.time())}"
if args.wandb_track:
wandb.init(
project=game_name,
entity=args.wandb_entity,
sync_tensorboard=True,
config=vars(args),
name=run_name,
monitor_gym=True,
save_code=True,
)
writer = SummaryWriter(f"runs/{run_name}")
writer.add_text(
"hyperparameters",
"|param|value|\n|-|-|\n%s"
% ("\n".join([f"|{key}|{value}|" for key, value in vars(args).items()])),
)
# Trajectory Buffer
ob_bf = [[] for i in range(env.unity_agent_num)]
act_bf = [[] for i in range(env.unity_agent_num)]
dis_logprobs_bf = [[] for i in range(env.unity_agent_num)]
con_logprobs_bf = [[] for i in range(env.unity_agent_num)]
rewards_bf = [[] for i in range(env.unity_agent_num)]
dones_bf = [[] for i in range(env.unity_agent_num)]
values_bf = [[] for i in range(env.unity_agent_num)]
# start the game
total_update_step = using_targets_num * args.total_timesteps // args.datasetSize
target_steps = [0 for i in range(TARGETNUM)]
start_time = time.time()
state, _, done = env.reset()
# state = torch.Tensor(next_obs).to(device)
# next_done = torch.zeros(env.unity_agent_num).to(device)
# initialize empty training datasets
obs = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,env.unity_observation_size)
actions = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,env.unity_action_size)
dis_logprobs = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1)
con_logprobs = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1)
rewards = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1)
values = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1)
advantages = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1)
returns = [torch.tensor([]).to(device) for i in range(TARGETNUM)] # (TARGETNUM,n,1)
for total_steps in range(total_update_step):
# discunt learning rate, while step == total_update_step lr will be 0
if args.annealLR:
finalRatio = TARGET_LEARNING_RATE/args.lr
frac = 1.0 - ((total_steps + 1.0) / total_update_step)
lrnow = frac * args.lr
for optimizer in optimizers:
optimizer.param_groups[0]["lr"] = lrnow
else:
lrnow = args.lr
print("new episode",total_steps,"learning rate = ",lrnow)
# MAIN LOOP: run agent in environment
step = 0
training = False
trainQueue = []
last_reward = [0.for i in range(env.unity_agent_num)]
action = torch.zeros((env.unity_agent_num,env.unity_discrete_type+env.unity_continuous_size))
dis_logprob = torch.zeros((env.unity_agent_num,env.unity_discrete_size))
con_logprob = torch.zeros((env.unity_agent_num,env.unity_continuous_size))
value = torch.zeros((env.unity_agent_num,1))
while True:
if step % args.decision_period == 0:
step += 1
# Choose action by agent
with torch.no_grad():
# predict actions
for i in range(env.unity_agent_num):
actTarget = int(state[i][0])
act, dis_lgprb, _, con_lgprb, _, vl = agentList[actTarget].get_actions_value(
torch.Tensor([state[i]]).to(device)
)
action[i] = act
dis_logprob[i] = dis_lgprb.squeeze(0)
con_logprob[i] = con_lgprb.squeeze(0)
value[i] = vl.squeeze(0)
# variable from GPU to CPU
action_cpu = action.cpu().numpy()
dis_logprob_cpu = dis_logprob.cpu().numpy()
con_logprob_cpu = con_logprob.cpu().numpy()
value_cpu = value.flatten().cpu().numpy()
# Environment step
next_state, reward, next_done = env.step(action_cpu)
# save memories
for i in range(env.unity_agent_num):
# save memories to buffers
ob_bf[i].append(state[i])
act_bf[i].append(action_cpu[i])
dis_logprobs_bf[i].append(dis_logprob_cpu[i])
con_logprobs_bf[i].append(con_logprob_cpu[i])
rewards_bf[i].append(reward[i]+last_reward[i])
dones_bf[i].append(done[i])
values_bf[i].append(value_cpu[i])
remainTime = state[i,TARGET_STATE_SIZE]
if next_done[i] == True:
# finished a round, send finished memories to training datasets
# compute advantage and discounted reward
#print(i,"over")
endTarget = int(ob_bf[i][0][0])
roundTargetType = int(state[i,0])
thisRewardsTensor = broadCastEndReward(rewards_bf[i],remainTime)
adv, rt = GAE(
agentList[endTarget],
args,
thisRewardsTensor,
torch.Tensor(dones_bf[i]).to(device),
torch.tensor(values_bf[i]).to(device),
torch.tensor(next_state[i]).to(device).unsqueeze(0),
torch.Tensor([next_done[i]]).to(device),
)
# send memories to training datasets
obs[roundTargetType] = torch.cat((obs[roundTargetType], torch.tensor(ob_bf[i]).to(device)), 0)
actions[roundTargetType] = torch.cat((actions[roundTargetType], torch.tensor(act_bf[i]).to(device)), 0)
dis_logprobs[roundTargetType] = torch.cat(
(dis_logprobs[roundTargetType], torch.tensor(dis_logprobs_bf[i]).to(device)), 0
)
con_logprobs[roundTargetType] = torch.cat(
(con_logprobs[roundTargetType], torch.tensor(con_logprobs_bf[i]).to(device)), 0
)
rewards[roundTargetType] = torch.cat((rewards[roundTargetType], thisRewardsTensor), 0)
values[roundTargetType] = torch.cat((values[roundTargetType], torch.tensor(values_bf[i]).to(device)), 0)
advantages[roundTargetType] = torch.cat((advantages[roundTargetType], adv), 0)
returns[roundTargetType] = torch.cat((returns[roundTargetType], rt), 0)
# clear buffers
ob_bf[i] = []
act_bf[i] = []
dis_logprobs_bf[i] = []
con_logprobs_bf[i] = []
rewards_bf[i] = []
dones_bf[i] = []
values_bf[i] = []
print(f"train dataset {Targets(roundTargetType).name} added:{obs[roundTargetType].size()[0]}/{args.datasetSize}")
for i in range(TARGETNUM):
if obs[i].size()[0] >= args.datasetSize:
# start train NN
trainQueue.append(i)
if(len(trainQueue)>0):
break
state, done = next_state, next_done
else:
step += 1
# skip this step use last predict action
next_state, reward, next_done = env.step(action_cpu)
# save memories
for i in range(env.unity_agent_num):
if next_done[i] == True:
#print(i,"over???")
# save memories to buffers
ob_bf[i].append(state[i])
act_bf[i].append(action_cpu[i])
dis_logprobs_bf[i].append(dis_logprob_cpu[i])
con_logprobs_bf[i].append(con_logprob_cpu[i])
rewards_bf[i].append(reward[i])
dones_bf[i].append(done[i])
values_bf[i].append(value_cpu[i])
remainTime = state[i,TARGET_STATE_SIZE]
# finished a round, send finished memories to training datasets
# compute advantage and discounted reward
roundTargetType = int(state[i,0])
thisRewardsTensor = broadCastEndReward(rewards_bf[i],remainTime)
adv, rt = GAE(
agentList[roundTargetType],
args,
thisRewardsTensor,
torch.Tensor(dones_bf[i]).to(device),
torch.tensor(values_bf[i]).to(device),
torch.Tensor(next_state[i]).to(device).unsqueeze(dim = 0),
torch.Tensor([next_done[i]]).to(device),
)
# send memories to training datasets
obs[roundTargetType] = torch.cat((obs[roundTargetType], torch.tensor(ob_bf[i]).to(device)), 0)
actions[roundTargetType] = torch.cat((actions[roundTargetType], torch.tensor(act_bf[i]).to(device)), 0)
dis_logprobs[roundTargetType] = torch.cat(
(dis_logprobs[roundTargetType], torch.tensor(dis_logprobs_bf[i]).to(device)), 0
)
con_logprobs[roundTargetType] = torch.cat(
(con_logprobs[roundTargetType], torch.tensor(con_logprobs_bf[i]).to(device)), 0
)
rewards[roundTargetType] = torch.cat((rewards[roundTargetType], thisRewardsTensor), 0)
values[roundTargetType] = torch.cat((values[roundTargetType], torch.tensor(values_bf[i]).to(device)), 0)
advantages[roundTargetType] = torch.cat((advantages[roundTargetType], adv), 0)
returns[roundTargetType] = torch.cat((returns[roundTargetType], rt), 0)
# clear buffers
ob_bf[i] = []
act_bf[i] = []
dis_logprobs_bf[i] = []
con_logprobs_bf[i] = []
rewards_bf[i] = []
dones_bf[i] = []
values_bf[i] = []
print(f"train dataset {Targets(roundTargetType).name} added:{obs[roundTargetType].size()[0]}/{args.datasetSize}")
state = next_state
last_reward = reward
i += 1
if args.train:
meanRewardList = [] # for WANDB
# loop all tarining queue
for thisT in trainQueue:
target_steps[thisT]+=1
# flatten the batch
b_obs = obs[thisT].reshape((-1,) + env.unity_observation_shape)
b_dis_logprobs = dis_logprobs[thisT].reshape(-1)
b_con_logprobs = con_logprobs[thisT].reshape(-1)
b_actions = actions[thisT].reshape((-1,) + (env.unity_action_size,))
b_advantages = advantages[thisT].reshape(-1)
b_returns = returns[thisT].reshape(-1)
b_values = values[thisT].reshape(-1)
b_size = b_obs.size()[0]
# Optimizing the policy and value network
b_inds = np.arange(b_size)
# clipfracs = []
for epoch in range(args.epochs):
print(epoch,end="")
# shuffle all datasets
np.random.shuffle(b_inds)
for start in range(0, b_size, args.minibatchSize):
print(".",end="")
end = start + args.minibatchSize
mb_inds = b_inds[start:end]
mb_advantages = b_advantages[mb_inds]
# normalize advantages
if args.norm_adv:
mb_advantages = (mb_advantages - mb_advantages.mean()) / (
mb_advantages.std() + 1e-8
)
(
_,
new_dis_logprob,
dis_entropy,
new_con_logprob,
con_entropy,
newvalue,
) = agentList[thisT].get_actions_value(b_obs[mb_inds], b_actions[mb_inds])
# discrete ratio
dis_logratio = new_dis_logprob - b_dis_logprobs[mb_inds]
dis_ratio = dis_logratio.exp()
# continuous ratio
con_logratio = new_con_logprob - b_con_logprobs[mb_inds]
con_ratio = con_logratio.exp()
"""
# early stop
with torch.no_grad():
# calculate approx_kl http://joschu.net/blog/kl-approx.html
old_approx_kl = (-logratio).mean()
approx_kl = ((ratio - 1) - logratio).mean()
clipfracs += [((ratio - 1.0).abs() > args.clip_coef).float().mean().item()]
"""
# discrete Policy loss
dis_pg_loss_orig = -mb_advantages * dis_ratio
dis_pg_loss_clip = -mb_advantages * torch.clamp(
dis_ratio, 1 - args.clip_coef, 1 + args.clip_coef
)
dis_pg_loss = torch.max(dis_pg_loss_orig, dis_pg_loss_clip).mean()
# continuous Policy loss
con_pg_loss_orig = -mb_advantages * con_ratio
con_pg_loss_clip = -mb_advantages * torch.clamp(
con_ratio, 1 - args.clip_coef, 1 + args.clip_coef
)
con_pg_loss = torch.max(con_pg_loss_orig, con_pg_loss_clip).mean()
# Value loss
newvalue = newvalue.view(-1)
if args.clip_vloss:
v_loss_unclipped = (newvalue - b_returns[mb_inds]) ** 2
v_clipped = b_values[mb_inds] + torch.clamp(
newvalue - b_values[mb_inds],
-args.clip_coef,
args.clip_coef,
)
v_loss_clipped = (v_clipped - b_returns[mb_inds]) ** 2
v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped)
v_loss = 0.5 * v_loss_max.mean()
else:
v_loss = 0.5 * ((newvalue - b_returns[mb_inds]) ** 2).mean()
# total loss
entropy_loss = dis_entropy.mean() + con_entropy.mean()
loss = (
dis_pg_loss * POLICY_COEF[thisT]
+ con_pg_loss * POLICY_COEF[thisT]
+ entropy_loss * ENTROPY_COEF[thisT]
+ v_loss * CRITIC_COEF[thisT]
)*LOSS_COEF[thisT]
optimizers[thisT].zero_grad()
loss.backward()
# Clips gradient norm of an iterable of parameters.
nn.utils.clip_grad_norm_(agentList[thisT].parameters(), args.max_grad_norm)
optimizers[thisT].step()
"""
if args.target_kl is not None:
if approx_kl > args.target_kl:
break
"""
# record mean reward before clear history
print("done")
targetRewardMean = np.mean(rewards[thisT].to("cpu").detach().numpy().copy())
meanRewardList.append(targetRewardMean)
targetName = Targets(thisT).name
# clear this target trainning set buffer
obs[thisT] = torch.tensor([]).to(device)
actions[thisT] = torch.tensor([]).to(device)
dis_logprobs[thisT] = torch.tensor([]).to(device)
con_logprobs[thisT] = torch.tensor([]).to(device)
rewards[thisT] = torch.tensor([]).to(device)
values[thisT] = torch.tensor([]).to(device)
advantages[thisT] = torch.tensor([]).to(device)
returns[thisT] = torch.tensor([]).to(device)
# record rewards for plotting purposes
writer.add_scalar(f"Target{targetName}/value_loss", v_loss.item(), target_steps[thisT])
writer.add_scalar(f"Target{targetName}/dis_policy_loss", dis_pg_loss.item(), target_steps[thisT])
writer.add_scalar(f"Target{targetName}/con_policy_loss", con_pg_loss.item(), target_steps[thisT])
writer.add_scalar(f"Target{targetName}/total_loss", loss.item(), target_steps[thisT])
writer.add_scalar(f"Target{targetName}/entropy_loss", entropy_loss.item(), target_steps[thisT])
writer.add_scalar(f"Target{targetName}/Reward", targetRewardMean, target_steps[thisT])
writer.add_scalar(f"Target{targetName}/WinRatio", WinRounds[targetName]/TotalRounds[targetName], target_steps[thisT])
print(f"episode over Target{targetName} mean reward:", targetRewardMean)
TotalRewardMean = np.mean(meanRewardList)
writer.add_scalar("GlobalCharts/TotalRewardMean", TotalRewardMean, total_steps)
writer.add_scalar("GlobalCharts/learning_rate", optimizer.param_groups[0]["lr"], total_steps)
# New Record!
if TotalRewardMean > bestReward:
bestReward = targetRewardMean
for i in range(using_targets_num):
saveDir = "../PPO-Model/" + run_name +"_"+ str(TotalRewardMean) +"_"+ str(i)+".pt"
torch.save(agentList[i], saveDir)
for i in range(using_targets_num):
saveDir = "../PPO-Model/"+ run_name +"_last_"+ str(i) + ".pt"
torch.save(agentList[i], saveDir)
env.close()
writer.close()
+204 -89
View File
@@ -9,6 +9,7 @@ import torch.nn as nn
import torch.optim as optim
from AimbotEnv import Aimbot
from tqdm import tqdm
from torch.distributions.normal import Normal
from torch.distributions.categorical import Categorical
from distutils.util import strtobool
@@ -24,26 +25,28 @@ from typing import List
bestReward = 0
DEFAULT_SEED = 9331
ENV_PATH = "../Build/Build-ParallelEnv-Target-OffPolicy-SingleStack-SideChannel/Aimbot-ParallelEnv"
ENV_PATH = "../Build/Build-ParallelEnv-Target-OffPolicy-SingleStack-SideChannel-ExtremeReward/Aimbot-ParallelEnv"
SIDE_CHANNEL_UUID = uuid.UUID("8bbfb62a-99b4-457c-879d-b78b69066b5e")
WAND_ENTITY = "koha9"
WORKER_ID = 1
BASE_PORT = 1000
# max round steps per agent is 2500/Decision_period, 25 seconds
# !!!check every parameters before run!!!
TOTAL_STEPS = 2000000
STEP_NUM = 314
DECISION_PERIOD = 2
LEARNING_RATE = 7e-4
TOTAL_STEPS = 6000000
BATCH_SIZE = 512
MAX_TRAINNING_DATASETS = 8000
DECISION_PERIOD = 1
LEARNING_RATE = 1e-3
GAMMA = 0.99
GAE_LAMBDA = 0.95
MINIBATCH_NUM = 4
EPOCHS = 4
CLIP_COEF = 0.1
POLICY_COEF = 1.0
ENTROPY_COEF = 0.01
CRITIC_COEF = 0.5
TARGET_LEARNING_RATE = 5e-5
ANNEAL_LEARNING_RATE = True
CLIP_VLOSS = True
@@ -51,8 +54,8 @@ NORM_ADV = True
TRAIN = True
WANDB_TACK = False
LOAD_DIR = None
# LOAD_DIR = "../PPO-Model/SmallArea-256-128-hybrid-2nd-trainning.pt"
#LOAD_DIR = None
LOAD_DIR = "../PPO-Model/Aimbot-target-last.pt"
# public data
TotalRounds = {"Go":0,"Attack":0,"Free":0}
@@ -81,10 +84,10 @@ def parse_args():
# model parameters
parser.add_argument("--train",type=lambda x: bool(strtobool(x)), default=TRAIN, nargs="?", const=True,
help="Train Model or not")
parser.add_argument("--stepNum", type=int, default=STEP_NUM,
help="the number of steps to run in each environment per policy rollout")
parser.add_argument("--minibatchesNum", type=int, default=MINIBATCH_NUM,
help="the number of mini-batches")
parser.add_argument("--datasetSize", type=int, default=MAX_TRAINNING_DATASETS,
help="training dataset size,start training while dataset collect enough data")
parser.add_argument("--minibatchSize", type=int, default=BATCH_SIZE,
help="nimi batch size")
parser.add_argument("--epochs", type=int, default=EPOCHS,
help="the K epochs to update the policy")
parser.add_argument("--annealLR", type=lambda x: bool(strtobool(x)), default=ANNEAL_LEARNING_RATE, nargs="?", const=True,
@@ -140,9 +143,11 @@ class PPOAgent(nn.Module):
self.continuous_size = env.unity_continuous_size
self.network = nn.Sequential(
layer_init(nn.Linear(np.array(env.unity_observation_shape).prod(), 384)),
layer_init(nn.Linear(np.array(env.unity_observation_shape).prod(), 700)),
nn.ReLU(),
layer_init(nn.Linear(384, 256)),
layer_init(nn.Linear(700, 500)),
nn.ReLU(),
layer_init(nn.Linear(500, 256)),
nn.ReLU(),
)
self.actor_dis = layer_init(nn.Linear(256, self.discrete_size), std=0.01)
@@ -192,6 +197,40 @@ class PPOAgent(nn.Module):
self.critic(hidden),
)
def GAE(agent, args, rewards, dones, values, next_obs, next_done):
# GAE
with torch.no_grad():
next_value = agent.get_value(next_obs).reshape(1, -1)
data_size = rewards.size()[0]
if args.gae:
advantages = torch.zeros_like(rewards).to(device)
lastgaelam = 0
for t in reversed(range(data_size)):
if t == data_size - 1:
nextnonterminal = 1.0 - next_done
nextvalues = next_value
else:
nextnonterminal = 1.0 - dones[t + 1]
nextvalues = values[t + 1]
delta = rewards[t] + args.gamma * nextvalues * nextnonterminal - values[t]
advantages[t] = lastgaelam = (
delta + args.gamma * args.gaeLambda * nextnonterminal * lastgaelam
)
returns = advantages + values
else:
returns = torch.zeros_like(rewards).to(device)
for t in reversed(range(data_size)):
if t == data_size - 1:
nextnonterminal = 1.0 - next_done
next_return = next_value
else:
nextnonterminal = 1.0 - dones[t + 1]
next_return = returns[t + 1]
returns[t] = rewards[t] + args.gamma * nextnonterminal * next_return
advantages = returns - values
return advantages, returns
class AimbotSideChannel(SideChannel):
def __init__(self, channel_id: uuid.UUID) -> None:
super().__init__(channel_id)
@@ -201,14 +240,14 @@ class AimbotSideChannel(SideChannel):
receive messages from Unity
"""
thisMessage = msg.read_string()
print(thisMessage)
#print(thisMessage)
thisResult = thisMessage.split("|")
if(thisResult[0] == "result"):
TotalRounds[thisResult[1]]+=1
if(thisResult[2] == "Win"):
WinRounds[thisResult[1]]+=1
print(TotalRounds)
print(WinRounds)
#print(TotalRounds)
#print(WinRounds)
elif(thisResult[0] == "Error"):
print(thisMessage)
# 发送函数
@@ -238,6 +277,7 @@ class AimbotSideChannel(SideChannel):
msg.write_float32_list(data)
super().queue_message_to_send(msg)
if __name__ == "__main__":
args = parse_args()
random.seed(args.seed)
@@ -259,11 +299,12 @@ if __name__ == "__main__":
optimizer = optim.Adam(agent.parameters(), lr=args.lr, eps=1e-5)
# Tensorboard and WandB Recorder
game_name = "Aimbot"
run_name = f"{game_name}_{args.seed}_{int(time.time())}"
game_name = "Aimbot_Target"
game_type = "OffPolicy"
run_name = f"{game_name}_{game_type}_{args.seed}_{int(time.time())}"
if args.wandb_track:
wandb.init(
project=run_name,
project=game_name,
entity=args.wandb_entity,
sync_tensorboard=True,
config=vars(args),
@@ -279,94 +320,168 @@ if __name__ == "__main__":
% ("\n".join([f"|{key}|{value}|" for key, value in vars(args).items()])),
)
# Memory Record
obs = torch.zeros((args.stepNum, env.unity_agent_num) + env.unity_observation_shape).to(device)
actions = torch.zeros((args.stepNum, env.unity_agent_num) + (env.unity_action_size,)).to(device)
dis_logprobs = torch.zeros((args.stepNum, env.unity_agent_num)).to(device)
con_logprobs = torch.zeros((args.stepNum, env.unity_agent_num)).to(device)
rewards = torch.zeros((args.stepNum, env.unity_agent_num)).to(device)
dones = torch.zeros((args.stepNum, env.unity_agent_num)).to(device)
values = torch.zeros((args.stepNum, env.unity_agent_num)).to(device)
# Trajectory Buffer
ob_bf = [[] for i in range(env.unity_agent_num)]
act_bf = [[] for i in range(env.unity_agent_num)]
dis_logprobs_bf = [[] for i in range(env.unity_agent_num)]
con_logprobs_bf = [[] for i in range(env.unity_agent_num)]
rewards_bf = [[] for i in range(env.unity_agent_num)]
dones_bf = [[] for i in range(env.unity_agent_num)]
values_bf = [[] for i in range(env.unity_agent_num)]
# TRY NOT TO MODIFY: start the game
args.batch_size = int(env.unity_agent_num * args.stepNum)
args.minibatch_size = int(args.batch_size // args.minibatchesNum)
total_update_step = args.total_timesteps // args.batch_size
total_update_step = args.total_timesteps // args.datasetSize
global_step = 0
start_time = time.time()
next_obs, _, _ = env.reset()
next_obs = torch.Tensor(next_obs).to(device)
next_done = torch.zeros(env.unity_agent_num).to(device)
state, _, done = env.reset()
# state = torch.Tensor(next_obs).to(device)
# next_done = torch.zeros(env.unity_agent_num).to(device)
for total_steps in range(total_update_step):
# discunt learning rate, while step == total_update_step lr will be 0
print("new episode")
if args.annealLR:
frac = 1.0 - (total_steps - 1.0) / total_update_step
finalRatio = TARGET_LEARNING_RATE/args.lr
frac = 1.0 - finalRatio*((total_steps - 1.0) / total_update_step)
lrnow = frac * args.lr
optimizer.param_groups[0]["lr"] = lrnow
# initialize empty training datasets
obs = torch.tensor([]).to(device) # (n,env.unity_observation_size)
actions = torch.tensor([]).to(device) # (n,env.unity_action_size)
dis_logprobs = torch.tensor([]).to(device) # (n,1)
con_logprobs = torch.tensor([]).to(device) # (n,1)
rewards = torch.tensor([]).to(device) # (n,1)
values = torch.tensor([]).to(device) # (n,1)
advantages = torch.tensor([]).to(device) # (n,1)
returns = torch.tensor([]).to(device) # (n,1)
# MAIN LOOP: run agent in environment
for i in range(args.stepNum * args.decision_period):
i = 0
training = False
while True:
if i % args.decision_period == 0:
step = round(i / args.decision_period)
# Choose action by agent
global_step += 1 * env.unity_agent_num
obs[step] = next_obs
dones[step] = next_done
with torch.no_grad():
# predict actions
action, dis_logprob, _, con_logprob, _, value = agent.get_actions_value(
next_obs
torch.Tensor(state).to(device)
)
value = value.flatten()
next_obs, reward, done = env.step(action.cpu().numpy())
# variable from GPU to CPU
action_cpu = action.cpu().numpy()
dis_logprob_cpu = dis_logprob.cpu().numpy()
con_logprob_cpu = con_logprob.cpu().numpy()
value_cpu = value.cpu().numpy()
# Environment step
next_state, reward, next_done = env.step(action_cpu)
# save memories
actions[step] = action
dis_logprobs[step] = dis_logprob
con_logprobs[step] = con_logprob
values[step] = value
rewards[step] = torch.tensor(reward).to(device).view(-1)
next_obs, next_done = torch.Tensor(next_obs).to(device), torch.Tensor(done).to(
device
)
for i in range(env.unity_agent_num):
# save memories to buffers
ob_bf[i].append(state[i])
act_bf[i].append(action_cpu[i])
dis_logprobs_bf[i].append(dis_logprob_cpu[i])
con_logprobs_bf[i].append(con_logprob_cpu[i])
rewards_bf[i].append(reward[i])
dones_bf[i].append(done[i])
values_bf[i].append(value_cpu[i])
if next_done[i] == True:
# finished a round, send finished memories to training datasets
# compute advantage and discounted reward
#print(i,"over")
adv, rt = GAE(
agent,
args,
torch.tensor(rewards_bf[i]).to(device),
torch.Tensor(dones_bf[i]).to(device),
torch.tensor(values_bf[i]).to(device),
torch.tensor(next_state[i]).to(device),
torch.Tensor([next_done[i]]).to(device),
)
# send memories to training datasets
obs = torch.cat((obs, torch.tensor(ob_bf[i]).to(device)), 0)
actions = torch.cat((actions, torch.tensor(act_bf[i]).to(device)), 0)
dis_logprobs = torch.cat(
(dis_logprobs, torch.tensor(dis_logprobs_bf[i]).to(device)), 0
)
con_logprobs = torch.cat(
(con_logprobs, torch.tensor(con_logprobs_bf[i]).to(device)), 0
)
rewards = torch.cat((rewards, torch.tensor(rewards_bf[i]).to(device)), 0)
values = torch.cat((values, torch.tensor(values_bf[i]).to(device)), 0)
advantages = torch.cat((advantages, adv), 0)
returns = torch.cat((returns, rt), 0)
# clear buffers
ob_bf[i] = []
act_bf[i] = []
dis_logprobs_bf[i] = []
con_logprobs_bf[i] = []
rewards_bf[i] = []
dones_bf[i] = []
values_bf[i] = []
print(f"train dataset added:{obs.size()[0]}/{args.datasetSize}")
if obs.size()[0] >= args.datasetSize:
# start train NN
break
state, done = next_state, next_done
else:
# skip this step use last predict action
next_obs, reward, done = env.step(action.cpu().numpy())
next_obs, next_done = torch.Tensor(next_obs).to(device), torch.Tensor(done).to(
device
)
next_obs, reward, next_done = env.step(action_cpu)
# save memories
for i in range(env.unity_agent_num):
if next_done[i] == True:
#print(i,"over???")
# save last memories to buffers
ob_bf[i].append(state[i])
act_bf[i].append(action_cpu[i])
dis_logprobs_bf[i].append(dis_logprob_cpu[i])
con_logprobs_bf[i].append(con_logprob_cpu[i])
rewards_bf[i].append(reward[i])
dones_bf[i].append(done[i])
values_bf[i].append(value_cpu[i])
# finished a round, send finished memories to training datasets
# compute advantage and discounted reward
adv, rt = GAE(
agent,
args,
torch.tensor(rewards_bf[i]).to(device),
torch.Tensor(dones_bf[i]).to(device),
torch.tensor(values_bf[i]).to(device),
torch.tensor(next_state[i]).to(device),
torch.Tensor([next_done[i]]).to(device),
)
# send memories to training datasets
obs = torch.cat((obs, torch.tensor(ob_bf[i]).to(device)), 0)
actions = torch.cat((actions, torch.tensor(act_bf[i]).to(device)), 0)
dis_logprobs = torch.cat(
(dis_logprobs, torch.tensor(dis_logprobs_bf[i]).to(device)), 0
)
con_logprobs = torch.cat(
(con_logprobs, torch.tensor(con_logprobs_bf[i]).to(device)), 0
)
rewards = torch.cat((rewards, torch.tensor(rewards_bf[i]).to(device)), 0)
values = torch.cat((values, torch.tensor(values_bf[i]).to(device)), 0)
advantages = torch.cat((advantages, adv), 0)
returns = torch.cat((returns, rt), 0)
# GAE
with torch.no_grad():
next_value = agent.get_value(next_obs).reshape(1, -1)
if args.gae:
advantages = torch.zeros_like(rewards).to(device)
lastgaelam = 0
for t in reversed(range(args.stepNum)):
if t == args.stepNum - 1:
nextnonterminal = 1.0 - next_done
nextvalues = next_value
else:
nextnonterminal = 1.0 - dones[t + 1]
nextvalues = values[t + 1]
delta = rewards[t] + args.gamma * nextvalues * nextnonterminal - values[t]
advantages[t] = lastgaelam = (
delta + args.gamma * args.gaeLambda * nextnonterminal * lastgaelam
)
returns = advantages + values
else:
returns = torch.zeros_like(rewards).to(device)
for t in reversed(range(args.stepNum)):
if t == args.stepNum - 1:
nextnonterminal = 1.0 - next_done
next_return = next_value
else:
nextnonterminal = 1.0 - dones[t + 1]
next_return = returns[t + 1]
returns[t] = rewards[t] + args.gamma * nextnonterminal * next_return
advantages = returns - values
# clear buffers
ob_bf[i] = []
act_bf[i] = []
dis_logprobs_bf[i] = []
con_logprobs_bf[i] = []
rewards_bf[i] = []
dones_bf[i] = []
values_bf[i] = []
print(f"train dataset added:{obs.size()[0]}/{args.datasetSize}")
state, done = next_state, next_done
i += 1
if args.train:
# flatten the batch
@@ -377,15 +492,15 @@ if __name__ == "__main__":
b_advantages = advantages.reshape(-1)
b_returns = returns.reshape(-1)
b_values = values.reshape(-1)
b_size = b_obs.size()[0]
# Optimizing the policy and value network
b_inds = np.arange(args.batch_size)
b_inds = np.arange(b_size)
# clipfracs = []
for epoch in range(args.epochs):
# shuffle all datasets
np.random.shuffle(b_inds)
for start in range(0, args.batch_size, args.minibatch_size):
end = start + args.minibatch_size
for start in range(0, b_size, args.minibatchSize):
end = start + args.minibatchSize
mb_inds = b_inds[start:end]
mb_advantages = b_advantages[mb_inds]
@@ -484,12 +599,12 @@ if __name__ == "__main__":
"charts/SPS", int(global_step / (time.time() - start_time)), global_step
)
writer.add_scalar("charts/Reward", rewardsMean, global_step)
writer.add_scalar("charts/GoWinRatio", WinRounds["Go"]/TotalRounds["Go"] if TotalRounds["Go"] != 0 else 0, global_step)
writer.add_scalar("charts/AttackWinRatio", WinRounds["Attack"]/TotalRounds["Attack"] if TotalRounds["Attack"] != 0 else 0, global_step)
writer.add_scalar("charts/FreeWinRatio", WinRounds["Free"]/TotalRounds["Free"] if TotalRounds["Free"] != 0 else 0, global_step)
writer.add_scalar("charts/GoWinRatio", WinRounds["Go"]/TotalRounds["Go"], global_step)
writer.add_scalar("charts/AttackWinRatio", WinRounds["Attack"]/TotalRounds["Attack"], global_step)
writer.add_scalar("charts/FreeWinRatio", WinRounds["Free"]/TotalRounds["Free"], global_step)
if rewardsMean > bestReward:
bestReward = rewardsMean
saveDir = "../PPO-Model/bigArea-384-128-hybrid-" + str(rewardsMean) + ".pt"
saveDir = "../PPO-Model/Target-700-500-256-hybrid-" + str(rewardsMean) + ".pt"
torch.save(agent, saveDir)
env.close()
+274 -23
View File
@@ -434,41 +434,292 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import torch\n",
"import numpy as np\n",
"\n",
"x = torch.randn(2, 3).to(\"cuda\")\n",
"print(x)\n",
"print(torch.cat((x, x, x), 0))\n",
"print(torch.cat((x, x, x), 1))\n",
"\n",
"aa = torch.empty(0).to(\"cuda\")\n",
"torch.cat([aa,x])\n",
"bb = [[]]*2\n",
"print(bb)\n",
"bb.append(x.to(\"cpu\").tolist())\n",
"bb.append(x.to(\"cpu\").tolist())\n",
"print(bb)"
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"x : torch.Size([2, 3, 4])\n",
"x : torch.Size([6, 2, 3, 4])\n",
"x : torch.Size([6, 2, 3, 4])\n"
"tensor([[-1.1090, 0.4686, 0.6883],\n",
" [-0.1862, -0.3943, -0.0202],\n",
" [ 0.1436, -0.9444, -1.2079],\n",
" [-2.9434, -2.5989, -0.6653],\n",
" [ 0.4668, 0.8548, -0.4641],\n",
" [-0.3956, -0.2832, -0.1889],\n",
" [-0.2801, -0.2092, 1.7254],\n",
" [ 2.7938, -0.7742, 0.7053]], device='cuda:0')\n",
"(8, 0)\n",
"---\n",
"[[array([-1.1090169, 0.4685607, 0.6883437], dtype=float32)], [array([-0.1861974 , -0.39429024, -0.02016036], dtype=float32)], [array([ 0.14360362, -0.9443668 , -1.2079065 ], dtype=float32)], [array([-2.9433894 , -2.598913 , -0.66532046], dtype=float32)], [array([ 0.46684313, 0.8547877 , -0.46408093], dtype=float32)], [array([-0.39563984, -0.2831819 , -0.18891 ], dtype=float32)], [array([-0.28008553, -0.20918302, 1.7253567 ], dtype=float32)], [array([ 2.7938051, -0.7742478, 0.705279 ], dtype=float32)]]\n",
"[[array([-1.1090169, 0.4685607, 0.6883437], dtype=float32)], [], [array([ 0.14360362, -0.9443668 , -1.2079065 ], dtype=float32)], [array([-2.9433894 , -2.598913 , -0.66532046], dtype=float32)], [array([ 0.46684313, 0.8547877 , -0.46408093], dtype=float32)], [array([-0.39563984, -0.2831819 , -0.18891 ], dtype=float32)], [array([-0.28008553, -0.20918302, 1.7253567 ], dtype=float32)], [array([ 2.7938051, -0.7742478, 0.705279 ], dtype=float32)]]\n",
"---\n",
"[array([-1.1090169, 0.4685607, 0.6883437], dtype=float32), array([-1.1090169, 0.4685607, 0.6883437], dtype=float32)]\n",
"vvv tensor([[-1.1090, 0.4686, 0.6883],\n",
" [-1.1090, 0.4686, 0.6883]], device='cuda:0')\n",
"tensor([[-1.1090, 0.4686, 0.6883],\n",
" [-1.1090, 0.4686, 0.6883]], device='cuda:0')\n"
]
},
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 64,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import numpy as np\n",
"import torch\n",
"\n",
"agent_num = 8\n",
"ob_buffer = [[]for i in range(agent_num)]\n",
"obs = torch.randn(8, 3).to(\"cuda\")\n",
"print(obs)\n",
"print(np.shape(np.array(ob_buffer)))\n",
"print('---')\n",
"obs_cpu = obs.to(\"cpu\").numpy()\n",
"for i in range(agent_num):\n",
" ob_buffer[i].append(obs_cpu[i])\n",
"print(ob_buffer)\n",
"ob_buffer[1] = []\n",
"print(ob_buffer)\n",
"print('---')\n",
"for i in range(agent_num):\n",
" ob_buffer[i].append(obs_cpu[i])\n",
"print(ob_buffer[0])\n",
"vvv = torch.tensor(ob_buffer[0]).to(\"cuda\")\n",
"print(\"vvv\",vvv)\n",
"empt = torch.tensor([]).to(\"cuda\")\n",
"vvvv = torch.cat((empt,vvv),0)\n",
"print(vvvv)\n",
"vvvv.size()[0]>0"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"from AimbotEnv import Aimbot\n",
"from enum import Enum\n",
"import uuid\n",
"from mlagents_envs.side_channel.side_channel import (\n",
" SideChannel,\n",
" IncomingMessage,\n",
" OutgoingMessage,\n",
")\n",
"from typing import List\n",
"\n",
"class Targets(Enum):\n",
" Free = 0\n",
" Go = 1\n",
" Attack = 2\n",
" Num = 3\n",
"TotalRounds = {\"Go\":0,\"Attack\":0,\"Free\":0}\n",
"WinRounds = {\"Go\":0,\"Attack\":0,\"Free\":0}\n",
"\n",
"class AimbotSideChannel(SideChannel):\n",
" def __init__(self, channel_id: uuid.UUID) -> None:\n",
" super().__init__(channel_id)\n",
" def on_message_received(self, msg: IncomingMessage) -> None:\n",
" \"\"\"\n",
" Note: We must implement this method of the SideChannel interface to\n",
" receive messages from Unity\n",
" \"\"\"\n",
" thisMessage = msg.read_string()\n",
" #print(thisMessage)\n",
" thisResult = thisMessage.split(\"|\")\n",
" if(thisResult[0] == \"result\"):\n",
" TotalRounds[thisResult[1]]+=1\n",
" if(thisResult[2] == \"Win\"):\n",
" WinRounds[thisResult[1]]+=1\n",
" #print(TotalRounds)\n",
" #print(WinRounds)\n",
" elif(thisResult[0] == \"Error\"):\n",
" print(thisMessage)\n",
"\t# 发送函数\n",
" def send_string(self, data: str) -> None:\n",
" \"\"\"发送一个字符串给C#\"\"\"\n",
" msg = OutgoingMessage()\n",
" msg.write_string(data)\n",
" super().queue_message_to_send(msg)\n",
"\n",
" def send_bool(self, data: bool) -> None:\n",
" msg = OutgoingMessage()\n",
" msg.write_bool(data)\n",
" super().queue_message_to_send(msg)\n",
"\n",
" def send_int(self, data: int) -> None:\n",
" msg = OutgoingMessage()\n",
" msg.write_int32(data)\n",
" super().queue_message_to_send(msg)\n",
"\n",
" def send_float(self, data: float) -> None:\n",
" msg = OutgoingMessage()\n",
" msg.write_float32(data)\n",
" super().queue_message_to_send(msg)\n",
"\n",
" def send_float_list(self, data: List[float]) -> None:\n",
" msg = OutgoingMessage()\n",
" msg.write_float32_list(data)\n",
" super().queue_message_to_send(msg)\n",
" \n",
"SIDE_CHANNEL_UUID = uuid.UUID(\"8bbfb62a-99b4-457c-879d-b78b69066b5e\")\n",
"ENV_PATH = \"../Build/Build-ParallelEnv-Target-OffPolicy-SingleStack-SideChannel-EndReward/Aimbot-ParallelEnv\"\n",
"aimBotsideChannel = AimbotSideChannel(SIDE_CHANNEL_UUID)\n",
"env = Aimbot(envPath=ENV_PATH, workerID=123, basePort=999,side_channels=[aimBotsideChannel])"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import torch\n",
"import torch.nn as nn\n",
"import torch.optim as optim\n",
"from AimbotEnv import Aimbot\n",
"from torch.distributions.normal import Normal\n",
"from torch.distributions.categorical import Categorical\n",
"device = torch.device(\"cuda\" if torch.cuda.is_available() and True else \"cpu\")\n",
"\n",
"def layer_init(layer, std=np.sqrt(2), bias_const=0.0):\n",
" torch.nn.init.orthogonal_(layer.weight, std)\n",
" torch.nn.init.constant_(layer.bias, bias_const)\n",
" return layer\n",
"\n",
"class PPOAgent(nn.Module):\n",
" def __init__(self, env: Aimbot,targetNum:int):\n",
" super(PPOAgent, self).__init__()\n",
" self.stateSize = env.unity_observation_shape[0]\n",
"\n",
" self.discrete_size = env.unity_discrete_size\n",
" self.discrete_shape = list(env.unity_discrete_branches)\n",
" self.continuous_size = env.unity_continuous_size\n",
"\n",
" self.network = nn.Sequential(\n",
" layer_init(nn.Linear(env.unity_observation_shape[0], 300)),\n",
" nn.Tanh(),\n",
" layer_init(nn.Linear(300, 200)),\n",
" nn.Tanh(),\n",
" )\n",
" self.actor_dis = layer_init(nn.Linear(200, self.discrete_size), std=0.5)\n",
" self.actor_mean = layer_init(nn.Linear(200, self.continuous_size), std=0.5)\n",
" self.actor_logstd = nn.Parameter(torch.zeros(1, self.continuous_size))\n",
" self.critic = layer_init(nn.Linear(200, 1), std=1)\n",
"\n",
" def get_value(self, state: torch.Tensor):\n",
" return self.critic(self.network(state))\n",
"\n",
" def get_actions_value(self, state: torch.Tensor, actions=None):\n",
" hidden = self.network(state)\n",
"\n",
" # discrete\n",
" dis_logits = self.actor_dis(hidden)\n",
" split_logits = torch.split(dis_logits, self.discrete_shape, dim=1)\n",
" multi_categoricals = [Categorical(logits=thisLogits) for thisLogits in split_logits]\n",
" # continuous\n",
" actions_mean = self.actor_mean(hidden)\n",
" action_logstd = self.actor_logstd.expand_as(actions_mean)\n",
" action_std = torch.exp(action_logstd)\n",
" con_probs = Normal(actions_mean, action_std)\n",
"\n",
" if actions is None:\n",
" if True:\n",
" # select actions base on probability distribution model\n",
" disAct = torch.stack([ctgr.sample() for ctgr in multi_categoricals])\n",
" conAct = con_probs.sample()\n",
" actions = torch.cat([disAct.T, conAct], dim=1)\n",
" else:\n",
" # select actions base on best probability distribution\n",
" disAct = torch.stack([torch.argmax(logit, dim=1) for logit in split_logits])\n",
" conAct = actions_mean\n",
" actions = torch.cat([disAct.T, conAct], dim=1)\n",
" else:\n",
" disAct = actions[:, 0 : env.unity_discrete_type].T\n",
" conAct = actions[:, env.unity_discrete_type :]\n",
" dis_log_prob = torch.stack(\n",
" [ctgr.log_prob(act) for act, ctgr in zip(disAct, multi_categoricals)]\n",
" )\n",
" dis_entropy = torch.stack([ctgr.entropy() for ctgr in multi_categoricals])\n",
" return (\n",
" actions,\n",
" dis_log_prob.sum(0),\n",
" dis_entropy.sum(0),\n",
" con_probs.log_prob(conAct).sum(1),\n",
" con_probs.entropy().sum(1),\n",
" self.critic(hidden),\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"ppp = \"../Build/Build-ParallelEnv-Target-OffPolicy-SingleStack-SideChannel-EndReward-Easy-V2.7-FreeOnly-NormalMapSize/Aimbot-ParallelEnv\"\n",
"env = Aimbot(envPath=ppp, workerID=1, basePort=1000,side_channels=[])\n",
"agent_list = []\n",
"optimizers = []\n",
"for i in range(3):\n",
" agent_list.append(PPOAgent(env=env,targetNum=3).to('cuda'))\n",
" optimizers.append(optim.Adam(agent_list[i].parameters(),lr=1e-4))"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"tensor([1., 2., 3., 4., 5.])"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import torch\n",
"#1\n",
"x = torch.randn(2, 1, 1)#为1可以扩展为3和4\n",
"x = x.expand(2, 3, 4)\n",
"print('x :', x.size())\n",
"\n",
"#2\n",
"#扩展一个新的维度必须在最前面,否则会报错\n",
"#x = x.expand(2, 3, 4, 6)\n",
"\n",
"x = x.expand(6, 2, 3, 4)\n",
"print('x :', x.size())\n",
"\n",
"#3\n",
"#某一个维度为-1表示不改变该维度的大小\n",
"x = x.expand(6, -1, -1, -1)\n",
"print('x :', x.size())\n",
"\n",
"x : torch.Size([2, 3, 4])\n",
"x : torch.Size([6, 2, 3, 4])\n",
"x : torch.Size([6, 2, 3, 4])"
"aaa = torch.zeros((8,5))\n",
"aaa[0] = torch.Tensor([1,2,3,4,5])\n",
"aaa[0]"
]
}
],