From 895cd5c1186a0db191e7f3890795d17d4d10b2ab Mon Sep 17 00:00:00 2001 From: Koha9 Date: Sat, 3 Dec 2022 03:58:19 +0900 Subject: [PATCH] Add EndReward Broadcast function while game over add remaintime/15 to every step's rewards. to improve this round's training weight. fix get target from states still using onehot decoder bug. --- Aimbot-PPO-Python/Pytorch/MultiNN-PPO.py | 41 +++- Aimbot-PPO-Python/Pytorch/testarea.ipynb | 252 +++++++++++++++++------ 2 files changed, 220 insertions(+), 73 deletions(-) diff --git a/Aimbot-PPO-Python/Pytorch/MultiNN-PPO.py b/Aimbot-PPO-Python/Pytorch/MultiNN-PPO.py index 48668d8..d64a08f 100644 --- a/Aimbot-PPO-Python/Pytorch/MultiNN-PPO.py +++ b/Aimbot-PPO-Python/Pytorch/MultiNN-PPO.py @@ -24,12 +24,12 @@ from typing import List bestReward = 0 -DEFAULT_SEED = 9331 -ENV_PATH = "../Build/Build-ParallelEnv-Target-OffPolicy-SingleStack-SideChannel-ExtremeReward/Aimbot-ParallelEnv" +DEFAULT_SEED = 933139 +ENV_PATH = "../Build/Build-ParallelEnv-Target-OffPolicy-SingleStack-SideChannel-EndReward-Easy/Aimbot-ParallelEnv" SIDE_CHANNEL_UUID = uuid.UUID("8bbfb62a-99b4-457c-879d-b78b69066b5e") WAND_ENTITY = "koha9" -WORKER_ID = 1 -BASE_PORT = 1000 +WORKER_ID = 2 +BASE_PORT = 1001 # max round steps per agent is 2500/Decision_period, 25 seconds # !!!check every parameters before run!!! @@ -38,7 +38,7 @@ TOTAL_STEPS = 6000000 BATCH_SIZE = 512 MAX_TRAINNING_DATASETS = 8000 DECISION_PERIOD = 1 -LEARNING_RATE = 1e-3 +LEARNING_RATE = 8e-4 GAMMA = 0.99 GAE_LAMBDA = 0.95 EPOCHS = 4 @@ -58,7 +58,11 @@ WANDB_TACK = True LOAD_DIR = "../PPO-Model/Aimbot-target-last.pt" # public data +BASE_WINREWARD = 999 +BASE_LOSEREWARD = -999 TARGETNUM= 4 +ENV_TIMELIMIT = 30 +RESULT_BROADCAST_RATIO = 2/ENV_TIMELIMIT TotalRounds = {"Go":0,"Attack":0,"Free":0} WinRounds = {"Go":0,"Attack":0,"Free":0} @@ -160,7 +164,7 @@ class PPOAgent(nn.Module): def get_actions_value(self, state: torch.Tensor, actions=None): hidden = self.network(state) - targets = torch.argmax(state[:,0:self.targetNum],dim=1) + targets = state[:,0] # discrete # 递归targets的数量,既agent数来实现根据target不同来选用对应的输出网络计算输出 @@ -244,7 +248,7 @@ class AimbotSideChannel(SideChannel): receive messages from Unity """ thisMessage = msg.read_string() - #print(thisMessage) + # print(thisMessage) thisResult = thisMessage.split("|") if(thisResult[0] == "result"): TotalRounds[thisResult[1]]+=1 @@ -256,7 +260,7 @@ class AimbotSideChannel(SideChannel): print(thisMessage) # 发送函数 def send_string(self, data: str) -> None: - """发送一个字符串给C#""" + # send a string toC# msg = OutgoingMessage() msg.write_string(data) super().queue_message_to_send(msg) @@ -281,6 +285,20 @@ class AimbotSideChannel(SideChannel): msg.write_float32_list(data) super().queue_message_to_send(msg) +def broadCastEndReward(rewardBF:list,remainTime:float): + thisRewardBF = rewardBF + if (rewardBF[-1]<=-500): + # print("Lose DO NOT BROAD CAST",rewardBF[-1]) + thisRewardBF[-1] = rewardBF[-1]-BASE_LOSEREWARD + thisRewardBF = (np.asarray(thisRewardBF)).tolist() + elif (rewardBF[-1]>=500): + # print("Win! Broadcast reward!",rewardBF[-1]) + thisRewardBF[-1] = rewardBF[-1]-BASE_WINREWARD + thisRewardBF = (np.asarray(thisRewardBF)+(remainTime*RESULT_BROADCAST_RATIO)).tolist() + else: + print("!!!!!DIDNT GET RESULT REWARD!!!!!!",rewardBF[-1]) + return torch.Tensor(thisRewardBF).to(device) + if __name__ == "__main__": args = parse_args() @@ -304,7 +322,7 @@ if __name__ == "__main__": # Tensorboard and WandB Recorder game_name = "Aimbot_Target" - game_type = "OffPolicy" + game_type = "OffPolicy_HMNN_EndBC" run_name = f"{game_name}_{game_type}_{args.seed}_{int(time.time())}" if args.wandb_track: wandb.init( @@ -398,10 +416,11 @@ if __name__ == "__main__": # finished a round, send finished memories to training datasets # compute advantage and discounted reward #print(i,"over") + thisRewardsTensor = broadCastEndReward(rewards_bf[i],state[i,6]) adv, rt = GAE( agent, args, - torch.tensor(rewards_bf[i]).to(device), + thisRewardsTensor, torch.Tensor(dones_bf[i]).to(device), torch.tensor(values_bf[i]).to(device), torch.tensor(next_state[i]).to(device), @@ -416,7 +435,7 @@ if __name__ == "__main__": con_logprobs = torch.cat( (con_logprobs, torch.tensor(con_logprobs_bf[i]).to(device)), 0 ) - rewards = torch.cat((rewards, torch.tensor(rewards_bf[i]).to(device)), 0) + rewards = torch.cat((rewards, thisRewardsTensor), 0) values = torch.cat((values, torch.tensor(values_bf[i]).to(device)), 0) advantages = torch.cat((advantages, adv), 0) returns = torch.cat((returns, rt), 0) diff --git a/Aimbot-PPO-Python/Pytorch/testarea.ipynb b/Aimbot-PPO-Python/Pytorch/testarea.ipynb index 7e5c504..3efc30e 100644 --- a/Aimbot-PPO-Python/Pytorch/testarea.ipynb +++ b/Aimbot-PPO-Python/Pytorch/testarea.ipynb @@ -525,7 +525,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -594,75 +594,203 @@ " super().queue_message_to_send(msg)\n", " \n", "SIDE_CHANNEL_UUID = uuid.UUID(\"8bbfb62a-99b4-457c-879d-b78b69066b5e\")\n", - "ENV_PATH = \"../Build/Build-ParallelEnv-Target-OffPolicy-SingleStack-SideChannel-ExtremeReward/Aimbot-ParallelEnv\"\n", + "ENV_PATH = \"../Build/Build-ParallelEnv-Target-OffPolicy-SingleStack-SideChannel-EndReward/Aimbot-ParallelEnv\"\n", "aimBotsideChannel = AimbotSideChannel(SIDE_CHANNEL_UUID)\n", - "env = Aimbot(envPath=ENV_PATH, workerID=1, basePort=100,side_channels=[aimBotsideChannel])" + "env = Aimbot(envPath=ENV_PATH, workerID=123, basePort=999,side_channels=[aimBotsideChannel])" ] }, { "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [ - { - "ename": "NameError", - "evalue": "name 'env' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)", - "\u001b[1;32m~\\AppData\\Local\\Temp\\ipykernel_34852\\4061840787.py\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mnumpy\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mnp\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 2\u001b[1;33m \u001b[0mstate\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0m_\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0m_\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0menv\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mgetSteps\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 3\u001b[0m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mstate\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;36m4\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 4\u001b[0m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0margmax\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mstate\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;36m4\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 5\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mi\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;36m4\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;31mNameError\u001b[0m: name 'env' is not defined" - ] - } - ], - "source": [ - "import numpy as np\n", - "state,_,_ = env.getSteps()\n", - "print(state[:,0:4])\n", - "print(np.argmax(state[:,0:4], axis=0))\n" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "tensor([[1, 2, 3],\n", - " [1, 2, 3],\n", - " [1, 2, 3]])\n", - "3\n", - "0\n", - "1\n", - "2\n", - "3\n" - ] - } - ], - "source": [ - "import torch\n", - "one_hot = torch.tensor([])\n", - "aaa = torch.tensor([1,2,3])\n", - "bbb = torch.tensor([1,2,3])\n", - "print(torch.stack([aaa,aaa,aaa]))\n", - "print(aaa.size()[0])\n", - "\n", - "for i in range(4):\n", - " print(i)" - ] - }, - { - "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ - "[Categorical(logits: torch.Size([8, 3])), Categorical(logits: torch.Size([8, 3])), Categorical(logits: torch.Size([8, 2]))]\n", - "Normal(loc: torch.Size([8, 1]), scale: torch.Size([8, 1]))" + "import torch\n", + "import torch.nn as nn\n", + "import torch.optim as optim\n", + "from torch.distributions.normal import Normal\n", + "from torch.distributions.categorical import Categorical\n", + "device = torch.device(\"cuda\" if torch.cuda.is_available() and True else \"cpu\")\n", + "\n", + "def layer_init(layer, std=np.sqrt(2), bias_const=0.0):\n", + " torch.nn.init.orthogonal_(layer.weight, std)\n", + " torch.nn.init.constant_(layer.bias, bias_const)\n", + " return layer\n", + "\n", + "class PPOAgent(nn.Module):\n", + " def __init__(self, env: Aimbot,targetNum:int):\n", + " super(PPOAgent, self).__init__()\n", + " self.targetNum = targetNum\n", + " self.discrete_size = env.unity_discrete_size\n", + " self.discrete_shape = list(env.unity_discrete_branches)\n", + " self.continuous_size = env.unity_continuous_size\n", + "\n", + " self.network = nn.Sequential(\n", + " layer_init(nn.Linear(np.array(env.unity_observation_shape).prod(), 500)),\n", + " nn.ReLU(),\n", + " layer_init(nn.Linear(500, 300)),\n", + " nn.ReLU(),\n", + " )\n", + " self.actor_dis = nn.ModuleList([layer_init(nn.Linear(300, self.discrete_size), std=0.01) for i in range(targetNum)])\n", + " self.actor_mean = nn.ModuleList([layer_init(nn.Linear(300, self.continuous_size), std=0.01) for i in range(targetNum)])\n", + " self.actor_logstd = nn.ParameterList([nn.Parameter(torch.zeros(1, self.continuous_size)) for i in range(targetNum)])\n", + " self.critic = layer_init(nn.Linear(300, 1), std=1)\n", + "\n", + " def get_value(self, state: torch.Tensor):\n", + " return self.critic(self.network(state))\n", + "\n", + " def get_actions_value(self, state: torch.Tensor, actions=None):\n", + " hidden = self.network(state)\n", + " targets = torch.argmax(state[:,0:self.targetNum],dim=1)\n", + "\n", + " # discrete\n", + " # 递归targets的数量,既agent数来实现根据target不同来选用对应的输出网络计算输出\n", + " dis_logits = torch.stack([self.actor_dis[targets[i]](hidden[i]) for i in range(targets.size()[0])])\n", + " split_logits = torch.split(dis_logits, self.discrete_shape, dim=1)\n", + " multi_categoricals = [Categorical(logits=thisLogits) for thisLogits in split_logits]\n", + " # continuous\n", + " actions_mean = torch.stack([self.actor_mean[targets[i]](hidden[i]) for i in range(targets.size()[0])]) # self.actor_mean(hidden)\n", + " # action_logstd = torch.stack([self.actor_logstd[targets[i]].expand_as(actions_mean) for i in range(targets.size()[0])]) # self.actor_logstd.expand_as(actions_mean)\n", + " # print(action_logstd)\n", + " action_std = torch.squeeze(torch.stack([torch.exp(self.actor_logstd[targets[i]]) for i in range(targets.size()[0])]),dim = -1) # torch.exp(action_logstd)\n", + " con_probs = Normal(actions_mean, action_std)\n", + "\n", + " if actions is None:\n", + " if True:\n", + " # select actions base on probability distribution model\n", + " disAct = torch.stack([ctgr.sample() for ctgr in multi_categoricals])\n", + " conAct = con_probs.sample()\n", + " actions = torch.cat([disAct.T, conAct], dim=1)\n", + " else:\n", + " # select actions base on best probability distribution\n", + " disAct = torch.stack([torch.argmax(logit, dim=1) for logit in split_logits])\n", + " conAct = actions_mean\n", + " actions = torch.cat([disAct.T, conAct], dim=1)\n", + " else:\n", + " disAct = actions[:, 0 : env.unity_discrete_type].T\n", + " conAct = actions[:, env.unity_discrete_type :]\n", + " dis_log_prob = torch.stack(\n", + " [ctgr.log_prob(act) for act, ctgr in zip(disAct, multi_categoricals)]\n", + " )\n", + " dis_entropy = torch.stack([ctgr.entropy() for ctgr in multi_categoricals])\n", + " return (\n", + " actions,\n", + " dis_log_prob.sum(0),\n", + " dis_entropy.sum(0),\n", + " con_probs.log_prob(conAct).sum(1),\n", + " con_probs.entropy().sum(1),\n", + " self.critic(hidden),\n", + " )\n", + "agent = PPOAgent(env,4).to(device)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[ 1. , -10.343613 , 0. , -7.367299 ,\n", + " 0. , 0. , 30. , -10.343662 ,\n", + " 1. , -33.708736 , 1. , 1. ,\n", + " 1. , 1. , 2. , 1. ,\n", + " 1. , 1. , 2. , 2. ,\n", + " 2. , 1. , 1. , 1. ,\n", + " 33.270493 , 39.50663 , 49.146526 , 32.595673 ,\n", + " 30.21616 , 21.163797 , 46.9299 , 1.3264331 ,\n", + " 1.2435672 , 1.2541904 , 30.08522 , 30.041445 ,\n", + " 21.072094 , 0. ],\n", + " [ 0. , 0. , 0. , 0. ,\n", + " 0. , 0. , 30. , -5.5892515 ,\n", + " 1. , -29.907726 , 1. , 1. ,\n", + " 1. , 1. , 2. , 1. ,\n", + " 1. , 1. , 1. , 1. ,\n", + " 1. , 1. , 1. , 1. ,\n", + " 41.408752 , 47.830173 , 45.03225 , 31.905174 ,\n", + " 41.849663 , 41.849648 , 43.001434 , 45.0322 ,\n", + " 47.48242 , 40.00285 , 41.668346 , 41.607723 ,\n", + " 41.668335 , 0. ],\n", + " [ 1. , 2.9582403 , 0. , -4.699738 ,\n", + " 0. , 0. , 30. , -5.412487 ,\n", + " 1. , -32.79967 , 1. , 2. ,\n", + " 1. , 1. , 1. , 1. ,\n", + " 1. , 1. , 1. , 2. ,\n", + " 1. , 1. , 1. , 1. ,\n", + " 20.17488 , 49.507687 , 48.162056 , 45.98998 ,\n", + " 44.75835 , 31.08564 , 32.865173 , 24.676666 ,\n", + " 12.952409 , 39.69923 , 44.564423 , 44.49966 ,\n", + " 44.564495 , 0. ],\n", + " [ 2. , -0.20171738, 0. , -10.340863 ,\n", + " 0. , 0. , 30. , -22.987915 ,\n", + " 1. , -34.37514 , 1. , 1. ,\n", + " 1. , 1. , 1. , 1. ,\n", + " 1. , 2. , 1. , 1. ,\n", + " 1. , 1. , 1. , 1. ,\n", + " 11.631058 , 13.872022 , 18.006863 , 27.457632 ,\n", + " 46.343067 , 46.343094 , 20.155125 , 49.867714 ,\n", + " 52.965984 , 56.775608 , 46.14223 , 46.075138 ,\n", + " 46.142246 , 0. ],\n", + " [ 2. , -14.687862 , 0. , -12.615574 ,\n", + " 0. , 0. , 30. , 15.125373 ,\n", + " 1. , -30.849268 , 1. , 1. ,\n", + " 1. , 1. , 1. , 1. ,\n", + " 1. , 1. , 1. , 1. ,\n", + " 1. , 1. , 1. , 2. ,\n", + " 52.430542 , 48.912865 , 46.05145 , 43.974594 ,\n", + " 42.796673 , 26.467875 , 11.072432 , 7.190229 ,\n", + " 5.483198 , 4.5500183 , 42.611244 , 42.549267 ,\n", + " 18.856438 , 0. ],\n", + " [ 0. , 0. , 0. , 0. ,\n", + " 0. , 0. , 30. , -4.0314903 ,\n", + " 1. , -29.164669 , 1. , 1. ,\n", + " 1. , 1. , 1. , 1. ,\n", + " 1. , 1. , 1. , 1. ,\n", + " 1. , 1. , 1. , 1. ,\n", + " 44.074184 , 46.9762 , 44.228096 , 42.2335 ,\n", + " 41.102253 , 41.102367 , 42.233757 , 44.22849 ,\n", + " 44.321827 , 37.335304 , 40.924183 , 40.86467 ,\n", + " 40.924236 , 0. ],\n", + " [ 0. , 0. , 0. , 0. ,\n", + " 0. , 0. , 30. , -18.603981 ,\n", + " 1. , -29.797592 , 1. , 1. ,\n", + " 1. , 1. , 1. , 1. ,\n", + " 1. , 1. , 1. , 1. ,\n", + " 1. , 2. , 2. , 2. ,\n", + " 19.134174 , 22.76088 , 29.468704 , 42.88739 ,\n", + " 41.738823 , 41.739002 , 42.88781 , 44.913647 ,\n", + " 47.704174 , 51.135338 , 20.418388 , 12.470214 ,\n", + " 12.670923 , 0. ],\n", + " [ 0. , 0. , 0. , 0. ,\n", + " 0. , 0. , 30. , -19.07032 ,\n", + " 1. , -30.246218 , 1. , 1. ,\n", + " 1. , 1. , 1. , 1. ,\n", + " 1. , 1. , 1. , 1. ,\n", + " 1. , 1. , 1. , 1. ,\n", + " 18.336487 , 21.81617 , 28.251017 , 42.977867 ,\n", + " 42.18994 , 42.19034 , 43.351707 , 45.399582 ,\n", + " 48.22037 , 51.68873 , 42.00719 , 41.94621 ,\n", + " 42.00739 , 0. ]], dtype=float32)" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "state,_,_ = env.getSteps()\n", + "state" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "env.close()" ] } ],