Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 43e80e2

Browse files
authored
Vector Env1
1 parent f4b7fdf commit 43e80e2

File tree

6 files changed

+240
-70
lines changed

6 files changed

+240
-70
lines changed

ElegantRL/agent.py

Lines changed: 99 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -530,6 +530,11 @@ def select_action(self, state):
530530
actions, noises = self.act.get_action(states) # plan to be get_action_a_noise
531531
return actions[0].detach().cpu().numpy(), noises[0].detach().cpu().numpy()
532532

533+
def select_actions(self, states):
534+
# states = torch.as_tensor((state,), dtype=torch.float32, device=self.device)
535+
actions, noises = self.act.get_action(states) # plan to be get_action_a_noise
536+
return actions, noises
537+
533538
def explore_env(self, env, target_step, reward_scale, gamma):
534539
trajectory_list = list()
535540

@@ -544,20 +549,97 @@ def explore_env(self, env, target_step, reward_scale, gamma):
544549
self.state = state
545550
return trajectory_list
546551

552+
def explore_envs(self, env, target_step, reward_scale, gamma):
553+
state = self.state
554+
env_num = env.env_num
555+
556+
buf_step = target_step // env_num
557+
states = torch.empty((buf_step, env_num, env.state_dim), dtype=torch.float32, device=self.device)
558+
actions = torch.empty((buf_step, env_num, env.action_dim), dtype=torch.float32, device=self.device)
559+
noises = torch.empty((buf_step, env_num, env.action_dim), dtype=torch.float32, device=self.device)
560+
rewards = torch.empty((buf_step, env_num), dtype=torch.float32, device=self.device)
561+
dones = torch.empty((buf_step, env_num), dtype=torch.float32, device=self.device)
562+
for i in range(buf_step):
563+
action, noise = self.select_actions(state)
564+
next_s, reward, done, _ = env.step(action.tanh())
565+
# other = (reward * reward_scale, 0.0 if done else gamma, *action, *noise)
566+
# trajectory_list.append((state, other))
567+
568+
states[i] = state
569+
actions[i] = action
570+
noises[i] = noise
571+
rewards[i] = reward
572+
dones[i] = done
573+
574+
# state = env.reset() if done else next_s
575+
state = next_s
576+
self.state = state
577+
rewards = rewards * reward_scale
578+
masks = (1 - dones) * gamma
579+
return states, rewards, masks, actions, noises
580+
581+
def prepare_buffer(self, buffer):
582+
buffer.update_now_len()
583+
buf_len = buffer.now_len
584+
with torch.no_grad(): # compute reverse reward
585+
reward, mask, action, a_noise, state = buffer.sample_all()
586+
587+
# print(';', [t.shape for t in (reward, mask, action, a_noise, state)])
588+
bs = 2 ** 10 # set a smaller 'BatchSize' when out of GPU memory.
589+
value = torch.cat([self.cri_target(state[i:i + bs]) for i in range(0, state.size(0), bs)], dim=0).squeeze(1)
590+
logprob = self.act.get_old_logprob(action, a_noise)
591+
592+
pre_state = torch.as_tensor((self.state,), dtype=torch.float32, device=self.device)
593+
pre_r_sum = self.cri_target(pre_state).detach()
594+
r_sum, advantage = self.get_reward_sum(buf_len, reward, mask, value, pre_r_sum)
595+
buffer.empty_buffer()
596+
return state, action, r_sum, logprob, advantage
597+
598+
def prepare_buffers(self, buffer):
599+
with torch.no_grad(): # compute reverse reward
600+
states, rewards, masks, actions, noises = buffer
601+
buf_len = states.size(0)
602+
env_num = states.size(1)
603+
604+
values = torch.empty_like(rewards)
605+
logprobs = torch.empty_like(rewards)
606+
bs = 2 ** 10 # set a smaller 'BatchSize' when out of GPU memory.
607+
for j in range(env_num):
608+
for i in range(0, buf_len, bs):
609+
values[i:i + bs, j] = self.cri_target(states[i:i + bs, j]).squeeze(1)
610+
logprobs[:, j] = self.act.get_old_logprob(actions[:, j], noises[:, j]).squeeze(1)
611+
612+
pre_states = torch.as_tensor(self.state, dtype=torch.float32, device=self.device)
613+
pre_r_sums = self.cri_target(pre_states).detach().squeeze(1)
614+
615+
r_sums, advantages = self.get_reward_sum((buf_len, env_num), rewards, masks, values, pre_r_sums)
616+
617+
buf_len_vec = buf_len * env_num
618+
619+
states = states.view((buf_len_vec, -1))
620+
actions = actions.view((buf_len_vec, -1))
621+
r_sums = r_sums.view(buf_len_vec)
622+
logprobs = logprobs.view(buf_len_vec)
623+
advantages = advantages.view(buf_len_vec)
624+
return states, actions, r_sums, logprobs, advantages
625+
547626
def update_net(self, buffer, batch_size, repeat_times, soft_update_tau):
548627
if isinstance(buffer, list):
549628
buffer_tuple = list(map(list, zip(*buffer))) # 2D-list transpose
550629
(buf_state, buf_action, buf_r_sum, buf_logprob, buf_advantage
551630
) = [torch.cat(tensor_list, dim=0).to(self.device)
552631
for tensor_list in buffer_tuple]
553-
632+
elif isinstance(buffer, tuple):
633+
(buf_state, buf_action, buf_r_sum, buf_logprob, buf_advantage
634+
) = buffer
554635
else:
555636
(buf_state, buf_action, buf_r_sum, buf_logprob, buf_advantage
556-
) = self.prepare_buffer(buffer, self.state)
637+
) = self.prepare_buffer(buffer)
557638
buf_len = buf_state.size(0)
558639

559640
'''PPO: Surrogate objective of Trust Region'''
560641
obj_critic = obj_actor = old_logprob = None
642+
r_sum_std = 1 # todo buf_r_sum.std() + 1e-6
561643
for _ in range(int(buf_len / batch_size * repeat_times)):
562644
indices = torch.randint(buf_len, size=(batch_size,), requires_grad=False, device=self.device)
563645

@@ -576,60 +658,47 @@ def update_net(self, buffer, batch_size, repeat_times, soft_update_tau):
576658
self.optim_update(self.act_optim, obj_actor)
577659

578660
value = self.cri(state).squeeze(1) # critic network predicts the reward_sum (Q value) of state
579-
obj_critic = self.criterion(value, r_sum) # / (r_sum.std() + 1e-6)
661+
obj_critic = self.criterion(value, r_sum) / r_sum_std
580662
self.optim_update(self.cri_optim, obj_critic)
581663
self.soft_update(self.cri_target, self.cri, soft_update_tau) if self.cri_target is not self.cri else None
582664

583665
return obj_critic.item(), obj_actor.item(), old_logprob.mean().item() # logging_tuple
584666

585-
def prepare_buffer(self, buffer, state_ary):
586-
buffer.update_now_len()
587-
buf_len = buffer.now_len
588-
with torch.no_grad(): # compute reverse reward
589-
reward, mask, action, a_noise, state = buffer.sample_all()
590-
591-
# print(';', [t.shape for t in (reward, mask, action, a_noise, state)])
592-
bs = 2 ** 10 # set a smaller 'BatchSize' when out of GPU memory.
593-
value = torch.cat([self.cri_target(state[i:i + bs]) for i in range(0, state.size(0), bs)], dim=0)
594-
logprob = self.act.get_old_logprob(action, a_noise)
595-
596-
pre_state = torch.as_tensor((state_ary,), dtype=torch.float32, device=self.device)
597-
pre_r_sum = self.cri(pre_state).detach()
598-
r_sum, advantage = self.get_reward_sum(buf_len, reward, mask, value, pre_r_sum)
599-
buffer.empty_buffer()
600-
return state, action, r_sum, logprob, advantage
601-
602667
def get_reward_sum_raw(self, buf_len, buf_reward, buf_mask, buf_value, pre_r_sum) -> (torch.Tensor, torch.Tensor):
603668
"""compute the excepted discounted episode return
604669
605670
:int buf_len: the length of ReplayBuffer
606-
:torch.Tensor buf_reward: buf_reward.shape==(buf_len, 1)
607-
:torch.Tensor buf_mask: buf_mask.shape ==(buf_len, 1)
608-
:torch.Tensor buf_value: buf_value.shape ==(buf_len, 1)
609-
:return torch.Tensor buf_r_sum: buf_r_sum.shape ==(buf_len, 1)
671+
:torch.Tensor buf_reward: buf_reward.shape==(buf_len, )
672+
:torch.Tensor buf_mask: buf_mask.shape ==(buf_len, )
673+
:torch.Tensor buf_value: buf_value.shape ==(buf_len, )
674+
:torch.Tensor pre_r_sum: pre_r_sum.shape ==(1, 1)
675+
:return torch.Tensor buf_r_sum: buf_r_sum.shape ==(buf_len, 1)
610676
:return torch.Tensor buf_advantage: buf_advantage.shape ==(buf_len, 1)
611677
"""
612678
buf_r_sum = torch.empty(buf_len, dtype=torch.float32, device=self.device) # reward sum
613-
for i in range(buf_len - 1, -1, -1):
679+
the_len = buf_len[0] if isinstance(buf_len, tuple) else buf_len
680+
for i in range(the_len - 1, -1, -1):
614681
buf_r_sum[i] = buf_reward[i] + buf_mask[i] * pre_r_sum
615682
pre_r_sum = buf_r_sum[i]
616-
buf_advantage = buf_r_sum - (buf_mask * buf_value.squeeze(1))
617-
buf_advantage = (buf_advantage - buf_advantage.mean()) / (buf_advantage.std() + 1e-5)
683+
buf_advantage = buf_r_sum - buf_mask * buf_value
684+
buf_advantage = (buf_advantage - buf_advantage.mean()) # todo / (buf_advantage.std() + 1e-5)
618685
return buf_r_sum, buf_advantage
619686

620687
def get_reward_sum_gae(self, buf_len, buf_reward, buf_mask, buf_value, pre_r_sum) -> (torch.Tensor, torch.Tensor):
621688
buf_r_sum = torch.empty(buf_len, dtype=torch.float32, device=self.device) # old policy value
622689
buf_advantage = torch.empty(buf_len, dtype=torch.float32, device=self.device) # advantage value
623690

624-
pre_advantage = pre_r_sum * (np.exp(self.lambda_gae_adv - 0.4) - 1) # advantage value of previous step
625-
for i in range(buf_len - 1, -1, -1):
691+
pre_advantage = pre_r_sum * (np.exp(self.lambda_gae_adv - 0.5) - 1) # advantage value of previous step
692+
693+
the_len = buf_len[0] if isinstance(buf_len, tuple) else buf_len
694+
for i in range(the_len - 1, -1, -1):
626695
buf_r_sum[i] = buf_reward[i] + buf_mask[i] * pre_r_sum
627696
pre_r_sum = buf_r_sum[i]
628697

629698
buf_advantage[i] = buf_reward[i] + buf_mask[i] * (pre_advantage - buf_value[i]) # fix a bug here
630699
pre_advantage = buf_value[i] + buf_advantage[i] * self.lambda_gae_adv
631700

632-
buf_advantage = (buf_advantage - buf_advantage.mean()) / (buf_advantage.std() + 1e-5)
701+
buf_advantage = (buf_advantage - buf_advantage.mean()) # todo / (buf_advantage.std() + 1e-5)
633702
return buf_r_sum, buf_advantage
634703

635704

ElegantRL/demo.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import sys
22

33
import gym
4-
from elegantrl2.env import PreprocessEnv
4+
from elegantrl2.env import PreprocessEnv, PreprocessVecEnv
55
from elegantrl2.run import Arguments, train_and_evaluate, train_and_evaluate_mp
66

77
gym.logger.set_level(40) # Block warning: 'WARN: Box bound precision lowered by casting to float32'

ElegantRL/env.py

Lines changed: 71 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import os
22
import gym
3+
import torch
34
import numpy as np
45
# import numpy.random as rd
56
from copy import deepcopy
@@ -12,11 +13,11 @@ def __init__(self, env, if_print=True, data_type=np.float32):
1213
"""Preprocess a standard OpenAI gym environment for training.
1314
1415
`object env` a standard OpenAI gym environment, it has env.reset() and env.step()
15-
`object if_print` print the information of environment. Such as env_name, state_dim ...
16+
`bool if_print` print the information of environment. Such as env_name, state_dim ...
1617
`object data_type` convert state (sometimes float64) to data_type (float32).
1718
"""
1819
self.env = gym.make(env) if isinstance(env, str) else env
19-
super(PreprocessEnv, self).__init__(self.env)
20+
super().__init__(self.env)
2021

2122
(self.env_name, self.state_dim, self.action_dim, self.action_max, self.max_step,
2223
self.if_discrete, self.target_return) = get_gym_env_info(self.env, if_print)
@@ -86,6 +87,74 @@ def step_norm(self, action) -> (np.ndarray, float, bool, dict):
8687
return state.astype(self.data_type), reward, done, info
8788

8889

90+
class PreprocessVecEnv(gym.Wrapper):
91+
def __init__(self, env, env_num, device=torch.device('cuda'),
92+
if_print=True, data_type=torch.float32):
93+
"""Preprocess a standard OpenAI gym environment for training.
94+
95+
`object env` a standard OpenAI gym environment, it has env.reset() and env.step()
96+
`int env_num` environment number
97+
`object device` torch.device('cpu'), torch.device('cuda')
98+
`bool if_print` print the information of environment. Such as env_name, state_dim ...
99+
`object data_type` convert state (sometimes float64) to data_type (float32).
100+
"""
101+
if isinstance(env, str):
102+
env_name = env
103+
self.env_list = [gym.make(env_name) for _ in range(env_num)]
104+
elif getattr(env, 'env_name', False):
105+
env_name = env.env_name
106+
self.env_list = [gym.make(env_name) for _ in range(env_num)]
107+
else:
108+
self.env_list = [deepcopy(env) for _ in range(env_num)]
109+
env = self.env_list[0]
110+
super().__init__(env)
111+
112+
(self.env_name, self.state_dim, self.action_dim, self.action_max, self.max_step,
113+
self.if_discrete, self.target_return) = get_gym_env_info(env, if_print)
114+
self.data_type = data_type
115+
116+
self.env_num = env_num
117+
self.data_type = data_type
118+
self.device = device
119+
120+
def reset(self) -> torch.Tensor:
121+
"""state = env.reset()
122+
123+
convert the data type of state from float64 to float32
124+
125+
return `array state` state.shape==(state_dim, )
126+
"""
127+
state = torch.as_tensor([env.reset() for env in self.env_list],
128+
dtype=self.data_type, device=self.device)
129+
return state
130+
131+
def step(self, actions) -> (torch.Tensor, torch.Tensor, torch.Tensor, dict):
132+
""" next_state, reward, done = env.step(action)
133+
134+
convert the data type of state from float64 to float32,
135+
adjust action range to (-action_max, +action_max)
136+
137+
return `array state` state.shape==(state_dim, )
138+
return `float reward` reward of one step
139+
return `bool done` the terminal of an training episode
140+
return `dict info` the information save in a dict. OpenAI gym standard. Send a `None` is OK
141+
"""
142+
143+
actions = actions.detach().cpu().numpy() * self.action_max
144+
145+
states = list()
146+
rewards = list()
147+
dones = list()
148+
for i in range(self.env_num):
149+
state, reward, done, _ = self.env_list[i].step(actions[i])
150+
states.append(self.env_list[i].reset() if done else state)
151+
rewards.append(reward)
152+
dones.append(done)
153+
states, rewards, dones = [torch.as_tensor(t, dtype=self.data_type, device=self.device)
154+
for t in (states, rewards, dones)]
155+
return states, rewards, dones, {}
156+
157+
89158
def deepcopy_or_rebuild_env(env):
90159
try:
91160
env_eval = deepcopy(env)

ElegantRL/net.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -215,7 +215,7 @@ def get_new_logprob_entropy(self, state, action):
215215
return logprob, dist_entropy
216216

217217
def get_old_logprob(self, _action, noise): # noise = action - a_noise
218-
return -(self.a_logstd + self.sqrt_2pi_log + noise.pow(2) * 0.5).sum(1) # old_logprob
218+
return -(self.a_logstd + self.sqrt_2pi_log + noise.pow(2) * 0.5).sum(1, keepdims=True) # old_logprob
219219

220220

221221
class ActorDiscretePPO(nn.Module):
@@ -448,7 +448,7 @@ def get_a_logprob(self, state): # actor
448448

449449
a_noise_tanh = a_noise.tanh()
450450
fix_term = (-a_noise_tanh.pow(2) + 1.00001).log()
451-
logprob = (noise.pow(2) / 2 + a_std_log + fix_term).sum(1, keepdim=True) + self.log_sqrt_2pi_sum
451+
logprob = (noise.pow(2) / 2 + a_std_log + fix_term).sum(1) + self.log_sqrt_2pi_sum
452452
return a_noise_tanh, logprob
453453

454454
def get_q_logprob(self, state):

ElegantRL/replay.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -223,7 +223,7 @@ def sample_batch(self, batch_size) -> list:
223223
# list_items of reward, mask, action, state, next_state, is_weights (PER)
224224

225225
# return [torch.cat([item[i] for item in list_items], dim=0)
226-
# for i in range(len(list_items[0]))] # todo need to check
226+
# for i in range(len(list_items[0]))] # need to check
227227
list_items = list(map(list, zip(*list_items))) # 2D-list transpose
228228
return [torch.cat(item, dim=0) for item in list_items]
229229

0 commit comments

Comments
 (0)