1
+ #######################################################################
2
+ # Copyright (C) #
3
+ # 2016 - 2019 Pinard Liu([email protected] ) #
4
+ # https://www.cnblogs.com/pinard #
5
+ # Permission given to modify the code as long as you keep this #
6
+ # declaration at the top #
7
+ #######################################################################
8
+ ####
9
+ ####
10
+
11
+ import gym
12
+ import tensorflow as tf
13
+ import numpy as np
14
+ import random
15
+ from collections import deque
16
+
17
+ # Hyper Parameters for DQN
18
+ GAMMA = 0.9 # discount factor for target Q
19
+ INITIAL_EPSILON = 0.5 # starting value of epsilon
20
+ FINAL_EPSILON = 0.01 # final value of epsilon
21
+ REPLAY_SIZE = 10000 # experience replay buffer size
22
+ BATCH_SIZE = 128 # size of minibatch
23
+ REPLACE_TARGET_FREQ = 10 # frequency to update target Q network
24
+
25
+ class DQN ():
26
+ # DQN Agent
27
+ def __init__ (self , env ):
28
+ # init experience replay
29
+ self .replay_buffer = deque ()
30
+ # init some parameters
31
+ self .time_step = 0
32
+ self .epsilon = INITIAL_EPSILON
33
+ self .state_dim = env .observation_space .shape [0 ]
34
+ self .action_dim = env .action_space .n
35
+
36
+ self .create_Q_network ()
37
+ self .create_training_method ()
38
+
39
+ # Init session
40
+ self .session = tf .InteractiveSession ()
41
+ self .session .run (tf .global_variables_initializer ())
42
+
43
+ def create_Q_network (self ):
44
+ # input layer
45
+ self .state_input = tf .placeholder ("float" , [None , self .state_dim ])
46
+ # network weights
47
+ with tf .variable_scope ('current_net' ):
48
+ W1 = self .weight_variable ([self .state_dim ,20 ])
49
+ b1 = self .bias_variable ([20 ])
50
+
51
+ # hidden layer 1
52
+ h_layer_1 = tf .nn .relu (tf .matmul (self .state_input ,W1 ) + b1 )
53
+
54
+ # hidden layer for state value
55
+ with tf .variable_scope ('Value' ):
56
+ W21 = self .weight_variable ([20 ,1 ])
57
+ b21 = self .bias_variable ([1 ])
58
+ self .V = tf .matmul (h_layer_1 , W21 ) + b21
59
+
60
+ # hidden layer for action value
61
+ with tf .variable_scope ('Advantage' ):
62
+ W22 = self .weight_variable ([20 ,self .action_dim ])
63
+ b22 = self .bias_variable ([self .action_dim ])
64
+ self .A = tf .matmul (h_layer_1 , W22 ) + b22
65
+
66
+ # Q Value layer
67
+ self .Q_value = self .V + (self .A - tf .reduce_mean (self .A , axis = 1 , keep_dims = True ))
68
+
69
+ with tf .variable_scope ('target_net' ):
70
+ W1t = self .weight_variable ([self .state_dim ,20 ])
71
+ b1t = self .bias_variable ([20 ])
72
+
73
+ # hidden layer 1
74
+ h_layer_1t = tf .nn .relu (tf .matmul (self .state_input ,W1t ) + b1t )
75
+
76
+ # hidden layer for state value
77
+ with tf .variable_scope ('Value' ):
78
+ W2v = self .weight_variable ([20 ,1 ])
79
+ b2v = self .bias_variable ([1 ])
80
+ self .VT = tf .matmul (h_layer_1t , W2v ) + b2v
81
+
82
+ # hidden layer for action value
83
+ with tf .variable_scope ('Advantage' ):
84
+ W2a = self .weight_variable ([20 ,self .action_dim ])
85
+ b2a = self .bias_variable ([self .action_dim ])
86
+ self .AT = tf .matmul (h_layer_1t , W2a ) + b2a
87
+
88
+ # Q Value layer
89
+ self .target_Q_value = self .VT + (self .AT - tf .reduce_mean (self .AT , axis = 1 , keep_dims = True ))
90
+
91
+ t_params = tf .get_collection (tf .GraphKeys .GLOBAL_VARIABLES , scope = 'target_net' )
92
+ e_params = tf .get_collection (tf .GraphKeys .GLOBAL_VARIABLES , scope = 'current_net' )
93
+
94
+ with tf .variable_scope ('soft_replacement' ):
95
+ self .target_replace_op = [tf .assign (t , e ) for t , e in zip (t_params , e_params )]
96
+
97
+ def create_training_method (self ):
98
+ self .action_input = tf .placeholder ("float" ,[None ,self .action_dim ]) # one hot presentation
99
+ self .y_input = tf .placeholder ("float" ,[None ])
100
+ Q_action = tf .reduce_sum (tf .multiply (self .Q_value ,self .action_input ),reduction_indices = 1 )
101
+ self .cost = tf .reduce_mean (tf .square (self .y_input - Q_action ))
102
+ self .optimizer = tf .train .AdamOptimizer (0.0001 ).minimize (self .cost )
103
+
104
+ def perceive (self ,state ,action ,reward ,next_state ,done ):
105
+ one_hot_action = np .zeros (self .action_dim )
106
+ one_hot_action [action ] = 1
107
+ self .replay_buffer .append ((state ,one_hot_action ,reward ,next_state ,done ))
108
+ if len (self .replay_buffer ) > REPLAY_SIZE :
109
+ self .replay_buffer .popleft ()
110
+
111
+ if len (self .replay_buffer ) > BATCH_SIZE :
112
+ self .train_Q_network ()
113
+
114
+ def train_Q_network (self ):
115
+ self .time_step += 1
116
+ # Step 1: obtain random minibatch from replay memory
117
+ minibatch = random .sample (self .replay_buffer ,BATCH_SIZE )
118
+ state_batch = [data [0 ] for data in minibatch ]
119
+ action_batch = [data [1 ] for data in minibatch ]
120
+ reward_batch = [data [2 ] for data in minibatch ]
121
+ next_state_batch = [data [3 ] for data in minibatch ]
122
+
123
+ # Step 2: calculate y
124
+ y_batch = []
125
+ Q_value_batch = self .target_Q_value .eval (feed_dict = {self .state_input :next_state_batch })
126
+ for i in range (0 ,BATCH_SIZE ):
127
+ done = minibatch [i ][4 ]
128
+ if done :
129
+ y_batch .append (reward_batch [i ])
130
+ else :
131
+ y_batch .append (reward_batch [i ] + GAMMA * np .max (Q_value_batch [i ]))
132
+
133
+ self .optimizer .run (feed_dict = {
134
+ self .y_input :y_batch ,
135
+ self .action_input :action_batch ,
136
+ self .state_input :state_batch
137
+ })
138
+
139
+ def egreedy_action (self ,state ):
140
+ Q_value = self .Q_value .eval (feed_dict = {
141
+ self .state_input :[state ]
142
+ })[0 ]
143
+ if random .random () <= self .epsilon :
144
+ self .epsilon -= (INITIAL_EPSILON - FINAL_EPSILON ) / 10000
145
+ return random .randint (0 ,self .action_dim - 1 )
146
+ else :
147
+ self .epsilon -= (INITIAL_EPSILON - FINAL_EPSILON ) / 10000
148
+ return np .argmax (Q_value )
149
+
150
+ def action (self ,state ):
151
+ return np .argmax (self .Q_value .eval (feed_dict = {
152
+ self .state_input :[state ]
153
+ })[0 ])
154
+
155
+ def update_target_q_network (self , episode ):
156
+ # update target Q netowrk
157
+ if episode % REPLACE_TARGET_FREQ == 0 :
158
+ self .session .run (self .target_replace_op )
159
+ #print('episode '+str(episode) +', target Q network params replaced!')
160
+
161
+ def weight_variable (self ,shape ):
162
+ initial = tf .truncated_normal (shape )
163
+ return tf .Variable (initial )
164
+
165
+ def bias_variable (self ,shape ):
166
+ initial = tf .constant (0.01 , shape = shape )
167
+ return tf .Variable (initial )
168
+ # ---------------------------------------------------------
169
+ # Hyper Parameters
170
+ ENV_NAME = 'CartPole-v0'
171
+ EPISODE = 3000 # Episode limitation
172
+ STEP = 300 # Step limitation in an episode
173
+ TEST = 5 # The number of experiment test every 100 episode
174
+
175
+ def main ():
176
+ # initialize OpenAI Gym env and dqn agent
177
+ env = gym .make (ENV_NAME )
178
+ agent = DQN (env )
179
+
180
+ for episode in range (EPISODE ):
181
+ # initialize task
182
+ state = env .reset ()
183
+ # Train
184
+ for step in range (STEP ):
185
+ action = agent .egreedy_action (state ) # e-greedy action for train
186
+ next_state ,reward ,done ,_ = env .step (action )
187
+ # Define reward for agent
188
+ reward = - 1 if done else 0.1
189
+ agent .perceive (state ,action ,reward ,next_state ,done )
190
+ state = next_state
191
+ if done :
192
+ break
193
+ # Test every 100 episodes
194
+ if episode % 100 == 0 :
195
+ total_reward = 0
196
+ for i in range (TEST ):
197
+ state = env .reset ()
198
+ for j in range (STEP ):
199
+ env .render ()
200
+ action = agent .action (state ) # direct action for test
201
+ state ,reward ,done ,_ = env .step (action )
202
+ total_reward += reward
203
+ if done :
204
+ break
205
+ ave_reward = total_reward / TEST
206
+ print ('episode: ' ,episode ,'Evaluation Average Reward:' ,ave_reward )
207
+ agent .update_target_q_network (episode )
208
+
209
+ if __name__ == '__main__' :
210
+ main ()
0 commit comments