-
Notifications
You must be signed in to change notification settings - Fork 0
/
DQN.py
123 lines (94 loc) · 3.53 KB
/
DQN.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import gym
import random
import numpy as np
from keras import Sequential
from keras import optimizers
from keras.layers import Dense
from collections import deque
## Hyper Parameters:
episodes = 10000
time = 1000
# gamma: discount rate
gamma = 0.95
batch_size = 32
# learning rate: determine how much neural net learns in each iteration
learning_rate = 0.001
memory_len = 10000
# use epsilon-greedy to deal with trade-off between exploration and exploitation
# epsilon_min: guarantee the least level of exploration
epsilon_min = 0.01
# epsilon_decay: decay rate
epsilon_decay = 0.995
class Agent:
def __init__(self, state_size, action_size):
self.state_size = state_size
self.action_size = action_size
# initialize epsilon: exploration rate
self.epsilon = 1.0
# use deque to dynamically collect experiences in memory
self.memory = deque( maxlen = memory_len)
self.model = self.build_model()
def build_model(self):
# create foundations of layers
model = Sequential()
# input layer with dimension of state size
model.add( Dense(24, input_dim = self.state_size, activation = 'relu') )
# hidden layer
model.add( Dense(24, activation = 'relu') )
# output layer
model.add( Dense(self.action_size, activation = 'linear') )
# create the model
model.compile( loss = 'mse', optimizer = optimizers.RMSprop(lr = learning_rate) )
return model
def remember(self, state, action, reward, next_state, done):
# collect the previous experiences in memory
self.memory.append( (state, action, reward, next_state, done) )
def act(self, state):
# use epsilon-greedy to deal with trade-off between exploration and exploitation
if random.random() <= self.epsilon:
# exploration: acts randomly
return env.action_space.sample()
else:
# exploitation
return np.argmax( self.model.predict(state) )
def replay(self):
# experience replay: randomly sample from memory
mini_batch = random.sample( self.memory, batch_size )
# deal with each experience from minibatch
for state, action, reward, next_state, done in mini_batch:
#print(state.shape)
if done:
# approach the termication of this eposide
target = reward
else:
target = reward + gamma * np.max( self.model.predict(next_state)[0] )
# predict the reward of current state
prediction = self.model.predict(state)
# update the q value of specific action of current state
prediction[0][action] = target
self.model.fit(state, prediction, epochs = 1, verbose = 0)
if __name__ == "__main__":
# initialize gym environment
env = gym.make('CartPole-v0')
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
#initialize agent
agent = Agent(state_size, action_size)
for e in range(episodes):
state = env.reset()
state = np.reshape( state, [1,state_size] )
for t in range(time):
#env.render()
action = agent.act(state)
next_state, reward, done, info = env.step(action)
next_state = np.reshape( next_state, [1, state_size] )
reward = 1 if not done else -10
agent.remember(state, action, reward, next_state, done)
state = next_state
if done:
print( "episode: {}/{}, score: {}, e: {:.2}".format(e, episodes, t, agent.epsilon) )
break
if len(agent.memory) > batch_size:
agent.replay()
# epsilon decay
agent.epsilon = max(agent.epsilon * epsilon_decay, epsilon_min)