TechTogetWorld

인공지능 구현 에 대한 글입니다.(Deep Reinforcement Learning)


글의 순서는 아래와 같습니다.


================================================

요약

 - 신경망 (Neural Network)즉  Q-NETWORK 를 이용하여  Q-Learing 구현( q-talble 형식의경우 메모리가 기하 급수적으로 필요해짐 )

  ==> 실생활에 적용하게에는 무리가 있음(q-table 방식) , 따라서 신경망 방식이 필요해짐


1.q_net_frozenlake

 - Network 으로 변환 


2. 07_3_dqn_2015_cartpole

  - Q-NETWORk 이슈

   1) 데이터가 너무 적어 정확도가 좋치 못하다. 2개의 데이터로 학습을 하게되면 전혀 다른 직선이 나오게 되는것이다

      . 깊게(deep)

      . experience replay : action후 버퍼에 상태,action등 을 저장한다 , 이후 random(골고루) 하게 샘플링해서 학습한다

   2) 타겟이 흔들림 ( 같은 네트웍을 사용해서, 예측변경이 타겟치도 변경이 일어남) => 화살을 쏘자마자 과녁을 움직이는것 같음

      . network을 하나더 만든다 ( 각자 업데이트 하다가, 학습전에 복사해서 합친다)


3. Next Step

  ==> 신경망 (Neural Network)를 이용하여  Q-Learing 구현


4. 참고자료

=================================================




[ 06_q_net_frozenlake ]



06_q_net_frozenlake


This code is based on

https://github.com/hunkim/DeepRL-Agents

'''

import gym

import numpy as np

import matplotlib.pyplot as plt

import time

import os

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'    # default value = 0  From http://stackoverflow.com/questions/35911252/disable-tensorflow-debugging-information


import tensorflow as tf

env = gym.make('FrozenLake-v0')


# Input and output size based on the Env

input_size = env.observation_space.n;

output_size = env.action_space.n;

learning_rate = 0.1


# These lines establish the feed-forward part of the network used to choose actions

X = tf.placeholder(shape=[1, input_size], dtype=tf.float32)              # state input

W = tf.Variable(tf.random_uniform([input_size, output_size], 0, 0.01))   # weight


Qpred = tf.matmul(X, W)     # Out Q prediction

Y = tf.placeholder(shape=[1, output_size], dtype=tf.float32)    # Y label


loss = tf.reduce_sum(tf.square(Y-Qpred))

train = tf.train.GradientDescentOptimizer(learning_rate=learning_rate).minimize(loss)


# Set Q-learning parameters

dis = .99

num_episodes = 2000


# create lists to contain total rewards and steps per episode

rList = []


def one_hot(x):

    return np.identity(16)[x:x+1]


start_time = time.time()


init = tf.global_variables_initializer()

with tf.Session() as sess:

    sess.run(init)

    for i in range(num_episodes):

        # Reset environment and get first new observation

        s = env.reset()

        e = 1. / ((i / 50) + 10)

        rAll = 0

        done = False

        local_loss = []


        # The Q-Table learning algorithm

        while not done:

            # Choose an action by greedly (with a chance of random action)

            # from the Q-network

            Qs = sess.run(Qpred, feed_dict={X: one_hot(s)})

            if np.random.rand(1) < e:

                a = env.action_space.sample()

            else:

                a = np.argmax(Qs)


            # Get new state and reward from environment

            s1, reward, done, _ = env.step(a)

            if done:

                # Update Q, and no Qs+1, since it's a termial state

                Qs[0, a] = reward

            else:

                # Obtain the Q_s` values by feeding the new state through our network

                Qs1 = sess.run(Qpred, feed_dict={X: one_hot(s1)})

                # Update Q

                Qs[0, a] = reward + dis*np.max(Qs1)


            # Train our network using target (Y) and predicted Q (Qpred) values

            sess.run(train, feed_dict={X: one_hot(s), Y: Qs})


            rAll += reward

            s = s1


        rList.append(rAll)


print("--- %s seconds ---" % (time.time() - start_time))


print("Success rate: " + str(sum(rList) / num_episodes))

#plt.bar(range(len(rList)), rList, color="blue")

plt.bar(range(len(rList)), rList, color='b', alpha=0.4)

plt.show()



[07_3_dqn_2015_cartpole]


07_3_dqn_2015_cartpole


This code is based on

https://github.com/hunkim/DeepRL-Agents


CF https://github.com/golbin/TensorFlow-Tutorials

https://github.com/dennybritz/reinforcement-learning/blob/master/DQN/dqn.py


Q-NETWOR 이슈

 1. 데이터가 너무 적어 정확도가 좋치 못하다. 2개의 데이터로 학습을 하게되면 전혀 다른 직선이 나오게 되는것이다

   - 깊게(deep)

   - experience replay : action후 버퍼에 상태,action등 을 저장한다 , 이후 random(골고루) 하게 샘플링해서 학습한다

 2. 타겟이 흔들림 ( 같은 네트웍을 사용해서, 예측변경이 타겟치도 변경이 일어남) => 화살을 쏘자마자 과녁을 움직이는것 같음

    - network을 하나더 만든다

"""


import numpy as np

import tensorflow as tf

import random

from collections import deque

from dqn import dqn


import gym

from gym import wrappers


env = gym.make('CartPole-v0')


# Constants defining our neural network

input_size = env.observation_space.shape[0]

output_size = env.action_space.n


dis = 0.9

REPLAY_MEMORY = 50000


def replay_train(mainDQN, targetDQN, train_batch):

    x_stack = np.empty(0).reshape(0, input_size)

    y_stack = np.empty(0).reshape(0, output_size)


    # Get stored information from the buffer

    for state, action, reward, next_state, done in train_batch:

        Q = mainDQN.predic(state)


        # terminal?

        if done:

            Q[0, action] = reward

        else:

            # get target from target DQN (Q')

            Q[0, action] = reward + dis * np.max(targetDQN.predict(next_state))


        y_stack = np.vstack([y_stack, Q])

        x_stack = np.vstack( [x_stack, state])


    # Train our network using target and predicted Q values on each episode

    return mainDQN.update(x_stack, y_stack)


def ddqn_replay_train(mainDQN, targetDQN, train_batch):


#Double DQN implementation

#param mainDQN main DQN

#param targetDQN target DQN

#param train_batch minibatch for train

#return loss



    x_stack = np.empty(0).reshape(0, mainDQN.input_size)

    y_stack = np.empty(0).reshape(0, mainDQN.output_size)


    # Get stored information from the buffer

    for state, action, reward, next_state, done in train_batch:

        Q = mainDQN.predict(state)


        # terminal?

        if done:

            Q[0, action] = reward

        else:

            # Double DQN: y = r + gamma * targetDQN(s')[a] where

            # a = argmax(mainDQN(s'))

            Q[0, action] = reward + dis * targetDQN.predict(next_state)[0, np.argmax(mainDQN.predict(next_state))]


        y_stack = np.vstack([y_stack, Q])

        x_stack = np.vstack([x_stack, state])


    # Train our network using target and predicted Q values on each episode

    return mainDQN.update(x_stack, y_stack)


def get_copy_var_ops(*, dest_scope_name="target", src_scope_name="main"):


    # Copy variables src_scope to dest_scope

    op_holder = []


    src_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=src_scope_name)

    dest_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=dest_scope_name)


    for src_var, dest_var in zip(src_vars, dest_vars):

        op_holder.append(dest_var.assign(src_var.value()))


    return op_holder


def bot_play(mainDQN, env=env):

    # See our trained network in action

    state = env.reset()

    reward_sum = 0

    while True:

        env.render()

        action = np.argmax(mainDQN.predict(state))

        state, reward, done, _ = env.step(action)

        reward_sum += reward

        if done:

            print("Total score: {}".format(reward_sum))

            break


def main():

    max_episodes = 5000

    # store the previous observations in replay memory

    replay_buffer = deque()


    with tf.Session() as sess:

        mainDQN = dqn.DQN(sess, input_size, output_size, name="main")

        targetDQN = dqn.DQN(sess, input_size, output_size, name="target")

        tf.global_variables_initializer().run()


        #initial copy q_net -> target_net

        copy_ops = get_copy_var_ops(dest_scope_name="target", src_scope_name="main")

        sess.run(copy_ops)


        for episode in range(max_episodes):

            e = 1. / ((episode / 10) + 1)

            done = False

            step_count = 0

            state = env.reset()


            while not done:

                if np.random.rand(1) < e:

                    action = env.action_space.sample()

                else:

                    # Choose an action by greedily from the Q-network

                    action = np.argmax(mainDQN.predict(state))


                # Get new state and reward from environment

                next_state, reward, done, _ = env.step(action)

                if done: # Penalty

                    reward = -100


                # Save the experience to our buffer

                replay_buffer.append((state, action, reward, next_state, done))

                if len(replay_buffer) > REPLAY_MEMORY:

                      replay_buffer.popleft()


                state = next_state

                step_count += 1

                if step_count > 10000:   # Good enough. Let's move on

                    break


            print("Episode: {} steps: {}".format(episode, step_count))

            if step_count > 10000:

                pass

                ##10,000이면 정지(무한루프방지)

                # break


            if episode % 10 == 1: # train every 10 episode

                # Get a random batch of experiences

                for _ in range(50):

                    minibatch = random.sample(replay_buffer, 10)

                    loss, _ = ddqn_replay_train(mainDQN, targetDQN, minibatch)


                print("Loss: ", loss)

                # copy q_net -> target_net

                sess.run(copy_ops)


        # See our trained bot in action

        env2 = wrappers.Monitor(env, 'gym-results', force=True)


        for i in range(200):

            bot_play(mainDQN, env=env2)


        env2.close()

        # gym.upload("gym-results", api_key="sk_VT2wPcSSOylnlPORltmQ")


if __name__ == "__main__":

    main()



[ 참고자료 ]


  https://www.inflearn.com/course/기본적인-머신러닝-딥러닝-강좌

  https://github.com/hunkim/deeplearningzerotoall

  https://www.tensorflow.org/api_docs/python/tf/layers

  https://www.inflearn.com/course/reinforcement-learning/