陪你聊TensorLayer"/>
【陪你聊TensorLayer
源码地址:.py
以下本人结合《深度学习:一起玩转Tensorlayer》一书进行中文注释,大牛就不必看了,面向初次接触深度强化学习与Tensorlayer深度学习高级AI库的新手。之后会有补充。
#! /usr/bin/python # -*- coding: utf-8 -*- """ Monte-Carlo Policy Network π(a|s) (REINFORCE). To understand Reinforcement Learning, we let computer to learn how to play Pong game from the original screen inputs. Before we start, we highly recommend you to go through a famous blog called “Deep Reinforcement Learning: Pong from Pixels” which is a minimalistic implementation of deep reinforcement learning by using python-numpy and OpenAI gym environment. The code here is the reimplementation of Karpathy's Blog by using TensorLayer. Compare with Karpathy's code, we store observation for a batch, he store observation for a episode only, they store gradients instead. (so we will use more memory if the observation is very large.) Link ----- / """ import time import gym import numpy as np import tensorflow as tf import tensorlayer as tl from tensorlayer.layers import DenseLayer, InputLayertf.logging.set_verbosity(tf.logging.DEBUG) tl.logging.set_verbosity(tl.logging.DEBUG)# hyper-parameters 超参数设置 image_size = 80 D = image_size * image_size H = 200 batch_size = 10 learning_rate = 1e-4 gamma = 0.99 decay_rate = 0.99 render = True # 是否可视化 # resume = True # load existing policy network model_file_name = "model_pong" np.set_printoptions(threshold=np.nan)def prepro(I):""" prepro 210x160x3 uint8 frame into 6400 (80x80) 1D float vector 将原始输入210×160×3改为80×80=6400的一维向量""" I = I[35:195]I = I[::2, ::2, 0]I[I == 144] = 0 I[I == 109] = 0 I[I != 0] = 1 return I.astype(np.float).ravel()env = gym.make("Pong-v0") observation = env.reset() prev_x = None running_reward = None reward_sum = 0 episode_number = 0 # 用以保存batch_size个Episode的所有状态 xs, ys, rs = [], [], [] # observation for training and inference t_states = tf.placeholder(tf.float32, shape=[None, D]) # policy network 网络定义 network = InputLayer(t_states, name='input') network = DenseLayer(network, n_units=H, act=tf.nn.relu, name='hidden') network = DenseLayer(network, n_units=3, name='output') probs = network.outputs sampling_prob = tf.nn.softmax(probs) # 用折算累计回报与交叉熵的乘积来表示目标函数 由cross_entropy_reward_loss实现 t_actions = tf.placeholder(tf.int32, shape=[None]) t_discount_rewards = tf.placeholder(tf.float32, shape=[None]) loss = tl.rein.cross_entropy_reward_loss(probs, t_actions, t_discount_rewards) train_op = tf.train.RMSPropOptimizer(learning_rate, decay_rate).minimize(loss)with tf.Session() as sess:tl.layers.initialize_global_variables(sess)# if resume: # load_params = tl.files.load_npz(name=model_file_name+'.npz') # tl.files.assign_params(sess, load_params, network) tl.files.load_and_assign_npz(sess, model_file_name + '.npz', network)network.print_params()network.print_layers()start_time = time.time()game_number = 0 while True:# 是否显示游戏画面 if render: env.render()# 使用当前帧与前一时间点的帧的差值来定义状态 cur_x = prepro(observation)x = cur_x - prev_x if prev_x is not None else np.zeros(D)x = x.reshape(1, D)prev_x = cur_x# 策略网络输出各个动作的概率 prob = sess.run(sampling_prob, feed_dict={t_states: x})# 根据输出的概率来源则动作. 1: STOP 2: UP 3: DOWN # action = np.random.choice([1,2,3], p=prob.flatten()) action = tl.rein.choice_action_by_probs(prob.flatten(), [1, 2, 3])# 执行动作,获取新的状态和回报 observation, reward, done, _ = env.step(action)# 累加一个Episode的回报,显示使用 reward_sum += reward# 保存这一步的状态、动作和回报 xs.append(x) # all observations in an episode ys.append(action - 1) # all fake labels in an episode (action begins from 1, so minus 1) rs.append(reward) # all rewards in an episode # 当一个Episode完结 if done:episode_number += 1 game_number = 0 # 当batch_size个Episode完结,更新一次网络 if episode_number % batch_size == 0:print('batch over...... updating parameters......')epx = np.vstack(xs)epy = np.asarray(ys)epr = np.asarray(rs)# 求折算累计回报,并把回报归一化,可以使训练更加稳定 disR = tl.rein.discount_episode_rewards(epr, gamma)disR -= np.mean(disR)disR /= np.std(disR)# 清空列表以下一次保存 xs, ys, rs = [], [], []# 更新策略网络 sess.run(train_op, feed_dict={t_states: epx, t_actions: epy, t_discount_rewards: disR})# 保存模型,以供测试使用 if episode_number % (batch_size * 100) == 0:tl.files.save_npz(network.all_params, name=model_file_name + '.npz')# 每个Episode完结时,显示动态回报 running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01 print('resetting env. episode reward total was %f. running mean: %f' % (reward_sum, running_reward))# 重置环境 reward_sum = 0 observation = env.reset() # reset env prev_x = None # 当一盘乒乓球完成时,显示信息,一个Episode有很多盘球 if reward != 0:print(('episode %d: game %d took %.5fs, reward: %f' % (episode_number, game_number, \time.time() - start_time, reward)), ('' if reward == -1 else ' !!!!!!!!'))start_time = time.time()game_number += 1
更多推荐
【陪你聊TensorLayer
发布评论