Chapter 1

编程入门行业动态更新时间:2024-10-25 22:24:10

Chapter 1 Chapter 1 - 9: Solve OpenAI Gym’s Taxi-v2 Task

将下列的三段代码复制到以每个子标题命名的python文件中，三个文件放置在同一个文件夹下，然后

python main.py

即可开始训练

agent.py

import numpy as np
from collections import defaultdictclass Agent:def __init__(self, nA=6):""" Initialize agent.Params======- nA: number of actions available to the agent- Q: state-action values matrix- epsilon: exploration rate of the epsilon-greedy strategy"""self.nA = nAself.Q = defaultdict(lambda: np.zeros(self.nA))self.epsilon = 1  # randomly equivalently policydef update_Q(self, Qsa, Qsa_next, reward, alpha, gamma):""" updates the action-value function estimate using the most recent time step """return Qsa + (alpha * (reward + (gamma * Qsa_next) - Qsa))def epsilon_greedy_probs(self, Q_s, i_episode, eps=None):"""obtains the action probabilities corresponding to epsilon greedy policy"""epsilon = 1.0 / i_episodeif eps is not None:epsilon = epspolicy_s = np.ones(self.nA) * epsilon / self.nApolicy_s[np.argmax(Q_s)] = 1 - epsilon + (epsilon / self.nA)return policy_sdef select_action(self, state, i_episode, eps=None):""" Given the state, select an action.Params======- state: the current state of the environment- i_episode: number of current episodeReturns=======- action: an integer, compatible with the task's action space"""# get epsilon-greedy action probabilitites.# print(eps)policy_s = self.epsilon_greedy_probs(self.Q[state], i_episode, eps)return np.random.choice(np.arange(self.nA), p=policy_s)def step(self, state, action, reward, next_state, done, i_episode, alpha=0.01, gamma=1):""" Update the agent's knowledge, using the most recently sampled tuple.Params======- state: the previous state of the environment- action: the agent's previous choice of action- reward: last reward received- next_state: the current state of the environment- done: whether the episode is complete (True or False)"""# self.Q[state][action] += alpha * (reward + (gamma * np.max(self.Q[next_state])) - self.Q[state][action])# self.Q[state][action] += 1# '''Qsa = self.Q[state][action]policy_s = self.epsilon_greedy_probs(self.Q[next_state], i_episode)Qsa_next = np.dot(self.Q[next_state], policy_s)self.Q[state][action] = self.update_Q(Qsa, Qsa_next, reward, alpha, gamma)# '''

main.py

from agent import Agent
from monitor import interact
import gym
import numpy as npenv = gym.make('Taxi-v3')
agent = Agent()
avg_rewards, best_avg_reward = interact(env, agent)

monitor.py

from collections import deque
import sys
import math
import numpy as npdef interact(env, agent, num_episodes=1000000, window=100):""" Monitor agent's performance.Params======- env: instance of OpenAI Gym's Taxi-v1 environment- agent: instance of class Agent (see Agent.py for details)- num_episodes: number of episodes of agent-environment interaction- window: number of episodes to consider when calculating average rewardsReturns=======- avg_rewards: deque containing average rewards- best_avg_reward: largest value in the avg_rewards deque"""# initialize average rewardsavg_rewards = deque(maxlen=num_episodes)# initialize best average rewardbest_avg_reward = -math.inf# initialize monitor for most recent rewardssamp_rewards = deque(maxlen=window)# for each episodefor i_episode in range(1, num_episodes+1):# begin the episodestate = env.reset()# initialize the sampled rewardsamp_reward = 0while True:# agent selects an actionaction = agent.select_action(state, i_episode, eps=0.005)# agent performs the selected actionnext_state, reward, done, _ = env.step(action)# agent performs internal updates based on sampled experienceagent.step(state, action, reward, next_state, done, i_episode)# update the sampled rewardsamp_reward += reward# update the state (s <- s') to next time stepstate = next_stateif done:# save final sampled rewardsamp_rewards.append(samp_reward)breakif (i_episode >= 100):# get average reward from last 100 episodesavg_reward = np.mean(samp_rewards)# append to dequeavg_rewards.append(avg_reward)# update best average rewardif avg_reward > best_avg_reward:best_avg_reward = avg_reward# monitor progressprint("\rEpisode {}/{} || Best average reward {}".format(i_episode, num_episodes, best_avg_reward), end="")sys.stdout.flush()# check if task is solved (according to OpenAI Gym)if best_avg_reward >= 9.7:print('\nEnvironment solved in {} episodes.'.format(i_episode), end="")breakif i_episode == num_episodes: print('\n')return avg_rewards, best_avg_reward

更多推荐

Chapter 1

本文发布于:2024-02-13 03:56:33，感谢您对本站的认可！

本文链接:https://www.elefans.com/category/jswz/34/1690580.html