Commit 2dd7d307 authored by John Schulman's avatar John Schulman
Browse files

Add ACER, PPO2, and results_plotter.py

parent 6a3cbb4b
......@@ -15,16 +15,18 @@ pip install -e .
```
- [A2C](baselines/a2c)
- [ACER](baselines/acer)
- [ACKTR](baselines/acktr)
- [DDPG](baselines/ddpg)
- [DQN](baselines/deepq)
- [PPO](baselines/ppo1)
- [PPO1](baselines/ppo1) (Multi-CPU using MPI)
- [PPO2](baselines/ppo2) (Optimized for GPU)
- [TRPO](baselines/trpo_mpi)
To cite this repository in publications:
@misc{baselines,
author = {Hesse, Christopher and Plappert, Matthias and Radford, Alec and Schulman, John and Sidor, Szymon and Wu, Yuhuai},
author = {Dhariwal, Prafulla and Hesse, Christopher and Plappert, Matthias and Radford, Alec and Schulman, John and Sidor, Szymon and Wu, Yuhuai},
title = {OpenAI Baselines},
year = {2017},
publisher = {GitHub},
......
......@@ -238,7 +238,7 @@ def check_shape(ts,shapes):
def avg_norm(t):
return tf.reduce_mean(tf.sqrt(tf.reduce_sum(tf.square(t), axis=-1)))
def myadd(g1, g2, param):
def gradient_add(g1, g2, param):
print([g1, g2, param.name])
assert (not (g1 is None and g2 is None)), param.name
if g1 is None:
......@@ -248,7 +248,7 @@ def myadd(g1, g2, param):
else:
return g1 + g2
def my_explained_variance(qpred, q):
def q_explained_variance(qpred, q):
_, vary = tf.nn.moments(q, axes=[0, 1])
_, varpred = tf.nn.moments(q - qpred, axes=[0, 1])
check_shape([vary, varpred], [[]] * 2)
......
# ACER
- Original paper: https://arxiv.org/abs/1611.01224
- `python -m baselines.acer.run_atari` runs the algorithm for 40M frames = 10M timesteps on an Atari game. See help (`-h`) for more options.
\ No newline at end of file
import time
import joblib
import numpy as np
import tensorflow as tf
from baselines import logger
from baselines.common import set_global_seeds
from baselines.a2c.utils import batch_to_seq, seq_to_batch
from baselines.a2c.utils import Scheduler, make_path, find_trainable_variables
from baselines.a2c.utils import cat_entropy_softmax
from baselines.a2c.utils import EpisodeStats
from baselines.a2c.utils import get_by_index, check_shape, avg_norm, gradient_add, q_explained_variance
from baselines.acer.buffer import Buffer
# remove last step
def strip(var, nenvs, nsteps, flat = False):
vars = batch_to_seq(var, nenvs, nsteps + 1, flat)
return seq_to_batch(vars[:-1], flat)
def q_retrace(R, D, q_i, v, rho_i, nenvs, nsteps, gamma):
"""
Calculates q_retrace targets
:param R: Rewards
:param D: Dones
:param q_i: Q values for actions taken
:param v: V values
:param rho_i: Importance weight for each action
:return: Q_retrace values
"""
rho_bar = batch_to_seq(tf.minimum(1.0, rho_i), nenvs, nsteps, True) # list of len steps, shape [nenvs]
rs = batch_to_seq(R, nenvs, nsteps, True) # list of len steps, shape [nenvs]
ds = batch_to_seq(D, nenvs, nsteps, True) # list of len steps, shape [nenvs]
q_is = batch_to_seq(q_i, nenvs, nsteps, True)
vs = batch_to_seq(v, nenvs, nsteps + 1, True)
v_final = vs[-1]
qret = v_final
qrets = []
for i in range(nsteps - 1, -1, -1):
check_shape([qret, ds[i], rs[i], rho_bar[i], q_is[i], vs[i]], [[nenvs]] * 6)
qret = rs[i] + gamma * qret * (1.0 - ds[i])
qrets.append(qret)
qret = (rho_bar[i] * (qret - q_is[i])) + vs[i]
qrets = qrets[::-1]
qret = seq_to_batch(qrets, flat=True)
return qret
# For ACER with PPO clipping instead of trust region
# def clip(ratio, eps_clip):
# # assume 0 <= eps_clip <= 1
# return tf.minimum(1 + eps_clip, tf.maximum(1 - eps_clip, ratio))
class Model(object):
def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, nstack, num_procs,
ent_coef, q_coef, gamma, max_grad_norm, lr,
rprop_alpha, rprop_epsilon, total_timesteps, lrschedule,
c, trust_region, alpha, delta):
config = tf.ConfigProto(allow_soft_placement=True,
intra_op_parallelism_threads=num_procs,
inter_op_parallelism_threads=num_procs)
sess = tf.Session(config=config)
nact = ac_space.n
nbatch = nenvs * nsteps
A = tf.placeholder(tf.int32, [nbatch]) # actions
D = tf.placeholder(tf.float32, [nbatch]) # dones
R = tf.placeholder(tf.float32, [nbatch]) # rewards, not returns
MU = tf.placeholder(tf.float32, [nbatch, nact]) # mu's
LR = tf.placeholder(tf.float32, [])
eps = 1e-6
step_model = policy(sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False)
train_model = policy(sess, ob_space, ac_space, nenvs, nsteps + 1, nstack, reuse=True)
params = find_trainable_variables("model")
print("Params {}".format(len(params)))
for var in params:
print(var)
# create polyak averaged model
ema = tf.train.ExponentialMovingAverage(alpha)
ema_apply_op = ema.apply(params)
def custom_getter(getter, *args, **kwargs):
v = ema.average(getter(*args, **kwargs))
print(v.name)
return v
with tf.variable_scope("", custom_getter=custom_getter, reuse=True):
polyak_model = policy(sess, ob_space, ac_space, nenvs, nsteps + 1, nstack, reuse=True)
# Notation: (var) = batch variable, (var)s = seqeuence variable, (var)_i = variable index by action at step i
v = tf.reduce_sum(train_model.pi * train_model.q, axis = -1) # shape is [nenvs * (nsteps + 1)]
# strip off last step
f, f_pol, q = map(lambda var: strip(var, nenvs, nsteps), [train_model.pi, polyak_model.pi, train_model.q])
# Get pi and q values for actions taken
f_i = get_by_index(f, A)
q_i = get_by_index(q, A)
# Compute ratios for importance truncation
rho = f / (MU + eps)
rho_i = get_by_index(rho, A)
# Calculate Q_retrace targets
qret = q_retrace(R, D, q_i, v, rho_i, nenvs, nsteps, gamma)
# Calculate losses
# Entropy
entropy = tf.reduce_mean(cat_entropy_softmax(f))
# Policy Graident loss, with truncated importance sampling & bias correction
v = strip(v, nenvs, nsteps, True)
check_shape([qret, v, rho_i, f_i], [[nenvs * nsteps]] * 4)
check_shape([rho, f, q], [[nenvs * nsteps, nact]] * 2)
# Truncated importance sampling
adv = qret - v
logf = tf.log(f_i + eps)
gain_f = logf * tf.stop_gradient(adv * tf.minimum(c, rho_i)) # [nenvs * nsteps]
loss_f = -tf.reduce_mean(gain_f)
# Bias correction for the truncation
adv_bc = (q - tf.reshape(v, [nenvs * nsteps, 1])) # [nenvs * nsteps, nact]
logf_bc = tf.log(f + eps) # / (f_old + eps)
check_shape([adv_bc, logf_bc], [[nenvs * nsteps, nact]]*2)
gain_bc = tf.reduce_sum(logf_bc * tf.stop_gradient(adv_bc * tf.nn.relu(1.0 - (c / (rho + eps))) * f), axis = 1) #IMP: This is sum, as expectation wrt f
loss_bc= -tf.reduce_mean(gain_bc)
loss_policy = loss_f + loss_bc
# Value/Q function loss, and explained variance
check_shape([qret, q_i], [[nenvs * nsteps]]*2)
ev = q_explained_variance(tf.reshape(q_i, [nenvs, nsteps]), tf.reshape(qret, [nenvs, nsteps]))
loss_q = tf.reduce_mean(tf.square(tf.stop_gradient(qret) - q_i)*0.5)
# Net loss
check_shape([loss_policy, loss_q, entropy], [[]] * 3)
loss = loss_policy + q_coef * loss_q - ent_coef * entropy
if trust_region:
g = tf.gradients(- (loss_policy - ent_coef * entropy) * nsteps * nenvs, f) #[nenvs * nsteps, nact]
# k = tf.gradients(KL(f_pol || f), f)
k = - f_pol / (f + eps) #[nenvs * nsteps, nact] # Directly computed gradient of KL divergence wrt f
k_dot_g = tf.reduce_sum(k * g, axis=-1)
adj = tf.maximum(0.0, (tf.reduce_sum(k * g, axis=-1) - delta) / (tf.reduce_sum(tf.square(k), axis=-1) + eps)) #[nenvs * nsteps]
# Calculate stats (before doing adjustment) for logging.
avg_norm_k = avg_norm(k)
avg_norm_g = avg_norm(g)
avg_norm_k_dot_g = tf.reduce_mean(tf.abs(k_dot_g))
avg_norm_adj = tf.reduce_mean(tf.abs(adj))
g = g - tf.reshape(adj, [nenvs * nsteps, 1]) * k
grads_f = -g/(nenvs*nsteps) # These are turst region adjusted gradients wrt f ie statistics of policy pi
grads_policy = tf.gradients(f, params, grads_f)
grads_q = tf.gradients(loss_q * q_coef, params)
grads = [gradient_add(g1, g2, param) for (g1, g2, param) in zip(grads_policy, grads_q, params)]
avg_norm_grads_f = avg_norm(grads_f) * (nsteps * nenvs)
norm_grads_q = tf.global_norm(grads_q)
norm_grads_policy = tf.global_norm(grads_policy)
else:
grads = tf.gradients(loss, params)
if max_grad_norm is not None:
grads, norm_grads = tf.clip_by_global_norm(grads, max_grad_norm)
grads = list(zip(grads, params))
trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=rprop_alpha, epsilon=rprop_epsilon)
_opt_op = trainer.apply_gradients(grads)
# so when you call _train, you first do the gradient step, then you apply ema
with tf.control_dependencies([_opt_op]):
_train = tf.group(ema_apply_op)
lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)
# Ops/Summaries to run, and their names for logging
run_ops = [_train, loss, loss_q, entropy, loss_policy, loss_f, loss_bc, ev, norm_grads]
names_ops = ['loss', 'loss_q', 'entropy', 'loss_policy', 'loss_f', 'loss_bc', 'explained_variance',
'norm_grads']
if trust_region:
run_ops = run_ops + [norm_grads_q, norm_grads_policy, avg_norm_grads_f, avg_norm_k, avg_norm_g, avg_norm_k_dot_g,
avg_norm_adj]
names_ops = names_ops + ['norm_grads_q', 'norm_grads_policy', 'avg_norm_grads_f', 'avg_norm_k', 'avg_norm_g',
'avg_norm_k_dot_g', 'avg_norm_adj']
def train(obs, actions, rewards, dones, mus, states, masks, steps):
cur_lr = lr.value_steps(steps)
td_map = {train_model.X: obs, polyak_model.X: obs, A: actions, R: rewards, D: dones, MU: mus, LR: cur_lr}
if states != []:
td_map[train_model.S] = states
td_map[train_model.M] = masks
td_map[polyak_model.S] = states
td_map[polyak_model.M] = masks
return names_ops, sess.run(run_ops, td_map)[1:] # strip off _train
def save(save_path):
ps = sess.run(params)
make_path(save_path)
joblib.dump(ps, save_path)
self.train = train
self.save = save
self.train_model = train_model
self.step_model = step_model
self.step = step_model.step
self.initial_state = step_model.initial_state
tf.global_variables_initializer().run(session=sess)
class Runner(object):
def __init__(self, env, model, nsteps, nstack):
self.env = env
self.nstack = nstack
self.model = model
nh, nw, nc = env.observation_space.shape
self.nc = nc # nc = 1 for atari, but just in case
self.nenv = nenv = env.num_envs
self.nact = env.action_space.n
self.nbatch = nenv * nsteps
self.batch_ob_shape = (nenv*(nsteps+1), nh, nw, nc*nstack)
self.obs = np.zeros((nenv, nh, nw, nc * nstack), dtype=np.uint8)
obs = env.reset()
self.update_obs(obs)
self.nsteps = nsteps
self.states = model.initial_state
self.dones = [False for _ in range(nenv)]
def update_obs(self, obs, dones=None):
if dones is not None:
self.obs *= (1 - dones.astype(np.uint8))[:, None, None, None]
self.obs = np.roll(self.obs, shift=-self.nc, axis=3)
self.obs[:, :, :, -self.nc:] = obs[:, :, :, :]
def run(self):
enc_obs = np.split(self.obs, self.nstack, axis=3) # so now list of obs steps
mb_obs, mb_actions, mb_mus, mb_dones, mb_rewards = [], [], [], [], []
for _ in range(self.nsteps):
actions, mus, states = self.model.step(self.obs, state=self.states, mask=self.dones)
mb_obs.append(np.copy(self.obs))
mb_actions.append(actions)
mb_mus.append(mus)
mb_dones.append(self.dones)
obs, rewards, dones, _ = self.env.step(actions)
# states information for statefull models like LSTM
self.states = states
self.dones = dones
self.update_obs(obs, dones)
mb_rewards.append(rewards)
enc_obs.append(obs)
mb_obs.append(np.copy(self.obs))
mb_dones.append(self.dones)
enc_obs = np.asarray(enc_obs, dtype=np.uint8).swapaxes(1, 0)
mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0)
mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0)
mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0)
mb_mus = np.asarray(mb_mus, dtype=np.float32).swapaxes(1, 0)
mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
mb_masks = mb_dones # Used for statefull models like LSTM's to mask state when done
mb_dones = mb_dones[:, 1:] # Used for calculating returns. The dones array is now aligned with rewards
# shapes are now [nenv, nsteps, []]
# When pulling from buffer, arrays will now be reshaped in place, preventing a deep copy.
return enc_obs, mb_obs, mb_actions, mb_rewards, mb_mus, mb_dones, mb_masks
class Acer():
def __init__(self, runner, model, buffer, log_interval):
self.runner = runner
self.model = model
self.buffer = buffer
self.log_interval = log_interval
self.tstart = None
self.episode_stats = EpisodeStats(runner.nsteps, runner.nenv)
self.steps = None
def call(self, on_policy):
runner, model, buffer, steps = self.runner, self.model, self.buffer, self.steps
if on_policy:
enc_obs, obs, actions, rewards, mus, dones, masks = runner.run()
self.episode_stats.feed(rewards, dones)
if buffer is not None:
buffer.put(enc_obs, actions, rewards, mus, dones, masks)
else:
# get obs, actions, rewards, mus, dones from buffer.
obs, actions, rewards, mus, dones, masks = buffer.get()
# reshape stuff correctly
obs = obs.reshape(runner.batch_ob_shape)
actions = actions.reshape([runner.nbatch])
rewards = rewards.reshape([runner.nbatch])
mus = mus.reshape([runner.nbatch, runner.nact])
dones = dones.reshape([runner.nbatch])
masks = masks.reshape([runner.batch_ob_shape[0]])
names_ops, values_ops = model.train(obs, actions, rewards, dones, mus, model.initial_state, masks, steps)
if on_policy and (int(steps/runner.nbatch) % self.log_interval == 0):
logger.record_tabular("total_timesteps", steps)
logger.record_tabular("fps", int(steps/(time.time() - self.tstart)))
# IMP: In EpisodicLife env, during training, we get done=True at each loss of life, not just at the terminal state.
# Thus, this is mean until end of life, not end of episode.
# For true episode rewards, see the monitor files in the log folder.
logger.record_tabular("mean_episode_length", self.episode_stats.mean_length())
logger.record_tabular("mean_episode_reward", self.episode_stats.mean_reward())
for name, val in zip(names_ops, values_ops):
logger.record_tabular(name, float(val))
logger.dump_tabular()
def learn(policy, env, seed, nsteps=20, nstack=4, total_timesteps=int(80e6), q_coef=0.5, ent_coef=0.01,
max_grad_norm=10, lr=7e-4, lrschedule='linear', rprop_epsilon=1e-5, rprop_alpha=0.99, gamma=0.99,
log_interval=100, buffer_size=50000, replay_ratio=4, replay_start=10000, c=10.0,
trust_region=True, alpha=0.99, delta=1):
print("Running Acer Simple")
print(locals())
tf.reset_default_graph()
set_global_seeds(seed)
nenvs = env.num_envs
ob_space = env.observation_space
ac_space = env.action_space
num_procs = len(env.remotes) # HACK
model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nenvs=nenvs, nsteps=nsteps, nstack=nstack,
num_procs=num_procs, ent_coef=ent_coef, q_coef=q_coef, gamma=gamma,
max_grad_norm=max_grad_norm, lr=lr, rprop_alpha=rprop_alpha, rprop_epsilon=rprop_epsilon,
total_timesteps=total_timesteps, lrschedule=lrschedule, c=c,
trust_region=trust_region, alpha=alpha, delta=delta)
runner = Runner(env=env, model=model, nsteps=nsteps, nstack=nstack)
if replay_ratio > 0:
buffer = Buffer(env=env, nsteps=nsteps, nstack=nstack, size=buffer_size)
else:
buffer = None
nbatch = nenvs*nsteps
acer = Acer(runner, model, buffer, log_interval)
acer.tstart = time.time()
for acer.steps in range(0, total_timesteps, nbatch): #nbatch samples, 1 on_policy call and multiple off-policy calls
acer.call(on_policy=True)
if replay_ratio > 0 and buffer.has_atleast(replay_start):
n = np.random.poisson(replay_ratio)
for _ in range(n):
acer.call(on_policy=False) # no simulation steps in this
env.close()
import numpy as np
class Buffer(object):
# gets obs, actions, rewards, mu's, (states, masks), dones
def __init__(self, env, nsteps, nstack, size=50000):
self.nenv = env.num_envs
self.nsteps = nsteps
self.nh, self.nw, self.nc = env.observation_space.shape
self.nstack = nstack
self.nbatch = self.nenv * self.nsteps
self.size = size // (self.nsteps) # Each loc contains nenv * nsteps frames, thus total buffer is nenv * size frames
# Memory
self.enc_obs = None
self.actions = None
self.rewards = None
self.mus = None
self.dones = None
self.masks = None
# Size indexes
self.next_idx = 0
self.num_in_buffer = 0
def has_atleast(self, frames):
# Frames per env, so total (nenv * frames) Frames needed
# Each buffer loc has nenv * nsteps frames
return self.num_in_buffer >= (frames // self.nsteps)
def can_sample(self):
return self.num_in_buffer > 0
# Generate stacked frames
def decode(self, enc_obs, dones):
# enc_obs has shape [nenvs, nsteps + nstack, nh, nw, nc]
# dones has shape [nenvs, nsteps, nh, nw, nc]
# returns stacked obs of shape [nenv, (nsteps + 1), nh, nw, nstack*nc]
nstack, nenv, nsteps, nh, nw, nc = self.nstack, self.nenv, self.nsteps, self.nh, self.nw, self.nc
y = np.empty([nsteps + nstack - 1, nenv, 1, 1, 1], dtype=np.float32)
obs = np.zeros([nstack, nsteps + nstack, nenv, nh, nw, nc], dtype=np.uint8)
x = np.reshape(enc_obs, [nenv, nsteps + nstack, nh, nw, nc]).swapaxes(1,
0) # [nsteps + nstack, nenv, nh, nw, nc]
y[3:] = np.reshape(1.0 - dones, [nenv, nsteps, 1, 1, 1]).swapaxes(1, 0) # keep
y[:3] = 1.0
# y = np.reshape(1 - dones, [nenvs, nsteps, 1, 1, 1])
for i in range(nstack):
obs[-(i + 1), i:] = x
# obs[:,i:,:,:,-(i+1),:] = x
x = x[:-1] * y
y = y[1:]
return np.reshape(obs[:, 3:].transpose((2, 1, 3, 4, 0, 5)), [nenv, (nsteps + 1), nh, nw, nstack * nc])
def put(self, enc_obs, actions, rewards, mus, dones, masks):
# enc_obs [nenv, (nsteps + nstack), nh, nw, nc]
# actions, rewards, dones [nenv, nsteps]
# mus [nenv, nsteps, nact]
if self.enc_obs is None:
self.enc_obs = np.empty([self.size] + list(enc_obs.shape), dtype=np.uint8)
self.actions = np.empty([self.size] + list(actions.shape), dtype=np.int32)
self.rewards = np.empty([self.size] + list(rewards.shape), dtype=np.float32)
self.mus = np.empty([self.size] + list(mus.shape), dtype=np.float32)
self.dones = np.empty([self.size] + list(dones.shape), dtype=np.bool)
self.masks = np.empty([self.size] + list(masks.shape), dtype=np.bool)
self.enc_obs[self.next_idx] = enc_obs
self.actions[self.next_idx] = actions
self.rewards[self.next_idx] = rewards
self.mus[self.next_idx] = mus
self.dones[self.next_idx] = dones
self.masks[self.next_idx] = masks
self.next_idx = (self.next_idx + 1) % self.size
self.num_in_buffer = min(self.size, self.num_in_buffer + 1)
def take(self, x, idx, envx):
nenv = self.nenv
out = np.empty([nenv] + list(x.shape[2:]), dtype=x.dtype)
for i in range(nenv):
out[i] = x[idx[i], envx[i]]
return out
def get(self):
# returns
# obs [nenv, (nsteps + 1), nh, nw, nstack*nc]
# actions, rewards, dones [nenv, nsteps]
# mus [nenv, nsteps, nact]
nenv = self.nenv
assert self.can_sample()
# Sample exactly one id per env. If you sample across envs, then higher correlation in samples from same env.
idx = np.random.randint(0, self.num_in_buffer, nenv)
envx = np.arange(nenv)
take = lambda x: self.take(x, idx, envx) # for i in range(nenv)], axis = 0)
dones = take(self.dones)
enc_obs = take(self.enc_obs)
obs = self.decode(enc_obs, dones)
actions = take(self.actions)
rewards = take(self.rewards)
mus = take(self.mus)
masks = take(self.masks)
return obs, actions, rewards, mus, dones, masks
import numpy as np
import tensorflow as tf
from baselines.a2c.utils import conv, fc, conv_to_fc, batch_to_seq, seq_to_batch, lstm, lnlstm, sample, check_shape
class AcerCnnPolicy(object):
def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False):
nbatch = nenv * nsteps
nh, nw, nc = ob_space.shape
ob_shape = (nbatch, nh, nw, nc * nstack)
nact = ac_space.n
X = tf.placeholder(tf.uint8, ob_shape) # obs
with tf.variable_scope("model", reuse=reuse):
h = conv(tf.cast(X, tf.float32) / 255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2))
h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2))
h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2))
h3 = conv_to_fc(h3)
h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))
pi_logits = fc(h4, 'pi', nact, act=lambda x: x, init_scale=0.01)
pi = tf.nn.softmax(pi_logits)
q = fc(h4, 'q', nact, act=lambda x: x)
a = sample(pi_logits) # could change this to use self.pi instead
self.initial_state = [] # not stateful
self.X = X
self.pi = pi # actual policy params now
self.q = q
def step(ob, *args, **kwargs):
# returns actions, mus, states
a0, pi0 = sess.run([a, pi], {X: ob})
return a0, pi0, [] # dummy state
def out(ob, *args, **kwargs):
pi0, q0 = sess.run([pi, q], {X: ob})
return pi0, q0
def act(ob, *args, **kwargs):
return sess.run(a, {X: ob})
self.step = step
self.out = out
self.act = act
class AcerLstmPolicy(object):
def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False, nlstm=256):
nbatch = nenv * nsteps
nh, nw, nc = ob_space.shape
ob_shape = (nbatch, nh, nw, nc * nstack)
nact = ac_space.n
X = tf.placeholder(tf.uint8, ob_shape) # obs
M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
with tf.variable_scope("model", reuse=reuse):
h = conv(tf.cast(X, tf.float32) / 255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2))
h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2))
h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2))
h3 = conv_to_fc(h3)
h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))
# lstm
xs = batch_to_seq(h4, nenv, nsteps)
ms = batch_to_seq(M, nenv, nsteps)
h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
h5 = seq_to_batch(h5)
pi_logits = fc(h5, 'pi', nact, act=lambda x: x, init_scale=0.01)