diff --git a/docker/Dockerfile b/docker/Dockerfile
index 53a11ba25..d442dc3fb 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -42,7 +42,7 @@ USER edward
 ARG python_version=3.5.3-0
 ARG python_qt_version=4
 RUN conda install -y python=${python_version} && \
-    pip install observations numpy six tensorflow keras prettytensor && \
+    pip install observations numpy six tensorflow keras && \
     pip install ipdb pytest pytest-cov python-coveralls coverage==3.7.1 pytest-xdist pep8 pytest-pep8 pydot_ng && \
     conda install Pillow scikit-learn matplotlib notebook pandas seaborn pyyaml h5py && \
     conda install -y pyqt=${python_qt_version} && \
diff --git a/docker/Dockerfile-gpu b/docker/Dockerfile-gpu
index a32bed4e2..90d74c9c5 100644
--- a/docker/Dockerfile-gpu
+++ b/docker/Dockerfile-gpu
@@ -42,7 +42,7 @@ USER edward
 # Python
 ARG python_version=3.5.3-0
 RUN conda install -y python=${python_version} && \
-    pip install observations numpy six tensorflow-gpu keras prettytensor && \
+    pip install observations numpy six tensorflow-gpu keras && \
     pip install ipdb pytest pytest-cov python-coveralls coverage==3.7.1 pytest-xdist pep8 pytest-pep8 pydot_ng && \
     conda install Pillow scikit-learn matplotlib notebook pandas seaborn pyyaml h5py && \
     pip install edward && \
diff --git a/docs/tex/bib.bib b/docs/tex/bib.bib
index 50521ce66..b5ba0e0e8 100644
--- a/docs/tex/bib.bib
+++ b/docs/tex/bib.bib
@@ -484,6 +484,13 @@ @inproceedings{welling2011bayesian
 year = {2011}
 }
 
+@inproceedings{wingate2011lightweight,
+  title={Lightweight implementations of probabilistic programming languages via transformational compilation},
+  author={Wingate, David and Stuhlmueller, Andreas and Goodman, Noah},
+  booktitle={Artificial Intelligence and Statistics},
+  year={2011}
+}
+
 @inproceedings{goodman2012church,
 author = {Goodman, Noah and Mansinghka, Vikash and Roy, Daniel M and Bonawitz, Keith and Tenenbaum, Joshua B},
 title = {{Church: a language for generative models}},
@@ -720,6 +727,13 @@ @article{johnson2016composing
   year = {2016},
 }
 
+@inproceedings{li2016preconditioned,
+  title={Preconditioned Stochastic Gradient Langevin Dynamics for Deep Neural Networks.},
+  author={Li, Chunyuan and Chen, Changyou and Carlson, David E and Carin, Lawrence},
+  booktitle={Association for the Advancement of Artificial Intelligence},
+  year={2016}
+}
+
 @article{mohamed2016learning,
 author = {Mohamed, Shakir and Lakshminarayanan, Balaji},
 title = {{Learning in Implicit Generative Models}},
diff --git a/edward/__init__.py b/edward/__init__.py
index 5892f3b65..16e10475b 100644
--- a/edward/__init__.py
+++ b/edward/__init__.py
@@ -2,88 +2,74 @@
 from __future__ import division
 from __future__ import print_function
 
-from edward import criticisms
 from edward import inferences
 from edward import models
-from edward import util
 
 # Direct imports for convenience
-from edward.criticisms import (
-    evaluate, ppc, ppc_density_plot, ppc_stat_hist_plot)
 from edward.inferences import (
-    Inference, MonteCarlo, VariationalInference,
-    HMC, MetropolisHastings, SGLD, SGHMC,
-    KLpq, KLqp, ReparameterizationKLqp, ReparameterizationKLKLqp,
-    ReparameterizationEntropyKLqp, ScoreKLqp, ScoreKLKLqp, ScoreEntropyKLqp,
-    ScoreRBKLqp, WakeSleep, GANInference, BiGANInference, WGANInference,
-    ImplicitKLqp, MAP, Laplace, complete_conditional, Gibbs)
-from edward.models import RandomVariable
-from edward.util import (
-    check_data, check_latent_vars, copy, dot,
-    get_ancestors, get_blanket, get_children, get_control_variate_coef,
-    get_descendants, get_parents, get_session, get_siblings, get_variables,
-    is_independent, Progbar, random_variables, rbf, set_seed,
-    to_simplex, transform)
+    bigan_inference,
+    complete_conditional,
+    gan_inference,
+    hmc,
+    klpq,
+    klqp,
+    klqp_implicit,
+    klqp_reparameterization,
+    klqp_reparameterization_kl,
+    klqp_score,
+    laplace,
+    map,
+    metropolis_hastings,
+    sghmc,
+    sgld,
+    wake_sleep,
+    wgan_inference)
+from edward.models import (
+    call_with_manipulate,
+    get_ancestors,
+    get_blanket,
+    get_children,
+    get_descendants,
+    get_parents,
+    get_siblings,
+    get_variables,
+    is_independent,
+    random_variables)
 from edward.version import __version__, VERSION
 
 from tensorflow.python.util.all_util import remove_undocumented
 
 # Export modules and constants.
 _allowed_symbols = [
-    'criticisms',
     'inferences',
     'models',
-    'util',
-    'evaluate',
-    'ppc',
-    'ppc_density_plot',
-    'ppc_stat_hist_plot',
-    'Inference',
-    'MonteCarlo',
-    'VariationalInference',
-    'HMC',
-    'MetropolisHastings',
-    'SGLD',
-    'SGHMC',
-    'KLpq',
-    'KLqp',
-    'ReparameterizationKLqp',
-    'ReparameterizationKLKLqp',
-    'ReparameterizationEntropyKLqp',
-    'ScoreKLqp',
-    'ScoreKLKLqp',
-    'ScoreEntropyKLqp',
-    'ScoreRBKLqp',
-    'WakeSleep',
-    'GANInference',
-    'BiGANInference',
-    'WGANInference',
-    'ImplicitKLqp',
-    'MAP',
-    'Laplace',
+    'bigan_inference',
+    'call_with_manipulate',
     'complete_conditional',
-    'Gibbs',
-    'RandomVariable',
-    'check_data',
-    'check_latent_vars',
-    'copy',
-    'dot',
+    'gan_inference',
+    'hmc',
+    'klpq',
+    'klqp',
+    'klqp_implicit',
+    'klqp_reparameterization',
+    'klqp_reparameterization_kl',
+    'klqp_score',
+    'laplace',
+    'map',
+    'metropolis_hastings',
+    'sghmc',
+    'sgld',
+    'wake_sleep',
+    'wgan_inference',
     'get_ancestors',
     'get_blanket',
     'get_children',
-    'get_control_variate_coef',
     'get_descendants',
     'get_parents',
-    'get_session',
     'get_siblings',
     'get_variables',
     'is_independent',
-    'Progbar',
     'random_variables',
-    'rbf',
-    'set_seed',
-    'to_simplex',
-    'transform',
     '__version__',
     'VERSION',
 ]
@@ -91,5 +77,5 @@
 # Remove all extra symbols that don't have a docstring or are not explicitly
 # referenced in the whitelist.
 remove_undocumented(__name__, _allowed_symbols, [
-    criticisms, inferences, models, util
+    inferences, models
 ])
diff --git a/edward/criticisms/__init__.py b/edward/criticisms/__init__.py
deleted file mode 100644
index 5a9aff3d6..000000000
--- a/edward/criticisms/__init__.py
+++ /dev/null
@@ -1,20 +0,0 @@
-"""
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from edward.criticisms.evaluate import *
-from edward.criticisms.ppc import *
-from edward.criticisms.ppc_plots import *
-
-from tensorflow.python.util.all_util import remove_undocumented
-
-_allowed_symbols = [
-    'evaluate',
-    'ppc',
-    'ppc_density_plot',
-    'ppc_stat_hist_plot',
-]
-
-remove_undocumented(__name__, allowed_exception_list=_allowed_symbols)
diff --git a/edward/criticisms/evaluate.py b/edward/criticisms/evaluate.py
deleted file mode 100644
index 44074aa59..000000000
--- a/edward/criticisms/evaluate.py
+++ /dev/null
@@ -1,476 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-import six
-import tensorflow as tf
-
-from edward.models import RandomVariable
-from edward.util import check_data, get_session, compute_multinomial_mode, \
-    with_binary_averaging
-
-try:
-  from edward.models import Bernoulli, Binomial, Categorical, \
-      Multinomial, OneHotCategorical
-except Exception as e:
-  raise ImportError("{0}. Your TensorFlow version is not supported.".format(e))
-
-
-def evaluate(metrics, data, n_samples=500, output_key=None, seed=None):
-  """Evaluate fitted model using a set of metrics.
-
-  A metric, or scoring rule [@winkler1994evaluating], is a function of
-  observed data under the posterior predictive distribution. For
-  example in supervised metrics such as classification accuracy, the
-  observed data (true output) is compared to the posterior
-  predictive's mean (predicted output). In unsupervised metrics such
-  as log-likelihood, the probability of observing the data is
-  calculated under the posterior predictive's log-density.
-
-  Args:
-    metrics: list of str and/or (str, params: dict) tuples, str,
-    or (str, params: dict) tuple.
-      List of metrics or a single metric:
-      `'binary_accuracy'`,
-      `'categorical_accuracy'`,
-      `'sparse_categorical_accuracy'`,
-      `'log_loss'` or `'binary_crossentropy'`,
-      `'categorical_crossentropy'`,
-      `'sparse_categorical_crossentropy'`,
-      `'hinge'`,
-      `'squared_hinge'`,
-      `'mse'` or `'MSE'` or `'mean_squared_error'`,
-      `'mae'` or `'MAE'` or `'mean_absolute_error'`,
-      `'mape'` or `'MAPE'` or `'mean_absolute_percentage_error'`,
-      `'msle'` or `'MSLE'` or `'mean_squared_logarithmic_error'`,
-      `'poisson'`,
-      `'cosine'` or `'cosine_proximity'`,
-      `'log_lik'` or `'log_likelihood'`.
-      In lieu of a metric string, this method also accepts (str, params: dict)
-      tuples; the first element of this tuple is the metric string, and
-      the second is a dict of associated params. At present, this dict only
-      expects one key, `'average'`, which stipulates the type of averaging to
-      perform on those metrics that permit binary averaging. Permissible
-      options include: `None`, `'macro'` and `'micro'`.
-    data: dict.
-      Data to evaluate model with. It binds observed variables (of type
-      `RandomVariable` or `tf.Tensor`) to their realizations (of
-      type `tf.Tensor`). It can also bind placeholders (of type
-      `tf.Tensor`) used in the model to their realizations.
-    n_samples: int.
-      Number of posterior samples for making predictions, using the
-      posterior predictive distribution.
-    output_key: RandomVariable or tf.Tensor.
-      It is the key in `data` which corresponds to the model's output.
-    seed: a Python integer. Used to create a random seed for the
-      distribution
-
-  Returns:
-    list of float or float.
-    A list of evaluations or a single evaluation.
-
-  Raises:
-    NotImplementedError.
-    If an input metric does not match an implemented metric in Edward.
-
-  #### Examples
-
-  ```python
-  # build posterior predictive after inference: it is
-  # parameterized by a posterior sample
-  x_post = ed.copy(x, {z: qz, beta: qbeta})
-
-  # log-likelihood performance
-  ed.evaluate('log_likelihood', data={x_post: x_train})
-
-  # classification accuracy
-  # here, `x_ph` is any features the model is defined with respect to,
-  # and `y_post` is the posterior predictive distribution
-  ed.evaluate('binary_accuracy', data={y_post: y_train, x_ph: x_train})
-
-  # mean squared error
-  ed.evaluate('mean_squared_error', data={y: y_data, x: x_data})
-  ```
-
-  # mean squared logarithmic error with `'micro'` averaging
-  ed.evaluate(('mean_squared_logarithmic_error', {'average': 'micro'}),
-              data={y: y_data, x: x_data})
-  """
-  sess = get_session()
-  if isinstance(metrics, str):
-    metrics = [metrics]
-  elif callable(metrics):
-    metrics = [metrics]
-  elif not isinstance(metrics, list):
-    raise TypeError("metrics must have type str or list, or be callable.")
-
-  check_data(data)
-  if not isinstance(n_samples, int):
-    raise TypeError("n_samples must have type int.")
-
-  if output_key is None:
-    # Default output_key to the only data key that isn't a placeholder.
-    keys = [key for key in six.iterkeys(data) if not
-            isinstance(key, tf.Tensor) or "Placeholder" not in key.op.type]
-    if len(keys) == 1:
-      output_key = keys[0]
-    else:
-      raise KeyError("User must specify output_key.")
-  elif not isinstance(output_key, RandomVariable):
-    raise TypeError("output_key must have type RandomVariable.")
-
-  # Create feed_dict for data placeholders that the model conditions
-  # on; it is necessary for all session runs.
-  feed_dict = {key: value for key, value in six.iteritems(data)
-               if isinstance(key, tf.Tensor) and "Placeholder" in key.op.type}
-
-  # Form true data.
-  y_true = data[output_key]
-  # Make predictions (if there are any supervised metrics).
-  if metrics != ['log_lik'] and metrics != ['log_likelihood']:
-    binary_discrete = (Bernoulli, Binomial)
-    categorical_discrete = (Categorical, Multinomial, OneHotCategorical)
-    total_count = sess.run(getattr(output_key, 'total_count', tf.constant(1.)))
-    if isinstance(output_key, binary_discrete + categorical_discrete):
-      # Average over realizations of their probabilities, then predict
-      # via argmax over probabilities.
-      probs = [sess.run(output_key.probs, feed_dict) for _ in range(n_samples)]
-      probs = np.sum(probs, axis=0) / n_samples
-      if isinstance(output_key, binary_discrete):
-        # make random prediction whenever probs is exactly 0.5
-        random = tf.random_uniform(shape=tf.shape(probs))
-        y_pred = tf.round(tf.where(tf.equal(0.5, probs), random, probs))
-      else:
-        if total_count > 1:
-          mode = compute_multinomial_mode(probs, total_count, seed)
-          if len(output_key.sample_shape):
-            y_pred = tf.reshape(tf.tile(mode, output_key.sample_shape),
-                                [-1, len(probs)])
-          else:
-            y_pred = mode
-        else:
-          y_pred = tf.argmax(probs, len(probs.shape) - 1)
-      probs = tf.constant(probs)
-    else:
-      # Monte Carlo estimate the mean of the posterior predictive.
-      y_pred = [sess.run(output_key, feed_dict) for _ in range(n_samples)]
-      y_pred = tf.cast(tf.add_n(y_pred), y_pred[0].dtype) / \
-          tf.cast(n_samples, y_pred[0].dtype)
-    if len(y_true.shape) == 0:
-      y_true = tf.expand_dims(y_true, 0)
-      y_pred = tf.expand_dims(y_pred, 0)
-
-  # Evaluate y_true (according to y_pred if supervised) for all metrics.
-  evaluations = []
-  for metric in metrics:
-    if isinstance(metric, tuple):
-      metric, params = metric
-    else:
-      params = {}
-    if metric == 'accuracy' or metric == 'crossentropy':
-      # automate binary or sparse cat depending on its support
-      support = sess.run(tf.reduce_max(y_true), feed_dict)
-      if support <= 1:
-        metric = 'binary_' + metric
-      else:
-        metric = 'sparse_categorical_' + metric
-
-    if metric == 'binary_accuracy':
-      evaluations += [binary_accuracy(y_true, y_pred, **params)]
-    elif metric == 'categorical_accuracy':
-      evaluations += [categorical_accuracy(y_true, y_pred, **params)]
-    elif metric == 'sparse_categorical_accuracy':
-      evaluations += [sparse_categorical_accuracy(y_true, y_pred, **params)]
-    elif metric == 'log_loss' or metric == 'binary_crossentropy':
-      evaluations += [binary_crossentropy(y_true, y_pred, **params)]
-    elif metric == 'categorical_crossentropy':
-      evaluations += [categorical_crossentropy(y_true, y_pred, **params)]
-    elif metric == 'sparse_categorical_crossentropy':
-      evaluations += [sparse_categorical_crossentropy(y_true, y_pred, **params)]
-    elif metric == 'multinomial_accuracy':
-      evaluations += [multinomial_accuracy(y_true, y_pred, **params)]
-    elif metric == 'kl_divergence':
-      y_true_ = y_true / total_count
-      y_pred_ = probs
-      evaluations += [kl_divergence(y_true_, y_pred_, **params)]
-    elif metric == 'hinge':
-      evaluations += [hinge(y_true, y_pred, **params)]
-    elif metric == 'squared_hinge':
-      evaluations += [squared_hinge(y_true, y_pred, **params)]
-    elif (metric == 'mse' or metric == 'MSE' or
-          metric == 'mean_squared_error'):
-      evaluations += [mean_squared_error(y_true, y_pred, **params)]
-    elif (metric == 'mae' or metric == 'MAE' or
-          metric == 'mean_absolute_error'):
-      evaluations += [mean_absolute_error(y_true, y_pred, **params)]
-    elif (metric == 'mape' or metric == 'MAPE' or
-          metric == 'mean_absolute_percentage_error'):
-      evaluations += [mean_absolute_percentage_error(y_true, y_pred, **params)]
-    elif (metric == 'msle' or metric == 'MSLE' or
-          metric == 'mean_squared_logarithmic_error'):
-      evaluations += [mean_squared_logarithmic_error(y_true, y_pred, **params)]
-    elif metric == 'poisson':
-      evaluations += [poisson(y_true, y_pred, **params)]
-    elif metric == 'cosine' or metric == 'cosine_proximity':
-      evaluations += [cosine_proximity(y_true, y_pred, **params)]
-    elif metric == 'log_lik' or metric == 'log_likelihood':
-      # Monte Carlo estimate the log-density of the posterior predictive.
-      tensor = tf.reduce_mean(output_key.log_prob(y_true))
-      log_pred = [sess.run(tensor, feed_dict) for _ in range(n_samples)]
-      log_pred = tf.add_n(log_pred) / tf.cast(n_samples, tensor.dtype)
-      evaluations += [log_pred]
-    elif callable(metric):
-      evaluations += [metric(y_true, y_pred, **params)]
-    else:
-      raise NotImplementedError("Metric is not implemented: {}".format(metric))
-
-  if len(evaluations) == 1:
-    return sess.run(evaluations[0], feed_dict)
-  else:
-    return sess.run(evaluations, feed_dict)
-
-
-# Classification metrics
-
-
-def binary_accuracy(y_true, y_pred):
-  """Binary prediction accuracy, also known as 0/1-loss.
-
-  Args:
-    y_true: tf.Tensor.
-      Tensor of 0s and 1s (most generally, any real values a and b).
-    y_pred: tf.Tensor.
-      Tensor of predictions, with same shape as `y_true`.
-  """
-  y_true = tf.cast(y_true, tf.float32)
-  y_pred = tf.cast(y_pred, tf.float32)
-  return tf.reduce_mean(tf.cast(tf.equal(y_true, y_pred), tf.float32))
-
-
-def categorical_accuracy(y_true, y_pred):
-  """Multi-class prediction accuracy. One-hot representation for `y_true`.
-
-  Args:
-    y_true: tf.Tensor.
-      Tensor of 0s and 1s, where the outermost dimension of size `K`
-      has only one 1 per row.
-    y_pred: tf.Tensor.
-      Tensor of predictions, with shape `y_true.shape[:-1]`. Each
-      entry is an integer {0, 1, ..., K-1}.
-  """
-  y_true = tf.cast(tf.argmax(y_true, len(y_true.shape) - 1), tf.float32)
-  y_pred = tf.cast(y_pred, tf.float32)
-  return tf.reduce_mean(tf.cast(tf.equal(y_true, y_pred), tf.float32))
-
-
-def sparse_categorical_accuracy(y_true, y_pred):
-  """Multi-class prediction accuracy. Label {0, 1, .., K-1}
-  representation for `y_true`.
-
-  Args:
-    y_true: tf.Tensor.
-      Tensor of integers {0, 1, ..., K-1}.
-    y_pred: tf.Tensor.
-      Tensor of predictions, with same shape as `y_true`.
-  """
-  y_true = tf.cast(y_true, tf.float32)
-  y_pred = tf.cast(y_pred, tf.float32)
-  return tf.reduce_mean(tf.cast(tf.equal(y_true, y_pred), tf.float32))
-
-
-# Classification metrics (with real-valued predictions)
-
-
-def binary_crossentropy(y_true, y_pred):
-  """Binary cross-entropy.
-
-  Args:
-    y_true: tf.Tensor.
-      Tensor of 0s and 1s.
-    y_pred: tf.Tensor.
-      Tensor of real values (logit probabilities), with same shape as
-      `y_true`.
-  """
-  y_true = tf.cast(y_true, tf.float32)
-  y_pred = tf.cast(y_pred, tf.float32)
-  return tf.reduce_mean(
-      tf.nn.sigmoid_cross_entropy_with_logits(logits=y_pred, labels=y_true))
-
-
-def categorical_crossentropy(y_true, y_pred):
-  """Multi-class cross entropy. One-hot representation for `y_true`.
-
-  Args:
-    y_true: tf.Tensor.
-      Tensor of 0s and 1s, where the outermost dimension of size K
-      has only one 1 per row.
-    y_pred: tf.Tensor.
-      Tensor of real values (logit probabilities), with same shape as
-      `y_true`. The outermost dimension is the number of classes.
-  """
-  y_true = tf.cast(y_true, tf.float32)
-  y_pred = tf.cast(y_pred, tf.float32)
-  return tf.reduce_mean(
-      tf.nn.softmax_cross_entropy_with_logits(logits=y_pred, labels=y_true))
-
-
-def sparse_categorical_crossentropy(y_true, y_pred):
-  """Multi-class cross entropy. Label {0, 1, .., K-1} representation
-  for `y_true.`
-
-  Args:
-    y_true: tf.Tensor.
-      Tensor of integers {0, 1, ..., K-1}.
-    y_pred: tf.Tensor.
-      Tensor of real values (logit probabilities), with shape
-      `(y_true.shape, K)`. The outermost dimension is the number of classes.
-  """
-  y_true = tf.cast(y_true, tf.int64)
-  y_pred = tf.cast(y_pred, tf.float32)
-  return tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
-      logits=y_pred, labels=y_true))
-
-
-def multinomial_accuracy(y_true, y_pred):
-  """Multinomial prediction accuracy. `y_true` is a tensor
-  of integers, where the outermost dimension gives a draw
-  from a Multinomial distribution.
-
-  NB: In evaluating the accuracy between two Multinomials
-  results may vary across evaluations. This is because Edward's
-  algorithm for computing `y_pred`, i.e. the Multinomial
-  mode, yields variable results if `any(isinstance(p, float)
-  for p in total_count * probs)` (where `probs` is a vector
-  of the predicted Multinomial probabilities).
-  """
-  y_true = tf.cast(y_true, tf.float32)
-  y_pred = tf.cast(y_pred, tf.float32)
-  return tf.reduce_mean(tf.cast(tf.equal(y_true, y_pred), tf.float32))
-
-
-def kl_divergence(y_true, y_pred):
-  """Kullback-Leibler divergence between two probability distributions. A
-  vector of probabilities for `y_true`.
-
-  Args:
-    y_true: tf.Tensor.
-      Tensor of real values (probabilities) where the values in each row
-      of the outermost dimension sum to 1.
-    y_pred: tf.Tensor.
-      Same as `y_true`, and with the same shape.
-  """
-  y_true = tf.cast(y_true, tf.float32)
-  y_pred = tf.cast(y_pred, tf.float32)
-  zeros = tf.zeros(shape=(tf.shape(y_true)))
-  summand = tf.where(tf.equal(y_true, 0.0), zeros,
-                     y_true * (tf.log(y_true) - tf.log(y_pred)))
-  return tf.reduce_sum(summand)
-
-
-def hinge(y_true, y_pred):
-  """Hinge loss.
-
-  Args:
-    y_true: tf.Tensor.
-      Tensor of 0s and 1s.
-    y_pred: tf.Tensor.
-      Tensor of real values, with same shape as `y_true`.
-  """
-  y_true = tf.cast(y_true, tf.float32)
-  y_pred = tf.cast(y_pred, tf.float32)
-  return tf.reduce_mean(tf.maximum(1.0 - y_true * y_pred, 0.0))
-
-
-def squared_hinge(y_true, y_pred):
-  """Squared hinge loss.
-
-  Args:
-    y_true: tf.Tensor.
-      Tensor of 0s and 1s.
-    y_pred: tf.Tensor.
-      Tensor of real values, with same shape as `y_true`.
-  """
-  y_true = tf.cast(y_true, tf.float32)
-  y_pred = tf.cast(y_pred, tf.float32)
-  return tf.reduce_mean(tf.square(tf.maximum(1.0 - y_true * y_pred, 0.0)))
-
-
-# Regression metrics
-
-
-@with_binary_averaging
-def mean_squared_error(y_true, y_pred):
-  """Mean squared error loss.
-
-  Args:
-    y_true: tf.Tensor.
-    y_pred: tf.Tensor.
-      Tensors of same shape and type.
-  """
-  return tf.reduce_mean(tf.square(y_pred - y_true), axis=-2)
-
-
-@with_binary_averaging
-def mean_absolute_error(y_true, y_pred):
-  """Mean absolute error loss.
-
-  Args:
-    y_true: tf.Tensor.
-    y_pred: tf.Tensor.
-      Tensors of same shape and type.
-  """
-  return tf.reduce_mean(tf.abs(y_pred - y_true), axis=-2)
-
-
-@with_binary_averaging
-def mean_absolute_percentage_error(y_true, y_pred):
-  """Mean absolute percentage error loss.
-
-  Args:
-    y_true: tf.Tensor.
-    y_pred: tf.Tensor.
-      Tensors of same shape and type.
-  """
-  diff = tf.abs((y_true - y_pred) / tf.clip_by_value(tf.abs(y_true),
-                                                     1e-8, np.inf))
-  return 100.0 * tf.reduce_mean(diff, axis=-2)
-
-
-@with_binary_averaging
-def mean_squared_logarithmic_error(y_true, y_pred):
-  """Mean squared logarithmic error loss.
-
-  Args:
-    y_true: tf.Tensor.
-    y_pred: tf.Tensor.
-      Tensors of same shape and type.
-  """
-  first_log = tf.log(tf.clip_by_value(y_pred, 1e-8, np.inf) + 1.0)
-  second_log = tf.log(tf.clip_by_value(y_true, 1e-8, np.inf) + 1.0)
-  return tf.reduce_mean(tf.square(first_log - second_log), axis=-2)
-
-
-def poisson(y_true, y_pred):
-  """Negative Poisson log-likelihood of data `y_true` given predictions
-  `y_pred` (up to proportion).
-
-  Args:
-    y_true: tf.Tensor.
-    y_pred: tf.Tensor.
-      Tensors of same shape and type.
-  """
-  return tf.reduce_sum(y_pred - y_true * tf.log(y_pred + 1e-8))
-
-
-def cosine_proximity(y_true, y_pred):
-  """Cosine similarity of two vectors.
-
-  Args:
-    y_true: tf.Tensor.
-    y_pred: tf.Tensor.
-      Tensors of same shape and type.
-  """
-  y_true = tf.nn.l2_normalize(y_true, len(y_true.shape) - 1)
-  y_pred = tf.nn.l2_normalize(y_pred, len(y_pred.shape) - 1)
-  return tf.reduce_sum(y_true * y_pred)
diff --git a/edward/criticisms/ppc.py b/edward/criticisms/ppc.py
deleted file mode 100644
index 5dddf6ea4..000000000
--- a/edward/criticisms/ppc.py
+++ /dev/null
@@ -1,121 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-import six
-import tensorflow as tf
-
-from edward.models import RandomVariable
-from edward.util import check_data, check_latent_vars, get_session
-
-
-def ppc(T, data, latent_vars=None, n_samples=100):
-  """Posterior predictive check
-  [@rubin1984bayesianly; @meng1994posterior; @gelman1996posterior].
-
-  PPC's form an empirical distribution for the predictive discrepancy,
-
-  $p(T\mid x) = \int p(T(x^{\\text{rep}})\mid z) p(z\mid x) dz$
-
-  by drawing replicated data sets $x^{\\text{rep}}$ and
-  calculating $T(x^{\\text{rep}})$ for each data set. Then it
-  compares it to $T(x)$.
-
-  If `data` is inputted with the prior predictive distribution, then
-  it is a prior predictive check [@box1980sampling].
-
-  Args:
-    T: function.
-      Discrepancy function, which takes a dictionary of data and
-      dictionary of latent variables as input and outputs a `tf.Tensor`.
-    data: dict.
-      Data to compare to. It binds observed variables (of type
-      `RandomVariable` or `tf.Tensor`) to their realizations (of
-      type `tf.Tensor`). It can also bind placeholders (of type
-      `tf.Tensor`) used in the model to their realizations.
-    latent_vars: dict.
-      Collection of random variables (of type `RandomVariable` or
-      `tf.Tensor`) binded to their inferred posterior. This argument
-      is used when the discrepancy is a function of latent variables.
-    n_samples: int.
-      Number of replicated data sets.
-
-  Returns:
-    list of np.ndarray.
-    List containing the reference distribution, which is a NumPy array
-    with `n_samples` elements,
-
-    $(T(x^{{\\text{rep}},1}, z^{1}), ...,
-       T(x^{\\text{rep,nsamples}}, z^{\\text{nsamples}}))$
-
-    and the realized discrepancy, which is a NumPy array with
-    `n_samples` elements,
-
-    $(T(x, z^{1}), ..., T(x, z^{\\text{nsamples}})).$
-
-
-  #### Examples
-
-  ```python
-  # build posterior predictive after inference:
-  # it is parameterized by a posterior sample
-  x_post = ed.copy(x, {z: qz, beta: qbeta})
-
-  # posterior predictive check
-  # T is a user-defined function of data, T(data)
-  T = lambda xs, zs: tf.reduce_mean(xs[x_post])
-  ed.ppc(T, data={x_post: x_train})
-
-  # in general T is a discrepancy function of the data (both response and
-  # covariates) and latent variables, T(data, latent_vars)
-  T = lambda xs, zs: tf.reduce_mean(zs[z])
-  ed.ppc(T, data={y_post: y_train, x_ph: x_train},
-         latent_vars={z: qz, beta: qbeta})
-
-  # prior predictive check
-  # run ppc on original x
-  ed.ppc(T, data={x: x_train})
-  ```
-  """
-  sess = get_session()
-  if not callable(T):
-    raise TypeError("T must be a callable function.")
-
-  check_data(data)
-  if latent_vars is None:
-    latent_vars = {}
-
-  check_latent_vars(latent_vars)
-  if not isinstance(n_samples, int):
-    raise TypeError("n_samples must have type int.")
-
-  # Build replicated latent variables.
-  zrep = {key: tf.convert_to_tensor(value)
-          for key, value in six.iteritems(latent_vars)}
-
-  # Build replicated data.
-  xrep = {x: (x.value() if isinstance(x, RandomVariable) else obs)
-          for x, obs in six.iteritems(data)}
-
-  # Create feed_dict for data placeholders that the model conditions
-  # on; it is necessary for all session runs.
-  feed_dict = {key: value for key, value in six.iteritems(data)
-               if isinstance(key, tf.Tensor) and "Placeholder" in key.op.type}
-
-  # Calculate discrepancy over many replicated data sets and latent
-  # variables.
-  Trep = T(xrep, zrep)
-  Tobs = T(data, zrep)
-  Treps = []
-  Ts = []
-  for _ in range(n_samples):
-    # Take a forward pass (session run) to get new samples for
-    # each calculation of the discrepancy.
-    # Alternatively, we could unroll the graph by registering this
-    # operation `n_samples` times, each for different parent nodes
-    # representing `xrep` and `zrep`. But it's expensive.
-    Treps += [sess.run(Trep, feed_dict)]
-    Ts += [sess.run(Tobs, feed_dict)]
-
-  return [np.stack(Treps), np.stack(Ts)]
diff --git a/edward/criticisms/ppc_plots.py b/edward/criticisms/ppc_plots.py
deleted file mode 100644
index 2b2677105..000000000
--- a/edward/criticisms/ppc_plots.py
+++ /dev/null
@@ -1,109 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-
-def ppc_density_plot(y, y_rep):
-  """Create 1D kernel density plot comparing data to samples from posterior.
-
-  Args:
-    y: np.ndarray.
-      A 1-D NumPy array.
-    y_rep: np.ndarray.
-      A 2-D NumPy array where rows represent different samples from posterior.
-
-  Returns:
-    matplotlib axes
-
-  #### Examples
-
-  ```python
-  import matplotlib.pyplot as plt
-
-  y = np.random.randn(20)
-  y_rep = np.random.randn(20, 20)
-
-  ed.ppc_density_plot(y, y_rep)
-  plt.show()
-  ```
-  """
-  import matplotlib.pyplot as plt
-  import seaborn as sns
-  ax = sns.kdeplot(y, color="maroon")
-
-  n = y_rep.shape[0]
-
-  for i in range(n):
-    ax = sns.kdeplot(y_rep[i, :], color="maroon", alpha=0.2, linewidth=0.8)
-
-  y_line = plt.Line2D([], [], color='maroon', label='y')
-  y_rep_line = plt.Line2D([], [], color='maroon', alpha=0.2, label='y_rep')
-
-  handles = [y_line, y_rep_line]
-  labels = ['y', r'$y_{rep}$']
-
-  ax.legend(handles, labels)
-
-  return ax
-
-
-def ppc_stat_hist_plot(y_stats, yrep_stats, stat_name=None, **kwargs):
-  """Create histogram plot comparing data to samples from posterior.
-
-  Args:
-    y_stats: float.
-      Float representing statistic value of observed data.
-    yrep_stats: np.ndarray.
-      A 1-D NumPy array.
-    stat_name: string.
-      Optional string value for including statistic name in legend.
-    **kwargs:
-      Keyword arguments used by seaborn.distplot can be given to customize plot.
-
-  Returns:
-    matplotlib axes.
-
-  #### Examples
-
-  ```python
-  import matplotlib.pyplot as plt
-
-  # DATA
-  x_data = np.array([0, 1, 0, 0, 0, 0, 0, 0, 0, 1])
-
-  # MODEL
-  p = Beta(1.0, 1.0)
-  x = Bernoulli(probs=p, sample_shape=10)
-
-  # INFERENCE
-  qp = Beta(tf.nn.softplus(tf.Variable(tf.random_normal([]))),
-            tf.nn.softplus(tf.Variable(tf.random_normal([]))))
-
-  inference = ed.KLqp({p: qp}, data={x: x_data})
-  inference.run(n_iter=500)
-
-  # CRITICISM
-  x_post = ed.copy(x, {p: qp})
-  y_rep, y = ed.ppc(
-      lambda xs, zs: tf.reduce_mean(tf.cast(xs[x_post], tf.float32)),
-      data={x_post: x_data})
-
-  ed.ppc_stat_hist_plot(
-      y[0], y_rep, stat_name=r'$T \equiv$mean', bins=10)
-  plt.show()
-  ```
-  """
-  import matplotlib.pyplot as plt
-  import seaborn as sns
-  ax = sns.distplot(yrep_stats, kde=False, label=r'$T(y_{rep})$', **kwargs)
-
-  max_value = ax.get_ylim()[1]
-
-  plt.vlines(y_stats, ymin=0.0, ymax=max_value, label='T(y)')
-
-  if stat_name is not None:
-    plt.legend(title=stat_name)
-  else:
-    plt.legend()
-
-  return ax
diff --git a/edward/inferences/__init__.py b/edward/inferences/__init__.py
index 38262fcb7..0fe41b1f7 100644
--- a/edward/inferences/__init__.py
+++ b/edward/inferences/__init__.py
@@ -1,4 +1,34 @@
 """
+There are two approaches to inference.
+
+1. Idiomatic TensorFlow
+  1. Build train_op (*).
+  2. Build summary file writer.
+  3. Build and run TensorFlow variable initializer ops.
+  4. Within a training loop:
+    + sess.run with infeeding and summary writers.
+    + Log progress by writing to files and/or printing.
+    + Check convergence (*).
+  5. Build and run post-training ops (*).
+2. Idiomatic TensorFlow Estimator
+  + Build `model_fn` by writing a probabilistic program and calling an
+  inference algorithm to produce train ops. Use the Estimator API
+  workflow of `train`, `evaluate`, and `predict` alongside an
+  `input_fn` data pipeline.
+
+Inference provides utilities for both approaches. In the first
+approach, it provides (*), namely: (1) inference algorithms to help
+produce the train_op (and low-level functions to build your own
+algorithms; sometimes post-training ops); and (2) convergence
+diagnostics. In the second approach, these functions build up a
+`model_fn` to form a TensorFlow Estimator.
+
+Inference uses (unbinded) pure functions with TensorFlow idiomatic
+exceptions (e.g., mutable state via TensorFlow variables; side effect
+of adding to global collections and TF graph). It forgoes OO.
+
+Specific inference files provide functions to help produce the train
+(and post-training) ops.
 """
 from __future__ import absolute_import
 from __future__ import division
@@ -7,50 +37,38 @@
 from edward.inferences.bigan_inference import *
 from edward.inferences.conjugacy import *
 from edward.inferences.gan_inference import *
-from edward.inferences.gibbs import *
 from edward.inferences.hmc import *
-from edward.inferences.implicit_klqp import *
-from edward.inferences.inference import *
 from edward.inferences.klpq import *
 from edward.inferences.klqp import *
+from edward.inferences.klqp_implicit import *
 from edward.inferences.laplace import *
 from edward.inferences.map import *
 from edward.inferences.metropolis_hastings import *
-from edward.inferences.monte_carlo import *
 from edward.inferences.sgld import *
 from edward.inferences.sghmc import *
-from edward.inferences.variational_inference import *
 from edward.inferences.wake_sleep import *
 from edward.inferences.wgan_inference import *
 
 from tensorflow.python.util.all_util import remove_undocumented
 
 _allowed_symbols = [
-    'BiGANInference',
+    'bigan_inference',
     'complete_conditional',
-    'GANInference',
-    'Gibbs',
-    'HMC',
-    'ImplicitKLqp',
-    'Inference',
-    'KLpq',
-    'KLqp',
-    'ReparameterizationKLqp',
-    'ReparameterizationKLKLqp',
-    'ReparameterizationEntropyKLqp',
-    'ScoreKLqp',
-    'ScoreKLKLqp',
-    'ScoreEntropyKLqp',
-    'ScoreRBKLqp',
-    'Laplace',
-    'MAP',
-    'MetropolisHastings',
-    'MonteCarlo',
-    'SGLD',
-    'SGHMC',
-    'VariationalInference',
-    'WakeSleep',
-    'WGANInference',
+    'gan_inference',
+    'hmc',
+    'klpq',
+    'klqp',
+    'klqp_implicit',
+    'klqp_reparameterization',
+    'klqp_reparameterization_kl',
+    'klqp_score',
+    'laplace',
+    'map',
+    'metropolis_hastings',
+    'sghmc',
+    'sgld',
+    'wake_sleep',
+    'wgan_inference',
 ]
 
 remove_undocumented(__name__, allowed_exception_list=_allowed_symbols)
diff --git a/edward/inferences/bigan_inference.py b/edward/inferences/bigan_inference.py
index 00b2396ba..1c6e6187c 100644
--- a/edward/inferences/bigan_inference.py
+++ b/edward/inferences/bigan_inference.py
@@ -5,88 +5,116 @@
 import six
 import tensorflow as tf
 
-from edward.inferences.gan_inference import GANInference
-from edward.util import get_session
-
-
-class BiGANInference(GANInference):
+from edward.inferences import docstrings as doc
+from edward.inferences.util import (
+    call_with_trace, make_optional_inputs, toposort)
+
+
+@doc.set_doc(
+    args=(doc.arg_model +
+          doc.arg_variational +
+          doc.arg_discriminator +
+          doc.arg_align_latent +
+          doc.arg_align_data +
+          doc.arg_collections +
+          doc.arg_args_kwargs)[:-1],
+    returns=doc.return_loss_loss_d,
+    notes_discriminator_scope=doc.notes_discriminator_scope,
+    notes_regularization_losses=doc.notes_regularization_losses)
+def bigan_inference(model, variational, discriminator, align_latent,
+                    align_data, collections=None, *args, **kwargs):
   """Adversarially Learned Inference [@dumuolin2017adversarially] or
   Bidirectional Generative Adversarial Networks [@donahue2017adversarial]
   for joint learning of generator and inference networks.
 
+  The function matches a mapping from data to latent variables and a
+  mapping from latent variables to data through a joint discriminator.
+
   Works for the class of implicit (and differentiable) probabilistic
   models. These models do not require a tractable density and assume
   only a program that generates samples.
 
-  #### Notes
+  Args:
+  @{args}
 
-  `BiGANInference` matches a mapping from data to latent variables and a
-  mapping from latent variables to data through a joint
-  discriminator.
+  `align_latent` must only align one random variable in `model` and
+  `variational`. `model` must return the generated data. `variational`
+  assumes a random variable output and not an implicit density (or at
+  least recorded on trace).
+
+  Returns:
+  @{returns}
+
+  #### Notes
 
-  In building the computation graph for inference, the
-  discriminator's parameters can be accessed with the variable scope
-  "Disc".
-  In building the computation graph for inference, the
-  encoder and decoder parameters can be accessed with the variable scope
-  "Gen".
+  @{notes_discriminator_scope}
 
-  The objective function also adds to itself a summation over all tensors
-  in the `REGULARIZATION_LOSSES` collection.
+  @{notes_regularization_losses}
 
   #### Examples
 
   ```python
-  with tf.variable_scope("Gen"):
-    xf = gen_data(z_ph)
-    zf = gen_latent(x_ph)
-  inference = ed.BiGANInference({z_ph: zf}, {xf: x_ph}, discriminator)
+  def model():
+    z = Normal(loc=0.0, scale=1.0, sample_shape=[256, 25], name="z")
+    x = generative_network(z, name="x")
+    return x
+
+  def variational(x_data):
+    net = tf.layers.dense(x_data, 25 * 2)
+    qz = Normal(loc=net[:, :25],
+                scale=tf.nn.softplus(net[:, 25:]),
+                sample_shape=[256,],
+                name="qz")
+    return qz
+
+  def discriminator(x):
+    net = tf.layers.dense(x, 256, activation=tf.nn.relu)
+    return tf.layers.dense(net, 1, activation=tf.sigmoid)
+
+  loss, loss_d = ed.bigan_inference(
+      model, variational, discriminator,
+      align_latent=lambda name: "qz" if name == "z" else None,
+      align_data=lambda name: "x_data" if name == "x" else None,
+      x_data=x_data)
   ```
   """
-  def __init__(self, latent_vars, data, discriminator):
-    if not callable(discriminator):
-      raise TypeError("discriminator must be a callable function.")
-
-    self.discriminator = discriminator
-    # call grandparent's method; avoid parent (GANInference)
-    super(GANInference, self).__init__(latent_vars, data)
-
-  def build_loss_and_gradients(self, var_list):
-    x_true = list(six.itervalues(self.data))[0]
-    x_fake = list(six.iterkeys(self.data))[0]
-
-    z_true = list(six.iterkeys(self.latent_vars))[0]
-    z_fake = list(six.itervalues(self.latent_vars))[0]
-
-    with tf.variable_scope("Disc"):
-        # xtzf := x_true, z_fake
-        d_xtzf = self.discriminator(x_true, z_fake)
-    with tf.variable_scope("Disc", reuse=True):
-        # xfzt := x_fake, z_true
-        d_xfzt = self.discriminator(x_fake, z_true)
-
-    loss_d = tf.nn.sigmoid_cross_entropy_with_logits(
-        labels=tf.ones_like(d_xfzt), logits=d_xfzt) + \
-        tf.nn.sigmoid_cross_entropy_with_logits(
-            labels=tf.zeros_like(d_xtzf), logits=d_xtzf)
-    loss = tf.nn.sigmoid_cross_entropy_with_logits(
-        labels=tf.zeros_like(d_xfzt), logits=d_xfzt) + \
-        tf.nn.sigmoid_cross_entropy_with_logits(
-            labels=tf.ones_like(d_xtzf), logits=d_xtzf)
-
-    reg_terms_d = tf.losses.get_regularization_losses(scope="Disc")
-    reg_terms = tf.losses.get_regularization_losses(scope="Gen")
-
-    loss_d = tf.reduce_mean(loss_d) + tf.reduce_sum(reg_terms_d)
-    loss = tf.reduce_mean(loss) + tf.reduce_sum(reg_terms)
-
-    var_list_d = tf.get_collection(
-        tf.GraphKeys.TRAINABLE_VARIABLES, scope="Disc")
-    var_list = tf.get_collection(
-        tf.GraphKeys.TRAINABLE_VARIABLES, scope="Gen")
-
-    grads_d = tf.gradients(loss_d, var_list_d)
-    grads = tf.gradients(loss, var_list)
-    grads_and_vars_d = list(zip(grads_d, var_list_d))
-    grads_and_vars = list(zip(grads, var_list))
-    return loss, grads_and_vars, loss_d, grads_and_vars_d
+  q_trace = call_with_trace(variational, *args, **kwargs)
+  model = make_optional_inputs(model)
+  x_fake = model(*args, **kwargs)
+
+  key = align_data(x_fake.name.split(':')[0])
+  if isinstance(key, int):
+    x_true = args[key]
+  elif kwargs.get(key, None) is not None:
+    x_true = kwargs.get(key)
+
+  for rv in toposort(x_fake):
+    aligned = align_latent(rv.name)
+    if aligned is not None:
+      z_true = rv
+      z_fake = q_trace[aligned]
+      break
+
+  with tf.variable_scope("Disc"):
+      # xtzf := x_true, z_fake
+      d_xtzf = self.discriminator(x_true, z_fake)
+  with tf.variable_scope("Disc", reuse=True):
+      # xfzt := x_fake, z_true
+      d_xfzt = self.discriminator(x_fake, z_true)
+
+  loss_d = tf.nn.sigmoid_cross_entropy_with_logits(
+      labels=tf.ones_like(d_xfzt), logits=d_xfzt) + \
+      tf.nn.sigmoid_cross_entropy_with_logits(
+          labels=tf.zeros_like(d_xtzf), logits=d_xtzf)
+  loss = tf.nn.sigmoid_cross_entropy_with_logits(
+      labels=tf.zeros_like(d_xfzt), logits=d_xfzt) + \
+      tf.nn.sigmoid_cross_entropy_with_logits(
+          labels=tf.ones_like(d_xtzf), logits=d_xtzf)
+
+  reg_terms_d = tf.losses.get_regularization_losses(scope="Disc")
+  reg_terms_all = tf.losses.get_regularization_losses()
+  reg_terms = [r for r in reg_terms_all if r not in reg_terms_d]
+
+  loss_d = tf.reduce_mean(loss_d) + tf.reduce_sum(reg_terms_d)
+  loss = tf.reduce_mean(loss) + tf.reduce_sum(reg_terms)
+  return loss, loss_d
diff --git a/edward/inferences/conjugacy/conjugacy.py b/edward/inferences/conjugacy/conjugacy.py
index ce0941398..1f805c53d 100644
--- a/edward/inferences/conjugacy/conjugacy.py
+++ b/edward/inferences/conjugacy/conjugacy.py
@@ -11,8 +11,8 @@
 from collections import defaultdict
 from edward.inferences.conjugacy.simplify \
     import symbolic_suff_stat, full_simplify, expr_contains, reconstruct_expr
-from edward.models.random_variables import *
-from edward.util import copy, get_blanket
+from edward.models import get_blanket
+from edward.models.core import *
 
 
 def mvn_diag_from_natural_params(p1, p2):
@@ -103,15 +103,15 @@ def complete_conditional(rv, cond_set=None):
     log_joint = get_log_joint(cond_set)
 
     # Pull out the nodes that are nonlinear functions of rv into s_stats.
-    stop_nodes = set([i.value() for i in cond_set])
+    stop_nodes = set([i.value for i in cond_set])
     subgraph = extract_subgraph(log_joint, stop_nodes)
-    s_stats = suff_stat_nodes(subgraph, rv.value(), cond_set)
+    s_stats = suff_stat_nodes(subgraph, rv.value, cond_set)
     s_stats = list(set(s_stats))
 
     # Simplify those nodes, and put any new linear terms into multipliers_i.
     s_stat_exprs = defaultdict(list)
     for s_stat in s_stats:
-      expr = symbolic_suff_stat(s_stat, rv.value(), stop_nodes)
+      expr = symbolic_suff_stat(s_stat, rv.value, stop_nodes)
       expr = full_simplify(expr)
       multipliers_i, s_stats_i = extract_s_stat_multipliers(expr)
       s_stat_exprs[s_stats_i].append(
@@ -139,7 +139,7 @@ def complete_conditional(rv, cond_set=None):
     for s_stat_expr in six.itervalues(s_stat_exprs):
       s_stat_placeholder = tf.placeholder(tf.float32,
                                           s_stat_expr[0][0].get_shape())
-      swap_back[s_stat_placeholder] = tf.cast(rv.value(), tf.float32)
+      swap_back[s_stat_placeholder] = tf.cast(rv.value, tf.float32)
       s_stat_placeholders.append(s_stat_placeholder)
       for s_stat_node, multiplier in s_stat_expr:
         fake_node = s_stat_placeholder * multiplier
@@ -148,7 +148,7 @@ def complete_conditional(rv, cond_set=None):
 
     for i in cond_set:
       if i != rv:
-        val = i.value()
+        val = i.value
         val_placeholder = tf.placeholder(val.dtype)
         swap_dict[val] = val_placeholder
         swap_back[val_placeholder] = val
diff --git a/edward/inferences/conjugacy/conjugate_log_probs.py b/edward/inferences/conjugacy/conjugate_log_probs.py
index a2e25f0ee..a8667a7e5 100644
--- a/edward/inferences/conjugacy/conjugate_log_probs.py
+++ b/edward/inferences/conjugacy/conjugate_log_probs.py
@@ -5,7 +5,7 @@
 import numpy as np
 import tensorflow as tf
 
-from edward.models.random_variables import *
+from edward.models.core import *
 
 
 def _val_wrapper(f):
diff --git a/edward/inferences/docstrings.py b/edward/inferences/docstrings.py
new file mode 100644
index 000000000..c9fd1dfb6
--- /dev/null
+++ b/edward/inferences/docstrings.py
@@ -0,0 +1,265 @@
+"""Programmable docstrings.
+
+The args below represent a global vocabulary of arguments shared
+across at least two inference algorithms. They are sorted
+alphabetically. They are also written with newlines at the end such
+that they can be easily added together. After composing args
+docstrings, remove the last newline.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import re
+import six
+import sys
+
+
+def set_doc(**kwargs):
+  """Decorator to programmatically set the docstring."""
+  def _update(cls_or_fn):
+    doc = trim(cls_or_fn.__doc__)
+    for k, v in six.iteritems(kwargs):
+      # Capture each @{k} reference to replace with v.
+      # We wrap the replacement in a function so no backslash escapes
+      # are processed.
+      pattern = r'@\{' + str(k) + r'\}'
+      doc = re.sub(pattern, lambda match: v, doc)
+    cls_or_fn.__doc__ = doc
+    return cls_or_fn
+  return _update
+
+
+def trim(docstring):
+  """Trims docstring indentation. Taken from PEP 257 docs."""
+  if not docstring:
+    return ''
+  # Convert tabs to spaces (following the normal Python rules)
+  # and split into a list of lines:
+  lines = docstring.expandtabs().splitlines()
+  # Determine minimum indentation (first line doesn't count):
+  indent = sys.maxint
+  for line in lines[1:]:
+    stripped = line.lstrip()
+    if stripped:
+      indent = min(indent, len(line) - len(stripped))
+  # Remove indentation (first line is special):
+  trimmed = [lines[0].strip()]
+  if indent < sys.maxint:
+    for line in lines[1:]:
+      trimmed.append(line[indent:].rstrip())
+  # Strip off trailing and leading blank lines:
+  while trimmed and not trimmed[-1]:
+    trimmed.pop()
+  while trimmed and not trimmed[0]:
+    trimmed.pop(0)
+  # Return a single string:
+  return '\n'.join(trimmed)
+
+
+arg_align_data = """
+  align_data: function of string, aligning `model` observed
+    variables with data. It takes a model variable's name as input
+    and returns an integer, indexing `args`, or key, indexing
+    `kwargs`. Other inputs must return None.
+"""[1:]
+arg_align_latent = """
+  align_latent: function of string, aligning `model` latent
+    variables with `variational`. It takes a model variable's name
+    as input and returns a string, indexing `variational`'s trace.
+    Other inputs must return None.
+"""[1:]
+arg_align_latent_monte_carlo = """
+  align_latent: function of string, aligning `model` latent
+    variables with posterior trace. It takes a model variable's name
+    as input and returns a string. The return output determines the
+    name of the returned dictionary of states' keys. If None,
+    will not perform inference over them.
+"""[1:]
+arg_args_kwargs = """
+  args, kwargs: data inputs. `kwargs`' keys are directly the argument
+    keys in `model` (and if present, `variational`). Data inputs are
+    passed at compile-time in TF's Graph mode or runtime in TF's Eager
+    mode.
+"""[1:]
+arg_auto_transform = """
+  auto_transform:
+"""[1:]
+arg_collections = """
+  collections:
+"""[1:]
+arg_discriminator = """
+  discriminator: function.
+    Function (with parameters) to discriminate samples. It should
+    output logit probabilities (real-valued) and not probabilities
+    in $[0, 1]$.
+"""[1:]
+arg_current_grads_target_log_prob = """
+  current_grads_target_log_prob:
+"""[1:]
+arg_kl_scaling = """
+  kl_scaling: function of string, aligning `model` latent
+    variables with KL scale factors. This provides option to scale
+    terms when using ELBO with KL divergence. If the KL divergence
+    terms are
+
+    $\\alpha_p \mathbb{E}_{q(z\mid x, \lambda)} [
+          \log q(z\mid x, \lambda) - \log p(z)],$
+
+    then pass {$p(z)$: $\\alpha_p$} as `kl_scaling`,
+    where $\\alpha_p$ is a tensor. Its shape must be broadcastable;
+    it is multiplied element-wise to the batchwise KL terms.
+"""[1:]
+arg_model = """
+  model: function whose inputs are a subset of `args` (e.g., for
+    discriminative). Output is not used.
+    TODO auto_transform docstring
+    Collection of random variables to perform inference on.
+    If list, each random variable will be implictly optimized using
+    a `Normal` random variable that is defined internally with a
+    free parameter per location and scale and is initialized using
+    standard normal draws. The random variables to approximate must
+    be continuous.
+    TODO note above only applicable to variational(?) inferences
+"""[1:]
+arg_n_samples = """
+  n_samples: int.
+    Number of samples from variational model for calculating
+    stochastic gradients.
+"""[1:]
+arg_scale = """
+  scale: function of string, aligning `model` observed
+    variables with scale factors. It takes a model variable's name
+    as input and returns a scale factor; else 1.0. The scale
+    factor's shape must be broadcastable; it is multiplied
+    element-wise to the random variable. For example, this is useful
+    for mini-batch scaling when inferring global variables, or
+    applying masks on a random variable.
+"""[1:]
+arg_current_state = """
+  current_state: Tensor or list of Tensors. Each element is a
+    posterior variable whose name is its current state. If the model
+    encounters a latent variable not aligned with a key in `states`,
+    its state is a draw from the distribution. Default is None
+    (equivalent to empty dict).
+"""[1:]
+arg_step_size = """
+  step_size: float.
+    Step size of numerical integrator. The implementation may be
+    extended in the future to enable a step size per random variable
+    (`step_size` would be a callable).
+"""[1:]
+arg_current_target_log_prob = """
+  current_target_log_prob:
+"""[1:]
+arg_variational = """
+  variational: function whose inputs are a subset of `args` (e.g.,
+    for amortized). Output is not used.
+"""[1:]
+notes_conditional_inference = """
+In conditional inference, we infer $z$ in $p(z, \\beta
+\mid x)$ while fixing inference over $\\beta$ using another
+distribution $q(\\beta)$. During calculations, this function uses an
+estimate of the marginal density,
+
+$\log p(x, z) = \log \mathbb{E}_{q(\\beta)} [ p(x, z, \\beta) ]
+              \\approx \log p(x, z, \\beta^*)$
+
+leveraging a single Monte Carlo sample, where $\\beta^* \sim
+q(\\beta)$. This is unbiased (and therefore asymptotically exact as a
+pseudo-marginal method) if $q(\\beta) = p(\\beta \mid x)$.
+"""[1:-1]
+notes_conditional_inference_samples = """
+In conditional inference, we infer $z$ in $p(z, \\beta
+\mid x)$ while fixing inference over $\\beta$ using another
+distribution $q(\\beta)$. During gradient calculation, instead
+of using the model's density
+
+$\log p(x, z^{(s)}), z^{(s)} \sim q(z; \lambda),$
+
+for each sample $s=1,\ldots,S$, this function uses
+
+$\log p(x, z^{(s)}, \\beta^{(s)}),$
+
+where $z^{(s)} \sim q(z; \lambda)$ and $\\beta^{(s)}
+\sim q(\\beta)$.
+"""[1:-1]
+notes_discriminator_scope = """
+In building the computation graph for inference, the
+discriminator's parameters can be accessed with the variable scope
+"Disc".
+"""[1:-1]
+notes_mcmc_programs = """
+Probabilistic programs may have random variables which vary across
+executions. At each iteration, the MCMC algorithm transitions across
+the (finite) list of latent variables seen during one execution of
+the model. The previous state is read from `states`: if the
+execution encounters a latent variable not existing in `states`, the
+previous state is a draw from the prior.
+
+We recommend updating `states` with the sampler's output after each
+iteration. For example, in Eager mode:
+```python
+states = {}
+for _ in range(10000):
+  new_states, ... = mcmc(..., states=states, ...)
+  states.update(new_states)
+```
+This caches previous states within the `states` dictionary. States
+are only updated when the associated latent variable is seen again
+in the model's execution. As long as every latent variable of
+interest appears in the execution with non-zero probability, the
+distribution of each state is guaranteed to converge to the target
+distribution.
+
+This idea can be seen as a joint version of single-site
+Metropolis-Hastings [@wingate2011lightweight], but note it does not
+rerun any part of the program. In fact, the newly transitioned states
+given old states may not actually be a valid output of the program.
+For example, consider
+```python
+def model():
+  x = Bernoulli(probs=0.5)
+  if tf.cast(x, tf.bool):
+    y = Normal(0.0, 1.0)
+  else:
+    y = Gamma(1.0, 1.0)
+  return x, y
+```
+Given a previous state from (Bernoulli, Normal), the proposal might
+generate (0, -0.3), which is not in the program's support.
+"""[1:-1]
+notes_model_parameters = """
+The function also enables optimizing model parameters $p(z \mid x;
+\\theta)$. It does this by variational EM, maximizing
+
+$\mathbb{E}_{q(z; \lambda)} [ \log p(x, z; \\theta) ]$
+
+with respect to $\\theta$.
+"""[1:-1]
+notes_regularization_losses = """
+The objective function also adds to itself a summation over all
+tensors in the `REGULARIZATION_LOSSES` collection.
+"""
+return_loss = """
+  Scalar tf.Tensor representing the loss. Its automatic
+  differentiation is the gradient to follow for optimization.
+"""[1:-1]
+return_loss_loss_d = """
+  Pair of scalar tf.Tensors, representing the generative loss and
+  discriminative loss respectively.
+"""[1:-1]
+return_loss_surrogate_loss = """
+  Pair of scalar tf.Tensors, representing the loss and surrogate loss
+  respectively. The surrogate loss' automatic differentiation is the
+  gradient to follow for optimization.
+"""[1:-1]
+return_samples = """
+  Dict of tf.Tensor. The keys are according to the return values of
+  `align_latent`. The associated values are the transitioned states
+  from the Markov chain.
+"""[1:-1]
+return_surrogate_loss = """
+  Scalar tf.Tensor representing the surrogate loss. Its automatic
+  differentiation is the gradient to follow for optimization.
+"""[1:-1]
diff --git a/edward/inferences/gan_inference.py b/edward/inferences/gan_inference.py
index 2a8c9d17c..50a6c410a 100644
--- a/edward/inferences/gan_inference.py
+++ b/edward/inferences/gan_inference.py
@@ -5,11 +5,21 @@
 import six
 import tensorflow as tf
 
-from edward.inferences.variational_inference import VariationalInference
-from edward.util import get_session
-
-
-class GANInference(VariationalInference):
+from edward.inferences import docstrings as doc
+from edward.inferences.util import make_optional_inputs
+
+
+@doc.set_doc(
+    args=(doc.arg_model +
+          doc.arg_discriminator +
+          doc.arg_align_data +
+          doc.arg_collections +
+          doc.arg_args_kwargs)[:-1],
+    returns=doc.return_loss_loss_d,
+    notes_discriminator_scope=doc.notes_discriminator_scope,
+    notes_regularization_losses=doc.notes_regularization_losses)
+def gan_inference(model, discriminator, align_data,
+                  collections=None, *args, **kwargs):
   """Parameter estimation with GAN-style training
   [@goodfellow2014generative].
 
@@ -17,239 +27,66 @@ class GANInference(VariationalInference):
   models. These models do not require a tractable density and assume
   only a program that generates samples.
 
-  #### Notes
+  Args:
+  @{args}
 
-  `GANInference` does not support latent variable inference. Note
-  that GAN-style training also samples from the prior: this does not
-  work well for latent variables that are shared across many data
-  points (global variables).
+  `model` must return the generated data.
 
-  In building the computation graph for inference, the
-  discriminator's parameters can be accessed with the variable scope
-  "Disc".
+  Returns:
+  @{returns}
 
-  GANs also only work for one observed random variable in `data`.
+  #### Notes
+
+  @{notes_discriminator_scope}
 
-  The objective function also adds to itself a summation over all
-  tensors in the `REGULARIZATION_LOSSES` collection.
+  @{notes_regularization_losses}
 
   #### Examples
 
   ```python
-  z = Normal(loc=tf.zeros([100, 10]), scale=tf.ones([100, 10]))
-  x = generative_network(z)
-
-  inference = ed.GANInference({x: x_data}, discriminator)
+  def model():
+    z = Normal(loc=0.0, scale=1.0, sample_shape=[256, 25])
+    x = generative_network(z, name="x")
+    return x
+
+  def discriminator(x):
+    net = tf.layers.dense(x, 256, activation=tf.nn.relu)
+    return tf.layers.dense(net, 1, activation=tf.sigmoid)
+
+  loss, loss_d = ed.gan_inference(
+      model, discriminator,
+      align_data=lambda name: "x_data" if name == "x" else None,
+      x_data=x_data)
   ```
   """
-  def __init__(self, data, discriminator):
-    """Create an inference algorithm.
-
-    Args:
-      data: dict.
-        Data dictionary which binds observed variables (of type
-        `RandomVariable` or `tf.Tensor`) to their realizations (of
-        type `tf.Tensor`).  It can also bind placeholders (of type
-        `tf.Tensor`) used in the model to their realizations.
-      discriminator: function.
-        Function (with parameters) to discriminate samples. It should
-        output logit probabilities (real-valued) and not probabilities
-        in $[0, 1]$.
-    """
-    if not callable(discriminator):
-      raise TypeError("discriminator must be a callable function.")
-
-    self.discriminator = discriminator
-    super(GANInference, self).__init__(None, data)
-
-  def initialize(self, optimizer=None, optimizer_d=None,
-                 global_step=None, global_step_d=None, var_list=None,
-                 *args, **kwargs):
-    """Initialize inference algorithm. It initializes hyperparameters
-    and builds ops for the algorithm's computation graph.
-
-    Args:
-      optimizer: str or tf.train.Optimizer.
-        A TensorFlow optimizer, to use for optimizing the generator
-        objective. Alternatively, one can pass in the name of a
-        TensorFlow optimizer, and default parameters for the optimizer
-        will be used.
-      optimizer_d: str or tf.train.Optimizer.
-        A TensorFlow optimizer, to use for optimizing the discriminator
-        objective. Alternatively, one can pass in the name of a
-        TensorFlow optimizer, and default parameters for the optimizer
-        will be used.
-      global_step: tf.Variable.
-        Optional `Variable` to increment by one after the variables
-        for the generator have been updated. See
-        `tf.train.Optimizer.apply_gradients`.
-      global_step_d: tf.Variable.
-        Optional `Variable` to increment by one after the variables
-        for the discriminator have been updated. See
-        `tf.train.Optimizer.apply_gradients`.
-      var_list: list of tf.Variable.
-        List of TensorFlow variables to optimize over (in the generative
-        model). Default is all trainable variables that `latent_vars`
-        and `data` depend on.
-    """
-    # call grandparent's method; avoid parent (VariationalInference)
-    super(VariationalInference, self).initialize(*args, **kwargs)
-
-    self.loss, grads_and_vars, self.loss_d, grads_and_vars_d = \
-        self.build_loss_and_gradients(var_list)
-
-    optimizer, global_step = _build_optimizer(optimizer, global_step)
-    optimizer_d, global_step_d = _build_optimizer(optimizer_d, global_step_d)
-
-    self.train = optimizer.apply_gradients(grads_and_vars,
-                                           global_step=global_step)
-    self.train_d = optimizer_d.apply_gradients(grads_and_vars_d,
-                                               global_step=global_step_d)
-
-    if self.logging:
-      tf.summary.scalar("loss", self.loss,
-                        collections=[self._summary_key])
-      tf.summary.scalar("loss/discriminative", self.loss_d,
-                        collections=[self._summary_key])
-      self.summarize = tf.summary.merge_all(key=self._summary_key)
-
-  def build_loss_and_gradients(self, var_list):
-    x_true = list(six.itervalues(self.data))[0]
-    x_fake = list(six.iterkeys(self.data))[0]
-    with tf.variable_scope("Disc"):
-      d_true = self.discriminator(x_true)
-
-    with tf.variable_scope("Disc", reuse=True):
-      d_fake = self.discriminator(x_fake)
-
-    if self.logging:
-      tf.summary.histogram("discriminator_outputs",
-                           tf.concat([d_true, d_fake], axis=0),
-                           collections=[self._summary_key])
-
-    reg_terms_d = tf.losses.get_regularization_losses(scope="Disc")
-    reg_terms_all = tf.losses.get_regularization_losses()
-    reg_terms = [r for r in reg_terms_all if r not in reg_terms_d]
-
-    loss_d = tf.nn.sigmoid_cross_entropy_with_logits(
-        labels=tf.ones_like(d_true), logits=d_true) + \
-        tf.nn.sigmoid_cross_entropy_with_logits(
-            labels=tf.zeros_like(d_fake), logits=d_fake)
-    loss = tf.nn.sigmoid_cross_entropy_with_logits(
-        labels=tf.ones_like(d_fake), logits=d_fake)
-    loss_d = tf.reduce_mean(loss_d) + tf.reduce_sum(reg_terms_d)
-    loss = tf.reduce_mean(loss) + tf.reduce_sum(reg_terms)
-
-    var_list_d = tf.get_collection(
-        tf.GraphKeys.TRAINABLE_VARIABLES, scope="Disc")
-    if var_list is None:
-      var_list = [v for v in tf.trainable_variables() if v not in var_list_d]
-
-    grads_d = tf.gradients(loss_d, var_list_d)
-    grads = tf.gradients(loss, var_list)
-    grads_and_vars_d = list(zip(grads_d, var_list_d))
-    grads_and_vars = list(zip(grads, var_list))
-    return loss, grads_and_vars, loss_d, grads_and_vars_d
-
-  def update(self, feed_dict=None, variables=None):
-    """Run one iteration of optimization.
-
-    Args:
-      feed_dict: dict.
-        Feed dictionary for a TensorFlow session run. It is used to feed
-        placeholders that are not fed during initialization.
-      variables: str.
-        Which set of variables to update. Either "Disc" or "Gen".
-        Default is both.
-
-    Returns:
-      dict.
-      Dictionary of algorithm-specific information. In this case, the
-      iteration number and generative and discriminative losses.
-
-    #### Notes
-
-    The outputted iteration number is the total number of calls to
-    `update`. Each update may include updating only a subset of
-    parameters.
-    """
-    if feed_dict is None:
-      feed_dict = {}
-
-    for key, value in six.iteritems(self.data):
-      if isinstance(key, tf.Tensor) and "Placeholder" in key.op.type:
-        feed_dict[key] = value
-
-    sess = get_session()
-    if variables is None:
-      _, _, t, loss, loss_d = sess.run(
-          [self.train, self.train_d, self.increment_t, self.loss, self.loss_d],
-          feed_dict)
-    elif variables == "Gen":
-      _, t, loss = sess.run(
-          [self.train, self.increment_t, self.loss], feed_dict)
-      loss_d = 0.0
-    elif variables == "Disc":
-      _, t, loss_d = sess.run(
-          [self.train_d, self.increment_t, self.loss_d], feed_dict)
-      loss = 0.0
-    else:
-      raise NotImplementedError("variables must be None, 'Gen', or 'Disc'.")
-
-    if self.debug:
-      sess.run(self.op_check, feed_dict)
-
-    if self.logging and self.n_print != 0:
-      if t == 1 or t % self.n_print == 0:
-        summary = sess.run(self.summarize, feed_dict)
-        self.train_writer.add_summary(summary, t)
-
-    return {'t': t, 'loss': loss, 'loss_d': loss_d}
-
-  def print_progress(self, info_dict):
-    """Print progress to output.
-    """
-    if self.n_print != 0:
-      t = info_dict['t']
-      if t == 1 or t % self.n_print == 0:
-        self.progbar.update(t, {'Gen Loss': info_dict['loss'],
-                                'Disc Loss': info_dict['loss_d']})
-
-
-def _build_optimizer(optimizer, global_step):
-  if optimizer is None and global_step is None:
-    # Default optimizer always uses a global step variable.
-    global_step = tf.Variable(0, trainable=False, name="global_step")
-
-  if isinstance(global_step, tf.Variable):
-    starter_learning_rate = 0.1
-    learning_rate = tf.train.exponential_decay(starter_learning_rate,
-                                               global_step,
-                                               100, 0.9, staircase=True)
-  else:
-    learning_rate = 0.01
-
-  # Build optimizer.
-  if optimizer is None:
-    optimizer = tf.train.AdamOptimizer(learning_rate)
-  elif isinstance(optimizer, str):
-    if optimizer == 'gradientdescent':
-      optimizer = tf.train.GradientDescentOptimizer(learning_rate)
-    elif optimizer == 'adadelta':
-      optimizer = tf.train.AdadeltaOptimizer(learning_rate)
-    elif optimizer == 'adagrad':
-      optimizer = tf.train.AdagradOptimizer(learning_rate)
-    elif optimizer == 'momentum':
-      optimizer = tf.train.MomentumOptimizer(learning_rate, 0.9)
-    elif optimizer == 'adam':
-      optimizer = tf.train.AdamOptimizer(learning_rate)
-    elif optimizer == 'ftrl':
-      optimizer = tf.train.FtrlOptimizer(learning_rate)
-    elif optimizer == 'rmsprop':
-      optimizer = tf.train.RMSPropOptimizer(learning_rate)
-    else:
-      raise ValueError('Optimizer class not found:', optimizer)
-  elif not isinstance(optimizer, tf.train.Optimizer):
-    raise TypeError("Optimizer must be str, tf.train.Optimizer, or None.")
-
-  return optimizer, global_step
+  model = make_optional_inputs(model)
+  x_fake = model(*args, **kwargs)
+  key = align_data(x_fake.name.split(':')[0])
+  if isinstance(key, int):
+    x_true = args[key]
+  elif kwargs.get(key, None) is not None:
+    x_true = kwargs.get(key)
+  with tf.variable_scope("Disc"):
+    d_true = discriminator(x_true)
+
+  with tf.variable_scope("Disc", reuse=True):
+    d_fake = discriminator(x_fake)
+
+  if collections is not None:
+    tf.summary.histogram("discriminator_outputs",
+                         tf.concat([d_true, d_fake], axis=0),
+                         collections=collections)
+
+  reg_terms_d = tf.losses.get_regularization_losses(scope="Disc")
+  reg_terms_all = tf.losses.get_regularization_losses()
+  reg_terms = [r for r in reg_terms_all if r not in reg_terms_d]
+
+  loss_d = tf.nn.sigmoid_cross_entropy_with_logits(
+      labels=tf.ones_like(d_true), logits=d_true) + \
+      tf.nn.sigmoid_cross_entropy_with_logits(
+          labels=tf.zeros_like(d_fake), logits=d_fake)
+  loss = tf.nn.sigmoid_cross_entropy_with_logits(
+      labels=tf.ones_like(d_fake), logits=d_fake)
+  loss_d = tf.reduce_mean(loss_d) + tf.reduce_sum(reg_terms_d)
+  loss = tf.reduce_mean(loss) + tf.reduce_sum(reg_terms)
+  return loss, loss_d
diff --git a/edward/inferences/gibbs.py b/edward/inferences/gibbs.py
deleted file mode 100644
index 3efb2d0c9..000000000
--- a/edward/inferences/gibbs.py
+++ /dev/null
@@ -1,152 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import random
-import six
-import tensorflow as tf
-
-from collections import OrderedDict
-from edward.inferences.conjugacy import complete_conditional
-from edward.inferences.monte_carlo import MonteCarlo
-from edward.models import RandomVariable
-from edward.util import check_latent_vars, get_session
-
-
-class Gibbs(MonteCarlo):
-  """Gibbs sampling [@geman1984stochastic].
-
-  Note `Gibbs` assumes the proposal distribution has the same
-  support as the prior. The `auto_transform` attribute in
-  the method `initialize()` is not applicable.
-
-  #### Examples
-
-  ```python
-  x_data = np.array([0, 1, 0, 0, 0, 0, 0, 0, 0, 1])
-
-  p = Beta(1.0, 1.0)
-  x = Bernoulli(probs=p, sample_shape=10)
-
-  qp = Empirical(tf.Variable(tf.zeros(500)))
-  inference = ed.Gibbs({p: qp}, data={x: x_data})
-  ```
-  """
-  def __init__(self, latent_vars, proposal_vars=None, data=None):
-    """Create an inference algorithm.
-
-    Args:
-      proposal_vars: dict of RandomVariable to RandomVariable.
-        Collection of random variables to perform inference on; each is
-        binded to its complete conditionals which Gibbs cycles draws on.
-        If not specified, default is to use `ed.complete_conditional`.
-    """
-    if proposal_vars is None:
-      proposal_vars = {z: complete_conditional(z)
-                       for z in six.iterkeys(latent_vars)}
-    else:
-      check_latent_vars(proposal_vars)
-
-    self.proposal_vars = proposal_vars
-    super(Gibbs, self).__init__(latent_vars, data)
-
-  def initialize(self, scan_order='random', *args, **kwargs):
-    """Initialize inference algorithm. It initializes hyperparameters
-    and builds ops for the algorithm's computation graph.
-
-    Args:
-      scan_order: list or str.
-        The scan order for each Gibbs update. If list, it is the
-        deterministic order of latent variables. An element in the list
-        can be a `RandomVariable` or itself a list of
-        `RandomVariable`s (this defines a blocked Gibbs sampler). If
-        'random', will use a random order at each update.
-    """
-    self.scan_order = scan_order
-    self.feed_dict = {}
-    kwargs['auto_transform'] = False
-    return super(Gibbs, self).initialize(*args, **kwargs)
-
-  def update(self, feed_dict=None):
-    """Run one iteration of sampling.
-
-    Args:
-      feed_dict: dict.
-        Feed dictionary for a TensorFlow session run. It is used to feed
-        placeholders that are not fed during initialization.
-
-    Returns:
-      dict.
-      Dictionary of algorithm-specific information. In this case, the
-      acceptance rate of samples since (and including) this iteration.
-    """
-    sess = get_session()
-    if not self.feed_dict:
-      # Initialize feed for all conditionals to be the draws at step 0.
-      samples = OrderedDict(self.latent_vars)
-      inits = sess.run([qz.params[0] for qz in six.itervalues(samples)])
-      for z, init in zip(six.iterkeys(samples), inits):
-        self.feed_dict[z] = init
-
-      for key, value in six.iteritems(self.data):
-        if isinstance(key, tf.Tensor) and "Placeholder" in key.op.type:
-          self.feed_dict[key] = value
-        elif isinstance(key, RandomVariable) and \
-                isinstance(value, (tf.Tensor, tf.Variable)):
-          self.feed_dict[key] = sess.run(value)
-
-    if feed_dict is None:
-      feed_dict = {}
-
-    self.feed_dict.update(feed_dict)
-
-    # Determine scan order.
-    if self.scan_order == 'random':
-      scan_order = list(six.iterkeys(self.latent_vars))
-      random.shuffle(scan_order)
-    else:  # list
-      scan_order = self.scan_order
-
-    # Fetch samples by iterating over complete conditional draws.
-    for z in scan_order:
-      if isinstance(z, RandomVariable):
-        draw = sess.run(self.proposal_vars[z], self.feed_dict)
-        self.feed_dict[z] = draw
-      else:  # list
-        draws = sess.run([self.proposal_vars[zz] for zz in z], self.feed_dict)
-        for zz, draw in zip(z, draws):
-          self.feed_dict[zz] = draw
-
-    # Assign the samples to the Empirical random variables.
-    _, accept_rate = sess.run(
-        [self.train, self.n_accept_over_t], self.feed_dict)
-    t = sess.run(self.increment_t)
-
-    if self.debug:
-      sess.run(self.op_check, self.feed_dict)
-
-    if self.logging and self.n_print != 0:
-      if t == 1 or t % self.n_print == 0:
-        summary = sess.run(self.summarize, self.feed_dict)
-        self.train_writer.add_summary(summary, t)
-
-    return {'t': t, 'accept_rate': accept_rate}
-
-  def build_update(self):
-    """
-    #### Notes
-
-    The updates assume each Empirical random variable is directly
-    parameterized by `tf.Variable`s.
-    """
-    # Update Empirical random variables according to the complete
-    # conditionals. We will feed the conditionals when calling `update()`.
-    assign_ops = []
-    for z, qz in six.iteritems(self.latent_vars):
-      variable = qz.get_variables()[0]
-      assign_ops.append(
-          tf.scatter_update(variable, self.t, self.proposal_vars[z]))
-
-    # Increment n_accept (if accepted).
-    assign_ops.append(self.n_accept.assign_add(1))
-    return tf.group(*assign_ops)
diff --git a/edward/inferences/hmc.py b/edward/inferences/hmc.py
index 825941ea3..9dba22669 100644
--- a/edward/inferences/hmc.py
+++ b/edward/inferences/hmc.py
@@ -5,206 +5,109 @@
 import six
 import tensorflow as tf
 
-from collections import OrderedDict
-from edward.inferences.monte_carlo import MonteCarlo
-from edward.models import RandomVariable
-from edward.util import copy
-
-
-class HMC(MonteCarlo):
+from edward.inferences import docstrings as doc
+from edward.inferences.util import make_log_joint
+
+tfp = tf.contrib.bayesflow
+
+
+@doc.set_doc(
+    args_part_one=(doc.arg_model +
+                   doc.arg_align_latent_monte_carlo +
+                   doc.arg_align_data +
+                   doc.arg_current_state)[:-1],
+    args_part_two=(doc.arg_step_size +
+                   doc.arg_current_target_log_prob +
+                   doc.arg_current_grads_target_log_prob +
+                   doc.arg_auto_transform +
+                   doc.arg_collections +
+                   doc.arg_args_kwargs)[:-1],
+    returns=doc.return_samples,
+    notes_mcmc_programs=doc.notes_mcmc_programs,
+    notes_conditional_inference=doc.notes_conditional_inference)
+def hmc(model,
+        align_latent,
+        align_data,
+        current_state=None,
+        num_leapfrog_steps=2,
+        step_size=0.25,
+        current_target_log_prob=None,
+        current_grads_target_log_prob=None,
+        auto_transform=True,
+        collections=None,
+        *args, **kwargs):
   """Hamiltonian Monte Carlo, also known as hybrid Monte Carlo
   [@duane1987hybrid; @neal2011mcmc].
 
-  #### Notes
+  HMC simulates Hamiltonian dynamics using a numerical integrator. The
+  integrator has a discretization error and is corrected with a
+  Metropolis accept-reject step.
 
-  In conditional inference, we infer $z$ in $p(z, \\beta
-  \mid x)$ while fixing inference over $\\beta$ using another
-  distribution $q(\\beta)$.
-  `HMC` substitutes the model's log marginal density
+  Works for any probabilistic program whose latent variables of
+  interest are differentiable. If `auto_transform=True`, the latent
+  variables may exist on any constrained differentiable support.
 
-  $\log p(x, z) = \log \mathbb{E}_{q(\\beta)} [ p(x, z, \\beta) ]
-                \\approx \log p(x, z, \\beta^*)$
+  Args:
+  @{args_part_one}
+    num_leapfrog_steps: int.
+      Number of steps of numerical integrator.
+  @{args_part_two}
+
+  Returns:
+  @{returns}
+
+  #### Notes
 
-  leveraging a single Monte Carlo sample, where $\\beta^* \sim
-  q(\\beta)$. This is unbiased (and therefore asymptotically exact as a
-  pseudo-marginal method) if $q(\\beta) = p(\\beta \mid x)$.
+  @{notes_mcmc_programs}
+
+  @{notes_conditional_inference}
 
   #### Examples
 
+  Consider the following setup.
   ```python
-  mu = Normal(loc=0.0, scale=1.0)
-  x = Normal(loc=mu, scale=1.0, sample_shape=10)
-
-  qmu = Empirical(tf.Variable(tf.zeros(500)))
-  inference = ed.HMC({mu: qmu}, {x: np.zeros(10, dtype=np.float32)})
+  def model():
+    mu = Normal(loc=0.0, scale=1.0, name="mu")
+    x = Normal(loc=mu, scale=1.0, sample_shape=10, name="x")
+    return x
+  ```
+  In graph mode, build `tf.Variable`s which are updated via the Markov
+  chain. The update op is fetched at runtime over many iterations.
+  ```python
+  qmu = tf.get_variable("qmu", initializer=1.)
+  next_state, _, _ = ed.hmc(
+      model,
+      ...,
+      current_state=qmu,
+      align_latent=lambda name: "qmu" if name == "mu" else None,
+      align_data=lambda name: "x_data" if name == "x" else None,
+      x_data=x_data)
+  qmu_update = qmu.assign(next_state)
+  ```
+  In eager mode, call the function at runtime, updating its inputs
+  such as `current_state`.
+  ```python
+  qmu = 1.
+  next_log_prob = None
+  next_gradients = None
+  for _ in range(1000):
+    next_state, next_log_prob, next_gradients = ed.hmc(
+        model,
+        ...,
+        current_state=qmu,
+        align_latent=lambda name: "qmu" if name == "mu" else None,
+        align_data=lambda name: "x_data" if name == "x" else None,
+        current_target_log_prob=next_log_prob,
+        current_grads_target_log_prob=next_gradients,
+        x_data=x_data)
+    qmu = next_state
   ```
   """
-  def __init__(self, *args, **kwargs):
-    super(HMC, self).__init__(*args, **kwargs)
-
-  def initialize(self, step_size=0.25, n_steps=2, *args, **kwargs):
-    """Initialize inference algorithm. It initializes hyperparameters
-    and builds ops for the algorithm's computation graph.
-
-    Args:
-      step_size: float.
-        Step size of numerical integrator.
-      n_steps: int.
-        Number of steps of numerical integrator.
-    """
-    self.step_size = step_size
-    self.n_steps = n_steps
-    # store global scope for log joint calculations
-    self._scope = tf.get_default_graph().unique_name("inference") + '/'
-    return super(HMC, self).initialize(*args, **kwargs)
-
-  def build_update(self):
-    """Simulate Hamiltonian dynamics using a numerical integrator.
-    Correct for the integrator's discretization error using an
-    acceptance ratio.
-
-    #### Notes
-
-    The updates assume each Empirical random variable is directly
-    parameterized by `tf.Variable`s.
-    """
-
-    # Gather the initial state, transformed to unconstrained space.
-    try:
-      self.latent_vars_unconstrained
-    except:
-      raise ValueError("This implementation of HMC requires that all "
-                       "variables have unconstrained support. Please "
-                       "initialize with auto_transform=True to ensure "
-                       "this. (if your variables already have unconstrained "
-                       "support then doing this is a no-op).")
-    old_sample = {z_unconstrained:
-                  tf.gather(qz_unconstrained.params, tf.maximum(self.t - 1, 0))
-                  for z_unconstrained, qz_unconstrained in
-                  six.iteritems(self.latent_vars_unconstrained)}
-    old_sample = OrderedDict(old_sample)
-
-    # Sample momentum.
-    old_r_sample = OrderedDict()
-    for z, qz in six.iteritems(self.latent_vars_unconstrained):
-      event_shape = qz.event_shape
-      old_r_sample[z] = tf.random_normal(event_shape, dtype=qz.dtype)
-
-    # Simulate Hamiltonian dynamics.
-    new_sample, new_r_sample = leapfrog(old_sample, old_r_sample,
-                                        self.step_size,
-                                        self._log_joint_unconstrained,
-                                        self.n_steps)
-
-    # Calculate acceptance ratio.
-    ratio = tf.reduce_sum([0.5 * tf.reduce_sum(tf.square(r))
-                           for r in six.itervalues(old_r_sample)])
-    ratio -= tf.reduce_sum([0.5 * tf.reduce_sum(tf.square(r))
-                            for r in six.itervalues(new_r_sample)])
-    ratio += self._log_joint_unconstrained(new_sample)
-    ratio -= self._log_joint_unconstrained(old_sample)
-
-    # Accept or reject sample.
-    u = tf.random_uniform([], dtype=ratio.dtype)
-    accept = tf.log(u) < ratio
-    sample_values = tf.cond(accept, lambda: list(six.itervalues(new_sample)),
-                            lambda: list(six.itervalues(old_sample)))
-    if not isinstance(sample_values, list):
-      # `tf.cond` returns tf.Tensor if output is a list of size 1.
-      sample_values = [sample_values]
-
-    sample = {z_unconstrained: sample_value for
-              z_unconstrained, sample_value in
-              zip(six.iterkeys(new_sample), sample_values)}
-
-    # Update Empirical random variables.
-    assign_ops = []
-    for z_unconstrained, qz_unconstrained in six.iteritems(
-            self.latent_vars_unconstrained):
-      variable = qz_unconstrained.get_variables()[0]
-      assign_ops.append(tf.scatter_update(
-          variable, self.t, sample[z_unconstrained]))
-
-    # Increment n_accept (if accepted).
-    assign_ops.append(self.n_accept.assign_add(tf.where(accept, 1, 0)))
-    return tf.group(*assign_ops)
-
-  def _log_joint_unconstrained(self, z_sample):
-    """
-    Given a sample in unconstrained latent space, transform it back into
-    the original space, and compute the log joint density with appropriate
-    Jacobian correction.
-    """
-
-    unconstrained_to_z = {v: k for (k, v) in self.transformations.items()}
-
-    # transform all samples back into the original (potentially
-    # constrained) space.
-    z_sample_transformed = {}
-    log_det_jacobian = 0.0
-    for z_unconstrained, qz_unconstrained in z_sample.items():
-      z = (unconstrained_to_z[z_unconstrained]
-           if z_unconstrained in unconstrained_to_z
-           else z_unconstrained)
-
-      try:
-        bij = self.transformations[z].bijector
-        z_sample_transformed[z] = bij.inverse(qz_unconstrained)
-        log_det_jacobian += tf.reduce_sum(
-            bij.inverse_log_det_jacobian(qz_unconstrained))
-      except:  # if z not in self.transformations,
-               # or is not a TransformedDist w/ bijector
-        z_sample_transformed[z] = qz_unconstrained
-
-    return self._log_joint(z_sample_transformed) + log_det_jacobian
-
-  def _log_joint(self, z_sample):
-    """Utility function to calculate model's log joint density,
-    log p(x, z), for inputs z (and fixed data x).
-
-    Args:
-      z_sample: dict.
-        Latent variable keys to samples.
-    """
-    scope = self._scope + tf.get_default_graph().unique_name("sample")
-    # Form dictionary in order to replace conditioning on prior or
-    # observed variable with conditioning on a specific value.
-    dict_swap = z_sample.copy()
-
-    for x, qx in six.iteritems(self.data):
-      if isinstance(x, RandomVariable):
-        if isinstance(qx, RandomVariable):
-          qx_copy = copy(qx, scope=scope)
-          dict_swap[x] = qx_copy.value()
-        else:
-          dict_swap[x] = qx
-
-    log_joint = 0.0
-    for z in six.iterkeys(self.latent_vars):
-      z_copy = copy(z, dict_swap, scope=scope)
-      log_joint += tf.reduce_sum(z_copy.log_prob(dict_swap[z]))
-
-    for x in six.iterkeys(self.data):
-      if isinstance(x, RandomVariable):
-        x_copy = copy(x, dict_swap, scope=scope)
-        log_joint += tf.reduce_sum(x_copy.log_prob(dict_swap[x]))
-
-    return log_joint
-
-
-def leapfrog(z_old, r_old, step_size, log_joint, n_steps):
-  z_new = z_old.copy()
-  r_new = r_old.copy()
-
-  grad_log_joint = tf.gradients(log_joint(z_new), list(six.itervalues(z_new)))
-  for _ in range(n_steps):
-    for i, key in enumerate(six.iterkeys(z_new)):
-      z, r = z_new[key], r_new[key]
-      r_new[key] = r + 0.5 * step_size * tf.convert_to_tensor(grad_log_joint[i])
-      z_new[key] = z + step_size * r_new[key]
-
-    grad_log_joint = tf.gradients(log_joint(z_new), list(six.itervalues(z_new)))
-    for i, key in enumerate(six.iterkeys(z_new)):
-      r_new[key] += 0.5 * step_size * tf.convert_to_tensor(grad_log_joint[i])
-
-  return z_new, r_new
+  out = tfp.hmc.kernel(
+      target_log_prob_fn=make_log_joint(model, current_state),
+      current_state=current_state,
+      step_size=step_size,
+      num_leapfrog_steps=num_leapfrog_steps,
+      current_target_log_prob=current_target_log_prob,
+      current_grads_target_log_prob=current_grads_target_log_prob)
+  return out
diff --git a/edward/inferences/implicit_klqp.py b/edward/inferences/implicit_klqp.py
deleted file mode 100644
index a794978b2..000000000
--- a/edward/inferences/implicit_klqp.py
+++ /dev/null
@@ -1,242 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import six
-import tensorflow as tf
-
-from edward.inferences.gan_inference import GANInference
-from edward.models import RandomVariable
-from edward.util import check_latent_vars, copy, get_session
-
-
-class ImplicitKLqp(GANInference):
-  """Variational inference with implicit probabilistic models
-  [@tran2017deep].
-
-  It minimizes the KL divergence
-
-  $\\text{KL}( q(z, \\beta; \lambda) \| p(z, \\beta \mid x) ),$
-
-  where $z$ are local variables associated to a data point and
-  $\\beta$ are global variables shared across data points.
-
-  Global latent variables require `log_prob()` and need to return a
-  random sample when fetched from the graph. Local latent variables
-  and observed variables require only a random sample when fetched
-  from the graph. (This is true for both $p$ and $q$.)
-
-  All variational factors must be reparameterizable: each of the
-  random variables (`rv`) satisfies `rv.is_reparameterized` and
-  `rv.is_continuous`.
-
-  #### Notes
-
-  Unlike `GANInference`, `discriminator` takes dict's as input,
-  and must subset to the appropriate values through lexical scoping
-  from the previously defined model and latent variables. This is
-  necessary as the discriminator can take an arbitrary set of data,
-  latent, and global variables.
-
-  Note the type for `discriminator`'s output changes when one
-  passes in the `scale` argument to `initialize()`.
-
-  + If `scale` has at most one item, then `discriminator`
-  outputs a tensor whose multiplication with that element is
-  broadcastable. (For example, the output is a tensor and the single
-  scale factor is a scalar.)
-  + If `scale` has more than one item, then in order to scale
-  its corresponding output, `discriminator` must output a
-  dictionary of same size and keys as `scale`.
-
-  The objective function also adds to itself a summation over all
-  tensors in the `REGULARIZATION_LOSSES` collection.
-  """
-  def __init__(self, latent_vars, data=None, discriminator=None,
-               global_vars=None):
-    """Create an inference algorithm.
-
-    Args:
-      discriminator: function.
-        Function (with parameters). Unlike `GANInference`, it is
-        interpreted as a ratio estimator rather than a discriminator.
-        It takes three arguments: a data dict, local latent variable
-        dict, and global latent variable dict. As with GAN
-        discriminators, it can take a batch of data points and local
-        variables, of size $M$, and output a vector of length
-        $M$.
-      global_vars: dict of RandomVariable to RandomVariable.
-        Identifying which variables in `latent_vars` are global
-        variables, shared across data points. These will not be
-        encompassed in the ratio estimation problem, and will be
-        estimated with tractable variational approximations.
-    """
-    if not callable(discriminator):
-      raise TypeError("discriminator must be a callable function.")
-
-    self.discriminator = discriminator
-    if global_vars is None:
-      global_vars = {}
-
-    check_latent_vars(global_vars)
-    self.global_vars = global_vars
-    # call grandparent's method; avoid parent (GANInference)
-    super(GANInference, self).__init__(latent_vars, data)
-
-  def initialize(self, ratio_loss='log', *args, **kwargs):
-    """Initialize inference algorithm. It initializes hyperparameters
-    and builds ops for the algorithm's computation graph.
-
-    Args:
-      ratio_loss: str or fn.
-        Loss function minimized to get the ratio estimator. 'log' or 'hinge'.
-        Alternatively, one can pass in a function of two inputs,
-        `psamples` and `qsamples`, and output a point-wise value
-        with shape matching the shapes of the two inputs.
-    """
-    if callable(ratio_loss):
-      self.ratio_loss = ratio_loss
-    elif ratio_loss == 'log':
-      self.ratio_loss = log_loss
-    elif ratio_loss == 'hinge':
-      self.ratio_loss = hinge_loss
-    else:
-      raise ValueError('Ratio loss not found:', ratio_loss)
-
-    return super(ImplicitKLqp, self).initialize(*args, **kwargs)
-
-  def build_loss_and_gradients(self, var_list):
-    """Build loss function
-
-    $-\Big(\mathbb{E}_{q(\\beta)} [\log p(\\beta) - \log q(\\beta) ] +
-        \sum_{n=1}^N \mathbb{E}_{q(\\beta)q(z_n\mid\\beta)} [
-            r^*(x_n, z_n, \\beta) ] \Big).$
-
-    We minimize it with respect to parameterized variational
-    families $q(z, \\beta; \lambda)$.
-
-    $r^*(x_n, z_n, \\beta)$ is a function of a single data point
-    $x_n$, single local variable $z_n$, and all global
-    variables $\\beta$. It is equal to the log-ratio
-
-    $\log p(x_n, z_n\mid \\beta) - \log q(x_n, z_n\mid \\beta),$
-
-    where $q(x_n)$ is the empirical data distribution. Rather
-    than explicit calculation, $r^*(x, z, \\beta)$ is the
-    solution to a ratio estimation problem, minimizing the specified
-    `ratio_loss`.
-
-    Gradients are taken using the reparameterization trick
-    [@kingma2014auto].
-
-    #### Notes
-
-    This also includes model parameters $p(x, z, \\beta; \\theta)$
-    and variational distributions with inference networks
-    $q(z\mid x)$.
-
-    There are a bunch of extensions we could easily do in this
-    implementation:
-
-    + further factorizations can be used to better leverage the
-      graph structure for more complicated models;
-    + score function gradients for global variables;
-    + use more samples; this would require the `copy()` utility
-      function for q's as well, and an additional loop. we opt not to
-      because it complicates the code;
-    + analytic KL/swapping out the penalty term for the globals.
-    """
-    # Collect tensors used in calculation of losses.
-    scope = tf.get_default_graph().unique_name("inference")
-    qbeta_sample = {}
-    pbeta_log_prob = 0.0
-    qbeta_log_prob = 0.0
-    for beta, qbeta in six.iteritems(self.global_vars):
-      # Draw a sample beta' ~ q(beta) and calculate
-      # log p(beta') and log q(beta').
-      qbeta_sample[beta] = qbeta.value()
-      pbeta_log_prob += tf.reduce_sum(beta.log_prob(qbeta_sample[beta]))
-      qbeta_log_prob += tf.reduce_sum(qbeta.log_prob(qbeta_sample[beta]))
-
-    pz_sample = {}
-    qz_sample = {}
-    for z, qz in six.iteritems(self.latent_vars):
-      if z not in self.global_vars:
-        # Copy local variables p(z), q(z) to draw samples
-        # z' ~ p(z | beta'), z' ~ q(z | beta').
-        pz_copy = copy(z, dict_swap=qbeta_sample, scope=scope)
-        pz_sample[z] = pz_copy.value()
-        qz_sample[z] = qz.value()
-
-    # Collect x' ~ p(x | z', beta') and x' ~ q(x).
-    dict_swap = qbeta_sample.copy()
-    dict_swap.update(qz_sample)
-    x_psample = {}
-    x_qsample = {}
-    for x, x_data in six.iteritems(self.data):
-      if isinstance(x, tf.Tensor):
-        if "Placeholder" not in x.op.type:
-          # Copy p(x | z, beta) to get draw p(x | z', beta').
-          x_copy = copy(x, dict_swap=dict_swap, scope=scope)
-          x_psample[x] = x_copy
-          x_qsample[x] = x_data
-      elif isinstance(x, RandomVariable):
-        # Copy p(x | z, beta) to get draw p(x | z', beta').
-        x_copy = copy(x, dict_swap=dict_swap, scope=scope)
-        x_psample[x] = x_copy.value()
-        x_qsample[x] = x_data
-
-    with tf.variable_scope("Disc"):
-      r_psample = self.discriminator(x_psample, pz_sample, qbeta_sample)
-
-    with tf.variable_scope("Disc", reuse=True):
-      r_qsample = self.discriminator(x_qsample, qz_sample, qbeta_sample)
-
-    # Form ratio loss and ratio estimator.
-    if len(self.scale) <= 1:
-      loss_d = tf.reduce_mean(self.ratio_loss(r_psample, r_qsample))
-      scale = list(six.itervalues(self.scale))
-      scale = scale[0] if scale else 1.0
-      scaled_ratio = tf.reduce_sum(scale * r_qsample)
-    else:
-      loss_d = [tf.reduce_mean(self.ratio_loss(r_psample[key], r_qsample[key]))
-                for key in six.iterkeys(self.scale)]
-      loss_d = tf.reduce_sum(loss_d)
-      scaled_ratio = [tf.reduce_sum(self.scale[key] * r_qsample[key])
-                      for key in six.iterkeys(self.scale)]
-      scaled_ratio = tf.reduce_sum(scaled_ratio)
-
-    reg_terms_d = tf.losses.get_regularization_losses(scope="Disc")
-    reg_terms_all = tf.losses.get_regularization_losses()
-    reg_terms = [r for r in reg_terms_all if r not in reg_terms_d]
-
-    # Form variational objective.
-    loss = -(pbeta_log_prob - qbeta_log_prob + scaled_ratio -
-             tf.reduce_sum(reg_terms))
-    loss_d = loss_d + tf.reduce_sum(reg_terms_d)
-
-    var_list_d = tf.get_collection(
-        tf.GraphKeys.TRAINABLE_VARIABLES, scope="Disc")
-    if var_list is None:
-      var_list = [v for v in tf.trainable_variables() if v not in var_list_d]
-
-    grads = tf.gradients(loss, var_list)
-    grads_d = tf.gradients(loss_d, var_list_d)
-    grads_and_vars = list(zip(grads, var_list))
-    grads_and_vars_d = list(zip(grads_d, var_list_d))
-    return loss, grads_and_vars, loss_d, grads_and_vars_d
-
-
-def log_loss(psample, qsample):
-  """Point-wise log loss."""
-  loss = tf.nn.sigmoid_cross_entropy_with_logits(
-      labels=tf.ones_like(psample), logits=psample) + \
-      tf.nn.sigmoid_cross_entropy_with_logits(
-          labels=tf.zeros_like(qsample), logits=qsample)
-  return loss
-
-
-def hinge_loss(psample, qsample):
-  """Point-wise hinge loss."""
-  loss = tf.nn.relu(1.0 - psample) + tf.nn.relu(1.0 + qsample)
-  return loss
diff --git a/edward/inferences/inference.py b/edward/inferences/inference.py
deleted file mode 100644
index 28614223f..000000000
--- a/edward/inferences/inference.py
+++ /dev/null
@@ -1,376 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import abc
-import numpy as np
-import six
-import tensorflow as tf
-import os
-
-from datetime import datetime
-from edward.models import RandomVariable
-from edward.util import check_data, check_latent_vars, get_session, \
-    get_variables, Progbar, transform
-
-from tensorflow.contrib.distributions import bijectors
-
-
-@six.add_metaclass(abc.ABCMeta)
-class Inference(object):
-  """Abstract base class for inference. All inference algorithms in
-  Edward inherit from `Inference`, sharing common methods and
-  properties via a class hierarchy.
-
-  Specific algorithms typically inherit from other subclasses of
-  `Inference` rather than `Inference` directly. For example, one
-  might inherit from the abstract classes `MonteCarlo` or
-  `VariationalInference`.
-
-  To build an algorithm inheriting from `Inference`, one must at the
-  minimum implement `initialize` and `update`: the former builds
-  the computational graph for the algorithm; the latter runs the
-  computational graph for the algorithm.
-
-  To reset inference (e.g., internal variable counters incremented
-  over training), fetch inference's reset ops from session with
-  `sess.run(inference.reset)`.
-
-  #### Examples
-
-  ```python
-  # Set up probability model.
-  mu = Normal(loc=0.0, scale=1.0)
-  x = Normal(loc=mu, scale=1.0, sample_shape=50)
-
-  # Set up posterior approximation.
-  qmu_loc = tf.Variable(tf.random_normal([]))
-  qmu_scale = tf.nn.softplus(tf.Variable(tf.random_normal([])))
-  qmu = Normal(loc=qmu_loc, scale=qmu_scale)
-
-  inference = ed.Inference({mu: qmu}, data={x: tf.zeros(50)})
-  ```
-  """
-  def __init__(self, latent_vars=None, data=None):
-    """Create an inference algorithm.
-
-    Args:
-      latent_vars: dict.
-        Collection of latent variables (of type `RandomVariable` or
-        `tf.Tensor`) to perform inference on. Each random variable is
-        binded to another random variable; the latter will infer the
-        former conditional on data.
-      data: dict.
-        Data dictionary which binds observed variables (of type
-        `RandomVariable` or `tf.Tensor`) to their realizations (of
-        type `tf.Tensor`). It can also bind placeholders (of type
-        `tf.Tensor`) used in the model to their realizations; and
-        prior latent variables (of type `RandomVariable`) to posterior
-        latent variables (of type `RandomVariable`).
-    """
-    sess = get_session()
-    if latent_vars is None:
-      latent_vars = {}
-    if data is None:
-      data = {}
-
-    check_latent_vars(latent_vars)
-    self.latent_vars = latent_vars
-
-    check_data(data)
-    self.data = {}
-    for key, value in six.iteritems(data):
-      if isinstance(key, tf.Tensor) and "Placeholder" in key.op.type:
-        self.data[key] = value
-      elif isinstance(key, (RandomVariable, tf.Tensor)):
-        if isinstance(value, (RandomVariable, tf.Tensor)):
-          self.data[key] = value
-        elif isinstance(value, (float, list, int, np.ndarray, np.number, str)):
-          # If value is a Python type, store it in the graph.
-          # Assign its placeholder with the key's data type.
-          with tf.variable_scope(None, default_name="data"):
-            ph = tf.placeholder(key.dtype, np.shape(value))
-            var = tf.Variable(ph, trainable=False, collections=[])
-            sess.run(var.initializer, {ph: value})
-            self.data[key] = var
-
-  def run(self, variables=None, use_coordinator=True, *args, **kwargs):
-    """A simple wrapper to run inference.
-
-    1. Initialize algorithm via `initialize`.
-    2. (Optional) Build a TensorFlow summary writer for TensorBoard.
-    3. (Optional) Initialize TensorFlow variables.
-    4. (Optional) Start queue runners.
-    5. Run `update` for `self.n_iter` iterations.
-    6. While running, `print_progress`.
-    7. Finalize algorithm via `finalize`.
-    8. (Optional) Stop queue runners.
-
-    To customize the way inference is run, run these steps
-    individually.
-
-    Args:
-      variables: list.
-        A list of TensorFlow variables to initialize during inference.
-        Default is to initialize all variables (this includes
-        reinitializing variables that were already initialized). To
-        avoid initializing any variables, pass in an empty list.
-      use_coordinator: bool.
-        Whether to start and stop queue runners during inference using a
-        TensorFlow coordinator. For example, queue runners are necessary
-        for batch training with file readers.
-      *args, **kwargs:
-        Passed into `initialize`.
-    """
-    self.initialize(*args, **kwargs)
-
-    if variables is None:
-      init = tf.global_variables_initializer()
-    else:
-      init = tf.variables_initializer(variables)
-
-    # Feed placeholders in case initialization depends on them.
-    feed_dict = {}
-    for key, value in six.iteritems(self.data):
-      if isinstance(key, tf.Tensor) and "Placeholder" in key.op.type:
-        feed_dict[key] = value
-
-    init.run(feed_dict)
-
-    if use_coordinator:
-      # Start input enqueue threads.
-      self.coord = tf.train.Coordinator()
-      self.threads = tf.train.start_queue_runners(coord=self.coord)
-
-    for _ in range(self.n_iter):
-      info_dict = self.update()
-      self.print_progress(info_dict)
-
-    self.finalize()
-
-    if use_coordinator:
-      # Ask threads to stop.
-      self.coord.request_stop()
-      self.coord.join(self.threads)
-
-  @abc.abstractmethod
-  def initialize(self, n_iter=1000, n_print=None, scale=None,
-                 auto_transform=True, logdir=None, log_timestamp=True,
-                 log_vars=None, debug=False):
-    """Initialize inference algorithm. It initializes hyperparameters
-    and builds ops for the algorithm's computation graph.
-
-    Any derived class of `Inference` **must** implement this method.
-    No methods which build ops should be called outside `initialize()`.
-
-    Args:
-      n_iter: int.
-        Number of iterations for algorithm when calling `run()`.
-        Alternatively if controlling inference manually, it is the
-        expected number of calls to `update()`; this number determines
-        tracking information during the print progress.
-      n_print: int.
-        Number of iterations for each print progress. To suppress print
-        progress, then specify 0. Default is `int(n_iter / 100)`.
-      scale: dict of RandomVariable to tf.Tensor.
-        A tensor to scale computation for any random variable that it is
-        binded to. Its shape must be broadcastable; it is multiplied
-        element-wise to the random variable. For example, this is useful
-        for mini-batch scaling when inferring global variables, or
-        applying masks on a random variable.
-      auto_transform: bool.
-        Whether to automatically transform continuous latent variables
-        of unequal support to be on the unconstrained space. It is
-        only applied if the argument is `True`, the latent variable
-        pair are `ed.RandomVariable`s with the `support` attribute,
-        the supports are both continuous and unequal.
-      logdir: str.
-        Directory where event file will be written. For details,
-        see `tf.summary.FileWriter`. Default is to log nothing.
-      log_timestamp: bool.
-        If True (and `logdir` is specified), create a subdirectory of
-        `logdir` to save the specific run results. The subdirectory's
-        name is the current UTC timestamp with format 'YYYYMMDD_HHMMSS'.
-      log_vars: list.
-        Specifies the list of variables to log after each `n_print`
-        steps. If None, will log all variables. If `[]`, no variables
-        will be logged. `logdir` must be specified for variables to be
-        logged.
-      debug: bool.
-        If True, add checks for `NaN` and `Inf` to all computations
-        in the graph. May result in substantially slower execution
-        times.
-    """
-    self.n_iter = n_iter
-    if n_print is None:
-      self.n_print = int(n_iter / 100)
-    else:
-      self.n_print = n_print
-
-    self.progbar = Progbar(self.n_iter)
-    self.t = tf.Variable(0, trainable=False, name="iteration")
-
-    self.increment_t = self.t.assign_add(1)
-
-    if scale is None:
-      scale = {}
-    elif not isinstance(scale, dict):
-      raise TypeError("scale must be a dict object.")
-
-    self.scale = scale
-
-    # map from original latent vars to unconstrained versions
-    self.transformations = {}
-    if auto_transform:
-      latent_vars = self.latent_vars.copy()
-      # latent_vars maps original latent vars to constrained Q's.
-      # latent_vars_unconstrained maps unconstrained vars to unconstrained Q's.
-      self.latent_vars = {}
-      self.latent_vars_unconstrained = {}
-      for z, qz in six.iteritems(latent_vars):
-        if hasattr(z, 'support') and hasattr(qz, 'support') and \
-                z.support != qz.support and qz.support != 'point':
-
-          # transform z to an unconstrained space
-          z_unconstrained = transform(z)
-          self.transformations[z] = z_unconstrained
-
-          # make sure we also have a qz that covers the unconstrained space
-          if qz.support == "points":
-            qz_unconstrained = qz
-          else:
-            qz_unconstrained = transform(qz)
-          self.latent_vars_unconstrained[z_unconstrained] = qz_unconstrained
-
-          # additionally construct the transformation of qz
-          # back into the original constrained space
-          if z_unconstrained != z:
-            qz_constrained = transform(
-                qz_unconstrained, bijectors.Invert(z_unconstrained.bijector))
-
-            try:  # attempt to pushforward the params of Empirical distributions
-              qz_constrained.params = z_unconstrained.bijector.inverse(
-                  qz_unconstrained.params)
-            except:  # qz_unconstrained is not an Empirical distribution
-              pass
-
-          else:
-            qz_constrained = qz_unconstrained
-
-          self.latent_vars[z] = qz_constrained
-        else:
-          self.latent_vars[z] = qz
-          self.latent_vars_unconstrained[z] = qz
-      del latent_vars
-
-    if logdir is not None:
-      self.logging = True
-      if log_timestamp:
-        logdir = os.path.expanduser(logdir)
-        logdir = os.path.join(
-            logdir, datetime.strftime(datetime.utcnow(), "%Y%m%d_%H%M%S"))
-
-      self._summary_key = tf.get_default_graph().unique_name("summaries")
-      self._set_log_variables(log_vars)
-      self.train_writer = tf.summary.FileWriter(logdir, tf.get_default_graph())
-    else:
-      self.logging = False
-
-    self.debug = debug
-    if self.debug:
-      self.op_check = tf.add_check_numerics_ops()
-
-    # Store reset ops which user can call. Subclasses should append
-    # any ops needed to reset internal variables in inference.
-    self.reset = [tf.variables_initializer([self.t])]
-
-  @abc.abstractmethod
-  def update(self, feed_dict=None):
-    """Run one iteration of inference.
-
-    Any derived class of `Inference` **must** implement this method.
-
-    Args:
-      feed_dict: dict.
-        Feed dictionary for a TensorFlow session run. It is used to feed
-        placeholders that are not fed during initialization.
-
-    Returns:
-      dict.
-        Dictionary of algorithm-specific information.
-    """
-    if feed_dict is None:
-      feed_dict = {}
-
-    for key, value in six.iteritems(self.data):
-      if isinstance(key, tf.Tensor) and "Placeholder" in key.op.type:
-        feed_dict[key] = value
-
-    sess = get_session()
-    t = sess.run(self.increment_t)
-
-    if self.debug:
-      sess.run(self.op_check, feed_dict)
-
-    if self.logging and self.n_print != 0:
-      if t == 1 or t % self.n_print == 0:
-        summary = sess.run(self.summarize, feed_dict)
-        self.train_writer.add_summary(summary, t)
-
-    return {'t': t}
-
-  def print_progress(self, info_dict):
-    """Print progress to output.
-
-    Args:
-      info_dict: dict.
-        Dictionary of algorithm-specific information.
-    """
-    if self.n_print != 0:
-      t = info_dict['t']
-      if t == 1 or t % self.n_print == 0:
-        self.progbar.update(t)
-
-  def finalize(self):
-    """Function to call after convergence.
-    """
-    if self.logging:
-      self.train_writer.close()
-
-  def _set_log_variables(self, log_vars=None):
-    """Log variables to TensorBoard.
-
-    For each variable in `log_vars`, forms a `tf.summary.scalar` if
-    the variable has scalar shape; otherwise forms a `tf.summary.histogram`.
-
-    Args:
-      log_vars: list.
-        Specifies the list of variables to log after each `n_print`
-        steps. If None, will log all variables. If `[]`, no variables
-        will be logged.
-    """
-    if log_vars is None:
-      log_vars = []
-      for key in six.iterkeys(self.data):
-        log_vars += get_variables(key)
-
-      for key, value in six.iteritems(self.latent_vars):
-        log_vars += get_variables(key)
-        log_vars += get_variables(value)
-
-      log_vars = set(log_vars)
-
-    for var in log_vars:
-      # replace colons which are an invalid character
-      var_name = var.name.replace(':', '/')
-      # Log all scalars.
-      if len(var.shape) == 0:
-        tf.summary.scalar("parameter/{}".format(var_name),
-                          var, collections=[self._summary_key])
-      elif len(var.shape) == 1 and var.shape[0] == 1:
-        tf.summary.scalar("parameter/{}".format(var_name),
-                          var[0], collections=[self._summary_key])
-      else:
-        # If var is multi-dimensional, log a histogram of its values.
-        tf.summary.histogram("parameter/{}".format(var_name),
-                             var, collections=[self._summary_key])
diff --git a/edward/inferences/klpq.py b/edward/inferences/klpq.py
index dfe0796e0..b1924398a 100644
--- a/edward/inferences/klpq.py
+++ b/edward/inferences/klpq.py
@@ -5,9 +5,9 @@
 import six
 import tensorflow as tf
 
-from edward.inferences.variational_inference import VariationalInference
-from edward.models import RandomVariable
-from edward.util import copy, get_descendants
+from edward.inferences import docstrings as doc
+from edward.inferences.util import (
+    call_with_intercept, call_with_trace, toposort)
 
 try:
   from edward.models import Normal
@@ -15,170 +15,120 @@
   raise ImportError("{0}. Your TensorFlow version is not supported.".format(e))
 
 
-class KLpq(VariationalInference):
+@doc.set_doc(
+    args=(doc.arg_model +
+          doc.arg_variational +
+          doc.arg_align_latent +
+          doc.arg_align_data +
+          doc.arg_scale +
+          doc.arg_n_samples +
+          doc.arg_auto_transform +
+          doc.arg_collections +
+          doc.arg_args_kwargs),
+    returns=doc.return_loss_surrogate_loss,
+    notes_model_parameters=doc.notes_model_parameters,
+    notes_conditional_inference=doc.notes_conditional_inference_samples,
+    notes_regularization_losses=doc.notes_regularization_losses)
+def klpq(model, variational, align_latent, align_data,
+         scale=lambda name: 1.0, n_samples=1, auto_transform=True,
+         collections=None, *args, **kwargs):
   """Variational inference with the KL divergence
 
-  $\\text{KL}( p(z \mid x) \| q(z) ).$
+  $\\text{KL}( p(z \mid x) \| q(z) )
+    = \mathbb{E}_{p(z \mid x)} [ \log p(z \mid x) - \log q(z; \lambda) ]$.
 
-  To perform the optimization, this class uses a technique from
+  To perform the optimization, this function uses a technique from
   adaptive importance sampling [@oh1992adaptive].
 
-  #### Notes
+  The loss function can be estimated up to a constant as
+
+  $\sum_{s=1}^S [
+    w_{\\text{norm}}(z^s; \lambda) (\log p(x, z^s) - \log q(z^s; \lambda) ],$
+
+  where for $z^s \sim q(z; \lambda)$,
+
+  $w_{\\text{norm}}(z^s; \lambda) =
+        w(z^s; \lambda) / \sum_{s=1}^S w(z^s; \lambda)$
+
+  normalizes the importance weights, $w(z^s; \lambda) = p(x,
+  z^s) / q(z^s; \lambda)$.
 
-  `KLpq` also optimizes any model parameters $p(z\mid x;
-  \\theta)$. It does this by variational EM, maximizing
+  This provides a gradient,
+
+  $- \sum_{s=1}^S [
+    w_{\\text{norm}}(z^s; \lambda) \\nabla_{\lambda} \log q(z^s; \lambda) ].$
+
+  Args:
+  @{args}
+
+  Returns:
+  @{returns}
+
+  #### Notes
 
-  $\mathbb{E}_{p(z \mid x; \lambda)} [ \log p(x, z; \\theta) ]$
+  Probabilistic programs may have random variables which vary across
+  executions. The algorithm returns calculations following `n_samples`
+  executions of the model and variational programs.
 
-  with respect to $\\theta$.
+  @{notes_model_parameters}
 
-  In conditional inference, we infer $z` in $p(z, \\beta
-  \mid x)$ while fixing inference over $\\beta$ using another
-  distribution $q(\\beta)$. During gradient calculation, instead
-  of using the model's density
+  @{notes_conditional_inference}
 
-  $\log p(x, z^{(s)}), z^{(s)} \sim q(z; \lambda),$
+  @{notes_regularization_losses}
 
-  for each sample $s=1,\ldots,S$, `KLpq` uses
+  #### Examples
 
-  $\log p(x, z^{(s)}, \\beta^{(s)}),$
+  ```python
+  def model():
+    mu = Normal(loc=0.0, scale=1.0, name="mu")
+    x = Normal(loc=mu, scale=1.0, sample_shape=10, name="x")
+    return x
 
-  where $z^{(s)} \sim q(z; \lambda)$ and$\\beta^{(s)}
-  \sim q(\\beta)$.
+  def variational():
+    qmu = Normal(loc=tf.get_variable("loc", []),
+                 scale=tf.nn.softplus(tf.get_variable("shape", [])),
+                 name="qmu")
+    return qmu
 
-  The objective function also adds to itself a summation over all
-  tensors in the `REGULARIZATION_LOSSES` collection.
+  loss, surrogate_loss = ed.klpq(
+      model, variational,
+      align_latent=lambda name: "qmu" if name == "mu" else None,
+      align_data=lambda name: "x" if name == "x" else None,
+      x=x_data)
+  ```
   """
-  def __init__(self, latent_vars=None, data=None):
-    """Create an inference algorithm.
-
-    Args:
-      latent_vars: list of RandomVariable or
-                   dict of RandomVariable to RandomVariable.
-        Collection of random variables to perform inference on. If
-        list, each random variable will be implictly optimized using a
-        `Normal` random variable that is defined internally with a
-        free parameter per location and scale and is initialized using
-        standard normal draws. The random variables to approximate
-        must be continuous.
-    """
-    if isinstance(latent_vars, list):
-      with tf.variable_scope(None, default_name="posterior"):
-        latent_vars_dict = {}
-        continuous = \
-            ('01', 'nonnegative', 'simplex', 'real', 'multivariate_real')
-        for z in latent_vars:
-          if not hasattr(z, 'support') or z.support not in continuous:
-            raise AttributeError(
-                "Random variable {} is not continuous or a random "
-                "variable with supported continuous support.".format(z))
-          batch_event_shape = z.batch_shape.concatenate(z.event_shape)
-          loc = tf.Variable(tf.random_normal(batch_event_shape))
-          scale = tf.nn.softplus(
-              tf.Variable(tf.random_normal(batch_event_shape)))
-          latent_vars_dict[z] = Normal(loc=loc, scale=scale)
-        latent_vars = latent_vars_dict
-        del latent_vars_dict
-
-    super(KLpq, self).__init__(latent_vars, data)
-
-  def initialize(self, n_samples=1, *args, **kwargs):
-    """Initialize inference algorithm. It initializes hyperparameters
-    and builds ops for the algorithm's computation graph.
-
-    Args:
-      n_samples: int.
-        Number of samples from variational model for calculating
-        stochastic gradients.
-    """
-    if n_samples <= 0:
-      raise ValueError(
-          "n_samples should be greater than zero: {}".format(n_samples))
-    self.n_samples = n_samples
-    return super(KLpq, self).initialize(*args, **kwargs)
-
-  def build_loss_and_gradients(self, var_list):
-    """Build loss function
-
-    $\\text{KL}( p(z \mid x) \| q(z) )
-      = \mathbb{E}_{p(z \mid x)} [ \log p(z \mid x) - \log q(z; \lambda) ]$
-
-    and stochastic gradients based on importance sampling.
-
-    The loss function can be estimated as
-
-    $\sum_{s=1}^S [
-      w_{\\text{norm}}(z^s; \lambda) (\log p(x, z^s) - \log q(z^s; \lambda) ],$
-
-    where for $z^s \sim q(z; \lambda)$,
-
-    $w_{\\text{norm}}(z^s; \lambda) =
-          w(z^s; \lambda) / \sum_{s=1}^S w(z^s; \lambda)$
-
-    normalizes the importance weights, $w(z^s; \lambda) = p(x,
-    z^s) / q(z^s; \lambda)$.
-
-    This provides a gradient,
-
-    $- \sum_{s=1}^S [
-      w_{\\text{norm}}(z^s; \lambda) \\nabla_{\lambda} \log q(z^s; \lambda) ].$
-    """
-    p_log_prob = [0.0] * self.n_samples
-    q_log_prob = [0.0] * self.n_samples
-    base_scope = tf.get_default_graph().unique_name("inference") + '/'
-    for s in range(self.n_samples):
-      # Form dictionary in order to replace conditioning on prior or
-      # observed variable with conditioning on a specific value.
-      scope = base_scope + tf.get_default_graph().unique_name("sample")
-      dict_swap = {}
-      for x, qx in six.iteritems(self.data):
-        if isinstance(x, RandomVariable):
-          if isinstance(qx, RandomVariable):
-            qx_copy = copy(qx, scope=scope)
-            dict_swap[x] = qx_copy.value()
-          else:
-            dict_swap[x] = qx
-
-      for z, qz in six.iteritems(self.latent_vars):
-        # Copy q(z) to obtain new set of posterior samples.
-        qz_copy = copy(qz, scope=scope)
-        dict_swap[z] = qz_copy.value()
+  p_log_prob = [0.0] * n_samples
+  q_log_prob = [0.0] * n_samples
+  for s in range(n_samples):
+    q_trace = call_with_trace(variational, *args, **kwargs)
+    x = call_with_intercept(model, q_trace, align_data, align_latent,
+                            *args, **kwargs)
+    for rv in toposort(x):
+      scale_factor = scale(rv.name)
+      if align_latent(rv.name) is not None or align_data(rv.name) is not None:
+        p_log_prob[s] += tf.reduce_sum(
+            scale_factor * rv.log_prob(tf.stop_gradient(rv.value)))
+      if align_latent(rv.name) is not None:
+        qz = posterior_trace[align_latent(rv.name)]
         q_log_prob[s] += tf.reduce_sum(
-            qz_copy.log_prob(tf.stop_gradient(dict_swap[z])))
-
-      for z in six.iterkeys(self.latent_vars):
-        z_copy = copy(z, dict_swap, scope=scope)
-        p_log_prob[s] += tf.reduce_sum(z_copy.log_prob(dict_swap[z]))
-
-      for x in six.iterkeys(self.data):
-        if isinstance(x, RandomVariable):
-          x_copy = copy(x, dict_swap, scope=scope)
-          p_log_prob[s] += tf.reduce_sum(x_copy.log_prob(dict_swap[x]))
-
-    p_log_prob = tf.stack(p_log_prob)
-    q_log_prob = tf.stack(q_log_prob)
-    reg_penalty = tf.reduce_sum(tf.losses.get_regularization_losses())
-
-    if self.logging:
-      tf.summary.scalar("loss/p_log_prob", tf.reduce_mean(p_log_prob),
-                        collections=[self._summary_key])
-      tf.summary.scalar("loss/q_log_prob", tf.reduce_mean(q_log_prob),
-                        collections=[self._summary_key])
-      tf.summary.scalar("loss/reg_penalty", reg_penalty,
-                        collections=[self._summary_key])
-
-    log_w = p_log_prob - q_log_prob
-    log_w_norm = log_w - tf.reduce_logsumexp(log_w)
-    w_norm = tf.exp(log_w_norm)
-    loss = tf.reduce_sum(w_norm * log_w) - reg_penalty
-
-    q_rvs = list(six.itervalues(self.latent_vars))
-    q_vars = [v for v in var_list
-              if len(get_descendants(tf.convert_to_tensor(v), q_rvs)) != 0]
-    q_grads = tf.gradients(
-        -(tf.reduce_sum(q_log_prob * tf.stop_gradient(w_norm)) - reg_penalty),
-        q_vars)
-    p_vars = [v for v in var_list if v not in q_vars]
-    p_grads = tf.gradients(-loss, p_vars)
-    grads_and_vars = list(zip(q_grads, q_vars)) + list(zip(p_grads, p_vars))
-    return loss, grads_and_vars
+            scale_factor * qz.log_prob(tf.stop_gradient(qz.value)))
+
+  p_log_prob = tf.stack(p_log_prob)
+  q_log_prob = tf.stack(q_log_prob)
+  reg_penalty = tf.reduce_sum(tf.losses.get_regularization_losses())
+  if collections is not None:
+    tf.summary.scalar("loss/p_log_prob", tf.reduce_mean(p_log_prob),
+                      collections=collections)
+    tf.summary.scalar("loss/q_log_prob", tf.reduce_mean(q_log_prob),
+                      collections=collections)
+    tf.summary.scalar("loss/reg_penalty", reg_penalty,
+                      collections=collections)
+
+  log_w = p_log_prob - tf.stop_gradient(q_log_prob)
+  log_w_norm = log_w - tf.reduce_logsumexp(log_w)
+  w_norm = tf.exp(log_w_norm)
+  loss = -tf.reduce_sum(w_norm * log_w) + reg_penalty
+  # Model parameter gradients will backprop into loss. Variational
+  # parameter gradients will backprop into reg_penalty and last term.
+  surrogate_loss = loss + tf.reduce_sum(q_log_prob * tf.stop_gradient(w_norm))
+  return loss, surrogate_loss
diff --git a/edward/inferences/klqp.py b/edward/inferences/klqp.py
index 3cfbc9cea..4610eeac2 100644
--- a/edward/inferences/klqp.py
+++ b/edward/inferences/klqp.py
@@ -5,9 +5,9 @@
 import six
 import tensorflow as tf
 
-from edward.inferences.variational_inference import VariationalInference
-from edward.models import RandomVariable
-from edward.util import copy, get_descendants
+from edward.inferences import docstrings as doc
+from edward.inferences.util import (
+    call_with_intercept, call_with_trace, toposort)
 
 try:
   from edward.models import Normal
@@ -15,703 +15,297 @@
 except Exception as e:
   raise ImportError("{0}. Your TensorFlow version is not supported.".format(e))
 
-
-class KLqp(VariationalInference):
+tfd = tf.contrib.distributions
+
+
+@doc.set_doc(
+    args=(doc.arg_model +
+          doc.arg_variational +
+          doc.arg_align_latent +
+          doc.arg_align_data +
+          doc.arg_scale +
+          doc.arg_n_samples +
+          doc.arg_kl_scaling +
+          doc.arg_auto_transform +
+          doc.arg_collections +
+          doc.arg_args_kwargs)[:-1],
+    returns=doc.return_loss_surrogate_loss,
+    notes_model_parameters=doc.notes_model_parameters,
+    notes_conditional_inference=doc.notes_conditional_inference_samples,
+    notes_regularization_losses=doc.notes_regularization_losses)
+def klqp(model, variational, align_latent, align_data,
+         scale=lambda name: 1.0, n_samples=1, kl_scaling=lambda name: 1.0,
+         auto_transform=True, collections=None, *args, **kwargs):
   """Variational inference with the KL divergence
 
   $\\text{KL}( q(z; \lambda) \| p(z \mid x) ).$
 
-  This class minimizes the objective by automatically selecting from a
-  variety of black box inference techniques.
+  This function returns a loss and surrogate loss
+  [@schulman2015stochastic; @ruiz2016generalized; @ritchie2016deep].
+  The surrogate loss' autodiff automates selection of two black box
+  gradient estimators given a variational factor:
 
-  #### Notes
+  1. score function gradients [@paisley2012variational] with
+     Rao-Blackwellization [@ranganath2014black];
+  2. reparameterization gradients [@kingma2014auto].
 
-  `KLqp` also optimizes any model parameters $p(z \mid x;
-  \\theta)$. It does this by variational EM, maximizing
+  If the KL divergence between a variational factor and its aligned
+  prior is tractable, then the loss function can be written as
 
-  $\mathbb{E}_{q(z; \lambda)} [ \log p(x, z; \\theta) ]$
+  $-\mathbb{E}_{q(z; \lambda)}[\log p(x \mid z)] +
+      \\text{KL}( q(z; \lambda) \| p(z) ),$
 
-  with respect to $\\theta$.
+  where the KL term is computed analytically [@kingma2014auto]. We
+  compute this automatically when $p(z)$ and $q(z; \lambda)$ are
+  Normal.
 
-  In conditional inference, we infer $z$ in $p(z, \\beta
-  \mid x)$ while fixing inference over $\\beta$ using another
-  distribution $q(\\beta)$. During gradient calculation, instead
-  of using the model's density
+  Current Rao-Blackwellization is limited to Rao-Blackwellizing across
+  stochastic nodes in the computation graph. It does not
+  Rao-Blackwellize within a node such as when a node represents
+  multiple random variables via non-scalar batch shape.
+  Rao-Blackwellization is performed at runtime for each sample.
 
-  $\log p(x, z^{(s)}), z^{(s)} \sim q(z; \lambda),$
+  Args:
+  @{args}
 
-  for each sample $s=1,\ldots,S$, `KLqp` uses
+  Returns:
+  @{returns}
 
-  $\log p(x, z^{(s)}, \\beta^{(s)}),$
+  #### Notes
 
-  where $z^{(s)} \sim q(z; \lambda)$ and $\\beta^{(s)}
-  \sim q(\\beta)$.
+  Probabilistic programs may have random variables which vary across
+  executions. The algorithm returns calculations following `n_samples`
+  executions of the model and variational programs.
 
-  The objective function also adds to itself a summation over all
-  tensors in the `REGULARIZATION_LOSSES` collection.
-  """
-  def __init__(self, latent_vars=None, data=None):
-    """Create an inference algorithm.
-
-    Args:
-      latent_vars: list of RandomVariable or
-                   dict of RandomVariable to RandomVariable.
-        Collection of random variables to perform inference on. If
-        list, each random variable will be implictly optimized using a
-        `Normal` random variable that is defined internally with a
-        free parameter per location and scale and is initialized using
-        standard normal draws. The random variables to approximate
-        must be continuous.
-    """
-    if isinstance(latent_vars, list):
-      with tf.variable_scope(None, default_name="posterior"):
-        latent_vars_dict = {}
-        continuous = \
-            ('01', 'nonnegative', 'simplex', 'real', 'multivariate_real')
-        for z in latent_vars:
-          if not hasattr(z, 'support') or z.support not in continuous:
-            raise AttributeError(
-                "Random variable {} is not continuous or a random "
-                "variable with supported continuous support.".format(z))
-          batch_event_shape = z.batch_shape.concatenate(z.event_shape)
-          loc = tf.Variable(tf.random_normal(batch_event_shape))
-          scale = tf.nn.softplus(
-              tf.Variable(tf.random_normal(batch_event_shape)))
-          latent_vars_dict[z] = Normal(loc=loc, scale=scale)
-        latent_vars = latent_vars_dict
-        del latent_vars_dict
-
-    super(KLqp, self).__init__(latent_vars, data)
-
-  def initialize(self, n_samples=1, kl_scaling=None, *args, **kwargs):
-    """Initialize inference algorithm. It initializes hyperparameters
-    and builds ops for the algorithm's computation graph.
-
-    Args:
-      n_samples: int.
-        Number of samples from variational model for calculating
-        stochastic gradients.
-      kl_scaling: dict of RandomVariable to tf.Tensor.
-        Provides option to scale terms when using ELBO with KL divergence.
-        If the KL divergence terms are
-
-        $\\alpha_p \mathbb{E}_{q(z\mid x, \lambda)} [
-              \log q(z\mid x, \lambda) - \log p(z)],$
-
-        then pass {$p(z)$: $\\alpha_p$} as `kl_scaling`,
-        where $\\alpha_p$ is a tensor. Its shape must be broadcastable;
-        it is multiplied element-wise to the batchwise KL terms.
-    """
-    if kl_scaling is None:
-      kl_scaling = {}
-    if n_samples <= 0:
-      raise ValueError(
-          "n_samples should be greater than zero: {}".format(n_samples))
-
-    self.n_samples = n_samples
-    self.kl_scaling = kl_scaling
-    return super(KLqp, self).initialize(*args, **kwargs)
-
-  def build_loss_and_gradients(self, var_list):
-    """Wrapper for the `KLqp` loss function.
-
-    $-\\text{ELBO} =
-        -\mathbb{E}_{q(z; \lambda)} [ \log p(x, z) - \log q(z; \lambda) ]$
-
-    KLqp supports
-
-    1. score function gradients [@paisley2012variational]
-    2. reparameterization gradients [@kingma2014auto]
-
-    of the loss function.
-
-    If the KL divergence between the variational model and the prior
-    is tractable, then the loss function can be written as
-
-    $-\mathbb{E}_{q(z; \lambda)}[\log p(x \mid z)] +
-        \\text{KL}( q(z; \lambda) \| p(z) ),$
-
-    where the KL term is computed analytically [@kingma2014auto]. We
-    compute this automatically when $p(z)$ and $q(z; \lambda)$ are
-    Normal.
-    """
-    is_reparameterizable = all([
-        rv.reparameterization_type ==
-        tf.contrib.distributions.FULLY_REPARAMETERIZED
-        for rv in six.itervalues(self.latent_vars)])
-    is_analytic_kl = all([isinstance(z, Normal) and isinstance(qz, Normal)
-                          for z, qz in six.iteritems(self.latent_vars)])
-    if not is_analytic_kl and self.kl_scaling:
-      raise TypeError("kl_scaling must be None when using non-analytic KL term")
-    if is_reparameterizable:
-      if is_analytic_kl:
-        return build_reparam_kl_loss_and_gradients(self, var_list)
-      # elif is_analytic_entropy:
-      #    return build_reparam_entropy_loss_and_gradients(self, var_list)
-      else:
-        return build_reparam_loss_and_gradients(self, var_list)
-    else:
-      # Prefer Rao-Blackwellization over analytic KL. Unknown what
-      # would happen stability-wise if the two are combined.
-      # if is_analytic_kl:
-      #   return build_score_kl_loss_and_gradients(self, var_list)
-      # Analytic entropies may lead to problems around
-      # convergence; for now it is deactivated.
-      # elif is_analytic_entropy:
-      #    return build_score_entropy_loss_and_gradients(self, var_list)
-      # else:
-      return build_score_rb_loss_and_gradients(self, var_list)
-
-
-class ReparameterizationKLqp(VariationalInference):
-  """Variational inference with the KL divergence
+  @{notes_model_parameters}
 
-  $\\text{KL}( q(z; \lambda) \| p(z \mid x) ).$
+  @{notes_conditional_inference}
 
-  This class minimizes the objective using the reparameterization
-  gradient.
+  @{notes_regularization_losses}
 
-  The objective function also adds to itself a summation over all
-  tensors in the `REGULARIZATION_LOSSES` collection.
-  """
-  def __init__(self, latent_vars=None, data=None):
-    """Create an inference algorithm.
-
-    Args:
-      latent_vars: list of RandomVariable or
-                   dict of RandomVariable to RandomVariable.
-        Collection of random variables to perform inference on. If
-        list, each random variable will be implictly optimized using a
-        `Normal` random variable that is defined internally with a
-        free parameter per location and scale and is initialized using
-        standard normal draws. The random variables to approximate
-        must be continuous.
-    """
-    if isinstance(latent_vars, list):
-      with tf.variable_scope(None, default_name="posterior"):
-        latent_vars_dict = {}
-        continuous = \
-            ('01', 'nonnegative', 'simplex', 'real', 'multivariate_real')
-        for z in latent_vars:
-          if not hasattr(z, 'support') or z.support not in continuous:
-            raise AttributeError(
-                "Random variable {} is not continuous or a random "
-                "variable with supported continuous support.".format(z))
-          batch_event_shape = z.batch_shape.concatenate(z.event_shape)
-          loc = tf.Variable(tf.random_normal(batch_event_shape))
-          scale = tf.nn.softplus(
-              tf.Variable(tf.random_normal(batch_event_shape)))
-          latent_vars_dict[z] = Normal(loc=loc, scale=scale)
-        latent_vars = latent_vars_dict
-        del latent_vars_dict
-
-    super(ReparameterizationKLqp, self).__init__(latent_vars, data)
-
-  def initialize(self, n_samples=1, *args, **kwargs):
-    """Initialize inference algorithm. It initializes hyperparameters
-    and builds ops for the algorithm's computation graph.
-
-    Args:
-      n_samples: int.
-        Number of samples from variational model for calculating
-        stochastic gradients.
-    """
-    if n_samples <= 0:
-      raise ValueError(
-          "n_samples should be greater than zero: {}".format(n_samples))
-    self.n_samples = n_samples
-    return super(ReparameterizationKLqp, self).initialize(*args, **kwargs)
-
-  def build_loss_and_gradients(self, var_list):
-    return build_reparam_loss_and_gradients(self, var_list)
-
-
-class ReparameterizationKLKLqp(VariationalInference):
-  """Variational inference with the KL divergence
+  #### Examples
 
-  $\\text{KL}( q(z; \lambda) \| p(z \mid x) ).$
+  ```python
+  def model():
+    mu = Normal(loc=0.0, scale=1.0, name="mu")
+    x = Normal(loc=mu, scale=1.0, sample_shape=10, name="x")
+    return x
 
-  This class minimizes the objective using the reparameterization
-  gradient and an analytic KL term.
+  def variational():
+    qmu = Normal(loc=tf.get_variable("loc", []),
+                 scale=tf.nn.softplus(tf.get_variable("shape", [])),
+                 name="qmu")
+    return qmu
 
-  The objective function also adds to itself a summation over all
-  tensors in the `REGULARIZATION_LOSSES` collection.
+  loss, surrogate_loss = ed.klqp(
+      model, variational,
+      align_latent=lambda name: "qmu" if name == "mu" else None,
+      align_data=lambda name: "x" if name == "x" else None,
+      x=x_data)
+  ```
   """
-  def __init__(self, latent_vars=None, data=None):
-    """Create an inference algorithm.
-
-    Args:
-      latent_vars: list of RandomVariable or
-                   dict of RandomVariable to RandomVariable.
-        Collection of random variables to perform inference on. If
-        list, each random variable will be implictly optimized using a
-        `Normal` random variable that is defined internally with a
-        free parameter per location and scale and is initialized using
-        standard normal draws. The random variables to approximate
-        must be continuous.
-    """
-    if isinstance(latent_vars, list):
-      with tf.variable_scope(None, default_name="posterior"):
-        latent_vars_dict = {}
-        continuous = \
-            ('01', 'nonnegative', 'simplex', 'real', 'multivariate_real')
-        for z in latent_vars:
-          if not hasattr(z, 'support') or z.support not in continuous:
-            raise AttributeError(
-                "Random variable {} is not continuous or a random "
-                "variable with supported continuous support.".format(z))
-          batch_event_shape = z.batch_shape.concatenate(z.event_shape)
-          loc = tf.Variable(tf.random_normal(batch_event_shape))
-          scale = tf.nn.softplus(
-              tf.Variable(tf.random_normal(batch_event_shape)))
-          latent_vars_dict[z] = Normal(loc=loc, scale=scale)
-        latent_vars = latent_vars_dict
-        del latent_vars_dict
-
-    super(ReparameterizationKLKLqp, self).__init__(latent_vars, data)
-
-  def initialize(self, n_samples=1, kl_scaling=None, *args, **kwargs):
-    """Initialize inference algorithm. It initializes hyperparameters
-    and builds ops for the algorithm's computation graph.
-
-    Args:
-      n_samples: int.
-        Number of samples from variational model for calculating
-        stochastic gradients.
-      kl_scaling: dict of RandomVariable to tf.Tensor.
-        Provides option to scale terms when using ELBO with KL divergence.
-        If the KL divergence terms are
-
-        $\\alpha_p \mathbb{E}_{q(z\mid x, \lambda)} [
-              \log q(z\mid x, \lambda) - \log p(z)],$
-
-        then pass {$p(z)$: $\\alpha_p$} as `kl_scaling`,
-        where $\\alpha_p$ is a tensor. Its shape must be broadcastable;
-        it is multiplied element-wise to the batchwise KL terms.
-    """
-    if kl_scaling is None:
-      kl_scaling = {}
-    if n_samples <= 0:
-      raise ValueError(
-          "n_samples should be greater than zero: {}".format(n_samples))
-
-    self.n_samples = n_samples
-    self.kl_scaling = kl_scaling
-    return super(ReparameterizationKLKLqp, self).initialize(*args, **kwargs)
-
-  def build_loss_and_gradients(self, var_list):
-    return build_reparam_kl_loss_and_gradients(self, var_list)
-
-
-class ReparameterizationEntropyKLqp(VariationalInference):
-  """Variational inference with the KL divergence
+  # TODO control variates
+  # + baseline, learnable baseline
+  # + Ruiz+ 2016
+  # + Tucker+ 2017; Cremer+ 2017
+  # + Miller+ 2017
+  # TODO analytic stuff
+  # + Roeder+ 2017
+  p_log_prob = [None] * n_samples
+  q_log_prob = [None] * n_samples
+  surrogate_loss = [None] * n_samples
+  kl_penalty = 0.0
+  for s in range(n_samples):
+    q_trace = call_with_trace(variational, *args, **kwargs)
+    x = call_with_intercept(model, q_trace, align_data, align_latent,
+                            *args, **kwargs)
+
+    # Collect key-value pairs of (rv, rv's (scaled) log prob).
+    p_dict = {}
+    q_dict = {}
+    inverse_align_latent = {}
+    for rv in toposort(x):
+      scale_factor = scale(rv.name)
+      if align_data(rv.name) is not None:
+        p_dict[rv] = tf.reduce_sum(scale_factor * rv.log_prob(rv.value))
+      if align_latent(rv.name) is not None:
+        qz = q_trace[align_latent(rv.name)]
+        # For pairs with analytic KL, accumulate KL divergences for
+        # first iteration in loop.
+        if isinstance(rv, Normal) and isinstance(qz, Normal):
+          if s == 0:
+            kl_penalty += tf.reduce_sum(
+                kl_scaling(rv.name) * kl_divergence(qz, rv))
+        else:
+          p_dict[rv] = tf.reduce_sum(scale_factor * rv.log_prob(rv.value))
+          q_dict[qz] = tf.reduce_sum(scale_factor * qz.log_prob(qz.value))
+          inverse_align_latent[qz] = rv
+
+    # Build surrogate loss.
+    scaled_q_log_prob = 0.0
+    for qz, log_prob in six.iteritems(q_dict):
+      if qz.reparameterization_type == tfd.FULLY_REPARAMETERIZED:
+        scale_factor = 1.0
+      else:
+        scale_factor = 0.0
+        for rv in qz.get_blanket(q_rvs) + [qz]:
+          scale_factor += q_dict[rv]
+          scale_factor -= p_dict[inverse_align_latent[qz]]
+      scaled_q_log_prob += scale_factor * log_prob
 
-  $\\text{KL}( q(z; \lambda) \| p(z \mid x) ).$
+    p_log_prob_s = tf.reduce_sum(list(six.itervalues(p_dict)))
+    p_log_prob[s] = p_log_prob_s
+    q_log_prob[s] = tf.reduce_sum(list(six.itervalues(q_dict)))
+    surrogate_loss[s] = scaled_q_log_prob - p_log_prob_s
+
+  p_log_prob = tf.reduce_mean(p_log_prob)
+  q_log_prob = tf.reduce_mean(q_log_prob)
+  surrogate_loss = tf.reduce_mean(surrogate_loss) + kl_penalty
 
-  This class minimizes the objective using the reparameterization
-  gradient and an analytic entropy term.
+  reg_penalty = tf.reduce_sum(tf.losses.get_regularization_losses())
+  surrogate_loss += reg_penalty
 
-  The objective function also adds to itself a summation over all
-  tensors in the `REGULARIZATION_LOSSES` collection.
-  """
-  def __init__(self, latent_vars=None, data=None):
-    """Create an inference algorithm.
-
-    Args:
-      latent_vars: list of RandomVariable or
-                   dict of RandomVariable to RandomVariable.
-        Collection of random variables to perform inference on. If
-        list, each random variable will be implictly optimized using a
-        `Normal` random variable that is defined internally with a
-        free parameter per location and scale and is initialized using
-        standard normal draws. The random variables to approximate
-        must be continuous.
-    """
-    if isinstance(latent_vars, list):
-      with tf.variable_scope(None, default_name="posterior"):
-        latent_vars_dict = {}
-        continuous = \
-            ('01', 'nonnegative', 'simplex', 'real', 'multivariate_real')
-        for z in latent_vars:
-          if not hasattr(z, 'support') or z.support not in continuous:
-            raise AttributeError(
-                "Random variable {} is not continuous or a random "
-                "variable with supported continuous support.".format(z))
-          batch_event_shape = z.batch_shape.concatenate(z.event_shape)
-          loc = tf.Variable(tf.random_normal(batch_event_shape))
-          scale = tf.nn.softplus(
-              tf.Variable(tf.random_normal(batch_event_shape)))
-          latent_vars_dict[z] = Normal(loc=loc, scale=scale)
-        latent_vars = latent_vars_dict
-        del latent_vars_dict
-
-    super(ReparameterizationEntropyKLqp, self).__init__(latent_vars, data)
-
-  def initialize(self, n_samples=1, *args, **kwargs):
-    """Initialize inference algorithm. It initializes hyperparameters
-    and builds ops for the algorithm's computation graph.
-
-    Args:
-      n_samples: int.
-        Number of samples from variational model for calculating
-        stochastic gradients.
-    """
-    if n_samples <= 0:
-      raise ValueError(
-          "n_samples should be greater than zero: {}".format(n_samples))
-    self.n_samples = n_samples
-    return super(ReparameterizationEntropyKLqp, self).initialize(
-        *args, **kwargs)
-
-  def build_loss_and_gradients(self, var_list):
-    return build_reparam_entropy_loss_and_gradients(self, var_list)
-
-
-class ScoreKLqp(VariationalInference):
+  if collections is not None:
+    tf.summary.scalar("loss/p_log_prob", p_log_prob,
+                      collections=collections)
+    tf.summary.scalar("loss/q_log_prob", q_log_prob,
+                      collections=collections)
+    tf.summary.scalar("loss/reg_penalty", reg_penalty,
+                      collections=collections)
+
+  loss = q_log_prob - p_log_prob + kl_penalty + reg_penalty
+  return loss, surrogate_loss
+
+
+@doc.set_doc(
+    args=(doc.arg_model +
+          doc.arg_variational +
+          doc.arg_align_latent +
+          doc.arg_align_data +
+          doc.arg_scale +
+          doc.arg_n_samples +
+          doc.arg_auto_transform +
+          doc.arg_collections +
+          doc.arg_args_kwargs)[:-1],
+    returns=doc.return_loss,
+    notes_model_parameters=doc.notes_model_parameters,
+    notes_conditional_inference=doc.notes_conditional_inference_samples,
+    notes_regularization_losses=doc.notes_regularization_losses)
+def klqp_reparameterization(model, variational, align_latent, align_data,
+                            scale=lambda name: 1.0, n_samples=1,
+                            auto_transform=True, collections=None,
+                            *args, **kwargs):
   """Variational inference with the KL divergence
 
   $\\text{KL}( q(z; \lambda) \| p(z \mid x) ).$
 
-  This class minimizes the objective using the score function
-  gradient.
+  This function builds a loss function equal to KL(q||p) up to a
+  constant. Its automatic differentiation is a stochastic gradient of
 
-  The objective function also adds to itself a summation over all
-  tensors in the `REGULARIZATION_LOSSES` collection.
-  """
-  def __init__(self, latent_vars=None, data=None):
-    """Create an inference algorithm.
-
-    Args:
-      latent_vars: list of RandomVariable or
-                   dict of RandomVariable to RandomVariable.
-        Collection of random variables to perform inference on. If
-        list, each random variable will be implictly optimized using a
-        `Normal` random variable that is defined internally with a
-        free parameter per location and scale and is initialized using
-        standard normal draws. The random variables to approximate
-        must be continuous.
-    """
-    if isinstance(latent_vars, list):
-      with tf.variable_scope(None, default_name="posterior"):
-        latent_vars_dict = {}
-        continuous = \
-            ('01', 'nonnegative', 'simplex', 'real', 'multivariate_real')
-        for z in latent_vars:
-          if not hasattr(z, 'support') or z.support not in continuous:
-            raise AttributeError(
-                "Random variable {} is not continuous or a random "
-                "variable with supported continuous support.".format(z))
-          batch_event_shape = z.batch_shape.concatenate(z.event_shape)
-          loc = tf.Variable(tf.random_normal(batch_event_shape))
-          scale = tf.nn.softplus(
-              tf.Variable(tf.random_normal(batch_event_shape)))
-          latent_vars_dict[z] = Normal(loc=loc, scale=scale)
-        latent_vars = latent_vars_dict
-        del latent_vars_dict
-
-    super(ScoreKLqp, self).__init__(latent_vars, data)
-
-  def initialize(self, n_samples=1, *args, **kwargs):
-    """Initialize inference algorithm. It initializes hyperparameters
-    and builds ops for the algorithm's computation graph.
-
-    Args:
-      n_samples: int.
-        Number of samples from variational model for calculating
-        stochastic gradients.
-    """
-    if n_samples <= 0:
-      raise ValueError(
-          "n_samples should be greater than zero: {}".format(n_samples))
-    self.n_samples = n_samples
-    return super(ScoreKLqp, self).initialize(*args, **kwargs)
-
-  def build_loss_and_gradients(self, var_list):
-    return build_score_loss_and_gradients(self, var_list)
-
-
-class ScoreKLKLqp(VariationalInference):
-  """Variational inference with the KL divergence
-
-  $\\text{KL}( q(z; \lambda) \| p(z \mid x) ).$
+  $-\\text{ELBO} =
+      -\mathbb{E}_{q(z; \lambda)} [ \log p(x, z) - \log q(z; \lambda) ]$
 
-  This class minimizes the objective using the score function gradient
-  and an analytic KL term.
+  based on the reparameterization trick [@kingma2014auto].
 
-  The objective function also adds to itself a summation over all
-  tensors in the `REGULARIZATION_LOSSES` collection.
-  """
-  def __init__(self, latent_vars=None, data=None):
-    """Create an inference algorithm.
-
-    Args:
-      latent_vars: list of RandomVariable or
-                   dict of RandomVariable to RandomVariable.
-        Collection of random variables to perform inference on. If
-        list, each random variable will be implictly optimized using a
-        `Normal` random variable that is defined internally with a
-        free parameter per location and scale and is initialized using
-        standard normal draws. The random variables to approximate
-        must be continuous.
-    """
-    if isinstance(latent_vars, list):
-      with tf.variable_scope(None, default_name="posterior"):
-        latent_vars_dict = {}
-        continuous = \
-            ('01', 'nonnegative', 'simplex', 'real', 'multivariate_real')
-        for z in latent_vars:
-          if not hasattr(z, 'support') or z.support not in continuous:
-            raise AttributeError(
-                "Random variable {} is not continuous or a random "
-                "variable with supported continuous support.".format(z))
-          batch_event_shape = z.batch_shape.concatenate(z.event_shape)
-          loc = tf.Variable(tf.random_normal(batch_event_shape))
-          scale = tf.nn.softplus(
-              tf.Variable(tf.random_normal(batch_event_shape)))
-          latent_vars_dict[z] = Normal(loc=loc, scale=scale)
-        latent_vars = latent_vars_dict
-        del latent_vars_dict
-
-    super(ScoreKLKLqp, self).__init__(latent_vars, data)
-
-  def initialize(self, n_samples=1, kl_scaling=None, *args, **kwargs):
-    """Initialize inference algorithm. It initializes hyperparameters
-    and builds ops for the algorithm's computation graph.
-
-    Args:
-      n_samples: int.
-        Number of samples from variational model for calculating
-        stochastic gradients.
-      kl_scaling: dict of RandomVariable to tf.Tensor.
-        Provides option to scale terms when using ELBO with KL divergence.
-        If the KL divergence terms are
-
-        $\\alpha_p \mathbb{E}_{q(z\mid x, \lambda)} [
-              \log q(z\mid x, \lambda) - \log p(z)],$
-
-        then pass {$p(z)$: $\\alpha_p$} as `kl_scaling`,
-        where $\\alpha_p$ is a tensor. Its shape must be broadcastable;
-        it is multiplied element-wise to the batchwise KL terms.
-    """
-    if kl_scaling is None:
-      kl_scaling = {}
-    if n_samples <= 0:
-      raise ValueError(
-          "n_samples should be greater than zero: {}".format(n_samples))
-    self.n_samples = n_samples
-    self.kl_scaling = kl_scaling
-    return super(ScoreKLKLqp, self).initialize(*args, **kwargs)
-
-  def build_loss_and_gradients(self, var_list):
-    return build_score_kl_loss_and_gradients(self, var_list)
-
-
-class ScoreEntropyKLqp(VariationalInference):
-  """Variational inference with the KL divergence
+  Computed by sampling from $q(z;\lambda)$ and evaluating the
+  expectation using Monte Carlo sampling.
 
-  $\\text{KL}( q(z; \lambda) \| p(z \mid x) ).$
+  Args:
+  @{args}
 
-  This class minimizes the objective using the score function gradient
-  and an analytic entropy term.
+  Returns:
+  @{returns}
 
-  The objective function also adds to itself a summation over all
-  tensors in the `REGULARIZATION_LOSSES` collection.
-  """
-  def __init__(self, latent_vars=None, data=None):
-    """Create an inference algorithm.
-
-    Args:
-      latent_vars: list of RandomVariable or
-                   dict of RandomVariable to RandomVariable.
-        Collection of random variables to perform inference on. If
-        list, each random variable will be implictly optimized using a
-        `Normal` random variable that is defined internally with a
-        free parameter per location and scale and is initialized using
-        standard normal draws. The random variables to approximate
-        must be continuous.
-    """
-    if isinstance(latent_vars, list):
-      with tf.variable_scope(None, default_name="posterior"):
-        latent_vars_dict = {}
-        continuous = \
-            ('01', 'nonnegative', 'simplex', 'real', 'multivariate_real')
-        for z in latent_vars:
-          if not hasattr(z, 'support') or z.support not in continuous:
-            raise AttributeError(
-                "Random variable {} is not continuous or a random "
-                "variable with supported continuous support.".format(z))
-          batch_event_shape = z.batch_shape.concatenate(z.event_shape)
-          loc = tf.Variable(tf.random_normal(batch_event_shape))
-          scale = tf.nn.softplus(
-              tf.Variable(tf.random_normal(batch_event_shape)))
-          latent_vars_dict[z] = Normal(loc=loc, scale=scale)
-        latent_vars = latent_vars_dict
-        del latent_vars_dict
-
-    super(ScoreEntropyKLqp, self).__init__(latent_vars, data)
-
-  def initialize(self, n_samples=1, *args, **kwargs):
-    """Initialize inference algorithm. It initializes hyperparameters
-    and builds ops for the algorithm's computation graph.
-
-    Args:
-      n_samples: int.
-        Number of samples from variational model for calculating
-        stochastic gradients.
-    """
-    if n_samples <= 0:
-      raise ValueError(
-          "n_samples should be greater than zero: {}".format(n_samples))
-    self.n_samples = n_samples
-    return super(ScoreEntropyKLqp, self).initialize(*args, **kwargs)
-
-  def build_loss_and_gradients(self, var_list):
-    return build_score_entropy_loss_and_gradients(self, var_list)
-
-
-class ScoreRBKLqp(VariationalInference):
-  """Variational inference with the KL divergence
+  #### Notes
 
-  $\\text{KL}( q(z; \lambda) \| p(z \mid x) ).$
+  Probabilistic programs may have random variables which vary across
+  executions. The algorithm returns calculations following `n_samples`
+  executions of the model and variational programs.
 
-  This class minimizes the objective using the score function gradient
-  and Rao-Blackwellization.
+  @{notes_model_parameters}
 
-  #### Notes
+  @{notes_conditional_inference}
 
-  Current Rao-Blackwellization is limited to Rao-Blackwellizing across
-  stochastic nodes in the computation graph. It does not
-  Rao-Blackwellize within a node such as when a node represents
-  multiple random variables via non-scalar batch shape.
+  @{notes_regularization_losses}
 
-  The objective function also adds to itself a summation over all
-  tensors in the `REGULARIZATION_LOSSES` collection.
-  """
-  def __init__(self, latent_vars=None, data=None):
-    """Create an inference algorithm.
-
-    Args:
-      latent_vars: list of RandomVariable or
-                   dict of RandomVariable to RandomVariable.
-        Collection of random variables to perform inference on. If
-        list, each random variable will be implictly optimized using a
-        `Normal` random variable that is defined internally with a
-        free parameter per location and scale and is initialized using
-        standard normal draws. The random variables to approximate
-        must be continuous.
-    """
-    if isinstance(latent_vars, list):
-      with tf.variable_scope(None, default_name="posterior"):
-        latent_vars_dict = {}
-        continuous = \
-            ('01', 'nonnegative', 'simplex', 'real', 'multivariate_real')
-        for z in latent_vars:
-          if not hasattr(z, 'support') or z.support not in continuous:
-            raise AttributeError(
-                "Random variable {} is not continuous or a random "
-                "variable with supported continuous support.".format(z))
-          batch_event_shape = z.batch_shape.concatenate(z.event_shape)
-          loc = tf.Variable(tf.random_normal(batch_event_shape))
-          scale = tf.nn.softplus(
-              tf.Variable(tf.random_normal(batch_event_shape)))
-          latent_vars_dict[z] = Normal(loc=loc, scale=scale)
-        latent_vars = latent_vars_dict
-        del latent_vars_dict
-
-    super(ScoreRBKLqp, self).__init__(latent_vars, data)
-
-  def initialize(self, n_samples=1, *args, **kwargs):
-    """Initialize inference algorithm. It initializes hyperparameters
-    and builds ops for the algorithm's computation graph.
-
-    Args:
-      n_samples: int.
-        Number of samples from variational model for calculating
-        stochastic gradients.
-    """
-    if n_samples <= 0:
-      raise ValueError(
-          "n_samples should be greater than zero: {}".format(n_samples))
-    self.n_samples = n_samples
-    return super(ScoreRBKLqp, self).initialize(*args, **kwargs)
-
-  def build_loss_and_gradients(self, var_list):
-    return build_score_rb_loss_and_gradients(self, var_list)
-
-
-def build_reparam_loss_and_gradients(inference, var_list):
-  """Build loss function. Its automatic differentiation
-  is a stochastic gradient of
+  #### Examples
 
-  $-\\text{ELBO} =
-      -\mathbb{E}_{q(z; \lambda)} [ \log p(x, z) - \log q(z; \lambda) ]$
+  ```python
+  def model():
+    mu = Normal(loc=0.0, scale=1.0, name="mu")
+    x = Normal(loc=mu, scale=1.0, sample_shape=10, name="x")
+    return x
 
-  based on the reparameterization trick [@kingma2014auto].
+  def variational():
+    qmu = Normal(loc=tf.get_variable("loc", []),
+                 scale=tf.nn.softplus(tf.get_variable("shape", [])),
+                 name="qmu")
+    return qmu
 
-  Computed by sampling from $q(z;\lambda)$ and evaluating the
-  expectation using Monte Carlo sampling.
+  loss = ed.klqp_reparameterization(
+      model, variational,
+      align_latent=lambda name: "qmu" if name == "mu" else None,
+      align_data=lambda name: "x" if name == "x" else None,
+      x=x_data)
+  ```
   """
-  p_log_prob = [0.0] * inference.n_samples
-  q_log_prob = [0.0] * inference.n_samples
-  base_scope = tf.get_default_graph().unique_name("inference") + '/'
-  for s in range(inference.n_samples):
-    # Form dictionary in order to replace conditioning on prior or
-    # observed variable with conditioning on a specific value.
-    scope = base_scope + tf.get_default_graph().unique_name("sample")
-    dict_swap = {}
-    for x, qx in six.iteritems(inference.data):
-      if isinstance(x, RandomVariable):
-        if isinstance(qx, RandomVariable):
-          qx_copy = copy(qx, scope=scope)
-          dict_swap[x] = qx_copy.value()
-        else:
-          dict_swap[x] = qx
-
-    for z, qz in six.iteritems(inference.latent_vars):
-      # Copy q(z) to obtain new set of posterior samples.
-      qz_copy = copy(qz, scope=scope)
-      dict_swap[z] = qz_copy.value()
-      q_log_prob[s] += tf.reduce_sum(
-          inference.scale.get(z, 1.0) * qz_copy.log_prob(dict_swap[z]))
-
-    for z in six.iterkeys(inference.latent_vars):
-      z_copy = copy(z, dict_swap, scope=scope)
-      p_log_prob[s] += tf.reduce_sum(
-          inference.scale.get(z, 1.0) * z_copy.log_prob(dict_swap[z]))
-
-    for x in six.iterkeys(inference.data):
-      if isinstance(x, RandomVariable):
-        x_copy = copy(x, dict_swap, scope=scope)
-        p_log_prob[s] += tf.reduce_sum(
-            inference.scale.get(x, 1.0) * x_copy.log_prob(dict_swap[x]))
+  p_log_prob = [0.0] * n_samples
+  q_log_prob = [0.0] * n_samples
+  for s in range(n_samples):
+    q_trace = call_with_trace(variational, *args, **kwargs)
+    x = call_with_intercept(model, q_trace, align_data, align_latent,
+                            *args, **kwargs)
+    for rv in toposort(x):
+      scale_factor = scale(rv.name)
+      if align_latent(rv.name) is not None or align_data(rv.name) is not None:
+        p_log_prob[s] += tf.reduce_sum(scale_factor * rv.log_prob(rv.value))
+      if align_latent(rv.name) is not None:
+        qz = q_trace[align_latent(rv.name)]
+        q_log_prob[s] += tf.reduce_sum(scale_factor * qz.log_prob(qz.value))
 
   p_log_prob = tf.reduce_mean(p_log_prob)
   q_log_prob = tf.reduce_mean(q_log_prob)
   reg_penalty = tf.reduce_sum(tf.losses.get_regularization_losses())
-
-  if inference.logging:
+  if collections is not None:
     tf.summary.scalar("loss/p_log_prob", p_log_prob,
-                      collections=[inference._summary_key])
+                      collections=collections)
     tf.summary.scalar("loss/q_log_prob", q_log_prob,
-                      collections=[inference._summary_key])
+                      collections=collections)
     tf.summary.scalar("loss/reg_penalty", reg_penalty,
-                      collections=[inference._summary_key])
-
-  loss = -(p_log_prob - q_log_prob - reg_penalty)
-
-  grads = tf.gradients(loss, var_list)
-  grads_and_vars = list(zip(grads, var_list))
-  return loss, grads_and_vars
-
+                      collections=collections)
+  loss = q_log_prob - p_log_prob + reg_penalty
+  return loss
+
+
+@doc.set_doc(
+    args=(doc.arg_model +
+          doc.arg_variational +
+          doc.arg_align_latent +
+          doc.arg_align_data +
+          doc.arg_scale +
+          doc.arg_n_samples +
+          doc.arg_kl_scaling +
+          doc.arg_auto_transform +
+          doc.arg_collections +
+          doc.arg_args_kwargs)[:-1],
+    returns=doc.return_loss,
+    notes_model_parameters=doc.notes_model_parameters,
+    notes_conditional_inference=doc.notes_conditional_inference_samples,
+    notes_regularization_losses=doc.notes_regularization_losses)
+def klqp_reparameterization_kl(model, variational, align_latent, align_data,
+                               scale=lambda name: 1.0, n_samples=1,
+                               kl_scaling=lambda name: 1.0,
+                               auto_transform=True, collections=None,
+                               *args, **kwargs):
+  """Variational inference with the KL divergence
 
-def build_reparam_kl_loss_and_gradients(inference, var_list):
-  """Build loss function. Its automatic differentiation
-  is a stochastic gradient of
+  $\\text{KL}( q(z; \lambda) \| p(z \mid x) ).$
 
-  .. math::
+  This function builds a loss function equal to KL(q||p) up to a
+  constant. Its automatic differentiation is a stochastic gradient of
 
-    -\\text{ELBO} =  - ( \mathbb{E}_{q(z; \lambda)} [ \log p(x \mid z) ]
-          + \\text{KL}(q(z; \lambda) \| p(z)) )
+  $-\\text{ELBO} =
+      -\mathbb{E}_{q(z; \lambda)} [ \log p(x, z) - \log q(z; \lambda) ]$
 
   based on the reparameterization trick [@kingma2014auto].
 
@@ -719,434 +313,176 @@ def build_reparam_kl_loss_and_gradients(inference, var_list):
 
   Computed by sampling from $q(z;\lambda)$ and evaluating the
   expectation using Monte Carlo sampling.
-  """
-  p_log_lik = [0.0] * inference.n_samples
-  base_scope = tf.get_default_graph().unique_name("inference") + '/'
-  for s in range(inference.n_samples):
-    # Form dictionary in order to replace conditioning on prior or
-    # observed variable with conditioning on a specific value.
-    scope = base_scope + tf.get_default_graph().unique_name("sample")
-    dict_swap = {}
-    for x, qx in six.iteritems(inference.data):
-      if isinstance(x, RandomVariable):
-        if isinstance(qx, RandomVariable):
-          qx_copy = copy(qx, scope=scope)
-          dict_swap[x] = qx_copy.value()
-        else:
-          dict_swap[x] = qx
-
-    for z, qz in six.iteritems(inference.latent_vars):
-      # Copy q(z) to obtain new set of posterior samples.
-      qz_copy = copy(qz, scope=scope)
-      dict_swap[z] = qz_copy.value()
 
-    for x in six.iterkeys(inference.data):
-      if isinstance(x, RandomVariable):
-        x_copy = copy(x, dict_swap, scope=scope)
-        p_log_lik[s] += tf.reduce_sum(
-            inference.scale.get(x, 1.0) * x_copy.log_prob(dict_swap[x]))
+  Args:
+  @{args}
 
-  p_log_lik = tf.reduce_mean(p_log_lik)
-
-  kl_penalty = tf.reduce_sum([
-      tf.reduce_sum(inference.kl_scaling.get(z, 1.0) * kl_divergence(qz, z))
-      for z, qz in six.iteritems(inference.latent_vars)])
-
-  reg_penalty = tf.reduce_sum(tf.losses.get_regularization_losses())
+  Returns:
+  @{returns}
 
-  if inference.logging:
-    tf.summary.scalar("loss/p_log_lik", p_log_lik,
-                      collections=[inference._summary_key])
-    tf.summary.scalar("loss/kl_penalty", kl_penalty,
-                      collections=[inference._summary_key])
-    tf.summary.scalar("loss/reg_penalty", reg_penalty,
-                      collections=[inference._summary_key])
+  #### Notes
 
-  loss = -(p_log_lik - kl_penalty - reg_penalty)
+  Probabilistic programs may have random variables which vary across
+  executions. The algorithm returns calculations following `n_samples`
+  executions of the model and variational programs.
 
-  grads = tf.gradients(loss, var_list)
-  grads_and_vars = list(zip(grads, var_list))
-  return loss, grads_and_vars
+  @{notes_model_parameters}
 
+  @{notes_conditional_inference}
 
-def build_reparam_entropy_loss_and_gradients(inference, var_list):
-  """Build loss function. Its automatic differentiation
-  is a stochastic gradient of
+  @{notes_regularization_losses}
 
-  $-\\text{ELBO} =  -( \mathbb{E}_{q(z; \lambda)} [ \log p(x , z) ]
-          + \mathbb{H}(q(z; \lambda)) )$
+  #### Examples
 
-  based on the reparameterization trick [@kingma2014auto].
+  ```python
+  def model():
+    mu = Normal(loc=0.0, scale=1.0, name="mu")
+    x = Normal(loc=mu, scale=1.0, sample_shape=10, name="x")
+    return x
 
-  It assumes the entropy is analytic.
+  def variational():
+    qmu = Normal(loc=tf.get_variable("loc", []),
+                 scale=tf.nn.softplus(tf.get_variable("shape", [])),
+                 name="qmu")
+    return qmu
 
-  Computed by sampling from $q(z;\lambda)$ and evaluating the
-  expectation using Monte Carlo sampling.
+  loss = ed.klqp_reparameterization_kl(
+      model, variational,
+      align_latent=lambda name: "qmu" if name == "mu" else None,
+      align_data=lambda name: "x" if name == "x" else None,
+      x=x_data)
+  ```
   """
-  p_log_prob = [0.0] * inference.n_samples
-  base_scope = tf.get_default_graph().unique_name("inference") + '/'
-  for s in range(inference.n_samples):
-    # Form dictionary in order to replace conditioning on prior or
-    # observed variable with conditioning on a specific value.
-    scope = base_scope + tf.get_default_graph().unique_name("sample")
-    dict_swap = {}
-    for x, qx in six.iteritems(inference.data):
-      if isinstance(x, RandomVariable):
-        if isinstance(qx, RandomVariable):
-          qx_copy = copy(qx, scope=scope)
-          dict_swap[x] = qx_copy.value()
-        else:
-          dict_swap[x] = qx
+  p_log_lik = [0.0] * n_samples
+  for s in range(n_samples):
+    q_trace = call_with_trace(variational, *args, **kwargs)
+    x = call_with_intercept(model, q_trace, align_data, align_latent,
+                            *args, **kwargs)
+    for rv in toposort(x):
+      if align_data(rv.name) is not None:
+        scale_factor = scale(rv.name)
+        p_log_lik[s] += tf.reduce_sum(scale_factor * rv.log_prob(rv.value))
 
-    for z, qz in six.iteritems(inference.latent_vars):
-      # Copy q(z) to obtain new set of posterior samples.
-      qz_copy = copy(qz, scope=scope)
-      dict_swap[z] = qz_copy.value()
-
-    for z in six.iterkeys(inference.latent_vars):
-      z_copy = copy(z, dict_swap, scope=scope)
-      p_log_prob[s] += tf.reduce_sum(
-          inference.scale.get(z, 1.0) * z_copy.log_prob(dict_swap[z]))
-
-    for x in six.iterkeys(inference.data):
-      if isinstance(x, RandomVariable):
-        x_copy = copy(x, dict_swap, scope=scope)
-        p_log_prob[s] += tf.reduce_sum(
-            inference.scale.get(x, 1.0) * x_copy.log_prob(dict_swap[x]))
-
-  p_log_prob = tf.reduce_mean(p_log_prob)
+  p_log_lik = tf.reduce_mean(p_log_lik)
 
-  q_entropy = tf.reduce_sum([
-      tf.reduce_sum(qz.entropy())
-      for z, qz in six.iteritems(inference.latent_vars)])
+  kl_penalty = 0.0
+  for name, node in six.iteritems(model_trace):
+    if align_latent(name) is not None:
+      rv = node.value
+      qz = posterior_trace[align_latent(name)].value
+      kl_penalty += tf.reduce_sum(kl_scaling(name) * kl_divergence(qz, rv))
 
   reg_penalty = tf.reduce_sum(tf.losses.get_regularization_losses())
-
-  if inference.logging:
-    tf.summary.scalar("loss/p_log_prob", p_log_prob,
-                      collections=[inference._summary_key])
-    tf.summary.scalar("loss/q_entropy", q_entropy,
-                      collections=[inference._summary_key])
+  if collections is not None:
+    tf.summary.scalar("loss/p_log_lik", p_log_lik,
+                      collections=collections)
+    tf.summary.scalar("loss/kl_penalty", kl_penalty,
+                      collections=collections)
     tf.summary.scalar("loss/reg_penalty", reg_penalty,
-                      collections=[inference._summary_key])
+                      collections=collections)
+  loss = -p_log_lik + kl_penalty + reg_penalty
+  return loss
+
+
+@doc.set_doc(
+    args=(doc.arg_model +
+          doc.arg_variational +
+          doc.arg_align_latent +
+          doc.arg_align_data +
+          doc.arg_scale +
+          doc.arg_n_samples +
+          doc.arg_auto_transform +
+          doc.arg_collections +
+          doc.arg_args_kwargs)[:-1],
+    returns=doc.return_loss_surrogate_loss,
+    notes_model_parameters=doc.notes_model_parameters,
+    notes_conditional_inference=doc.notes_conditional_inference_samples,
+    notes_regularization_losses=doc.notes_regularization_losses)
+def klqp_score(model, variational, align_latent, align_data,
+               scale=lambda name: 1.0, n_samples=1, auto_transform=True,
+               collections=None, *args, **kwargs):
+  """Variational inference with the KL divergence
 
-  loss = -(p_log_prob + q_entropy - reg_penalty)
+  $\\text{KL}( q(z; \lambda) \| p(z \mid x) ).$
 
-  grads = tf.gradients(loss, var_list)
-  grads_and_vars = list(zip(grads, var_list))
-  return loss, grads_and_vars
+  This function builds a loss function equal to KL(q||p) up to a
+  constant. It also builds a surrogate loss whose automatic
+  differentiation is a stochastic gradient of
 
+  $-\\text{ELBO} =
+      -\mathbb{E}_{q(z; \lambda)} [ \log p(x, z) - \log q(z; \lambda) ]$
 
-def build_score_loss_and_gradients(inference, var_list):
-  """Build loss function and gradients based on the score function
-  estimator [@paisley2012variational].
+  based on the score function estimator [@paisley2012variational].
 
   Computed by sampling from $q(z;\lambda)$ and evaluating the
   expectation using Monte Carlo sampling.
-  """
-  p_log_prob = [0.0] * inference.n_samples
-  q_log_prob = [0.0] * inference.n_samples
-  base_scope = tf.get_default_graph().unique_name("inference") + '/'
-  for s in range(inference.n_samples):
-    # Form dictionary in order to replace conditioning on prior or
-    # observed variable with conditioning on a specific value.
-    scope = base_scope + tf.get_default_graph().unique_name("sample")
-    dict_swap = {}
-    for x, qx in six.iteritems(inference.data):
-      if isinstance(x, RandomVariable):
-        if isinstance(qx, RandomVariable):
-          qx_copy = copy(qx, scope=scope)
-          dict_swap[x] = qx_copy.value()
-        else:
-          dict_swap[x] = qx
-
-    for z, qz in six.iteritems(inference.latent_vars):
-      # Copy q(z) to obtain new set of posterior samples.
-      qz_copy = copy(qz, scope=scope)
-      dict_swap[z] = qz_copy.value()
-      q_log_prob[s] += tf.reduce_sum(
-          inference.scale.get(z, 1.0) *
-          qz_copy.log_prob(tf.stop_gradient(dict_swap[z])))
-
-    for z in six.iterkeys(inference.latent_vars):
-      z_copy = copy(z, dict_swap, scope=scope)
-      p_log_prob[s] += tf.reduce_sum(
-          inference.scale.get(z, 1.0) * z_copy.log_prob(dict_swap[z]))
-
-    for x in six.iterkeys(inference.data):
-      if isinstance(x, RandomVariable):
-        x_copy = copy(x, dict_swap, scope=scope)
-        p_log_prob[s] += tf.reduce_sum(
-            inference.scale.get(x, 1.0) * x_copy.log_prob(dict_swap[x]))
 
-  p_log_prob = tf.stack(p_log_prob)
-  q_log_prob = tf.stack(q_log_prob)
-  reg_penalty = tf.reduce_sum(tf.losses.get_regularization_losses())
-
-  if inference.logging:
-    tf.summary.scalar("loss/p_log_prob", tf.reduce_mean(p_log_prob),
-                      collections=[inference._summary_key])
-    tf.summary.scalar("loss/q_log_prob", tf.reduce_mean(q_log_prob),
-                      collections=[inference._summary_key])
-    tf.summary.scalar("loss/reg_penalty", reg_penalty,
-                      collections=[inference._summary_key])
-
-  losses = p_log_prob - q_log_prob
-  loss = -(tf.reduce_mean(losses) - reg_penalty)
+  Args:
+  @{args}
 
-  q_rvs = list(six.itervalues(inference.latent_vars))
-  q_vars = [v for v in var_list
-            if len(get_descendants(tf.convert_to_tensor(v), q_rvs)) != 0]
-  q_grads = tf.gradients(
-      -(tf.reduce_mean(q_log_prob * tf.stop_gradient(losses)) - reg_penalty),
-      q_vars)
-  p_vars = [v for v in var_list if v not in q_vars]
-  p_grads = tf.gradients(loss, p_vars)
-  grads_and_vars = list(zip(q_grads, q_vars)) + list(zip(p_grads, p_vars))
-  return loss, grads_and_vars
-
-
-def build_score_kl_loss_and_gradients(inference, var_list):
-  """Build loss function and gradients based on the score function
-  estimator [@paisley2012variational].
-
-  It assumes the KL is analytic.
-
-  Computed by sampling from $q(z;\lambda)$ and evaluating the
-  expectation using Monte Carlo sampling.
-  """
-  p_log_lik = [0.0] * inference.n_samples
-  q_log_prob = [0.0] * inference.n_samples
-  base_scope = tf.get_default_graph().unique_name("inference") + '/'
-  for s in range(inference.n_samples):
-    # Form dictionary in order to replace conditioning on prior or
-    # observed variable with conditioning on a specific value.
-    scope = base_scope + tf.get_default_graph().unique_name("sample")
-    dict_swap = {}
-    for x, qx in six.iteritems(inference.data):
-      if isinstance(x, RandomVariable):
-        if isinstance(qx, RandomVariable):
-          qx_copy = copy(qx, scope=scope)
-          dict_swap[x] = qx_copy.value()
-        else:
-          dict_swap[x] = qx
-
-    for z, qz in six.iteritems(inference.latent_vars):
-      # Copy q(z) to obtain new set of posterior samples.
-      qz_copy = copy(qz, scope=scope)
-      dict_swap[z] = qz_copy.value()
-      q_log_prob[s] += tf.reduce_sum(
-          inference.scale.get(z, 1.0) *
-          qz_copy.log_prob(tf.stop_gradient(dict_swap[z])))
-
-    for x in six.iterkeys(inference.data):
-      if isinstance(x, RandomVariable):
-        x_copy = copy(x, dict_swap, scope=scope)
-        p_log_lik[s] += tf.reduce_sum(
-            inference.scale.get(x, 1.0) * x_copy.log_prob(dict_swap[x]))
-
-  p_log_lik = tf.stack(p_log_lik)
-  q_log_prob = tf.stack(q_log_prob)
+  Returns:
+  @{returns}
 
-  kl_penalty = tf.reduce_sum([
-      tf.reduce_sum(inference.kl_scaling.get(z, 1.0) * kl_divergence(qz, z))
-      for z, qz in six.iteritems(inference.latent_vars)])
+  #### Notes
 
-  reg_penalty = tf.reduce_sum(tf.losses.get_regularization_losses())
+  Probabilistic programs may have random variables which vary across
+  executions. The algorithm returns calculations following `n_samples`
+  executions of the model and variational programs.
 
-  if inference.logging:
-    tf.summary.scalar("loss/p_log_lik", tf.reduce_mean(p_log_lik),
-                      collections=[inference._summary_key])
-    tf.summary.scalar("loss/kl_penalty", kl_penalty,
-                      collections=[inference._summary_key])
-    tf.summary.scalar("loss/reg_penalty", reg_penalty,
-                      collections=[inference._summary_key])
+  @{notes_model_parameters}
 
-  loss = -(tf.reduce_mean(p_log_lik) - kl_penalty - reg_penalty)
+  @{notes_conditional_inference}
 
-  q_rvs = list(six.itervalues(inference.latent_vars))
-  q_vars = [v for v in var_list
-            if len(get_descendants(tf.convert_to_tensor(v), q_rvs)) != 0]
-  q_grads = tf.gradients(
-      -(tf.reduce_mean(q_log_prob * tf.stop_gradient(p_log_lik)) - kl_penalty -
-          reg_penalty),
-      q_vars)
-  p_vars = [v for v in var_list if v not in q_vars]
-  p_grads = tf.gradients(loss, p_vars)
-  grads_and_vars = list(zip(q_grads, q_vars)) + list(zip(p_grads, p_vars))
-  return loss, grads_and_vars
+  @{notes_regularization_losses}
 
+  #### Examples
 
-def build_score_entropy_loss_and_gradients(inference, var_list):
-  """Build loss function and gradients based on the score function
-  estimator [@paisley2012variational].
+  ```python
+  def model():
+    mu = Normal(loc=0.0, scale=1.0, name="mu")
+    x = Normal(loc=mu, scale=1.0, sample_shape=10, name="x")
+    return x
 
-  It assumes the entropy is analytic.
+  def variational():
+    qmu = Normal(loc=tf.get_variable("loc", []),
+                 scale=tf.nn.softplus(tf.get_variable("shape", [])),
+                 name="qmu")
+    return qmu
 
-  Computed by sampling from $q(z;\lambda)$ and evaluating the
-  expectation using Monte Carlo sampling.
+  loss, surrogate_loss = ed.klqp_score(
+      model, variational,
+      align_latent=lambda name: "qmu" if name == "mu" else None,
+      align_data=lambda name: "x" if name == "x" else None,
+      x=x_data)
+  ```
   """
-  p_log_prob = [0.0] * inference.n_samples
-  q_log_prob = [0.0] * inference.n_samples
-  base_scope = tf.get_default_graph().unique_name("inference") + '/'
-  for s in range(inference.n_samples):
-    # Form dictionary in order to replace conditioning on prior or
-    # observed variable with conditioning on a specific value.
-    scope = base_scope + tf.get_default_graph().unique_name("sample")
-    dict_swap = {}
-    for x, qx in six.iteritems(inference.data):
-      if isinstance(x, RandomVariable):
-        if isinstance(qx, RandomVariable):
-          qx_copy = copy(qx, scope=scope)
-          dict_swap[x] = qx_copy.value()
-        else:
-          dict_swap[x] = qx
-
-    for z, qz in six.iteritems(inference.latent_vars):
-      # Copy q(z) to obtain new set of posterior samples.
-      qz_copy = copy(qz, scope=scope)
-      dict_swap[z] = qz_copy.value()
-      q_log_prob[s] += tf.reduce_sum(
-          inference.scale.get(z, 1.0) *
-          qz_copy.log_prob(tf.stop_gradient(dict_swap[z])))
-
-    for z in six.iterkeys(inference.latent_vars):
-      z_copy = copy(z, dict_swap, scope=scope)
-      p_log_prob[s] += tf.reduce_sum(
-          inference.scale.get(z, 1.0) * z_copy.log_prob(dict_swap[z]))
-
-    for x in six.iterkeys(inference.data):
-      if isinstance(x, RandomVariable):
-        x_copy = copy(x, dict_swap, scope=scope)
-        p_log_prob[s] += tf.reduce_sum(
-            inference.scale.get(x, 1.0) * x_copy.log_prob(dict_swap[x]))
+  p_log_prob = [0.0] * n_samples
+  q_log_prob = [0.0] * n_samples
+  for s in range(n_samples):
+    q_trace = call_with_trace(variational, *args, **kwargs)
+    x = call_with_intercept(model, q_trace, align_data, align_latent,
+                            *args, **kwargs)
+    for rv in toposort(x):
+      scale_factor = scale(rv.name)
+      if align_latent(rv.name) is not None or align_data(rv.name) is not None:
+        p_log_prob[s] += tf.reduce_sum(scale_factor * rv.log_prob(rv.value))
+      if align_latent(rv.name) is not None:
+        qz = q_trace[align_latent(rv.name)]
+        q_log_prob[s] += tf.reduce_sum(
+            scale_factor * qz.log_prob(tf.stop_gradient(qz.value)))
 
   p_log_prob = tf.stack(p_log_prob)
   q_log_prob = tf.stack(q_log_prob)
-
-  q_entropy = tf.reduce_sum([
-      tf.reduce_sum(qz.entropy())
-      for z, qz in six.iteritems(inference.latent_vars)])
-
   reg_penalty = tf.reduce_sum(tf.losses.get_regularization_losses())
-
-  if inference.logging:
+  if collections is not None:
     tf.summary.scalar("loss/p_log_prob", tf.reduce_mean(p_log_prob),
-                      collections=[inference._summary_key])
+                      collections=collections)
     tf.summary.scalar("loss/q_log_prob", tf.reduce_mean(q_log_prob),
-                      collections=[inference._summary_key])
-    tf.summary.scalar("loss/q_entropy", q_entropy,
-                      collections=[inference._summary_key])
+                      collections=collections)
     tf.summary.scalar("loss/reg_penalty", reg_penalty,
-                      collections=[inference._summary_key])
-
-  loss = -(tf.reduce_mean(p_log_prob) + q_entropy - reg_penalty)
-
-  q_rvs = list(six.itervalues(inference.latent_vars))
-  q_vars = [v for v in var_list
-            if len(get_descendants(tf.convert_to_tensor(v), q_rvs)) != 0]
-  q_grads = tf.gradients(
-      -(tf.reduce_mean(q_log_prob * tf.stop_gradient(p_log_prob)) +
-          q_entropy - reg_penalty),
-      q_vars)
-  p_vars = [v for v in var_list if v not in q_vars]
-  p_grads = tf.gradients(loss, p_vars)
-  grads_and_vars = list(zip(q_grads, q_vars)) + list(zip(p_grads, p_vars))
-  return loss, grads_and_vars
-
-
-def build_score_rb_loss_and_gradients(inference, var_list):
-  """Build loss function and gradients based on the score function
-  estimator [@paisley2012variational] and Rao-Blackwellization
-  [@ranganath2014black].
-
-  Computed by sampling from :math:`q(z;\lambda)` and evaluating the
-  expectation using Monte Carlo sampling and Rao-Blackwellization.
-  """
-  # Build tensors for loss and gradient calculations. There is one set
-  # for each sample from the variational distribution.
-  p_log_probs = [{}] * inference.n_samples
-  q_log_probs = [{}] * inference.n_samples
-  base_scope = tf.get_default_graph().unique_name("inference") + '/'
-  for s in range(inference.n_samples):
-    # Form dictionary in order to replace conditioning on prior or
-    # observed variable with conditioning on a specific value.
-    scope = base_scope + tf.get_default_graph().unique_name("sample")
-    dict_swap = {}
-    for x, qx in six.iteritems(inference.data):
-      if isinstance(x, RandomVariable):
-        if isinstance(qx, RandomVariable):
-          qx_copy = copy(qx, scope=scope)
-          dict_swap[x] = qx_copy.value()
-        else:
-          dict_swap[x] = qx
-
-    for z, qz in six.iteritems(inference.latent_vars):
-      # Copy q(z) to obtain new set of posterior samples.
-      qz_copy = copy(qz, scope=scope)
-      dict_swap[z] = qz_copy.value()
-      q_log_probs[s][qz] = tf.reduce_sum(
-          inference.scale.get(z, 1.0) *
-          qz_copy.log_prob(tf.stop_gradient(dict_swap[z])))
-
-    for z in six.iterkeys(inference.latent_vars):
-      z_copy = copy(z, dict_swap, scope=scope)
-      p_log_probs[s][z] = tf.reduce_sum(
-          inference.scale.get(z, 1.0) * z_copy.log_prob(dict_swap[z]))
-
-    for x in six.iterkeys(inference.data):
-      if isinstance(x, RandomVariable):
-        x_copy = copy(x, dict_swap, scope=scope)
-        p_log_probs[s][x] = tf.reduce_sum(
-            inference.scale.get(x, 1.0) * x_copy.log_prob(dict_swap[x]))
-
-  # Take gradients of Rao-Blackwellized loss for each variational parameter.
-  p_rvs = list(six.iterkeys(inference.latent_vars)) + \
-      [x for x in six.iterkeys(inference.data) if isinstance(x, RandomVariable)]
-  q_rvs = list(six.itervalues(inference.latent_vars))
-  reverse_latent_vars = {v: k for k, v in six.iteritems(inference.latent_vars)}
-  grads = []
-  grads_vars = []
-  for var in var_list:
-    # Get all variational factors depending on the parameter.
-    descendants = get_descendants(tf.convert_to_tensor(var), q_rvs)
-    if len(descendants) == 0:
-      continue  # skip if not a variational parameter
-    # Get p and q's Markov blanket wrt these latent variables.
-    var_p_rvs = set()
-    for qz in descendants:
-      z = reverse_latent_vars[qz]
-      var_p_rvs.update(z.get_blanket(p_rvs) + [z])
-
-    var_q_rvs = set()
-    for qz in descendants:
-      var_q_rvs.update(qz.get_blanket(q_rvs) + [qz])
-
-    pi_log_prob = [0.0] * inference.n_samples
-    qi_log_prob = [0.0] * inference.n_samples
-    for s in range(inference.n_samples):
-      pi_log_prob[s] = tf.reduce_sum([p_log_probs[s][rv] for rv in var_p_rvs])
-      qi_log_prob[s] = tf.reduce_sum([q_log_probs[s][rv] for rv in var_q_rvs])
-
-    pi_log_prob = tf.stack(pi_log_prob)
-    qi_log_prob = tf.stack(qi_log_prob)
-    grad = tf.gradients(
-        -tf.reduce_mean(qi_log_prob *
-                        tf.stop_gradient(pi_log_prob - qi_log_prob)) +
-        tf.reduce_sum(tf.losses.get_regularization_losses()),
-        var)
-    grads.extend(grad)
-    grads_vars.append(var)
-
-  # Take gradients of total loss function for model parameters.
-  loss = -(tf.reduce_mean([tf.reduce_sum(list(six.itervalues(p_log_prob)))
-                           for p_log_prob in p_log_probs]) -
-           tf.reduce_mean([tf.reduce_sum(list(six.itervalues(q_log_prob)))
-                           for q_log_prob in q_log_probs]) -
-           tf.reduce_sum(tf.losses.get_regularization_losses()))
-  model_vars = [v for v in var_list if v not in grads_vars]
-  model_grads = tf.gradients(loss, model_vars)
-  grads.extend(model_grads)
-  grads_vars.extend(model_vars)
-  grads_and_vars = list(zip(grads, grads_vars))
-  return loss, grads_and_vars
+                      collections=collections)
+  losses = q_log_prob - p_log_prob
+  loss = tf.reduce_mean(losses) + reg_penalty
+  surrogate_loss = (tf.reduce_mean(q_log_prob * tf.stop_gradient(losses)) +
+                    reg_penalty)
+  return loss, surrogate_loss
diff --git a/edward/inferences/klqp_implicit.py b/edward/inferences/klqp_implicit.py
new file mode 100644
index 000000000..243b996ed
--- /dev/null
+++ b/edward/inferences/klqp_implicit.py
@@ -0,0 +1,252 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import six
+import tensorflow as tf
+
+from edward.inferences import docstrings as doc
+from edward.inferences.util import (
+    call_with_intercept, call_with_trace, toposort)
+
+
+@doc.set_doc(
+    args_part_one=(doc.arg_model +
+                   doc.arg_variational)[:-1],
+    args_part_two=(doc.arg_align_latent +
+                   doc.arg_align_data)[:-1],
+    args_part_three=(doc.arg_scale +
+                     doc.arg_auto_transform +
+                     doc.arg_collections +
+                     doc.arg_args_kwargs)[:-1],
+    returns=doc.return_loss_loss_d,
+    notes_discriminator_scope=doc.notes_discriminator_scope,
+    notes_regularization_losses=doc.notes_regularization_losses)
+def klqp_implicit(model, variational, discriminator, align_latent,
+                  align_data, align_latent_global=lambda name: name,
+                  ratio_loss='log', scale=lambda name: 1.0,
+                  auto_transform=True, collections=None, *args, **kwargs):
+  """Variational inference with implicit probabilistic models
+  [@tran2017deep].
+
+  It minimizes the KL divergence
+
+  $\\text{KL}( q(z, \\beta; \lambda) \| p(z, \\beta \mid x) ),$
+
+  where $z$ are local variables associated to a data point and
+  $\\beta$ are global variables shared across data points.
+
+  Global latent variables require `log_prob()` and need to return a
+  random sample when fetched from the graph. Local latent variables
+  and observed variables require only a random sample when fetched
+  from the graph. (This is true for both $p$ and $q$.)
+
+  All variational factors must be reparameterizable: each of the
+  random variables (`rv`) satisfies `rv.is_reparameterized` and
+  `rv.is_continuous`.
+
+  Args:
+  @{args_part_one}
+    discriminator: function.
+      Function (with parameters). Unlike `GANInference`, it is
+      interpreted as a ratio estimator rather than a discriminator.
+      It takes three arguments: a data dict, local latent variable
+      dict, and global latent variable dict. As with GAN
+      discriminators, it can take a batch of data points and local
+      variables, of size $M$, and output a vector of length
+      $M$.
+  @{args_part_two}
+    align_latent_global: dict of RandomVariable to RandomVariable.
+      Identifying which variables in `latent_vars` are global
+      variables, shared across data points. These will not be
+      encompassed in the ratio estimation problem, and will be
+      estimated with tractable variational approximations.
+    ratio_loss: str or fn.
+      Loss function minimized to get the ratio estimator. 'log' or 'hinge'.
+      Alternatively, one can pass in a function of two inputs,
+      `psamples` and `qsamples`, and output a point-wise value
+      with shape matching the shapes of the two inputs.
+  @{args_part_three}
+
+  Unlike `GANInference`, `discriminator` takes dict's as input,
+  and must subset to the appropriate values through lexical scoping
+  from the previously defined model and latent variables. This is
+  necessary as the discriminator can take an arbitrary set of data,
+  latent, and global variables.
+
+  align_latent aligns all global and local latents;
+  align_global_latent only aligns global latents.
+
+  Returns:
+  @{returns}
+
+  #### Notes
+
+  Note the type for `discriminator`'s output changes when one
+  passes in the `scale` argument to `initialize()`.
+
+  @{notes_discriminator_scope}
+
+  @{notes_regularization_losses}
+
+  Build loss function
+
+  $-\Big(\mathbb{E}_{q(\\beta)} [\log p(\\beta) - \log q(\\beta) ] +
+      \sum_{n=1}^N \mathbb{E}_{q(\\beta)q(z_n\mid\\beta)} [
+          r^*(x_n, z_n, \\beta) ] \Big).$
+
+  We minimize it with respect to parameterized variational
+  families $q(z, \\beta; \lambda)$.
+
+  $r^*(x_n, z_n, \\beta)$ is a function of a single data point
+  $x_n$, single local variable $z_n$, and all global
+  variables $\\beta$. It is equal to the log-ratio
+
+  $\log p(x_n, z_n\mid \\beta) - \log q(x_n, z_n\mid \\beta),$
+
+  where $q(x_n)$ is the empirical data distribution. Rather
+  than explicit calculation, $r^*(x, z, \\beta)$ is the
+  solution to a ratio estimation problem, minimizing the specified
+  `ratio_loss`.
+
+  Gradients are taken using the reparameterization trick
+  [@kingma2014auto].
+
+  This also includes model parameters $p(x, z, \\beta; \\theta)$
+  and variational distributions with inference networks
+  $q(z\mid x)$.
+
+  There are a bunch of extensions we could easily do in this
+  implementation:
+
+  + further factorizations can be used to better leverage the
+    graph structure for more complicated models;
+  + score function gradients for global variables;
+  + use more samples; this would require the `copy()` utility
+    function for q's as well, and an additional loop. we opt not to
+    because it complicates the code;
+  + analytic KL/swapping out the penalty term for the globals.
+
+  #### Examples
+
+  ```python
+  def model():
+    z = Normal(loc=0.0, scale=1.0, sample_shape=[256, 25], name="z")
+    x = generative_network(z, name="x")
+    return x
+
+  def variational(x):
+    net = tf.layers.dense(x_data, 25 * 2)
+    qz = Normal(loc=net[:, :25],
+                scale=tf.nn.softplus(net[:, 25:]),
+                name="qz")
+    return qz
+
+  def ratio_estimator(data, local_vars, global_vars):
+    # concatenated input has shape (batch_size, 28*28 + 25)
+    net = tf.concat([data["x"], local_vars["z"]], 1)
+    net = tf.layers.dense(net, 256, activation=tf.nn.relu)
+    return tf.layers.dense(net, 1, activation=tf.sigmoid)
+
+  loss, loss_d = ed.klqp_implicit(
+      model, variational, ratio_estimator,
+      align_latent=lambda name: "qz" if name == "z" else None,
+      align_data=lambda name: "x" if name == "x" else None,
+      x=x_data)
+  ```
+  """
+  if callable(ratio_loss):
+    ratio_loss = ratio_loss
+  elif ratio_loss == 'log':
+    ratio_loss = _log_loss
+  elif ratio_loss == 'hinge':
+    ratio_loss = _hinge_loss
+  else:
+    raise ValueError('Ratio loss not found:', ratio_loss)
+
+  q_trace = call_with_trace(variational, *args, **kwargs)
+  # Intercept model's global latent variables and set to posterior
+  # samples (but not its locals).
+  x = call_with_intercept(model, q_trace, align_data, align_latent_global,
+                          *args, **kwargs)
+
+  # Collect tensors used in calculation of losses.
+  pbeta_log_prob = 0.0
+  qbeta_log_prob = 0.0
+  qbeta_sample = {}
+  pz_sample = {}
+  qz_sample = {}
+  x_psample = {}
+  x_qsample = {}
+  for rv in toposort(x):
+    # Calculate log p(beta') and log q(beta').
+    if align_latent_global(rv.name) is not None:
+      pbeta = rv
+      qbeta = q_trace[align_latent_global(rv.name)]
+      scale_factor = scale(rv.name)
+      pbeta_log_prob += tf.reduce_sum(
+          scale_factor * pbeta.log_prob(pbeta.value))
+      qbeta_log_prob += tf.reduce_sum(
+          scale_factor * qbeta.log_prob(qbeta.value))
+      qbeta_sample[rv.name] = qbeta.value
+    else:
+      # TODO This assumes implicit variables are tf.Tensors existing
+      # on the Trace stack.
+      if align_latent(rv.name) is not None:
+        pz = rv
+        qz = q_trace[align_latent(rv.name)]
+        pz_sample[rv.name] = pz
+        qz_sample[rv.name] = qz
+      else:
+        key = align_data(rv.name)
+        if isinstance(key, int):
+          data_node = args[key]
+        elif kwargs.get(key, None) is not None:
+          data_node = kwargs.get(key)
+        px = rv
+        qx = data_node
+        x_psample[rv.name] = px
+        x_qsample[rv.name] = qx
+
+  # Collect x' ~ p(x | z', beta') and x' ~ q(x).
+  with tf.variable_scope("Disc"):
+    # TODO For now, this assumes the discriminator automagically knows
+    # how to index the dictionaries and computes some forward pass on
+    # them (which can vary across executions). Dictionaries should be
+    # improved to be more idiomatic.
+    r_psample = discriminator(x_psample, pz_sample, qbeta_sample)
+
+  with tf.variable_scope("Disc", reuse=True):
+    r_qsample = discriminator(x_qsample, qz_sample, qbeta_sample)
+
+  # Form ratio loss and ratio estimator.
+  loss_d = 0.0
+  scaled_ratio = 0.0
+  for key, value in six.iteritems(r_qsample):
+    loss_d += tf.reduce_mean(ratio_loss(r_psample[key], value))
+    scaled_ratio += tf.reduce_sum(scale(key) * value)
+
+  reg_terms_d = tf.losses.get_regularization_losses(scope="Disc")
+  reg_terms_all = tf.losses.get_regularization_losses()
+  reg_terms = [r for r in reg_terms_all if r not in reg_terms_d]
+
+  # Form variational objective.
+  loss = (qbeta_log_prob - pbeta_log_prob - scaled_ratio +
+          tf.reduce_sum(reg_terms))
+  loss_d = loss_d + tf.reduce_sum(reg_terms_d)
+  return loss, loss_d
+
+
+def _log_loss(psample, qsample):
+  """Point-wise log loss."""
+  loss = tf.nn.sigmoid_cross_entropy_with_logits(
+      labels=tf.ones_like(psample), logits=psample) + \
+      tf.nn.sigmoid_cross_entropy_with_logits(
+          labels=tf.zeros_like(qsample), logits=qsample)
+  return loss
+
+
+def _hinge_loss(psample, qsample):
+  """Point-wise hinge loss."""
+  loss = tf.nn.relu(1.0 - psample) + tf.nn.relu(1.0 + qsample)
+  return loss
diff --git a/edward/inferences/laplace.py b/edward/inferences/laplace.py
index 67258ca4e..d74241909 100644
--- a/edward/inferences/laplace.py
+++ b/edward/inferences/laplace.py
@@ -5,10 +5,10 @@
 import six
 import tensorflow as tf
 
-from edward.inferences.map import MAP
-from edward.models import PointMass, RandomVariable
-from edward.util import get_session, get_variables
-from edward.util import copy, transform
+from edward.inferences import docstrings as doc
+from edward.inferences.map import map
+from edward.inferences.util import call_with_trace
+from edward.models.queries import get_variables
 
 try:
   from edward.models import \
@@ -17,17 +17,31 @@
   raise ImportError("{0}. Your TensorFlow version is not supported.".format(e))
 
 
-class Laplace(MAP):
+@doc.set_doc(
+    args=(doc.arg_model +
+          doc.arg_variational +
+          doc.arg_align_latent +
+          doc.arg_align_data +
+          doc.arg_scale +
+          doc.arg_auto_transform +
+          doc.arg_collections +
+          doc.arg_args_kwargs)[:-1])
+def laplace(model, variational, align_latent, align_data,
+            scale=lambda name: 1.0, auto_transform=True,
+            collections=None, *args, **kwargs):
   """Laplace approximation [@laplace1986memoir].
 
   It approximates the posterior distribution using a multivariate
   normal distribution centered at the mode of the posterior.
 
-  We implement this by running `MAP` to find the posterior mode.
+  We implement this by running `ed.map` to find the posterior mode.
   This forms the mean of the normal approximation. We then compute the
   inverse Hessian at the mode of the posterior. This forms the
   covariance of the normal approximation.
 
+  Args:
+  @{args}
+
   #### Notes
 
   If `MultivariateNormalDiag` or `Normal` random variables are
@@ -39,8 +53,8 @@ class Laplace(MAP):
   Random variables with both scalar batch and event shape are not
   supported as `tf.hessians` is currently not applicable to scalars.
 
-  Note that `Laplace` finds the location parameter of the normal
-  approximation using `MAP`, which is performed on the latent
+  Note that this function finds the location parameter of the normal
+  approximation using `ed.map`, which is performed on the latent
   variable's original (constrained) support. The scale parameter
   is calculated by evaluating the Hessian of $-\log p(x, z)$ in the
   constrained space and under the mode. This implies the Laplace
@@ -50,104 +64,66 @@ class Laplace(MAP):
   #### Examples
 
   ```python
-  X = tf.placeholder(tf.float32, [N, D])
-  w = Normal(loc=tf.zeros(D), scale=tf.ones(D))
-  y = Normal(loc=ed.dot(X, w), scale=tf.ones(N))
+  def model(X):
+    w = Normal(loc=tf.zeros(D), scale=tf.ones(D), name="w")
+    y = Normal(loc=tf.tensordot(X, w, [[1], [0]]), scale=tf.ones(N), name="y")
+    return y
+
+  def variational():
+    qw = MultivariateNormalTriL(
+        loc=tf.Variable(tf.random_normal([D])),
+        scale_tril=tf.Variable(tf.random_normal([D, D])),
+        name="qw")
+    return qw
+
+  loss = ed.laplace(
+      model, variational,
+      align_latent=lambda name: "qw" if name == "w" else None,
+      align_data=lambda name: "y" if name == "y" else None,
+      X=X_data,
+      y=y_data)
+  ```
+  """
+  variational_pointmass = _make_variational_pointmass(
+      variational, *args, **kwargs)
+  loss = map(model, variational, align_latent, align_data,
+             scale, auto_transform, collections, *args, **kwargs)
+  finalize_op = _finalize(loss, variational)
+  return loss, finalize_op
 
-  qw = MultivariateNormalTriL(
-      loc=tf.Variable(tf.random_normal([D])),
-      scale_tril=tf.Variable(tf.random_normal([D, D])))
 
-  inference = ed.Laplace({w: qw}, data={X: X_train, y: y_train})
-  ```
+def _finalize(loss, variational):
+  """Function to call after convergence.
+
+  Computes the Hessian at the mode.
+  """
+  q_trace = call_with_trace(variational, *args, **kwargs)
+  hessians = tf.hessians(
+      loss, [node.value.loc for node in six.itervalues(q_trace)])
+  finalize_ops = []
+  for qz, hessian in zip(six.itervalues(q_trace), hessians):
+    if isinstance(qz, (MultivariateNormalDiag, Normal)):
+      scale_var = get_variables(qz.variance())[0]
+      scale = 1.0 / tf.diag_part(hessian)
+    else:  # qz is MultivariateNormalTriL
+      scale_var = get_variables(qz.covariance())[0]
+      scale = tf.matrix_inverse(tf.cholesky(hessian))
+
+    finalize_ops.append(scale_var.assign(scale))
+  return tf.group(*finalize_ops)
+
+
+def _make_variational_pointmass(variational, *args, **kwargs):
+  """Take a variational program and build a new one that replaces all
+  random variables with point masses.
+
+  We assume all latent variables are traceable in one execution.
   """
-  def __init__(self, latent_vars, data=None):
-    """Create an inference algorithm.
-
-    Args:
-      latent_vars: list of RandomVariable or
-                   dict of RandomVariable to RandomVariable.
-        Collection of random variables to perform inference on. If list,
-        each random variable will be implictly optimized using a
-        `MultivariateNormalTriL` random variable that is defined
-        internally with unconstrained support and is initialized using
-        standard normal draws. If dictionary, each random
-        variable must be a `MultivariateNormalDiag`,
-        `MultivariateNormalTriL`, or `Normal` random variable.
-    """
-    if isinstance(latent_vars, list):
-      with tf.variable_scope(None, default_name="posterior"):
-        latent_vars_dict = {}
-        for z in latent_vars:
-          # Define location to have constrained support and
-          # unconstrained free parameters.
-          batch_event_shape = z.batch_shape.concatenate(z.event_shape)
-          loc = tf.Variable(tf.random_normal(batch_event_shape))
-          if hasattr(z, 'support'):
-            z_transform = transform(z)
-            if hasattr(z_transform, 'bijector'):
-              loc = z_transform.bijector.inverse(loc)
-          scale_tril = tf.Variable(tf.random_normal(
-              batch_event_shape.concatenate(batch_event_shape[-1])))
-          qz = MultivariateNormalTriL(loc=loc, scale_tril=scale_tril)
-          latent_vars_dict[z] = qz
-        latent_vars = latent_vars_dict
-        del latent_vars_dict
-    elif isinstance(latent_vars, dict):
-      for qz in six.itervalues(latent_vars):
-        if not isinstance(
-                qz, (MultivariateNormalDiag, MultivariateNormalTriL, Normal)):
-          raise TypeError("Posterior approximation must consist of only "
-                          "MultivariateNormalDiag, MultivariateTriL, or "
-                          "Normal random variables.")
-
-    # call grandparent's method; avoid parent (MAP)
-    super(MAP, self).__init__(latent_vars, data)
-
-  def initialize(self, *args, **kwargs):
-    # Store latent variables in a temporary object; MAP will
-    # optimize `PointMass` random variables, which subsequently
-    # optimizes location parameters of the normal approximations.
-    latent_vars_normal = self.latent_vars.copy()
-    self.latent_vars = {z: PointMass(params=qz.loc)
-                        for z, qz in six.iteritems(latent_vars_normal)}
-
-    super(Laplace, self).initialize(*args, **kwargs)
-
-    hessians = tf.hessians(self.loss, list(six.itervalues(self.latent_vars)))
-    self.finalize_ops = []
-    for z, hessian in zip(six.iterkeys(self.latent_vars), hessians):
-      qz = latent_vars_normal[z]
-      if isinstance(qz, (MultivariateNormalDiag, Normal)):
-        scale_var = get_variables(qz.variance())[0]
-        scale = 1.0 / tf.diag_part(hessian)
-      else:  # qz is MultivariateNormalTriL
-        scale_var = get_variables(qz.covariance())[0]
-        scale = tf.matrix_inverse(tf.cholesky(hessian))
-
-      self.finalize_ops.append(scale_var.assign(scale))
-
-    self.latent_vars = latent_vars_normal.copy()
-    del latent_vars_normal
-
-  def finalize(self, feed_dict=None):
-    """Function to call after convergence.
-
-    Computes the Hessian at the mode.
-
-    Args:
-      feed_dict: dict.
-        Feed dictionary for a TensorFlow session run during evaluation
-        of Hessian. It is used to feed placeholders that are not fed
-        during initialization.
-    """
-    if feed_dict is None:
-      feed_dict = {}
-
-    for key, value in six.iteritems(self.data):
-      if isinstance(key, tf.Tensor) and "Placeholder" in key.op.type:
-        feed_dict[key] = value
-
-    sess = get_session()
-    sess.run(self.finalize_ops, feed_dict)
-    super(Laplace, self).finalize()
+  q_trace = call_with_trace(variational, *args, **kwargs)
+  def variational_pointmass(*args, **kwargs):
+    for name, node in six.iteritems(q_trace):
+      qz = node.value
+      qz_pointmass = PointMass(params=qz.loc,
+                               name=qz.name + "_pointmass",
+                               value=qz.loc)
+  return variational_pointmass
diff --git a/edward/inferences/map.py b/edward/inferences/map.py
index 406d461d4..3c9eb3995 100644
--- a/edward/inferences/map.py
+++ b/edward/inferences/map.py
@@ -5,9 +5,9 @@
 import six
 import tensorflow as tf
 
-from edward.inferences.variational_inference import VariationalInference
-from edward.models import RandomVariable, PointMass
-from edward.util import copy, transform
+from edward.inferences import docstrings as doc
+from edward.inferences.util import (
+    call_with_intercept, call_with_trace, toposort)
 
 try:
   from tensorflow.contrib.distributions import bijectors
@@ -15,10 +15,23 @@
   raise ImportError("{0}. Your TensorFlow version is not supported.".format(e))
 
 
-class MAP(VariationalInference):
+@doc.set_doc(
+    args=(doc.arg_model +
+          doc.arg_variational +
+          doc.arg_align_latent +
+          doc.arg_align_data +
+          doc.arg_scale +
+          doc.arg_auto_transform +
+          doc.arg_collections +
+          doc.arg_args_kwargs)[:-1],
+    returns=doc.return_loss,
+    notes_regularization_losses=doc.notes_regularization_losses)
+def map(model, variational, align_latent, align_data,
+        scale=lambda name: 1.0, auto_transform=True, collections=None,
+        *args, **kwargs):
   """Maximum a posteriori.
 
-  This class implements gradient-based optimization to solve the
+  This function implements gradient-based optimization to solve the
   optimization problem,
 
   $\min_{z} - p(z \mid x).$
@@ -28,13 +41,23 @@ class MAP(VariationalInference):
 
   $- \mathbb{E}_{q(z; \lambda)} [ \log p(x, z) ].$
 
+  Args:
+  @{args}
+
+  Returns:
+  @{returns}
+
   #### Notes
 
-  This class is currently restricted to optimization over
+  This function is currently restricted to optimization over
   differentiable latent variables. For example, it does not solve
   discrete optimization.
 
-  This class also minimizes the loss with respect to any model
+  Probabilistic programs may have random variables which vary across
+  executions. The algorithm returns calculations following one
+  execution of the model and variational programs.
+
+  This function also minimizes the loss with respect to any model
   parameters $p(z \mid x; \\theta)$.
 
   In conditional inference, we infer $z$ in $p(z, \\beta
@@ -46,108 +69,51 @@ class MAP(VariationalInference):
   marginal density $\log p(x, z)$, and it is exact if
   $q(\\beta) = p(\\beta \mid x)$ (up to stochasticity).
 
+  @{notes_regularization_losses}
+
   #### Examples
 
-  Most explicitly, `MAP` is specified via a dictionary:
+  Most explicitly, this function is specified via a variational
+  program over pointmasses.
 
   ```python
-  qpi = PointMass(params=ed.to_simplex(tf.Variable(tf.zeros(K-1))))
-  qmu = PointMass(params=tf.Variable(tf.zeros(K*D)))
-  qsigma = PointMass(params=tf.nn.softplus(tf.Variable(tf.zeros(K*D))))
-  ed.MAP({pi: qpi, mu: qmu, sigma: qsigma}, data)
+  def variational():
+    qpi = PointMass(params=to_simplex(tf.Variable(tf.zeros(K-1))),
+                    name="qpi")
+    qmu = PointMass(params=tf.Variable(tf.zeros(K*D)),
+                    name="qmu")
+    qsigma = PointMass(params=tf.nn.softplus(tf.Variable(tf.zeros(K*D))),
+                       name="qsigma")
+    return qpi, qmu, qsigma
+
+  loss = ed.map(..., variational, ...)
   ```
 
-  We also automate the specification of `PointMass` distributions,
-  so one can pass in a list of latent variables instead:
+  We also automate the specification of `PointMass` distributions
+  so you don't pass in `variational`. (TODO not implemented yet.)
 
-  ```python
-  ed.MAP([beta], data)
-  ed.MAP([pi, mu, sigma], data)
-  ```
-
-  Note that for `MAP` to optimize over latent variables with
+  Note that for this function to optimize over latent variables with
   constrained continuous support, the point mass must be constrained
   to have the same support while its free parameters are
   unconstrained; see, e.g., `qsigma` above. This is different than
   performing MAP on the unconstrained space: in general, the MAP of
   the transform is not the transform of the MAP.
-
-  The objective function also adds to itself a summation over all
-  tensors in the `REGULARIZATION_LOSSES` collection.
   """
-  def __init__(self, latent_vars=None, data=None):
-    """Create an inference algorithm.
-
-    Args:
-      latent_vars: list of RandomVariable or
-                   dict of RandomVariable to RandomVariable.
-        Collection of random variables to perform inference on. If
-        list, each random variable will be implictly optimized using a
-        `PointMass` random variable that is defined internally with
-        constrained support, has unconstrained free parameters, and is
-        initialized using standard normal draws. If dictionary, each
-        value in the dictionary must be a `PointMass` random variable
-        with the same support as the key.
-    """
-    if isinstance(latent_vars, list):
-      with tf.variable_scope(None, default_name="posterior"):
-        latent_vars_dict = {}
-        for z in latent_vars:
-          # Define point masses to have constrained support and
-          # unconstrained free parameters.
-          batch_event_shape = z.batch_shape.concatenate(z.event_shape)
-          params = tf.Variable(tf.random_normal(batch_event_shape))
-          if hasattr(z, 'support'):
-            z_transform = transform(z)
-            if hasattr(z_transform, 'bijector'):
-              params = z_transform.bijector.inverse(params)
-          latent_vars_dict[z] = PointMass(params=params)
-        latent_vars = latent_vars_dict
-        del latent_vars_dict
-    elif isinstance(latent_vars, dict):
-      for qz in six.itervalues(latent_vars):
-        if not isinstance(qz, PointMass):
-          raise TypeError("Posterior approximation must consist of only "
-                          "PointMass random variables.")
-
-    super(MAP, self).__init__(latent_vars, data)
-
-  def build_loss_and_gradients(self, var_list):
-    """Build loss function. Its automatic differentiation
-    is the gradient of
-
-    $- \log p(x,z).$
-    """
-    # Form dictionary in order to replace conditioning on prior or
-    # observed variable with conditioning on a specific value.
-    scope = tf.get_default_graph().unique_name("inference")
-    dict_swap = {z: qz.value()
-                 for z, qz in six.iteritems(self.latent_vars)}
-    for x, qx in six.iteritems(self.data):
-      if isinstance(x, RandomVariable):
-        if isinstance(qx, RandomVariable):
-          dict_swap[x] = qx.value()
-        else:
-          dict_swap[x] = qx
-
-    p_log_prob = 0.0
-    for z in six.iterkeys(self.latent_vars):
-      z_copy = copy(z, dict_swap, scope=scope)
-      p_log_prob += tf.reduce_sum(
-          self.scale.get(z, 1.0) * z_copy.log_prob(dict_swap[z]))
-
-    for x in six.iterkeys(self.data):
-      if isinstance(x, RandomVariable):
-        if dict_swap:
-          x_copy = copy(x, dict_swap, scope=scope)
-        else:
-          x_copy = x
-        p_log_prob += tf.reduce_sum(
-            self.scale.get(x, 1.0) * x_copy.log_prob(dict_swap[x]))
-
-    reg_penalty = tf.reduce_sum(tf.losses.get_regularization_losses())
-    loss = -p_log_prob + reg_penalty
-
-    grads = tf.gradients(loss, var_list)
-    grads_and_vars = list(zip(grads, var_list))
-    return loss, grads_and_vars
+  q_trace = call_with_trace(variational, *args, **kwargs)
+  x = call_with_intercept(model, q_trace, align_data, align_latent,
+                          *args, **kwargs)
+  p_log_prob = 0.0
+  for rv in toposort(x):
+    if align_latent(rv.name) is not None or align_data(rv.name) is not None:
+      scale_factor = scale(rv.name)
+      p_log_prob += tf.reduce_sum(scale_factor * rv.log_prob(rv.value))
+
+  reg_penalty = tf.reduce_sum(tf.losses.get_regularization_losses())
+  if collections is not None:
+    tf.summary.scalar("loss/p_log_prob", p_log_prob,
+                      collections=collections)
+    tf.summary.scalar("loss/reg_penalty", reg_penalty,
+                      collections=collections)
+
+  loss = -p_log_prob + reg_penalty
+  return loss
diff --git a/edward/inferences/metropolis_hastings.py b/edward/inferences/metropolis_hastings.py
index fc3259774..939c1b60e 100644
--- a/edward/inferences/metropolis_hastings.py
+++ b/edward/inferences/metropolis_hastings.py
@@ -2,155 +2,167 @@
 from __future__ import division
 from __future__ import print_function
 
+import collections
 import six
 import tensorflow as tf
 
-from collections import OrderedDict
-from edward.inferences.monte_carlo import MonteCarlo
-from edward.models import RandomVariable
-from edward.util import check_latent_vars, copy
+from edward.inferences import docstrings as doc
+from edward.inferences.util import (
+    call_with_trace, make_optional_inputs, make_log_joint)
+from edward.models.core import call_with_manipulate
+from edward.models.random_variable import RandomVariable
+
+tfp = tf.contrib.bayesflow
+
+
+@doc.set_doc(
+    arg_model=doc.arg_model[:-1],
+    arg_align_latent=doc.arg_align_latent_monte_carlo[:-1],
+    args=(doc.arg_align_data +
+          doc.arg_current_state +
+          doc.arg_current_target_log_prob +
+          doc.arg_collections +
+          doc.arg_args_kwargs)[:-1],
+    returns=doc.return_samples,
+    notes_mcmc_programs=doc.notes_mcmc_programs,
+    notes_conditional_inference=doc.notes_conditional_inference)
+def metropolis_hastings(model,
+                        proposal,
+                        align_latent,
+                        align_proposal,
+                        align_data,
+                        current_state=None,
+                        current_target_log_prob=None,
+                        collections=None,
+                        *args, **kwargs):
+  """Metropolis-Hastings [@metropolis1953equation; @hastings1970monte].
 
+  MH draws a sample from `proposal` given the last sample. The
+  proposed sample is accepted with log-probability given by
 
-class MetropolisHastings(MonteCarlo):
-  """Metropolis-Hastings [@metropolis1953equation; @hastings1970monte].
+  $\\text{ratio} =
+        \log p(x, z^{\\text{new}}) - \log p(x, z^{\\text{old}}) -
+        \log g(z^{\\text{new}} \mid z^{\\text{old}}) +
+        \log g(z^{\\text{old}} \mid z^{\\text{new}})$
 
-  #### Notes
+  where $p$ is the model's joint density over observed and latent
+  variables, and $g$ is the proposal's density.
+
+  Args:
+  @{arg_model}
+    proposal: function whose inputs are each state. It returns a new
+      collection (Python list) of states given the inputs, $z'\sim
+      g(z' \mid z)$.
+  @{arg_align_latent}
+    align_proposal:
+  @{args}
 
-  In conditional inference, we infer $z$ in $p(z, \\beta
-  \mid x)$ while fixing inference over $\\beta$ using another
-  distribution $q(\\beta)$.
-  To calculate the acceptance ratio, `MetropolisHastings` uses an
-  estimate of the marginal density,
+  Returns:
+  @{returns}
 
-  $p(x, z) = \mathbb{E}_{q(\\beta)} [ p(x, z, \\beta) ]
-            \\approx p(x, z, \\beta^*)$
+  #### Notes
 
-  leveraging a single Monte Carlo sample, where $\\beta^* \sim
-  q(\\beta)$. This is unbiased (and therefore asymptotically exact as a
-  pseudo-marginal method) if $q(\\beta) = p(\\beta \mid x)$.
+  @{notes_mcmc_programs}
 
-  `MetropolisHastings` assumes the proposal distribution has the same
-  support as the prior. The `auto_transform` attribute in
-  the method `initialize()` is not applicable.
+  @{notes_conditional_inference}
 
   #### Examples
 
+  Consider the following setup.
   ```python
-  mu = Normal(loc=0.0, scale=1.0)
-  x = Normal(loc=mu, scale=1.0, sample_shape=10)
-
-  qmu = Empirical(tf.Variable(tf.zeros(500)))
-  proposal_mu = Normal(loc=mu, scale=0.5)
-  inference = ed.MetropolisHastings({mu: qmu}, {mu: proposal_mu},
-                                    data={x: np.zeros(10, dtype=np.float32)})
+  def model():
+    mu = Normal(loc=0.0, scale=1.0, name="mu")
+    x = Normal(loc=mu, scale=1.0, sample_shape=10, name="x")
+    return x
+
+  def proposal(mu):
+    proposal_mu = Normal(loc=mu, scale=0.5, name="proposal/mu")
+    return proposal_mu
+  ```
+  In graph mode, build `tf.Variable`s which are updated via the Markov
+  chain. The update op is fetched at runtime over many iterations.
+  ```python
+  qmu = tf.get_variable("qmu", initializer=1.)
+  next_state, _ = ed.metropolis_hastings(
+      model, proposal,
+      current_state=qmu,
+      align_latent=lambda name: "qmu" if name == "mu" else None,
+      align_data=lambda name: "x_data" if name == "x" else None,
+      x_data=x_data)
+  qmu_update = qmu.assign(next_state)
+  ```
+  In eager mode, call the function at runtime, updating its inputs
+  such as `state`.
+  ```python
+  qmu = 1.
+  next_log_prob = None
+  for _ in range(1000):
+    next_state, next_log_prob = ed.metropolis_hastings(
+        model, proposal,
+        current_state=qmu,
+        align_latent=lambda name: "qmu" if name == "mu" else None,
+        align_proposal=lambda name: "proposal/mu" if name == "mu" else None,
+        align_data=lambda name: "x_data" if name == "x" else None,
+        current_target_log_prob=next_log_prob,
+        x_data=x_data)
+    qmu = next_state
   ```
   """
-  def __init__(self, latent_vars, proposal_vars, data=None):
-    """Create an inference algorithm.
-
-    Args:
-      proposal_vars: dict of RandomVariable to RandomVariable.
-        Collection of random variables to perform inference on; each is
-        binded to a proposal distribution $g(z' \mid z)$.
-    """
-    check_latent_vars(proposal_vars)
-    self.proposal_vars = proposal_vars
-    super(MetropolisHastings, self).__init__(latent_vars, data)
-
-  def initialize(self, *args, **kwargs):
-    kwargs['auto_transform'] = False
-    return super(MetropolisHastings, self).initialize(*args, **kwargs)
-
-  def build_update(self):
-    """Draw sample from proposal conditional on last sample. Then
-    accept or reject the sample based on the ratio,
-
-    $\\text{ratio} =
-          \log p(x, z^{\\text{new}}) - \log p(x, z^{\\text{old}}) -
-          \log g(z^{\\text{new}} \mid z^{\\text{old}}) +
-          \log g(z^{\\text{old}} \mid z^{\\text{new}})$
-
-    #### Notes
+  def _proposal_fn(*fargs):
+    """Takes inputted states and returns (proposed states, log Hastings ratio).
 
-    The updates assume each Empirical random variable is directly
-    parameterized by `tf.Variable`s.
+    This implementation doesn't let `proposal take *args, **kwargs as
+    input (i.e., it cannot be amortized). We also assume proposal
+    returns same size and order as inputted states.
     """
-    old_sample = {z: tf.gather(qz.params, tf.maximum(self.t - 1, 0))
-                  for z, qz in six.iteritems(self.latent_vars)}
-    old_sample = OrderedDict(old_sample)
-
-    # Form dictionary in order to replace conditioning on prior or
-    # observed variable with conditioning on a specific value.
-    dict_swap = {}
-    for x, qx in six.iteritems(self.data):
-      if isinstance(x, RandomVariable):
-        if isinstance(qx, RandomVariable):
-          qx_copy = copy(qx, scope='conditional')
-          dict_swap[x] = qx_copy.value()
-        else:
-          dict_swap[x] = qx
-
-    dict_swap_old = dict_swap.copy()
-    dict_swap_old.update(old_sample)
-    base_scope = tf.get_default_graph().unique_name("inference") + '/'
-    scope_old = base_scope + 'old'
-    scope_new = base_scope + 'new'
-
-    # Draw proposed sample and calculate acceptance ratio.
-    new_sample = old_sample.copy()  # copy to ensure same order
-    ratio = 0.0
-    for z, proposal_z in six.iteritems(self.proposal_vars):
-      # Build proposal g(znew | zold).
-      proposal_znew = copy(proposal_z, dict_swap_old, scope=scope_old)
-      # Sample znew ~ g(znew | zold).
-      new_sample[z] = proposal_znew.value()
-      # Increment ratio.
-      ratio -= tf.reduce_sum(proposal_znew.log_prob(new_sample[z]))
-
-    dict_swap_new = dict_swap.copy()
-    dict_swap_new.update(new_sample)
-
-    for z, proposal_z in six.iteritems(self.proposal_vars):
-      # Build proposal g(zold | znew).
-      proposal_zold = copy(proposal_z, dict_swap_new, scope=scope_new)
-      # Increment ratio.
-      ratio += tf.reduce_sum(proposal_zold.log_prob(dict_swap_old[z]))
-
-    for z in six.iterkeys(self.latent_vars):
-      # Build priors p(znew) and p(zold).
-      znew = copy(z, dict_swap_new, scope=scope_new)
-      zold = copy(z, dict_swap_old, scope=scope_old)
-      # Increment ratio.
-      ratio += tf.reduce_sum(znew.log_prob(dict_swap_new[z]))
-      ratio -= tf.reduce_sum(zold.log_prob(dict_swap_old[z]))
-
-    for x in six.iterkeys(self.data):
-      if isinstance(x, RandomVariable):
-        # Build likelihoods p(x | znew) and p(x | zold).
-        x_znew = copy(x, dict_swap_new, scope=scope_new)
-        x_zold = copy(x, dict_swap_old, scope=scope_old)
-        # Increment ratio.
-        ratio += tf.reduce_sum(x_znew.log_prob(dict_swap[x]))
-        ratio -= tf.reduce_sum(x_zold.log_prob(dict_swap[x]))
-
-    # Accept or reject sample.
-    u = tf.random_uniform([], dtype=ratio.dtype)
-    accept = tf.log(u) < ratio
-    sample_values = tf.cond(accept, lambda: list(six.itervalues(new_sample)),
-                            lambda: list(six.itervalues(old_sample)))
-    if not isinstance(sample_values, list):
-      # `tf.cond` returns tf.Tensor if output is a list of size 1.
-      sample_values = [sample_values]
-
-    sample = {z: sample_value for z, sample_value in
-              zip(six.iterkeys(new_sample), sample_values)}
-
-    # Update Empirical random variables.
-    assign_ops = []
-    for z, qz in six.iteritems(self.latent_vars):
-      variable = qz.get_variables()[0]
-      assign_ops.append(tf.scatter_update(variable, self.t, sample[z]))
-
-    # Increment n_accept (if accepted).
-    assign_ops.append(self.n_accept.assign_add(tf.where(accept, 1, 0)))
-    return tf.group(*assign_ops)
+    global inverse_align_latent
+    # Build g(new | old): new states are drawn given old states as input.
+    new_trace = call_with_trace(proposal, *fargs)
+    new_states = []
+    old_proposal_trace = {}
+    for state, farg in zip(states, fargs):
+      name = state.name.split(':')[0]
+      new_state = new_trace[align_proposal(inverse_align_latent[name])]
+      old_proposal_trace[new_state.name.split(':')[0]] = farg
+      new_states.append(new_state)
+    # Build g(old | new): `value`s set to old states; new states are input.
+    old_trace = call_with_trace_and_intercept(
+        proposal,
+        old_proposal_trace,
+        lambda name: name if name in old_proposal_trace else None,
+        *new_states)
+    old_states = []
+    for state, farg in zip(states, fargs):
+      name = state.name.split(':')[0]
+      old_state = old_trace[align_proposal(inverse_align_latent[name])]
+      old_states.append(old_state)
+    # Compute log p(old | new) - log p(new | old).
+    log_hastings_ratio = 0.0
+    for old_state, new_state in zip(old_states, new_states):
+      log_hastings_ratio += tf.reduce_sum(old_state.log_prob(old_state.value))
+      log_hastings_ratio -= tf.reduce_sum(new_state.log_prob(new_state.value))
+    return new_states, log_hastings_ratio
+
+  maybe_list = lambda x: list(x) if isinstance(x, (tuple, list)) else [x]
+  states = maybe_list(current_state)
+  out = tfp.metropolis_hastings.kernel(
+      target_log_prob_fn=make_log_joint(model, current_state),
+      proposal_fn=_proposal_fn,
+      current_state=current_state,
+      current_target_log_prob=current_target_log_prob)
+  return out
+
+
+def call_with_trace_and_intercept(f, trace, align_latent, *args, **kwargs):
+  """Calls function and both writes to a stack and intercepts sample value."""
+  def manipulate(cls_init, self, *fargs, **fkwargs):
+    name = fkwargs.get('name', None)
+    key = align_latent(name)
+    if trace.get(key, None) is not None:
+      fkwargs['value'] = tf.convert_to_tensor(trace[key])
+    cls_init(self, *fargs, **fkwargs)
+    stack[name] = self
+  stack = collections.OrderedDict({})
+  f = make_optional_inputs(f)
+  call_with_manipulate(f, manipulate, *args, **kwargs)
+  return stack
diff --git a/edward/inferences/monte_carlo.py b/edward/inferences/monte_carlo.py
deleted file mode 100644
index da22cb285..000000000
--- a/edward/inferences/monte_carlo.py
+++ /dev/null
@@ -1,169 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import abc
-import numpy as np
-import six
-import tensorflow as tf
-
-from edward.inferences.inference import Inference
-from edward.models import Empirical, RandomVariable
-from edward.util import get_session
-
-
-@six.add_metaclass(abc.ABCMeta)
-class MonteCarlo(Inference):
-  """Abstract base class for Monte Carlo. Specific Monte Carlo methods
-  inherit from `MonteCarlo`, sharing methods in this class.
-
-  To build an algorithm inheriting from `MonteCarlo`, one must at the
-  minimum implement `build_update`: it determines how to assign
-  the samples in the `Empirical` approximations.
-
-  #### Notes
-
-  The number of Monte Carlo iterations is set according to the
-  minimum of all `Empirical` sizes.
-
-  Initialization is assumed from `params[0, :]`. This generalizes
-  initializing randomly and initializing from user input. Updates
-  are along this outer dimension, where iteration t updates
-  `params[t, :]` in each `Empirical` random variable.
-
-  No warm-up is implemented. Users must run MCMC for a long period
-  of time, then manually burn in the Empirical random variable.
-
-  #### Examples
-
-  Most explicitly, `MonteCarlo` is specified via a dictionary:
-
-  ```python
-  qpi = Empirical(params=tf.Variable(tf.zeros([T, K-1])))
-  qmu = Empirical(params=tf.Variable(tf.zeros([T, K*D])))
-  qsigma = Empirical(params=tf.Variable(tf.zeros([T, K*D])))
-  ed.MonteCarlo({pi: qpi, mu: qmu, sigma: qsigma}, data)
-  ```
-
-  The inferred posterior is comprised of `Empirical` random
-  variables with `T` samples. We also automate the specification
-  of `Empirical` random variables. One can pass in a list of
-  latent variables instead:
-
-  ```python
-  ed.MonteCarlo([beta], data)
-  ed.MonteCarlo([pi, mu, sigma], data)
-  ```
-
-  It defaults to `Empirical` random variables with 10,000 samples for
-  each dimension.
-  """
-  def __init__(self, latent_vars=None, data=None):
-    """Create an inference algorithm.
-
-    Args:
-      latent_vars: list or dict.
-        Collection of random variables (of type `RandomVariable` or
-        `tf.Tensor`) to perform inference on. If list, each random
-        variable will be approximated using a `Empirical` random
-        variable that is defined internally (with unconstrained
-        support). If dictionary, each value in the dictionary must be a
-        `Empirical` random variable.
-      data: dict.
-        Data dictionary which binds observed variables (of type
-        `RandomVariable` or `tf.Tensor`) to their realizations (of
-        type `tf.Tensor`). It can also bind placeholders (of type
-        `tf.Tensor`) used in the model to their realizations.
-    """
-    if isinstance(latent_vars, list):
-      with tf.variable_scope(None, default_name="posterior"):
-        latent_vars = {z: Empirical(params=tf.Variable(tf.zeros(
-            [1e4] + z.batch_shape.concatenate(z.event_shape).as_list())))
-            for z in latent_vars}
-    elif isinstance(latent_vars, dict):
-      for qz in six.itervalues(latent_vars):
-        if not isinstance(qz, Empirical):
-          raise TypeError("Posterior approximation must consist of only "
-                          "Empirical random variables.")
-        elif len(qz.sample_shape) != 0:
-          raise ValueError("Empirical posterior approximations must have "
-                           "a scalar sample shape.")
-
-    super(MonteCarlo, self).__init__(latent_vars, data)
-
-  def initialize(self, *args, **kwargs):
-    kwargs['n_iter'] = np.amin([qz.params.shape.as_list()[0] for
-                                qz in six.itervalues(self.latent_vars)])
-    super(MonteCarlo, self).initialize(*args, **kwargs)
-
-    self.n_accept = tf.Variable(0, trainable=False, name="n_accept")
-    self.n_accept_over_t = self.n_accept / self.t
-    self.train = self.build_update()
-
-    self.reset.append(tf.variables_initializer([self.n_accept]))
-
-    if self.logging:
-      tf.summary.scalar("n_accept", self.n_accept,
-                        collections=[self._summary_key])
-      self.summarize = tf.summary.merge_all(key=self._summary_key)
-
-  def update(self, feed_dict=None):
-    """Run one iteration of sampling.
-
-    Args:
-      feed_dict: dict.
-        Feed dictionary for a TensorFlow session run. It is used to feed
-        placeholders that are not fed during initialization.
-
-    Returns:
-      dict.
-      Dictionary of algorithm-specific information. In this case, the
-      acceptance rate of samples since (and including) this iteration.
-
-    #### Notes
-
-    We run the increment of `t` separately from other ops. Whether the
-    others op run with the `t` before incrementing or after incrementing
-    depends on which is run faster in the TensorFlow graph. Running it
-    separately forces a consistent behavior.
-    """
-    if feed_dict is None:
-      feed_dict = {}
-
-    for key, value in six.iteritems(self.data):
-      if isinstance(key, tf.Tensor) and "Placeholder" in key.op.type:
-        feed_dict[key] = value
-
-    sess = get_session()
-    _, accept_rate = sess.run([self.train, self.n_accept_over_t], feed_dict)
-    t = sess.run(self.increment_t)
-
-    if self.debug:
-      sess.run(self.op_check, feed_dict)
-
-    if self.logging and self.n_print != 0:
-      if t == 1 or t % self.n_print == 0:
-        summary = sess.run(self.summarize, feed_dict)
-        self.train_writer.add_summary(summary, t)
-
-    return {'t': t, 'accept_rate': accept_rate}
-
-  def print_progress(self, info_dict):
-    """Print progress to output.
-    """
-    if self.n_print != 0:
-      t = info_dict['t']
-      if t == 1 or t % self.n_print == 0:
-        self.progbar.update(t, {'Acceptance Rate': info_dict['accept_rate']})
-
-  @abc.abstractmethod
-  def build_update(self):
-    """Build update rules, returning an assign op for parameters in
-    the `Empirical` random variables.
-
-    Any derived class of `MonteCarlo` **must** implement this method.
-
-    Raises:
-      NotImplementedError.
-    """
-    raise NotImplementedError()
diff --git a/edward/inferences/sghmc.py b/edward/inferences/sghmc.py
index d69dc7a0a..801ab054e 100644
--- a/edward/inferences/sghmc.py
+++ b/edward/inferences/sghmc.py
@@ -5,126 +5,227 @@
 import six
 import tensorflow as tf
 
-from edward.inferences.monte_carlo import MonteCarlo
-from edward.models import RandomVariable, Empirical
-from edward.util import copy
-
-
-class SGHMC(MonteCarlo):
+from edward.inferences import docstrings as doc
+from edward.inferences.util import make_log_joint
+
+
+@doc.set_doc(
+    args_part_one=(doc.arg_model +
+                   doc.arg_align_latent_monte_carlo +
+                   doc.arg_align_data +
+                   doc.arg_current_state +
+                   doc.arg_step_size)[:-1],
+    args_part_two=(doc.arg_current_target_log_prob +
+                   doc.arg_current_grads_target_log_prob +
+                   doc.arg_auto_transform +
+                   doc.arg_collections +
+                   doc.arg_args_kwargs)[:-1],
+    returns=doc.return_samples,
+    notes_mcmc_programs=doc.notes_mcmc_programs,
+    notes_conditional_inference=doc.notes_conditional_inference)
+def sghmc(model,
+          align_latent,
+          align_data,
+          # current_state=None,  # TODO kwarg before arg
+          current_state,
+          momentum,
+          momentum_state,
+          learning_rate,
+          friction=0.1,
+          preconditioner_decay_rate=0.95,
+          num_pseudo_batches=1,
+          diagonal_bias=1e-8,
+          target_log_prob=None,
+          grads_target_log_prob=None,
+          auto_transform=True,
+          collections=None,
+          *args, **kwargs):
   """Stochastic gradient Hamiltonian Monte Carlo [@chen2014stochastic].
 
-  #### Notes
+  SGHMC simulates Hamiltonian dynamics with friction using a discretized
+  integrator. Its discretization error goes to zero as the learning
+  rate decreases. Namely, it implements the update equations from (15)
+  of @chen2014stochastic.
+
+  This function implements an adaptive mass matrix using RMSProp.
+  Namely, it uses the update from pre-conditioned SGLD
+  [@li2016preconditioned] extended to second-order Langevin dynamics
+  (SGHMC): the preconditioner is equal to the inverse of the mass
+  matrix [@chen2014stochastic].
+
+  Works for any probabilistic program whose latent variables of
+  interest are differentiable. If `auto_transform=True`, the latent
+  variables may exist on any constrained differentiable support.
+
+  Args:
+  @{args_part_one}
+    friction: float.
+      Constant scale on the friction term in the Hamiltonian system.
+      The implementation may be extended in the future to enable a
+      friction per random variable (`friction` would be a callable).
+    momentum:
+    momentum_state:
+    learning_rate:
+    friction:
+    preconditioner_decay_rate:
+    num_pseudo_batches:
+    diagonal_bias:
+  @{args_part_two}
+
+  Returns:
+  @{returns}
 
-  In conditional inference, we infer $z$ in $p(z, \\beta
-  \mid x)$ while fixing inference over $\\beta$ using another
-  distribution $q(\\beta)$.
-  `SGHMC` substitutes the model's log marginal density
+  #### Notes
 
-  $\log p(x, z) = \log \mathbb{E}_{q(\\beta)} [ p(x, z, \\beta) ]
-                \\approx \log p(x, z, \\beta^*)$
+  @{notes_mcmc_programs}
 
-  leveraging a single Monte Carlo sample, where $\\beta^* \sim
-  q(\\beta)$. This is unbiased (and therefore asymptotically exact as a
-  pseudo-marginal method) if $q(\\beta) = p(\\beta \mid x)$.
+  @{notes_conditional_inference}
 
   #### Examples
 
+  Consider the following setup.
   ```python
-  mu = Normal(loc=0.0, scale=1.0)
-  x = Normal(loc=mu, scale=1.0, sample_shape=10)
-
-  qmu = Empirical(tf.Variable(tf.zeros(500)))
-  inference = ed.SGHMC({mu: qmu}, {x: np.zeros(10, dtype=np.float32)})
+  def model():
+    mu = Normal(loc=0.0, scale=1.0, name="mu")
+    x = Normal(loc=mu, scale=1.0, sample_shape=10, name="x")
+    return x
   ```
+  In graph mode, build `tf.Variable`s which are updated via the Markov
+  chain. The update op is fetched at runtime over many iterations.
+  ```python
+  qmu = tf.get_variable("qmu", initializer=1.)
+  qmu_mom = tf.get_variable("qmu_mom", initializer=0.)
+  qmu_mom_state = tf.get_variable("qmu_mom_state", initializer=0.)
+  next_state, next_momentum, next_momentum_state = ed.sghmc(
+      model,
+      ...,
+      current_state=qmu,
+      momentum=qmu_mom,
+      momentum_state=qmu_mom_state,
+      align_latent=lambda name: "qmu" if name == "mu" else None,
+      align_data=lambda name: "x_data" if name == "x" else None,
+      x_data=x_data)
+  qmu_update = qmu.assign(next_state)
+  qmu_mom_update = qmu_mom.assign(next_momentum)
+  qmu_mom_state_update = qmu_mom_state.assign(next_momentum_state)
+  ```
+  In eager mode, call the function at runtime, updating its inputs
+  such as `current_state`.
+  ```python
+  qmu = 1.
+  qmu_mom = None
+  qmu_mom_state = None
+  for _ in range(1000):
+    next_state, next_momentum, next_momentum_state = ed.sghmc(
+        model,
+        ...,
+        current_state=qmu,
+        momentum=qmu_mom,
+        momentum_state=qmu_mom_state,
+        align_latent=lambda name: "qmu" if name == "mu" else None,
+        align_data=lambda name: "x_data" if name == "x" else None,
+        x_data=x_data)
+    qmu = next_state
+    qmu_mom = next_momentum
+    qmu_mom_state = next_momentum_state
+  ```
+  """
+  out = kernel(
+      target_log_prob_fn=make_log_joint(model, current_state),
+      current_state=current_state,
+      momentum=momentum,
+      momentum_state=momentum_state,
+      learning_rate=learning_rate,
+      frictions=friction,
+      preconditioner_decay_rate=preconditioner_decay_rate,
+      num_pseudo_batches=num_pseudo_batches,
+      diagonal_bias=diagonal_bias,
+      current_target_log_prob=target_log_prob,
+      current_grads_target_log_prob=grads_target_log_prob)
+  return out
+
+
+def kernel(target_log_prob_fn,
+           current_state,
+           momentum,
+           momentum_state,
+           learning_rate,
+           frictions=0.1,
+           preconditioner_decay_rate=0.95,
+           num_pseudo_batches=1,
+           diagonal_bias=1e-8,
+           current_target_log_prob=None,
+           current_grads_target_log_prob=None,
+           name=None):
+  """Pre-conditioned SGHMC.
+
+  Args:
+    ...
+    momentum:
+    momentum_state: Auxiliary momentums for states (the other is
+      momentum for the preconditioner RMSProp.)
+    learning_rate: From tf.contrib.bayesflow.SGLDOptimizer.
+    frictions:
+    preconditioner_decay_rate: From tf.contrib.bayesflow.SGLDOptimizer.
+    num_pseudo_batches: From tf.contrib.bayesflow.SGLDOptimizer.
+    diagonal_bias: From tf.contrib.bayesflow.SGLDOptimizer.
+    ...
   """
-  def __init__(self, *args, **kwargs):
-    super(SGHMC, self).__init__(*args, **kwargs)
-
-  def initialize(self, step_size=0.25, friction=0.1, *args, **kwargs):
-    """Initialize inference algorithm.
-
-    Args:
-      step_size: float.
-        Constant scale factor of learning rate.
-      friction: float.
-        Constant scale on the friction term in the Hamiltonian system.
-    """
-    self.step_size = step_size
-    self.friction = friction
-    self.v = {z: tf.Variable(tf.zeros(qz.params.shape[1:], dtype=qz.dtype))
-              for z, qz in six.iteritems(self.latent_vars)}
-    return super(SGHMC, self).initialize(*args, **kwargs)
-
-  def build_update(self):
-    """Simulate Hamiltonian dynamics with friction using a discretized
-    integrator. Its discretization error goes to zero as the learning
-    rate decreases.
-
-    Implements the update equations from (15) of @chen2014stochastic.
-    """
-    old_sample = {z: tf.gather(qz.params, tf.maximum(self.t - 1, 0))
-                  for z, qz in six.iteritems(self.latent_vars)}
-    old_v_sample = {z: v for z, v in six.iteritems(self.v)}
-
-    # Simulate Hamiltonian dynamics with friction.
-    learning_rate = self.step_size * 0.01
-    grad_log_joint = tf.gradients(self._log_joint(old_sample),
-                                  list(six.itervalues(old_sample)))
-
-    # v_sample is so named b/c it represents a velocity rather than momentum.
-    sample = {}
-    v_sample = {}
-    for z, grad_log_p in zip(six.iterkeys(old_sample), grad_log_joint):
-      qz = self.latent_vars[z]
-      event_shape = qz.event_shape
-      stddev = tf.sqrt(tf.cast(learning_rate * self.friction, qz.dtype))
-      normal = tf.random_normal(event_shape, dtype=qz.dtype)
-      sample[z] = old_sample[z] + old_v_sample[z]
-      v_sample[z] = ((1.0 - 0.5 * self.friction) * old_v_sample[z] +
-                     learning_rate * tf.convert_to_tensor(grad_log_p) +
-                     stddev * normal)
-
-    # Update Empirical random variables.
-    assign_ops = []
-    for z, qz in six.iteritems(self.latent_vars):
-      variable = qz.get_variables()[0]
-      assign_ops.append(tf.scatter_update(variable, self.t, sample[z]))
-      assign_ops.append(tf.assign(self.v[z], v_sample[z]).op)
-
-    # Increment n_accept.
-    assign_ops.append(self.n_accept.assign_add(1))
-    return tf.group(*assign_ops)
-
-  def _log_joint(self, z_sample):
-    """Utility function to calculate model's log joint density,
-    log p(x, z), for inputs z (and fixed data x).
-
-    Args:
-      z_sample: dict.
-        Latent variable keys to samples.
-    """
-    scope = tf.get_default_graph().unique_name("inference")
-    # Form dictionary in order to replace conditioning on prior or
-    # observed variable with conditioning on a specific value.
-    dict_swap = z_sample.copy()
-    for x, qx in six.iteritems(self.data):
-      if isinstance(x, RandomVariable):
-        if isinstance(qx, RandomVariable):
-          qx_copy = copy(qx, scope=scope)
-          dict_swap[x] = qx_copy.value()
-        else:
-          dict_swap[x] = qx
-
-    log_joint = 0.0
-    for z in six.iterkeys(self.latent_vars):
-      z_copy = copy(z, dict_swap, scope=scope)
-      log_joint += tf.reduce_sum(
-          self.scale.get(z, 1.0) * z_copy.log_prob(dict_swap[z]))
-
-    for x in six.iterkeys(self.data):
-      if isinstance(x, RandomVariable):
-        x_copy = copy(x, dict_swap, scope=scope)
-        log_joint += tf.reduce_sum(
-            self.scale.get(x, 1.0) * x_copy.log_prob(dict_swap[x]))
-
-    return log_joint
+  is_list_like = lambda x: isinstance(x, (tuple, list))
+  maybe_list = lambda x: list(x) if is_list_like(x) else [x]
+  states = maybe_list(current_state)
+  momentums = maybe_list(momentum)
+  momentums_states = maybe_list(momentum_state)
+  with tf.name_scope(name, "sghmc_kernel", states):
+    with tf.name_scope("initialize"):
+      if current_target_log_prob is None:
+        current_target_log_prob = target_log_prob_fn(*states)
+      if current_grads_target_log_prob is None:
+        current_grads_target_log_prob = tf.gradients(current_target_log_prob, states)
+
+    next_states = []
+    next_momentums_states = []
+    for state, mom, grad in zip(states, momentums, current_grads_target_log_prob):
+      state_update, mom_state_update = _apply_noisy_update(
+          mom, grad, learning_rate,
+          friction, mom_state,
+          diagonal_bias, num_pseudo_batches)
+      next_state = state + learning_rate * state_update
+      # TODO doesn't this scale the noise incorrectly by additional
+      # learning_rate during the update? (same in sgld_optimizer)
+      next_mom_state = mom + learning_rate * mom_state_update
+      momentum = (mom + (1.0 - preconditioner_decay_rate) *
+                  (tf.square(grad) - mom))
+      next_states.append(next_state)
+      next_momentums_states.append(next_mom_state)
+      momentums.append(momentum)
+
+    maybe_flatten = lambda x: x if is_list_like(state) else x[0]
+    next_state = maybe_flatten(next_states)
+    next_momentum_state = maybe_flatten(next_momentums_states)
+    momentum = maybe_flatten(momentums)
+    return [
+        next_state,
+        next_momentum_state,
+        momentum,
+    ]
+
+
+def _apply_noisy_update(mom, grad, learning_rate,
+                        friction, mom_state,
+                        diagonal_bias, num_pseudo_batches):
+  """Adapted from tf.contrib.bayesflow.SGLDOptimizer._apply_noisy_update."""
+  from tensorflow.python.ops import array_ops
+  from tensorflow.python.ops import math_ops
+  from tensorflow.python.ops import random_ops
+  stddev = math_ops.cast(math_ops.rsqrt(2 * learning_rate * friction), grad.dtype)
+  preconditioner = math_ops.rsqrt(
+      mom + math_ops.cast(diagonal_bias, grad.dtype))
+  state_update = preconditioner * mom_state
+  mom_state_update = (
+      -grad * math_ops.cast(num_pseudo_batches,
+                            grad.dtype) +
+      friction * tf.matmul(preconditioner, mom_state) +
+      random_ops.random_normal(array_ops.shape(grad), 1.0, dtype=grad.dtype) *
+      stddev)
+  return state_update, mom_state_update
diff --git a/edward/inferences/sgld.py b/edward/inferences/sgld.py
index bcd7027da..5e7140a5f 100644
--- a/edward/inferences/sgld.py
+++ b/edward/inferences/sgld.py
@@ -5,117 +5,121 @@
 import six
 import tensorflow as tf
 
-from edward.inferences.monte_carlo import MonteCarlo
-from edward.models import RandomVariable
-from edward.util import copy
+from edward.inferences import docstrings as doc
+from edward.inferences.util import make_log_joint
+
+tfp = tf.contrib.bayesflow
+
+
+@doc.set_doc(
+    args_part_one=(doc.arg_model +
+                   doc.arg_align_latent_monte_carlo +
+                   doc.arg_align_data +
+                   doc.arg_current_state)[:-1],
+    args_part_two=(doc.arg_current_target_log_prob +
+                   doc.arg_current_grads_target_log_prob +
+                   doc.arg_auto_transform +
+                   doc.arg_collections +
+                   doc.arg_args_kwargs)[:-1],
+    returns=doc.return_samples,
+    notes_mcmc_programs=doc.notes_mcmc_programs,
+    notes_conditional_inference=doc.notes_conditional_inference)
+def sgld(model,
+         align_latent,
+         align_data,
+         # current_state=None,  # TODO kwarg before arg
+         current_state,
+         momentum,
+         learning_rate,
+         preconditioner_decay_rate=0.95,
+         num_pseudo_batches=1,
+         diagonal_bias=1e-8,
+         target_log_prob=None,
+         grads_target_log_prob=None,
+         auto_transform=True,
+         collections=None,
+         *args, **kwargs):
+  """Stochastic gradient Langevin dynamics [@welling2011bayesian].
 
+  SGLD simulates Langevin dynamics using a discretized integrator. Its
+  discretization error goes to zero as the learning rate decreases.
 
-class SGLD(MonteCarlo):
-  """Stochastic gradient Langevin dynamics [@welling2011bayesian].
+  This function implements an adaptive preconditioner using RMSProp
+  [@li2016preconditioned].
 
-  #### Notes
+  Works for any probabilistic program whose latent variables of
+  interest are differentiable. If `auto_transform=True`, the latent
+  variables may exist on any constrained differentiable support.
 
-  In conditional inference, we infer $z$ in $p(z, \\beta
-  \mid x)$ while fixing inference over $\\beta$ using another
-  distribution $q(\\beta)$.
-  `SGLD` substitutes the model's log marginal density
+  Args:
+  @{args_part_one}
+    momentum:
+    learning_rate:
+    preconditioner_decay_rate:
+    num_pseudo_batches:
+    diagonal_bias:
+  @{args_part_two}
+
+  Returns:
+  @{returns}
+
+  #### Notes
 
-  $\log p(x, z) = \log \mathbb{E}_{q(\\beta)} [ p(x, z, \\beta) ]
-                \\approx \log p(x, z, \\beta^*)$
+  @{notes_mcmc_programs}
 
-  leveraging a single Monte Carlo sample, where $\\beta^* \sim
-  q(\\beta)$. This is unbiased (and therefore asymptotically exact as a
-  pseudo-marginal method) if $q(\\beta) = p(\\beta \mid x)$.
+  @{notes_conditional_inference}
 
   #### Examples
 
+  Consider the following setup.
   ```python
-  mu = Normal(loc=0.0, scale=1.0)
-  x = Normal(loc=mu, scale=1.0, sample_shape=10)
-
-  qmu = Empirical(tf.Variable(tf.zeros(500)))
-  inference = ed.SGLD({mu: qmu}, {x: np.zeros(10, dtype=np.float32)})
+  def model():
+    mu = Normal(loc=0.0, scale=1.0, name="mu")
+    x = Normal(loc=mu, scale=1.0, sample_shape=10, name="x")
+    return x
+  ```
+  In graph mode, build `tf.Variable`s which are updated via the Markov
+  chain. The update op is fetched at runtime over many iterations.
+  ```python
+  qmu = tf.get_variable("qmu", initializer=1.)
+  qmu_mom = tf.get_variable("qmu_mom", initializer=0.)
+  next_state, next_momentum = ed.sgld(
+      model,
+      ...,
+      current_state=qmu,
+      momentum=qmu_mom,
+      align_latent=lambda name: "qmu" if name == "mu" else None,
+      align_data=lambda name: "x_data" if name == "x" else None,
+      x_data=x_data)
+  qmu_update = qmu.assign(next_state)
+  qmu_mom_update = qmu_mom.assign(next_momentum)
+  ```
+  In eager mode, call the function at runtime, updating its inputs
+  such as `current_state`.
+  ```python
+  qmu = 1.
+  qmu_mom = None
+  for _ in range(1000):
+    next_state, momentum = ed.sgld(
+        model,
+        ...,
+        current_state=qmu,
+        momentum=qmu_mom,
+        align_latent=lambda name: "qmu" if name == "mu" else None,
+        align_data=lambda name: "x_data" if name == "x" else None,
+        x_data=x_data)
+    qmu = next_state
+    qmu_mom = next_momentum
   ```
   """
-  def __init__(self, *args, **kwargs):
-    super(SGLD, self).__init__(*args, **kwargs)
-
-  def initialize(self, step_size=0.25, *args, **kwargs):
-    """
-    Args:
-      step_size: float.
-        Constant scale factor of learning rate.
-    """
-    self.step_size = step_size
-    return super(SGLD, self).initialize(*args, **kwargs)
-
-  def build_update(self):
-    """Simulate Langevin dynamics using a discretized integrator. Its
-    discretization error goes to zero as the learning rate decreases.
-
-    #### Notes
-
-    The updates assume each Empirical random variable is directly
-    parameterized by `tf.Variable`s.
-    """
-    old_sample = {z: tf.gather(qz.params, tf.maximum(self.t - 1, 0))
-                  for z, qz in six.iteritems(self.latent_vars)}
-
-    # Simulate Langevin dynamics.
-    learning_rate = self.step_size / tf.pow(
-        tf.cast(self.t + 1, list(six.iterkeys(old_sample))[0].dtype), 0.55)
-    grad_log_joint = tf.gradients(self._log_joint(old_sample),
-                                  list(six.itervalues(old_sample)))
-    sample = {}
-    for z, grad_log_p in zip(six.iterkeys(old_sample), grad_log_joint):
-      qz = self.latent_vars[z]
-      event_shape = qz.event_shape
-      stddev = tf.sqrt(tf.cast(learning_rate, qz.dtype))
-      normal = tf.random_normal(event_shape, dtype=qz.dtype)
-      sample[z] = (old_sample[z] +
-                   0.5 * learning_rate * tf.convert_to_tensor(grad_log_p) +
-                   stddev * normal)
-
-    # Update Empirical random variables.
-    assign_ops = []
-    for z, qz in six.iteritems(self.latent_vars):
-      variable = qz.get_variables()[0]
-      assign_ops.append(tf.scatter_update(variable, self.t, sample[z]))
-
-    # Increment n_accept.
-    assign_ops.append(self.n_accept.assign_add(1))
-    return tf.group(*assign_ops)
-
-  def _log_joint(self, z_sample):
-    """Utility function to calculate model's log joint density,
-    log p(x, z), for inputs z (and fixed data x).
-
-    Args:
-      z_sample: dict.
-        Latent variable keys to samples.
-    """
-    scope = tf.get_default_graph().unique_name("inference")
-    # Form dictionary in order to replace conditioning on prior or
-    # observed variable with conditioning on a specific value.
-    dict_swap = z_sample.copy()
-    for x, qx in six.iteritems(self.data):
-      if isinstance(x, RandomVariable):
-        if isinstance(qx, RandomVariable):
-          qx_copy = copy(qx, scope=scope)
-          dict_swap[x] = qx_copy.value()
-        else:
-          dict_swap[x] = qx
-
-    log_joint = 0.0
-    for z in six.iterkeys(self.latent_vars):
-      z_copy = copy(z, dict_swap, scope=scope)
-      log_joint += tf.reduce_sum(
-          self.scale.get(z, 1.0) * z_copy.log_prob(dict_swap[z]))
-
-    for x in six.iterkeys(self.data):
-      if isinstance(x, RandomVariable):
-        x_copy = copy(x, dict_swap, scope=scope)
-        log_joint += tf.reduce_sum(
-            self.scale.get(x, 1.0) * x_copy.log_prob(dict_swap[x]))
-
-    return log_joint
+  out = tfp.sgld.kernel(
+      target_log_prob_fn=make_log_joint(model, current_state),
+      current_state=current_state,
+      momentum=momentum,
+      learning_rate=learning_rate,
+      preconditioner_decay_rate=preconditioner_decay_rate,
+      num_pseudo_batches=num_pseudo_batches,
+      diagonal_bias=diagonal_bias,
+      current_target_log_prob=target_log_prob,
+      current_grads_target_log_prob=grads_target_log_prob)
+  return out
diff --git a/edward/inferences/util.py b/edward/inferences/util.py
new file mode 100644
index 000000000..a92e7f7a3
--- /dev/null
+++ b/edward/inferences/util.py
@@ -0,0 +1,253 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import inspect
+import operator
+import six
+import tensorflow as tf
+
+from edward.models.core import call_with_manipulate
+from edward.models.core import TransformedDistribution
+from edward.models.random_variable import RandomVariable
+
+tfb = tf.contrib.distributions.bijectors
+
+
+def call_with_trace(f, *args, **kwargs):
+  """Calls function and writes to a stack to expose its execution trace."""
+  def manipulate(cls_init, self, *fargs, **fkwargs):
+    cls_init(self, *fargs, **fkwargs)
+    stack[self.name] = self
+  stack = collections.OrderedDict({})
+  f = make_optional_inputs(f)
+  call_with_manipulate(f, manipulate, *args, **kwargs)
+  return stack
+
+
+def call_with_intercept(f, trace, align_data, align_latent,
+                        *args, **kwargs):
+  """Calls function and intercepts its primitive ops' sample values."""
+  def manipulate(f, *fargs, **fkwargs):
+    """Set model's sample values to variational distribution's and data."""
+    name = fkwargs.get('name', None)
+    key = align_data(name)
+    if isinstance(key, int):
+      fkwargs['value'] = args[key]
+    elif kwargs.get(key, None) is not None:
+      fkwargs['value'] = kwargs.get(key)
+    elif align_latent(name) is not None:
+      fkwargs['value'] = tf.convert_to_tensor(trace[align_latent(name)])
+    # if auto_transform and 'qz' in locals():
+    #   # TODO for generation to work, must output original dist. to
+    #   keep around TD? must maintain another stack to write to as a
+    #   side-effect (or augment the original stack).
+    #   return transform(f, qz, *fargs, **fkwargs)
+    return f(*fargs, **fkwargs)
+  f = make_optional_inputs(f)
+  return call_with_manipulate(f, manipulate, *args, **kwargs)
+
+
+def make_log_joint(model, states):
+  """Factory to make a log-joint probability function.
+
+  It takes a model and transition states as input. It returns its log-joint
+  probability as a function of the states. (This is applied in Markov chain
+  Carlo algorithms.)
+  """
+  maybe_list = lambda x: list(x) if isinstance(x, (tuple, list)) else [x]
+  states = maybe_list(states)
+  def log_joint(*fargs):
+    """Target's unnormalized log-joint density as a function of states."""
+    q_trace = {state.name.split(':')[0]: arg
+               for state, arg in zip(states, fargs)}
+    x = call_with_intercept(model, q_trace, align_data, align_latent,
+                            *args, **kwargs)
+    p_log_prob = 0.0
+    for rv in toposort(x):
+      if align_latent(rv.name) is not None or align_data(rv.name) is not None:
+        p_log_prob += tf.reduce_sum(rv.log_prob(rv.value))
+    return p_log_prob
+  return log_joint
+
+
+def make_optional_inputs(f):
+  """Wraps function to take in optional, unused args/kwargs."""
+  def f_wrapped(*args, **kwargs):
+    if hasattr(f, "_func"):  # tf.make_template()
+      argspec = inspect.getargspec(f._func)
+    else:
+      argspec = inspect.getargspec(f)
+    fkwargs = {}
+    for k, v in six.iteritems(kwargs):
+      if k in argspec.args:
+        fkwargs[k] = v
+    num_args = len(argspec.args) - len(fkwargs)
+    if num_args > 0:
+      return f(*args[:num_args], **fkwargs)
+    elif len(fkwargs) > 0:
+      return f(**fkwargs)
+    return f()
+  f_wrapped.__name__ = getattr(f, '__name__', '[unknown name]')
+  f_wrapped.__doc__ = getattr(f, '__doc__' , '')
+  return f_wrapped
+
+
+def toposort(end_node, parents=operator.methodcaller('get_parents')):
+  """Generate nodes in DAG's reverse topological order.
+
+  For any edge U -> V, the function visits V before visiting U. It traces
+  using a backward pass, i.e., the "pull" dataflow model.
+
+  Args:
+    end_node: Input or list of inputs.
+  """
+  child_counts = {}
+  maybe_list = lambda x: list(x) if isinstance(x, (list, tuple)) else [x]
+  stack = maybe_list(end_node)
+  while stack:
+    node = stack.pop()
+    if node in child_counts:
+      child_counts[node] += 1
+    else:
+      child_counts[node] = 1
+      stack.extend(parents(node))
+
+  childless_nodes = maybe_list(end_node)
+  while childless_nodes:
+    node = childless_nodes.pop()
+    yield node
+    for parent in parents(node):
+      if child_counts[parent] == 1:
+        childless_nodes.append(parent)
+      else:
+        child_counts[parent] -= 1
+
+
+def transform(f, qz, *args, **kwargs):
+  """Transform prior -> unconstrained -> q's constraint.
+
+  When using in VI, we keep variational distribution on its original
+  space (for sake of implementing only one intercepting function).
+  """
+  # TODO deal with f or qz being 'point' or 'points'
+  if (not hasattr(f, 'support') or not hasattr(qz, 'support') or
+          f.support == qz.support):
+    return f(*args, **kwargs)
+  value = kwargs.pop('value')
+  kwargs['value'] = 0.0  # to avoid sampling; TODO follow sample shape
+  rv = f(*args, **kwargs)
+  # Take shortcuts in logic if p or q are already unconstrained.
+  if qz.support in ('real', 'multivariate_real'):
+    return _transform(rv, value=value)
+  if rv.support in ('real', 'multivariate_real'):
+    rv_unconstrained = rv
+  else:
+    rv_unconstrained = _transform(rv, value=0.0)
+  unconstrained_to_constrained = tfb.Invert(_transform(qz).bijector)
+  return _transform(rv_unconstrained,
+                    unconstrained_to_constrained,
+                    value=value)
+
+
+def transform(x, *args, **kwargs):
+  """Transform a continuous random variable to the unconstrained space.
+
+  `transform` selects among a number of default transformations which
+  depend on the support of the provided random variable:
+
+  + $[0, 1]$ (e.g., Beta): Inverse of sigmoid.
+  + $[0, \infty)$ (e.g., Gamma): Inverse of softplus.
+  + Simplex (e.g., Dirichlet): Inverse of softmax-centered.
+  + $(-\infty, \infty)$ (e.g., Normal, MultivariateNormalTriL): None.
+
+  Args:
+    x: RandomVariable.
+      Continuous random variable to transform.
+    *args, **kwargs:
+      Arguments to overwrite when forming the `TransformedDistribution`.
+      For example, manually specify the transformation by passing in
+      the `bijector` argument.
+
+  Returns:
+    RandomVariable.
+    A `TransformedDistribution` random variable, or the provided random
+    variable if no transformation was applied.
+
+  #### Examples
+
+  ```python
+  x = Gamma(1.0, 1.0)
+  y = ed.transform(x)
+  sess = tf.Session()
+  sess.run(y)
+  -2.2279539
+  ```
+  """
+  if len(args) != 0 or kwargs.get('bijector', None) is not None:
+    return TransformedDistribution(x, *args, **kwargs)
+
+  try:
+    support = x.support
+  except AttributeError as e:
+    msg = """'{}' object has no 'support'
+             so cannot be transformed.""".format(type(x).__name__)
+    raise AttributeError(msg)
+
+  if support == '01':
+    bij = tfb.Invert(tfb.Sigmoid())
+    new_support = 'real'
+  elif support == 'nonnegative':
+    bij = tfb.Invert(tfb.Softplus())
+    new_support = 'real'
+  elif support == 'simplex':
+    bij = tfb.Invert(tfb.SoftmaxCentered(event_ndims=1))
+    new_support = 'multivariate_real'
+  elif support in ('real', 'multivariate_real'):
+    return x
+  else:
+    msg = "'transform' does not handle supports of type '{}'".format(support)
+    raise ValueError(msg)
+
+  new_x = TransformedDistribution(x, bij, *args, **kwargs)
+  new_x.support = new_support
+  return new_x
+
+
+def get_control_variate_coef(f, h):
+  """Returns scalar used by control variates method for variance reduction in
+  Monte Carlo methods.
+
+  If we have a statistic $m$ as an unbiased estimator of $\mu$ and
+  and another statistic $t$ which is an unbiased estimator of
+  $\\tau$ then $m^* = m + c(t - \\tau)$ is also an unbiased
+  estimator of $\mu$ for any coefficient $c$.
+
+  This function calculates the optimal coefficient
+
+  $c^* = \\frac{\\text{Cov}(m,t)}{\\text{Var}(t)}$
+
+  for minimizing the variance of $m^*$.
+
+  Args:
+    f: tf.Tensor.
+      A 1-D tensor.
+    h: tf.Tensor.
+      A 1-D tensor.
+
+  Returns:
+    tf.Tensor.
+    A 0 rank tensor
+  """
+  f_mu = tf.reduce_mean(f)
+  h_mu = tf.reduce_mean(h)
+
+  n = f.shape[0].value
+
+  cov_fh = tf.reduce_sum((f - f_mu) * (h - h_mu)) / (n - 1)
+  var_h = tf.reduce_sum(tf.square(h - h_mu)) / (n - 1)
+
+  a = cov_fh / var_h
+
+  return a
diff --git a/edward/inferences/variational_inference.py b/edward/inferences/variational_inference.py
deleted file mode 100644
index f6e8c244d..000000000
--- a/edward/inferences/variational_inference.py
+++ /dev/null
@@ -1,185 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import abc
-import numpy as np
-import six
-import tensorflow as tf
-
-from edward.inferences.inference import Inference
-from edward.models import RandomVariable
-from edward.util import get_session, get_variables
-
-
-@six.add_metaclass(abc.ABCMeta)
-class VariationalInference(Inference):
-  """Abstract base class for variational inference. Specific
-  variational inference methods inherit from `VariationalInference`,
-  sharing methods such as a default optimizer.
-
-  To build an algorithm inheriting from `VariationalInference`, one
-  must at the minimum implement `build_loss_and_gradients`: it
-  determines the loss function and gradients to apply for a given
-  optimizer.
-  """
-  def __init__(self, *args, **kwargs):
-    super(VariationalInference, self).__init__(*args, **kwargs)
-
-  def initialize(self, optimizer=None, var_list=None, use_prettytensor=False,
-                 global_step=None, *args, **kwargs):
-    """Initialize inference algorithm. It initializes hyperparameters
-    and builds ops for the algorithm's computation graph.
-
-    Args:
-      optimizer: str or tf.train.Optimizer.
-        A TensorFlow optimizer, to use for optimizing the variational
-        objective. Alternatively, one can pass in the name of a
-        TensorFlow optimizer, and default parameters for the optimizer
-        will be used.
-      var_list: list of tf.Variable.
-        List of TensorFlow variables to optimize over. Default is all
-        trainable variables that `latent_vars` and `data` depend on,
-        excluding those that are only used in conditionals in `data`.
-      use_prettytensor: bool.
-        `True` if aim to use PrettyTensor optimizer (when using
-        PrettyTensor) or `False` if aim to use TensorFlow optimizer.
-        Defaults to TensorFlow.
-      global_step: tf.Variable.
-        A TensorFlow variable to hold the global step.
-    """
-    super(VariationalInference, self).initialize(*args, **kwargs)
-
-    if var_list is None:
-      # Traverse random variable graphs to get default list of variables.
-      var_list = set()
-      trainables = tf.trainable_variables()
-      for z, qz in six.iteritems(self.latent_vars):
-        var_list.update(get_variables(z, collection=trainables))
-        var_list.update(get_variables(qz, collection=trainables))
-
-      for x, qx in six.iteritems(self.data):
-        if isinstance(x, RandomVariable) and \
-                not isinstance(qx, RandomVariable):
-          var_list.update(get_variables(x, collection=trainables))
-
-      var_list = list(var_list)
-
-    self.loss, grads_and_vars = self.build_loss_and_gradients(var_list)
-
-    if self.logging:
-      tf.summary.scalar("loss", self.loss, collections=[self._summary_key])
-      for grad, var in grads_and_vars:
-        # replace colons which are an invalid character
-        tf.summary.histogram("gradient/" +
-                             var.name.replace(':', '/'),
-                             grad, collections=[self._summary_key])
-        tf.summary.scalar("gradient_norm/" +
-                          var.name.replace(':', '/'),
-                          tf.norm(grad), collections=[self._summary_key])
-
-      self.summarize = tf.summary.merge_all(key=self._summary_key)
-
-    if optimizer is None and global_step is None:
-      # Default optimizer always uses a global step variable.
-      global_step = tf.Variable(0, trainable=False, name="global_step")
-
-    if isinstance(global_step, tf.Variable):
-      starter_learning_rate = 0.1
-      learning_rate = tf.train.exponential_decay(starter_learning_rate,
-                                                 global_step,
-                                                 100, 0.9, staircase=True)
-    else:
-      learning_rate = 0.01
-
-    # Build optimizer.
-    if optimizer is None:
-      optimizer = tf.train.AdamOptimizer(learning_rate)
-    elif isinstance(optimizer, str):
-      if optimizer == 'gradientdescent':
-        optimizer = tf.train.GradientDescentOptimizer(learning_rate)
-      elif optimizer == 'adadelta':
-        optimizer = tf.train.AdadeltaOptimizer(learning_rate)
-      elif optimizer == 'adagrad':
-        optimizer = tf.train.AdagradOptimizer(learning_rate)
-      elif optimizer == 'momentum':
-        optimizer = tf.train.MomentumOptimizer(learning_rate, 0.9)
-      elif optimizer == 'adam':
-        optimizer = tf.train.AdamOptimizer(learning_rate)
-      elif optimizer == 'ftrl':
-        optimizer = tf.train.FtrlOptimizer(learning_rate)
-      elif optimizer == 'rmsprop':
-        optimizer = tf.train.RMSPropOptimizer(learning_rate)
-      else:
-        raise ValueError('Optimizer class not found:', optimizer)
-    elif not isinstance(optimizer, tf.train.Optimizer):
-      raise TypeError("Optimizer must be str, tf.train.Optimizer, or None.")
-
-    with tf.variable_scope(None, default_name="optimizer") as scope:
-      if not use_prettytensor:
-        self.train = optimizer.apply_gradients(grads_and_vars,
-                                               global_step=global_step)
-      else:
-        import prettytensor as pt
-        # Note PrettyTensor optimizer does not accept manual updates;
-        # it autodiffs the loss directly.
-        self.train = pt.apply_optimizer(optimizer, losses=[self.loss],
-                                        global_step=global_step,
-                                        var_list=var_list)
-
-    self.reset.append(tf.variables_initializer(
-        tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=scope.name)))
-
-  def update(self, feed_dict=None):
-    """Run one iteration of optimization.
-
-    Args:
-      feed_dict: dict.
-        Feed dictionary for a TensorFlow session run. It is used to feed
-        placeholders that are not fed during initialization.
-
-    Returns:
-      dict.
-      Dictionary of algorithm-specific information. In this case, the
-      loss function value after one iteration.
-    """
-    if feed_dict is None:
-      feed_dict = {}
-
-    for key, value in six.iteritems(self.data):
-      if isinstance(key, tf.Tensor) and "Placeholder" in key.op.type:
-        feed_dict[key] = value
-
-    sess = get_session()
-    _, t, loss = sess.run([self.train, self.increment_t, self.loss], feed_dict)
-
-    if self.debug:
-      sess.run(self.op_check, feed_dict)
-
-    if self.logging and self.n_print != 0:
-      if t == 1 or t % self.n_print == 0:
-        summary = sess.run(self.summarize, feed_dict)
-        self.train_writer.add_summary(summary, t)
-
-    return {'t': t, 'loss': loss}
-
-  def print_progress(self, info_dict):
-    """Print progress to output.
-    """
-    if self.n_print != 0:
-      t = info_dict['t']
-      if t == 1 or t % self.n_print == 0:
-        self.progbar.update(t, {'Loss': info_dict['loss']})
-
-  @abc.abstractmethod
-  def build_loss_and_gradients(self, var_list):
-    """Build loss function and its gradients. They will be leveraged
-    in an optimizer to update the model and variational parameters.
-
-    Any derived class of `VariationalInference` **must** implement
-    this method.
-
-    Raises:
-      NotImplementedError.
-    """
-    raise NotImplementedError()
diff --git a/edward/inferences/wake_sleep.py b/edward/inferences/wake_sleep.py
index 5d4db6d3d..fad6e835a 100644
--- a/edward/inferences/wake_sleep.py
+++ b/edward/inferences/wake_sleep.py
@@ -5,12 +5,26 @@
 import six
 import tensorflow as tf
 
-from edward.inferences.variational_inference import VariationalInference
-from edward.models import RandomVariable
-from edward.util import copy, get_descendants
-
-
-class WakeSleep(VariationalInference):
+from edward.inferences import docstrings as doc
+from edward.inferences.util import (
+    call_with_intercept, call_with_trace, toposort)
+
+
+@doc.set_doc(
+    args_part_one=(doc.arg_model +
+                   doc.arg_variational +
+                   doc.arg_align_latent +
+                   doc.arg_align_data +
+                   doc.arg_scale +
+                   doc.arg_n_samples)[:-1],
+    args_part_two=(doc.arg_auto_transform +
+                   doc.arg_collections +
+                   doc.arg_args_kwargs)[:-1],
+    notes_conditional_inference=doc.notes_conditional_inference_samples,
+    notes_regularization_losses=doc.notes_regularization_losses)
+def wake_sleep(model, variational, align_latent, align_data,
+               scale=lambda name: 1.0, n_samples=1, phase_q='sleep',
+               auto_transform=True, collections=None, *args, **kwargs):
   """Wake-Sleep algorithm [@hinton1995wake].
 
   Given a probability model $p(x, z; \\theta)$ and variational
@@ -36,120 +50,100 @@ class WakeSleep(VariationalInference):
   corresponds to minimizing the reverse KL $\\text{KL}(p\|q)$ in
   expectation over the data distribution.
 
-  #### Notes
-
-  In conditional inference, we infer $z$ in $p(z, \\beta
-  \mid x)$ while fixing inference over $\\beta$ using another
-  distribution $q(\\beta)$. During gradient calculation, instead
-  of using the model's density
+  Args:
+  @{args_part_one}
+    phase_q: str.
+      Phase for updating parameters of q. If 'sleep', update using
+      a sample from p. If 'wake', update using a sample from q.
+      (Unlike reparameterization gradients, the sample is held
+      fixed.)
+  @{args_part_two}
 
-  $\log p(x, z^{(s)}), z^{(s)} \sim q(z; \lambda),$
+  Returns:
+    Pair of scalar tf.Tensors, representing losses for training p
+    and q respectively.
 
-  for each sample $s=1,\ldots,S$, `WakeSleep` uses
-
-  $\log p(x, z^{(s)}, \\beta^{(s)}),$
-
-  where $z^{(s)} \sim q(z; \lambda)$ and $\\beta^{(s)}
-  \sim q(\\beta)$.
+  #### Notes
 
-  The objective function also adds to itself a summation over all
-  tensors in the `REGULARIZATION_LOSSES` collection.
+  Probabilistic programs may have random variables which vary across
+  executions. The algorithm returns calculations following `n_samples`
+  executions of the model and variational programs.
+
+  @{notes_conditional_inference}
+
+  @{notes_regularization_losses}
+
+  #### Examples
+
+  ```python
+  def model():
+    z = Normal(loc=0.0, scale=1.0, sample_shape=[256, 25], name="z")
+    net = tf.layers.dense(z, 512, activation=tf.nn.relu)
+    net = tf.layers.dense(net, 28 * 28, activation=None)
+    x = Normal(loc=net, scale=1.0, name="x")
+    return x
+
+  def variational(x):
+    net = tf.layers.dense(x, 25 * 2)
+    qz = Normal(loc=net[:, :25],
+                scale=tf.nn.softplus(net[:, 25:]),
+                name="qz")
+    return qz
+
+  loss_p, loss_q = ed.wake_sleep(
+      model, variational,
+      align_latent=lambda name: "qz" if name == "z" else None,
+      align_data=lambda name: "x" if name == "x" else None,
+      x=x_data)
+  ```
   """
-  def __init__(self, *args, **kwargs):
-    super(WakeSleep, self).__init__(*args, **kwargs)
-
-  def initialize(self, n_samples=1, phase_q='sleep', *args, **kwargs):
-    """Initialize inference algorithm. It initializes hyperparameters
-    and builds ops for the algorithm's computation graph.
-
-    Args:
-      n_samples: int.
-        Number of samples for calculating stochastic gradients during
-        wake and sleep phases.
-      phase_q: str.
-        Phase for updating parameters of q. If 'sleep', update using
-        a sample from p. If 'wake', update using a sample from q.
-        (Unlike reparameterization gradients, the sample is held
-        fixed.)
-    """
-    self.n_samples = n_samples
-    self.phase_q = phase_q
-    return super(WakeSleep, self).initialize(*args, **kwargs)
-
-  def build_loss_and_gradients(self, var_list):
-    p_log_prob = [0.0] * self.n_samples
-    q_log_prob = [0.0] * self.n_samples
-    base_scope = tf.get_default_graph().unique_name("inference") + '/'
-    for s in range(self.n_samples):
-      # Form dictionary in order to replace conditioning on prior or
-      # observed variable with conditioning on a specific value.
-      scope = base_scope + tf.get_default_graph().unique_name("q_sample")
-      dict_swap = {}
-      for x, qx in six.iteritems(self.data):
-        if isinstance(x, RandomVariable):
-          if isinstance(qx, RandomVariable):
-            qx_copy = copy(qx, scope=scope)
-            dict_swap[x] = qx_copy.value()
-          else:
-            dict_swap[x] = qx
-
-      # Sample z ~ q(z), then compute log p(x, z).
-      q_dict_swap = dict_swap.copy()
-      for z, qz in six.iteritems(self.latent_vars):
-        # Copy q(z) to obtain new set of posterior samples.
-        qz_copy = copy(qz, scope=scope)
-        q_dict_swap[z] = qz_copy.value()
-        if self.phase_q != 'sleep':
-          # If not sleep phase, compute log q(z).
-          q_log_prob[s] += tf.reduce_sum(
-              self.scale.get(z, 1.0) *
-              qz_copy.log_prob(tf.stop_gradient(q_dict_swap[z])))
-
-      for z in six.iterkeys(self.latent_vars):
-        z_copy = copy(z, q_dict_swap, scope=scope)
-        p_log_prob[s] += tf.reduce_sum(
-            self.scale.get(z, 1.0) * z_copy.log_prob(q_dict_swap[z]))
-
-      for x in six.iterkeys(self.data):
-        if isinstance(x, RandomVariable):
-          x_copy = copy(x, q_dict_swap, scope=scope)
-          p_log_prob[s] += tf.reduce_sum(
-              self.scale.get(x, 1.0) * x_copy.log_prob(q_dict_swap[x]))
-
-      if self.phase_q == 'sleep':
-        # Sample z ~ p(z), then compute log q(z).
-        scope = base_scope + tf.get_default_graph().unique_name("p_sample")
-        p_dict_swap = dict_swap.copy()
-        for z, qz in six.iteritems(self.latent_vars):
-          # Copy p(z) to obtain new set of prior samples.
-          z_copy = copy(z, scope=scope)
-          p_dict_swap[qz] = z_copy.value()
-        for qz in six.itervalues(self.latent_vars):
-          qz_copy = copy(qz, p_dict_swap, scope=scope)
-          q_log_prob[s] += tf.reduce_sum(
-              self.scale.get(z, 1.0) *
-              qz_copy.log_prob(tf.stop_gradient(p_dict_swap[qz])))
-
-    p_log_prob = tf.reduce_mean(p_log_prob)
-    q_log_prob = tf.reduce_mean(q_log_prob)
-    reg_penalty = tf.reduce_sum(tf.losses.get_regularization_losses())
-
-    if self.logging:
-      tf.summary.scalar("loss/p_log_prob", p_log_prob,
-                        collections=[self._summary_key])
-      tf.summary.scalar("loss/q_log_prob", q_log_prob,
-                        collections=[self._summary_key])
-      tf.summary.scalar("loss/reg_penalty", reg_penalty,
-                        collections=[self._summary_key])
-
-    loss_p = -p_log_prob + reg_penalty
-    loss_q = -q_log_prob + reg_penalty
-
-    q_rvs = list(six.itervalues(self.latent_vars))
-    q_vars = [v for v in var_list
-              if len(get_descendants(tf.convert_to_tensor(v), q_rvs)) != 0]
-    q_grads = tf.gradients(loss_q, q_vars)
-    p_vars = [v for v in var_list if v not in q_vars]
-    p_grads = tf.gradients(loss_p, p_vars)
-    grads_and_vars = list(zip(q_grads, q_vars)) + list(zip(p_grads, p_vars))
-    return loss_p, grads_and_vars
+  p_log_prob = [0.0] * n_samples
+  q_log_prob = [0.0] * n_samples
+  for s in range(n_samples):
+    q_trace = call_with_trace(variational, *args, **kwargs)
+    x = call_with_intercept(model, q_trace, align_data, align_latent,
+                            *args, **kwargs)
+    for rv in toposort(x):
+      scale_factor = scale(rv.name)
+      if align_data(rv.name) is not None or align_latent(rv.name) is not None:
+        p_log_prob[s] += tf.reduce_sum(scale_factor * rv.log_prob(rv.value))
+      if phase_q != 'sleep' and align_latent(rv.name) is not None:
+        # If not sleep phase, compute log q(z).
+        qz = q_trace[align_latent(rv.name)]
+        q_log_prob[s] += tf.reduce_sum(
+            scale_factor * qz.log_prob(tf.stop_gradient(qz.value)))
+
+    if phase_q == 'sleep':
+      p_trace = call_with_trace(model, *args, **kwargs)
+      qz = call_with_intercept(variational, p_trace,
+                               align_data=lambda name: None,
+                               align_latent=align_latent,
+                               *args, **kwargs)
+      # Build dictionary to return scale factor for a posterior
+      # variable via its corresponding prior. The implementation is
+      # naive.
+      scale_posterior = {}
+      for name, rv in six.iteritems(p_trace):
+        if align_latent(name) is not None:
+          qz = q_trace[align_latent(name)]
+          scale_posterior[qz] = rv
+
+      for rv in toposort(qz):
+        scale_factor = scale_posterior[rv]
+        q_log_prob[s] += tf.reduce_sum(
+            scale_factor * rv.log_prob(tf.stop_gradient(rv.value)))
+
+  p_log_prob = tf.reduce_mean(p_log_prob)
+  q_log_prob = tf.reduce_mean(q_log_prob)
+  reg_penalty = tf.reduce_sum(tf.losses.get_regularization_losses())
+  if collections is not None:
+    tf.summary.scalar("loss/p_log_prob", p_log_prob,
+                      collections=collections)
+    tf.summary.scalar("loss/q_log_prob", q_log_prob,
+                      collections=collections)
+    tf.summary.scalar("loss/reg_penalty", reg_penalty,
+                      collections=collections)
+
+  loss_p = -p_log_prob + reg_penalty
+  loss_q = -q_log_prob + reg_penalty
+  return loss_p, loss_q
diff --git a/edward/inferences/wgan_inference.py b/edward/inferences/wgan_inference.py
index 7f67ab75b..dc5609c17 100644
--- a/edward/inferences/wgan_inference.py
+++ b/edward/inferences/wgan_inference.py
@@ -5,11 +5,21 @@
 import six
 import tensorflow as tf
 
-from edward.inferences.gan_inference import GANInference
-from edward.util import get_session
-
-
-class WGANInference(GANInference):
+from edward.inferences import docstrings as doc
+from edward.inferences.util import make_optional_inputs
+
+
+@doc.set_doc(
+    args_part_one=(doc.arg_model +
+                   doc.arg_discriminator +
+                   doc.arg_align_data)[:-1],
+    args_part_twoe=(doc.arg_collections +
+                    doc.arg_args_kwargs)[:-1],
+    returns=doc.return_loss_loss_d,
+    notes_discriminator_scope=doc.notes_discriminator_scope,
+    notes_regularization_losses=doc.notes_regularization_losses)
+def wgan_inference(model, discriminator, align_data,
+                   penalty=10.0, collections=None, *args, **kwargs):
   """Parameter estimation with GAN-style training
   [@goodfellow2014generative], using the Wasserstein distance
   [@arjovsky2017wasserstein].
@@ -18,99 +28,87 @@ class WGANInference(GANInference):
   models. These models do not require a tractable density and assume
   only a program that generates samples.
 
+  Args:
+  @{args_part_one}
+    penalty: float.
+      Scalar value to enforce gradient penalty that ensures the
+      gradients have norm equal to 1 [@gulrajani2017improved]. Set to
+      None (or 0.0) if using no penalty.
+  @{args_part_two}
+
+  `model` must return the generated data.
+
+  Returns:
+  @{returns}
+
   #### Notes
 
-  Argument-wise, the only difference from `GANInference` is
-  conceptual: the `discriminator` is better described as a test
-  function or critic. `WGANInference` continues to use
-  `discriminator` only to share methods and attributes with
-  `GANInference`.
+  The original WGAN clips weight parameters of the discriminator as an
+  approximation to the 1-Lipschitz constraint. To clip weights, one
+  must manually add a clipping op and then call it after each gradient
+  update during training. For example:
 
-  The objective function also adds to itself a summation over all
-  tensors in the `REGULARIZATION_LOSSES` collection.
+  ```python
+  ... = wgan_inference(..., penalty=None)
+  var_list = tf.get_collection(
+      tf.GraphKeys.TRAINABLE_VARIABLES, scope="Disc")
+  clip_op = [w.assign(tf.clip_by_value(w, -0.1, 0.1)) for w in var_list]
+  ```
+
+  @{notes_discriminator_scope}
+
+  @{notes_regularization_losses}
 
   #### Examples
 
   ```python
-  z = Normal(loc=tf.zeros([100, 10]), scale=tf.ones([100, 10]))
-  x = generative_network(z)
-
-  inference = ed.WGANInference({x: x_data}, discriminator)
+  def model():
+    z = Normal(loc=0.0, scale=1.0, sample_shape=[256, 25])
+    x = generative_network(z, name="x")
+    return x
+
+  def discriminator(x):
+    net = tf.layers.dense(x, 256, activation=tf.nn.relu)
+    return tf.layers.dense(net, 1, activation=tf.sigmoid)
+
+  loss, loss_d = ed.wgan_inference(
+      model, discriminator,
+      align_data=lambda name: "x_data" if name == "x" else None,
+      x_data=x_data)
   ```
   """
-  def __init__(self, *args, **kwargs):
-    super(WGANInference, self).__init__(*args, **kwargs)
-
-  def initialize(self, penalty=10.0, clip=None, *args, **kwargs):
-    """Initialize inference algorithm. It initializes hyperparameters
-    and builds ops for the algorithm's computation graph.
-
-    Args:
-      penalty: float.
-        Scalar value to enforce gradient penalty that ensures the
-        gradients have norm equal to 1 [@gulrajani2017improved]. Set to
-        None (or 0.0) if using no penalty.
-      clip: float.
-        Value to clip weights by. Default is no clipping.
-    """
-    self.penalty = penalty
-
-    super(WGANInference, self).initialize(*args, **kwargs)
-
-    self.clip_op = None
-    if clip is not None:
-      var_list = tf.get_collection(
-          tf.GraphKeys.TRAINABLE_VARIABLES, scope="Disc")
-      self.clip_op = [w.assign(tf.clip_by_value(w, -clip, clip))
-                      for w in var_list]
-
-  def build_loss_and_gradients(self, var_list):
-    x_true = list(six.itervalues(self.data))[0]
-    x_fake = list(six.iterkeys(self.data))[0]
-    with tf.variable_scope("Disc"):
-      d_true = self.discriminator(x_true)
-
+  model = make_optional_inputs(model)
+  x_fake = model(*args, **kwargs)
+  key = align_data(x_fake.name.split(':')[0])
+  if isinstance(key, int):
+    x_true = args[key]
+  elif kwargs.get(key, None) is not None:
+    x_true = kwargs.get(key)
+  with tf.variable_scope("Disc"):
+    d_true = discriminator(x_true)
+
+  with tf.variable_scope("Disc", reuse=True):
+    d_fake = discriminator(x_fake)
+
+  if penalty is None or penalty == 0:
+    penalty = 0.0
+  else:
+    eps = tf.random_uniform(tf.shape(x_true))
+    x_interpolated = eps * x_true + (1.0 - eps) * x_fake
     with tf.variable_scope("Disc", reuse=True):
-      d_fake = self.discriminator(x_fake)
-
-    if self.penalty is None or self.penalty == 0:
-      penalty = 0.0
-    else:
-      eps = tf.random_uniform(tf.shape(x_true))
-      x_interpolated = eps * x_true + (1.0 - eps) * x_fake
-      with tf.variable_scope("Disc", reuse=True):
-        d_interpolated = self.discriminator(x_interpolated)
-
-      gradients = tf.gradients(d_interpolated, [x_interpolated])[0]
-      slopes = tf.sqrt(tf.reduce_sum(tf.square(gradients),
-                                     list(range(1, gradients.shape.ndims))))
-      penalty = self.penalty * tf.reduce_mean(tf.square(slopes - 1.0))
-
-    reg_terms_d = tf.losses.get_regularization_losses(scope="Disc")
-    reg_terms_all = tf.losses.get_regularization_losses()
-    reg_terms = [r for r in reg_terms_all if r not in reg_terms_d]
-
-    mean_true = tf.reduce_mean(d_true)
-    mean_fake = tf.reduce_mean(d_fake)
-    loss_d = mean_fake - mean_true + penalty + tf.reduce_sum(reg_terms_d)
-    loss = -mean_fake + tf.reduce_sum(reg_terms)
-
-    var_list_d = tf.get_collection(
-        tf.GraphKeys.TRAINABLE_VARIABLES, scope="Disc")
-    if var_list is None:
-      var_list = [v for v in tf.trainable_variables() if v not in var_list_d]
-
-    grads_d = tf.gradients(loss_d, var_list_d)
-    grads = tf.gradients(loss, var_list)
-    grads_and_vars_d = list(zip(grads_d, var_list_d))
-    grads_and_vars = list(zip(grads, var_list))
-    return loss, grads_and_vars, loss_d, grads_and_vars_d
-
-  def update(self, feed_dict=None, variables=None):
-    info_dict = super(WGANInference, self).update(feed_dict, variables)
-
-    sess = get_session()
-    if self.clip_op is not None and variables in (None, "Disc"):
-      sess.run(self.clip_op)
-
-    return info_dict
+      d_interpolated = discriminator(x_interpolated)
+
+    gradients = tf.gradients(d_interpolated, [x_interpolated])[0]
+    slopes = tf.sqrt(tf.reduce_sum(tf.square(gradients),
+                                   list(range(1, gradients.shape.ndims))))
+    penalty = penalty * tf.reduce_mean(tf.square(slopes - 1.0))
+
+  reg_terms_d = tf.losses.get_regularization_losses(scope="Disc")
+  reg_terms_all = tf.losses.get_regularization_losses()
+  reg_terms = [r for r in reg_terms_all if r not in reg_terms_d]
+
+  mean_true = tf.reduce_mean(d_true)
+  mean_fake = tf.reduce_mean(d_fake)
+  loss_d = mean_fake - mean_true + penalty + tf.reduce_sum(reg_terms_d)
+  loss = -mean_fake + tf.reduce_sum(reg_terms)
+  return loss, loss_d
diff --git a/edward/models/__init__.py b/edward/models/__init__.py
index 2b2eaa2cc..fb39afb2c 100644
--- a/edward/models/__init__.py
+++ b/edward/models/__init__.py
@@ -4,22 +4,25 @@
 from __future__ import division
 from __future__ import print_function
 
-from edward.models.dirichlet_process import *
-from edward.models.empirical import *
-from edward.models.param_mixture import *
-from edward.models.point_mass import *
-from edward.models.random_variable import RandomVariable
-from edward.models.random_variables import *
+from edward.models.core import *
+from edward.models.queries import *
+from edward.models.random_variable import *
 
 from tensorflow.python.util.all_util import remove_undocumented
-from edward.models import random_variables as _module
+from edward.models import core as _module
 
 _allowed_symbols = [
-    'DirichletProcess',
-    'Empirical',
-    'ParamMixture',
-    'PointMass',
     'RandomVariable',
+    'call_with_manipulate',
+    'get_ancestors',
+    'get_blanket',
+    'get_children',
+    'get_descendants',
+    'get_parents',
+    'get_siblings',
+    'get_variables',
+    'is_independent',
+    'random_variables',
 ]
 for name in dir(_module):
   obj = getattr(_module, name)
diff --git a/edward/models/random_variables.py b/edward/models/core.py
similarity index 53%
rename from edward/models/random_variables.py
rename to edward/models/core.py
index be5cf8058..91c893b70 100644
--- a/edward/models/random_variables.py
+++ b/edward/models/core.py
@@ -7,6 +7,53 @@
 from edward.models.random_variable import RandomVariable as _RandomVariable
 from tensorflow.contrib import distributions as _distributions
 
+TRACE_STACK = [lambda f, *args, **kwargs: f(*args, **kwargs)]
+
+
+def call_with_manipulate(f, manipulate, *args, **kwargs):
+  """Calls function `f(*args, **kwargs)` with manipulation.
+
+  Args:
+    f: Function to call.
+    manipulate: Function to intercept primitives. It takes each primitive
+      function `f`, inputs `args, kwargs`, and may return any value and/or add
+      side-effects.
+    args, kwargs: Inputs to function.
+
+  Returns:
+    The output of `f`. Any calls to `primitive` operations are replaced by
+    calls to `manipulate`.
+
+  #### Examples
+
+  ```python
+  def f(x):
+    y = Poisson(rate=x, name="y")
+    return y
+
+  def manipulate(f, *args, **kwargs):
+    if kwargs.get("name") == "y":
+      kwargs["value"] = 42
+    return f(*args, **kwargs)
+
+  y = ed.call_with_manipulate(f, manipulate, 1.5)
+  with tf.Session() as sess:
+    assert sess.run(y.value) == 42
+  ```
+  """
+  TRACE_STACK.append(manipulate)
+  output = f(*args, **kwargs)
+  TRACE_STACK.pop()
+  return output
+
+
+def primitive(cls_init):
+  """Wraps class __init__ for manipulating its continuation."""
+  def __init__(self, *args, **kwargs):
+    TRACE_STACK[-1](cls_init, self, *args, **kwargs)
+  return __init__
+
+
 # Automatically generate random variable classes from classes in
 # tf.contrib.distributions.
 _globals = globals()
@@ -16,7 +63,9 @@
           _candidate != _distributions.Distribution and
           issubclass(_candidate, _distributions.Distribution)):
 
-    # to use _candidate's docstring, must write a new __init__ method
+    # write a new __init__ method in order to decorate class as primitive
+    # and share _candidate's docstring
+    @primitive
     def __init__(self, *args, **kwargs):
       _RandomVariable.__init__(self, *args, **kwargs)
     __init__.__doc__ = _candidate.__init__.__doc__
diff --git a/edward/models/empirical.py b/edward/models/empirical.py
deleted file mode 100644
index 7da9b8265..000000000
--- a/edward/models/empirical.py
+++ /dev/null
@@ -1,125 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf
-
-from edward.models.random_variable import RandomVariable
-from tensorflow.contrib.distributions import Distribution
-
-try:
-  from tensorflow.contrib.distributions import FULLY_REPARAMETERIZED
-except Exception as e:
-  raise ImportError("{0}. Your TensorFlow version is not supported.".format(e))
-
-
-class distributions_Empirical(Distribution):
-  """Empirical random variable.
-
-  #### Examples
-
-  ```python
-  # 100 samples of a scalar
-  x = Empirical(params=tf.zeros(100))
-  assert x.shape == ()
-
-  # 5 samples of a 2 x 3 matrix
-  x = Empirical(params=tf.zeros([5, 2, 3]))
-  assert x.shape == (2, 3)
-  ```
-  """
-  def __init__(self,
-               params,
-               validate_args=False,
-               allow_nan_stats=True,
-               name="Empirical"):
-    """Initialize an `Empirical` random variable.
-
-    Args:
-      params: tf.Tensor.
-      Collection of samples. Its outer (left-most) dimension
-      determines the number of samples.
-    """
-    parameters = locals()
-    with tf.name_scope(name, values=[params]):
-      with tf.control_dependencies([]):
-        self._params = tf.identity(params, name="params")
-        try:
-          self._n = tf.shape(self._params)[0]
-        except ValueError:  # scalar params
-          self._n = tf.constant(1)
-
-    super(distributions_Empirical, self).__init__(
-        dtype=self._params.dtype,
-        reparameterization_type=FULLY_REPARAMETERIZED,
-        validate_args=validate_args,
-        allow_nan_stats=allow_nan_stats,
-        parameters=parameters,
-        graph_parents=[self._params, self._n],
-        name=name)
-
-  @staticmethod
-  def _param_shapes(sample_shape):
-    return {"params": tf.convert_to_tensor(sample_shape, dtype=tf.int32)}
-
-  @property
-  def params(self):
-    """Distribution parameter."""
-    return self._params
-
-  @property
-  def n(self):
-    """Number of samples."""
-    return self._n
-
-  def _batch_shape_tensor(self):
-    return tf.constant([], dtype=tf.int32)
-
-  def _batch_shape(self):
-    return tf.TensorShape([])
-
-  def _event_shape_tensor(self):
-    return tf.shape(self.params)[1:]
-
-  def _event_shape(self):
-    return self.params.shape[1:]
-
-  def _mean(self):
-    return tf.reduce_mean(self.params, 0)
-
-  def _stddev(self):
-    # broadcasting n x shape - shape = n x shape
-    r = self.params - self.mean()
-    return tf.sqrt(tf.reduce_mean(tf.square(r), 0))
-
-  def _variance(self):
-    return tf.square(self.stddev())
-
-  def _sample_n(self, n, seed=None):
-    input_tensor = self.params
-    if len(input_tensor.shape) == 0:
-      input_tensor = tf.expand_dims(input_tensor, 0)
-      multiples = tf.concat(
-          [tf.expand_dims(n, 0), [1] * len(self.event_shape)], 0)
-      return tf.tile(input_tensor, multiples)
-    else:
-      probs = tf.ones([self.n]) / tf.cast(self.n, dtype=tf.float32)
-      cat = tf.contrib.distributions.Categorical(probs)
-      indices = cat._sample_n(n, seed)
-      tensor = tf.gather(input_tensor, indices)
-      return tensor
-
-
-# Generate random variable class similar to autogenerated ones from TensorFlow.
-def __init__(self, *args, **kwargs):
-  RandomVariable.__init__(self, *args, **kwargs)
-
-
-_name = 'Empirical'
-_candidate = distributions_Empirical
-__init__.__doc__ = _candidate.__init__.__doc__
-_globals = globals()
-_params = {'__doc__': _candidate.__doc__,
-           '__init__': __init__,
-           'support': 'points'}
-_globals[_name] = type(_name, (RandomVariable, _candidate), _params)
diff --git a/edward/models/point_mass.py b/edward/models/point_mass.py
deleted file mode 100644
index b63031b6d..000000000
--- a/edward/models/point_mass.py
+++ /dev/null
@@ -1,110 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf
-
-from edward.models.random_variable import RandomVariable
-from tensorflow.contrib.distributions import Distribution
-
-try:
-  from tensorflow.contrib.distributions import FULLY_REPARAMETERIZED
-except Exception as e:
-  raise ImportError("{0}. Your TensorFlow version is not supported.".format(e))
-
-
-class distributions_PointMass(Distribution):
-  """PointMass random variable.
-
-  It is analogous to an Empirical random variable with one sample, but
-  its parameter argument does not have an outer dimension.
-
-  #### Examples
-
-  ```python
-  # scalar
-  x = PointMass(params=28.3)
-  assert x.shape == ()
-
-  # 5 x 2 x 3 tensor
-  x = PointMass(params=tf.zeros([5, 2, 3]))
-  assert x.shape == (5, 2, 3)
-  ```
-  """
-  def __init__(self,
-               params,
-               validate_args=False,
-               allow_nan_stats=True,
-               name="PointMass"):
-    """Initialize a `PointMass` random variable.
-
-    Args:
-      params: tf.Tensor.
-        The location with all probability mass.
-    """
-    parameters = locals()
-    with tf.name_scope(name, values=[params]):
-      with tf.control_dependencies([]):
-        self._params = tf.identity(params, name="params")
-
-    super(distributions_PointMass, self).__init__(
-        dtype=self._params.dtype,
-        reparameterization_type=FULLY_REPARAMETERIZED,
-        validate_args=validate_args,
-        allow_nan_stats=allow_nan_stats,
-        parameters=parameters,
-        graph_parents=[self._params],
-        name=name)
-
-  @staticmethod
-  def _param_shapes(sample_shape):
-    return {"params": tf.expand_dims(
-        tf.convert_to_tensor(sample_shape, dtype=tf.int32), 0)}
-
-  @property
-  def params(self):
-    """Distribution parameter."""
-    return self._params
-
-  def _batch_shape_tensor(self):
-    return tf.constant([], dtype=tf.int32)
-
-  def _batch_shape(self):
-    return tf.TensorShape([])
-
-  def _event_shape_tensor(self):
-    return tf.shape(self.params)
-
-  def _event_shape(self):
-    return self.params.shape
-
-  def _mean(self):
-    return self.params
-
-  def _stddev(self):
-    return 0.0 * tf.ones_like(self.params)
-
-  def _variance(self):
-    return tf.square(self.stddev())
-
-  def _sample_n(self, n, seed=None):
-    input_tensor = self.params
-    input_tensor = tf.expand_dims(input_tensor, 0)
-    multiples = tf.concat(
-        [tf.expand_dims(n, 0), [1] * len(self.event_shape)], 0)
-    return tf.tile(input_tensor, multiples)
-
-
-# Generate random variable class similar to autogenerated ones from TensorFlow.
-def __init__(self, *args, **kwargs):
-  RandomVariable.__init__(self, *args, **kwargs)
-
-
-_name = 'PointMass'
-_candidate = distributions_PointMass
-__init__.__doc__ = _candidate.__init__.__doc__
-_globals = globals()
-_params = {'__doc__': _candidate.__doc__,
-           '__init__': __init__,
-           'support': 'point'}
-_globals[_name] = type(_name, (RandomVariable, _candidate), _params)
diff --git a/edward/models/queries.py b/edward/models/queries.py
new file mode 100644
index 000000000..307ef0e00
--- /dev/null
+++ b/edward/models/queries.py
@@ -0,0 +1,438 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import six
+import tensorflow as tf
+
+from edward.models.random_variable import RandomVariable, random_variables
+
+
+def get_ancestors(x, collection=None):
+  """Get ancestor random variables of input.
+
+  Args:
+    x: RandomVariable or tf.Tensor.
+      Query node to find ancestors of.
+    collection: list of RandomVariable.
+      The collection of random variables to check with respect to;
+      defaults to all random variables in the graph.
+
+  Returns:
+    list of RandomVariable.
+    Ancestor random variables of x.
+
+  #### Examples
+  ```python
+  a = Normal(0.0, 1.0)
+  b = Normal(a, 1.0)
+  c = Normal(0.0, 1.0)
+  d = Normal(b * c, 1.0)
+  assert set(ed.get_ancestors(d)) == set([a, b, c])
+  ```
+  """
+  if collection is None:
+    collection = random_variables()
+
+  node_dict = {node.value: node for node in collection}
+
+  # Traverse the graph. Add each node to the set if it's in the collection.
+  output = set()
+  visited = set()
+  nodes = {x}
+  while nodes:
+    node = nodes.pop()
+
+    if node in visited:
+      continue
+    visited.add(node)
+
+    if isinstance(node, RandomVariable):
+      node = node.value
+
+    candidate_node = node_dict.get(node, None)
+    if candidate_node is not None and candidate_node != x:
+      output.add(candidate_node)
+
+    nodes.update(node.op.inputs)
+
+  return list(output)
+
+
+def get_blanket(x, collection=None):
+  """Get Markov blanket of input, which consists of its parents, its
+  children, and the other parents of its children.
+
+  Args:
+    x: RandomVariable or tf.Tensor.
+      Query node to find Markov blanket of.
+    collection: list of RandomVariable.
+      The collection of random variables to check with respect to;
+      defaults to all random variables in the graph.
+
+  Returns:
+    list of RandomVariable.
+    Markov blanket of x.
+
+  #### Examples
+
+  ```python
+  a = Normal(0.0, 1.0)
+  b = Normal(0.0, 1.0)
+  c = Normal(a * b, 1.0)
+  d = Normal(0.0, 1.0)
+  e = Normal(c * d, 1.0)
+  assert set(ed.get_blanket(c)) == set([a, b, d, e])
+  ```
+  """
+  output = set()
+  output.update(get_parents(x, collection))
+  children = get_children(x, collection)
+  output.update(children)
+  for child in children:
+    output.update(get_parents(child, collection))
+
+  output.discard(x)
+  return list(output)
+
+
+def get_children(x, collection=None):
+  """Get child random variables of input.
+
+  Args:
+    x: RandomVariable or tf.Tensor>
+      Query node to find children of.
+    collection: list of RandomVariable.
+      The collection of random variables to check with respect to;
+      defaults to all random variables in the graph.
+
+  Returns:
+    list of RandomVariable.
+    Child random variables of x.
+
+  #### Examples
+
+  ```python
+  a = Normal(0.0, 1.0)
+  b = Normal(a, 1.0)
+  c = Normal(a, 1.0)
+  d = Normal(c, 1.0)
+  assert set(ed.get_children(a)) == set([b, c])
+  ```
+  """
+  if collection is None:
+    collection = random_variables()
+
+  node_dict = {node.value: node for node in collection}
+
+  # Traverse the graph. Add each node to the set if it's in the collection.
+  output = set()
+  visited = set()
+  nodes = {x}
+  while nodes:
+    node = nodes.pop()
+
+    if node in visited:
+      continue
+    visited.add(node)
+
+    if isinstance(node, RandomVariable):
+      node = node.value
+
+    candidate_node = node_dict.get(node, None)
+    if candidate_node is not None and candidate_node != x:
+      output.add(candidate_node)
+    else:
+      for op in node.consumers():
+        nodes.update(op.outputs)
+
+  return list(output)
+
+
+def get_descendants(x, collection=None):
+  """Get descendant random variables of input.
+
+  Args:
+    x: RandomVariable or tf.Tensor.
+      Query node to find descendants of.
+    collection: list of RandomVariable.
+      The collection of random variables to check with respect to;
+      defaults to all random variables in the graph.
+
+  Returns:
+    list of RandomVariable.
+    Descendant random variables of x.
+
+  #### Examples
+
+  ```python
+  a = Normal(0.0, 1.0)
+  b = Normal(a, 1.0)
+  c = Normal(a, 1.0)
+  d = Normal(c, 1.0)
+  assert set(ed.get_descendants(a)) == set([b, c, d])
+  ```
+  """
+  if collection is None:
+    collection = random_variables()
+
+  node_dict = {node.value: node for node in collection}
+
+  # Traverse the graph. Add each node to the set if it's in the collection.
+  output = set()
+  visited = set()
+  nodes = {x}
+  while nodes:
+    node = nodes.pop()
+
+    if node in visited:
+      continue
+    visited.add(node)
+
+    if isinstance(node, RandomVariable):
+      node = node.value
+
+    candidate_node = node_dict.get(node, None)
+    if candidate_node is not None and candidate_node != x:
+      output.add(candidate_node)
+
+    for op in node.consumers():
+      nodes.update(op.outputs)
+
+  return list(output)
+
+
+def get_parents(x, collection=None):
+  """Get parent random variables of input.
+
+  Args:
+    x: RandomVariable or tf.Tensor.
+      Query node to find parents of.
+    collection: list of RandomVariable.
+      The collection of random variables to check with respect to;
+      defaults to all random variables in the graph.
+
+  Returns:
+    list of RandomVariable.
+    Parent random variables of x.
+
+  #### Notes
+
+  We implement this using `tf.gradients`. This is potentially inefficient vs
+  to traverse and stop after reaching all ancestors which are root nodes
+  and/or rvs. Note user can also use `tf.stop_gradient` to stop graph traversal
+  of a node.
+
+  TODO how to extend to eager with its gradients function?
+
+  #### Examples
+
+  ```python
+  a = Normal(0.0, 1.0, name="a")
+  b = Normal(a, 1.0, name="b")
+  c = Normal(0.0, 1.0, name="c")
+  d = Normal(b * c, 1.0, name="d")
+  assert set(ed.get_parents(d)) == set([b, c])
+  ```
+  """
+  if collection is None:
+    collection = random_variables()
+
+  node_dict = {node.value: node for node in collection}
+
+  # Traverse the graph. Add each node to the set if it's in the collection.
+  output = set()
+  visited = set()
+  nodes = {x}
+  while nodes:
+    node = nodes.pop()
+
+    if node in visited:
+      continue
+    visited.add(node)
+
+    if isinstance(node, RandomVariable):
+      node = node.value
+
+    candidate_node = node_dict.get(node, None)
+    if candidate_node is not None and candidate_node != x:
+      output.add(candidate_node)
+    else:
+      nodes.update(node.op.inputs)
+
+  return list(output)
+  # TODO this gets ancestors
+  parents = []
+  if collection is None:
+    collection = random_variables()
+  if isinstance(x,
+                (tf.Variable, tf.SparseTensor, tf.Tensor, RandomVariable)):
+    for g, v in zip(tf.gradients(node, collection), collection):
+      if g is not None:
+        parents.append(v)
+  parents.remove(node)
+  return set(parents)
+
+
+def get_siblings(x, collection=None):
+  """Get sibling random variables of input.
+
+  Args:
+    x: RandomVariable or tf.Tensor.
+      Query node to find siblings of.
+    collection: list of RandomVariable.
+      The collection of random variables to check with respect to;
+      defaults to all random variables in the graph.
+
+  Returns:
+    list of RandomVariable.
+    Sibling random variables of x.
+
+  #### Examples
+
+  ```python
+  a = Normal(0.0, 1.0)
+  b = Normal(a, 1.0)
+  c = Normal(a, 1.0)
+  assert ed.get_siblings(b) == [c]
+  ```
+  """
+  parents = get_parents(x, collection)
+  siblings = set()
+  for parent in parents:
+    siblings.update(get_children(parent, collection))
+
+  siblings.discard(x)
+  return list(siblings)
+
+
+def get_variables(x, collection=None):
+  """Get parent TensorFlow variables of input.
+
+  Args:
+    x: RandomVariable or tf.Tensor.
+      Query node to find parents of.
+    collection: list of tf.Variable.
+      The collection of variables to check with respect to; defaults to
+      all variables in the graph.
+
+  Returns:
+    list of tf.Variable.
+    TensorFlow variables that x depends on.
+
+  #### Examples
+
+  ```python
+  a = tf.Variable(0.0)
+  b = tf.Variable(0.0)
+  c = Normal(a * b, 1.0)
+  assert set(ed.get_variables(c)) == set([a, b])
+  ```
+  """
+  if collection is None:
+    collection = tf.global_variables()
+
+  node_dict = {node.name: node for node in collection}
+
+  # Traverse the graph. Add each node to the set if it's in the collection.
+  output = set()
+  visited = set()
+  nodes = {x}
+  while nodes:
+    node = nodes.pop()
+
+    if node in visited:
+      continue
+    visited.add(node)
+
+    if isinstance(node, RandomVariable):
+      node = node.value
+
+    candidate_node = node_dict.get(node.name, None)
+    if candidate_node is not None and candidate_node != x:
+      output.add(candidate_node)
+
+    nodes.update(node.op.inputs)
+
+  return list(output)
+
+
+def is_independent(a, b, condition=None):
+  """Assess whether a is independent of b given the random variables in
+  condition.
+
+  Implemented using the Bayes-Ball algorithm [@schachter1998bayes].
+
+  Args:
+    a: RandomVariable or list of RandomVariable.
+       Query node(s).
+    b: RandomVariable or list of RandomVariable.
+       Query node(s).
+    condition: RandomVariable or list of RandomVariable.
+       Random variable(s) to condition on.
+
+  Returns:
+    bool.
+    True if a is independent of b given the random variables in condition.
+
+  #### Examples
+
+  ```python
+  a = Normal(0.0, 1.0)
+  b = Normal(a, 1.0)
+  c = Normal(a, 1.0)
+  assert ed.is_independent(b, c, condition=a)
+  ```
+  """
+  if condition is None:
+    condition = []
+  if not isinstance(a, list):
+    a = [a]
+  if not isinstance(b, list):
+    b = [b]
+  if not isinstance(condition, list):
+    condition = [condition]
+  A = set(a)
+  B = set(b)
+  condition = set(condition)
+
+  top_marked = set()
+  # The Bayes-Ball algorithm will traverse the belief network
+  # and add each node that is relevant to B given condition
+  # to the set bottom_marked. A and B are conditionally
+  # independent if no node in A is in bottom_marked.
+  bottom_marked = set()
+
+  schedule = [(node, "child") for node in B]
+  while schedule:
+    node, came_from = schedule.pop()
+
+    if node not in condition and came_from == "child":
+      if node not in top_marked:
+        top_marked.add(node)
+        for parent in get_parents(node):
+          schedule.append((parent, "child"))
+
+      # TODO
+      from edward.models import PointMass
+      if not isinstance(node, PointMass) and node not in bottom_marked:
+        bottom_marked.add(node)
+        if node in A:
+          return False  # node in A is relevant to B
+        for child in get_children(node):
+          schedule.append((child, "parent"))
+
+    elif came_from == "parent":
+      if node in condition and node not in top_marked:
+        top_marked.add(node)
+        for parent in get_parents(node):
+          schedule.append((parent, "child"))
+
+      elif node not in condition and node not in bottom_marked:
+        bottom_marked.add(node)
+        if node in A:
+          return False  # node in A is relevant to B
+        for child in get_children(node):
+          schedule.append((child, "parent"))
+
+  return True
diff --git a/edward/models/random_variable.py b/edward/models/random_variable.py
index 65f69e85e..4bcd4373e 100644
--- a/edward/models/random_variable.py
+++ b/edward/models/random_variable.py
@@ -5,6 +5,7 @@
 import tensorflow as tf
 
 from collections import defaultdict
+import six
 
 try:
   from tensorflow.python.client.session import \
@@ -86,29 +87,11 @@ def __init__(self, *args, **kwargs):
         Optional list of graph collections (lists). The random variable is
         added to these collections. Defaults to `[ed.random_variables()]`.
     """
-    # Force the Distribution class to always use the same name scope
-    # when scoping its parameter names and also when calling any
-    # methods such as sample.
-    name = kwargs.get('name', type(self).__name__)
-    with tf.name_scope(name) as ns:
-      kwargs['name'] = ns
-
-    # pop and store RandomVariable-specific parameters in _kwargs
+    # pop and store RandomVariable-specific parameters
     sample_shape = kwargs.pop('sample_shape', ())
     value = kwargs.pop('value', None)
     collections = kwargs.pop('collections', ["random_variables"])
 
-    # store args, kwargs for easy graph copying
-    self._args = args
-    self._kwargs = kwargs.copy()
-
-    if sample_shape != ():
-      self._kwargs['sample_shape'] = sample_shape
-    if value is not None:
-      self._kwargs['value'] = value
-    if collections != ["random_variables"]:
-      self._kwargs['collections'] = collections
-
     super(RandomVariable, self).__init__(*args, **kwargs)
 
     self._sample_shape = tf.TensorShape(sample_shape)
@@ -145,19 +128,32 @@ def sample_shape(self):
   @property
   def shape(self):
     """Shape of random variable."""
-    return self._value.shape
+    return self.value.shape
+
+  @property
+  def value(self):
+    """Get tensor that the random variable corresponds to."""
+    return self._value
 
   def __str__(self):
+    if not hasattr(self.value, "numpy"):
+      name = self.name
+    else:
+      name = numpy_text(self.value)
     return "RandomVariable(\"%s\"%s%s%s)" % (
-        self.name,
+        name,
         (", shape=%s" % self.shape)
         if self.shape.ndims is not None else "",
         (", dtype=%s" % self.dtype.name) if self.dtype else "",
-        (", device=%s" % self.value().device) if self.value().device else "")
+        (", device=%s" % self.value.device) if self.value.device else "")
 
   def __repr__(self):
-    return "<ed.RandomVariable '%s' shape=%s dtype=%s>" % (
+    string = "<ed.RandomVariable '%s' shape=%s dtype=%s>" % (
         self.name, self.shape, self.dtype.name)
+    if hasattr(self.value, "numpy"):
+      string = string[:-1] + " numpy=%s>" % (
+          numpy_text(self.value, is_repr=True))
+    return string
 
   def __hash__(self):
     return id(self)
@@ -213,45 +209,55 @@ def eval(self, session=None, feed_dict=None):
       print(x.eval())
     ```
     """
-    return self.value().eval(session=session, feed_dict=feed_dict)
+    return self.value.eval(session=session, feed_dict=feed_dict)
 
-  def value(self):
-    """Get tensor that the random variable corresponds to."""
-    return self._value
+  def numpy(self):
+    """Value as NumPy array, only available for TF Eager."""
+    return self.value.numpy()
 
   def get_ancestors(self, collection=None):
     """Get ancestor random variables."""
-    from edward.util.random_variables import get_ancestors
+    from edward.models.queries import get_ancestors
     return get_ancestors(self, collection)
 
   def get_blanket(self, collection=None):
     """Get the random variable's Markov blanket."""
-    from edward.util.random_variables import get_blanket
+    from edward.models.queries import get_blanket
     return get_blanket(self, collection)
 
   def get_children(self, collection=None):
     """Get child random variables."""
-    from edward.util.random_variables import get_children
+    from edward.models.queries import get_children
     return get_children(self, collection)
 
   def get_descendants(self, collection=None):
     """Get descendant random variables."""
-    from edward.util.random_variables import get_descendants
+    from edward.models.queries import get_descendants
     return get_descendants(self, collection)
 
   def get_parents(self, collection=None):
     """Get parent random variables."""
-    from edward.util.random_variables import get_parents
-    return get_parents(self, collection)
+    from edward.models.queries import get_parents
+    # The backward pass requires TF graph traversal. In general, consider
+    # primitive -> black box function (TF ops) -> primitive. To go to parent
+    # primitive, we traverse black box function.
+    parents = []
+    for node in six.itervalues(self.parameters):
+      if isinstance(node,
+                    (tf.Variable, tf.SparseTensor, tf.Tensor, RandomVariable)):
+        parents.extend(get_parents(node))
+      if isinstance(node, RandomVariable):
+        parents.append(node)
+    return parents
 
   def get_siblings(self, collection=None):
     """Get sibling random variables."""
-    from edward.util.random_variables import get_siblings
+    from edward.models.queries import get_siblings
     return get_siblings(self, collection)
 
   def get_variables(self, collection=None):
     """Get TensorFlow variables that the random variable depends on."""
-    from edward.util.random_variables import get_variables
+    from edward.models.queries import get_variables
     return get_variables(self, collection)
 
   def get_shape(self):
@@ -274,7 +280,7 @@ def _overload_operator(operator):
       operator: string. The operator name.
     """
     def _run_op(a, *args):
-      return getattr(tf.Tensor, operator)(a.value(), *args)
+      return getattr(tf.Tensor, operator)(a.value, *args)
     # Propagate __doc__ to wrapper
     try:
       _run_op.__doc__ = getattr(tf.Tensor, operator).__doc__
@@ -291,15 +297,15 @@ def _run_op(a, *args):
 
   @staticmethod
   def _session_run_conversion_fetch_function(tensor):
-    return ([tensor.value()], lambda val: val[0])
+    return ([tensor.value], lambda val: val[0])
 
   @staticmethod
   def _session_run_conversion_feed_function(feed, feed_val):
-    return [(feed.value(), feed_val)]
+    return [(feed.value, feed_val)]
 
   @staticmethod
   def _session_run_conversion_feed_function_for_partial_run(feed):
-    return [feed.value()]
+    return [feed.value]
 
   @staticmethod
   def _tensor_conversion_function(v, dtype=None, name=None, as_ref=False):
@@ -308,7 +314,33 @@ def _tensor_conversion_function(v, dtype=None, name=None, as_ref=False):
       raise ValueError(
           "Incompatible type conversion requested to type '%s' for variable "
           "of type '%s'" % (dtype.name, v.dtype.name))
-    return v.value()
+    return v.value
+
+
+def numpy_text(tensor, is_repr=False):  # utility fn from TF Eager codebase
+  """Human readable representation of a tensor's numpy value."""
+  if tensor.dtype.is_numpy_compatible:
+    text = repr(tensor.numpy()) if is_repr else str(tensor.numpy())
+  else:
+    text = "<unprintable>"
+  if "\n" in text:
+    text = "\n" + text
+  return text
+
+
+def random_variables(graph=None):
+  """Return all random variables in the TensorFlow graph.
+
+  Args:
+    graph: TensorFlow graph.
+
+  Returns:
+    list of RandomVariable.
+  """
+  if graph is None:
+    graph = tf.get_default_graph()
+
+  return _RANDOM_VARIABLE_COLLECTION[graph]
 
 
 RandomVariable._overload_all_operators()
diff --git a/edward/util/__init__.py b/edward/util/__init__.py
deleted file mode 100644
index 19bb24b70..000000000
--- a/edward/util/__init__.py
+++ /dev/null
@@ -1,40 +0,0 @@
-"""
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from edward.util.graphs import *
-from edward.util.metrics import *
-from edward.util.progbar import *
-from edward.util.random_variables import *
-from edward.util.tensorflow import *
-
-from tensorflow.python.util.all_util import remove_undocumented
-
-_allowed_symbols = [
-    'check_data',
-    'check_latent_vars',
-    'compute_multinomial_mode',
-    'copy',
-    'dot',
-    'get_ancestors',
-    'get_blanket',
-    'get_children',
-    'get_control_variate_coef',
-    'get_descendants',
-    'get_parents',
-    'get_session',
-    'get_siblings',
-    'get_variables',
-    'is_independent',
-    'Progbar',
-    'random_variables',
-    'rbf',
-    'set_seed',
-    'to_simplex',
-    'transform',
-    'with_binary_averaging'
-]
-
-remove_undocumented(__name__, allowed_exception_list=_allowed_symbols)
diff --git a/edward/util/graphs.py b/edward/util/graphs.py
deleted file mode 100644
index d56c2ea89..000000000
--- a/edward/util/graphs.py
+++ /dev/null
@@ -1,73 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-import six
-import sys
-import tensorflow as tf
-
-from edward.models.random_variable import _RANDOM_VARIABLE_COLLECTION
-
-
-def get_session():
-  """Get the globally defined TensorFlow session.
-
-  If the session is not already defined, then the function will create
-  a global session.
-
-  Returns:
-    _ED_SESSION: tf.InteractiveSession.
-  """
-  global _ED_SESSION
-  if tf.get_default_session() is None:
-    _ED_SESSION = tf.InteractiveSession()
-  else:
-    _ED_SESSION = tf.get_default_session()
-
-  save_stderr = sys.stderr
-  try:
-    import os
-    sys.stderr = open(os.devnull, 'w')  # suppress keras import
-    from keras import backend as K
-    sys.stderr = save_stderr
-    have_keras = True
-  except ImportError:
-    sys.stderr = save_stderr
-    have_keras = False
-  if have_keras:
-    K.set_session(_ED_SESSION)
-
-  return _ED_SESSION
-
-
-def random_variables(graph=None):
-  """Return all random variables in the TensorFlow graph.
-
-  Args:
-    graph: TensorFlow graph.
-
-  Returns:
-    list of RandomVariable.
-  """
-  if graph is None:
-    graph = tf.get_default_graph()
-
-  return _RANDOM_VARIABLE_COLLECTION[graph]
-
-
-def set_seed(x):
-  """Set seed for both NumPy and TensorFlow.
-
-  Args:
-    x: int, float.
-      seed
-  """
-  node_names = list(six.iterkeys(tf.get_default_graph()._nodes_by_name))
-  if len(node_names) > 0 and node_names != ['keras_learning_phase']:
-    raise RuntimeError("Seeding is not supported after initializing "
-                       "part of the graph. "
-                       "Please move set_seed to the beginning of your code.")
-
-  np.random.seed(x)
-  tf.set_random_seed(x)
diff --git a/edward/util/metrics.py b/edward/util/metrics.py
deleted file mode 100644
index 4caf2e725..000000000
--- a/edward/util/metrics.py
+++ /dev/null
@@ -1,46 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from functools import wraps
-
-import tensorflow as tf
-
-
-def with_binary_averaging(metric):
-  """
-  Inspired by scikit-learn's _average_binary_score function:
-  https://github.com/scikit-learn/scikit-learn/blob/d9fdd8b0d1053cb47af8e3823b7a05279dd72054/sklearn/metrics/base.py#L23.
-
-  `None`: computes the specified metric along the second-to-last
-  dimension of `y_true` and `y_pred`. Returns a vector of "class-wise"
-  metrics.
-  `'macro'`: same as `None`, except compute the (unweighted) global
-  average of the resulting vector.
-  `'micro'`: flatten `y_true` and `y_pred` into vectors, then compute
-  `'macro'`
-  """
-  AVERAGE_OPTIONS = (None, 'micro', 'macro')
-
-  @wraps(metric)
-  def with_binary_averaging(*args, **kwargs):
-    y_true, y_pred = args
-    y_true = tf.cast(y_true, tf.float32)
-    y_pred = tf.cast(y_pred, tf.float32)
-    if len(y_true.shape) < 2 and len(y_pred.shape) < 2:
-      y_true = tf.expand_dims(y_true, 0)
-      y_pred = tf.expand_dims(y_pred, 0)
-
-    average = kwargs.get('average', 'macro')
-    if average not in AVERAGE_OPTIONS:
-      raise ValueError('average has to be one of {0}'
-                       ''.format(average_options))
-    if average is None:
-      return metric(y_true, y_pred)
-    if average == 'macro':
-      return tf.reduce_mean(metric(y_true, y_pred))
-    if average == 'micro':
-      y_true = tf.reshape(y_true, [1, -1])
-      y_pred = tf.reshape(y_pred, [1, -1])
-      return tf.reduce_mean(metric(y_true, y_pred))
-  return with_binary_averaging
diff --git a/edward/util/progbar.py b/edward/util/progbar.py
deleted file mode 100644
index 4886d1d91..000000000
--- a/edward/util/progbar.py
+++ /dev/null
@@ -1,115 +0,0 @@
-# -*- coding: utf-8 -*-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-import six
-import sys
-import time
-
-
-class Progbar(object):
-  def __init__(self, target, width=30, interval=0.01, verbose=1):
-    """(Yet another) progress bar.
-
-    Args:
-      target: int.
-        Total number of steps expected.
-      width: int.
-        Width of progress bar.
-      interval: float.
-        Minimum time (in seconds) for progress bar to be displayed
-        during updates.
-      verbose: int.
-        Level of verbosity. 0 suppresses output; 1 is default.
-    """
-    self.target = target
-    self.width = width
-    self.interval = interval
-    self.verbose = verbose
-
-    self.stored_values = {}
-    self.start = time.time()
-    self.last_update = 0
-    self.total_width = 0
-    self.seen_so_far = 0
-
-  def update(self, current, values=None, force=False):
-    """Update progress bar, and print to standard output if `force`
-    is True, or the last update was completed longer than `interval`
-    amount of time ago, or `current` >= `target`.
-
-    The written output is the progress bar and all unique values.
-
-    Args:
-      current: int.
-        Index of current step.
-      values: dict of str to float.
-        Dict of name by value-for-last-step. The progress bar
-        will display averages for these values.
-      force: bool.
-        Whether to force visual progress update.
-    """
-    if values is None:
-      values = {}
-
-    for k, v in six.iteritems(values):
-      self.stored_values[k] = v
-
-    self.seen_so_far = current
-
-    now = time.time()
-    if (not force and
-            (now - self.last_update) < self.interval and
-            current < self.target):
-      return
-
-    self.last_update = now
-    if self.verbose == 0:
-      return
-
-    prev_total_width = self.total_width
-    sys.stdout.write("\b" * prev_total_width)
-    sys.stdout.write("\r")
-
-    # Write progress bar to stdout.
-    n_digits = len(str(self.target))
-    bar = '%%%dd/%%%dd' % (n_digits, n_digits) % (current, self.target)
-    bar += ' [{0}%]'.format(str(int(current / self.target * 100)).rjust(3))
-    bar += ' '
-    prog_width = int(self.width * float(current) / self.target)
-    if prog_width > 0:
-      try:
-        bar += ('█' * prog_width)
-      except UnicodeEncodeError:
-        bar += ('*' * prog_width)
-
-    bar += (' ' * (self.width - prog_width))
-    sys.stdout.write(bar)
-
-    # Write values to stdout.
-    if current:
-      time_per_unit = (now - self.start) / current
-    else:
-      time_per_unit = 0
-
-    eta = time_per_unit * (self.target - current)
-    info = ''
-    if current < self.target:
-      info += ' ETA: %ds' % eta
-    else:
-      info += ' Elapsed: %ds' % (now - self.start)
-
-    for k, v in six.iteritems(self.stored_values):
-      info += ' | {0:s}: {1:0.3f}'.format(k, v)
-
-    self.total_width = len(bar) + len(info)
-    if prev_total_width > self.total_width:
-      info += ((prev_total_width - self.total_width) * " ")
-
-    sys.stdout.write(info)
-    sys.stdout.flush()
-
-    if current >= self.target:
-      sys.stdout.write("\n")
diff --git a/edward/util/random_variables.py b/edward/util/random_variables.py
deleted file mode 100644
index 3a581505a..000000000
--- a/edward/util/random_variables.py
+++ /dev/null
@@ -1,968 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-import six
-import tensorflow as tf
-
-from copy import deepcopy
-from edward.models.random_variable import RandomVariable
-from edward.models.random_variables import TransformedDistribution
-from edward.models import PointMass
-from edward.util.graphs import random_variables
-from tensorflow.core.framework import attr_value_pb2
-from tensorflow.python.framework.ops import set_shapes_for_outputs
-from tensorflow.python.util import compat
-
-tfb = tf.contrib.distributions.bijectors
-
-
-def check_data(data):
-  """Check that the data dictionary passed during inference and
-  criticism is valid.
-  """
-  if not isinstance(data, dict):
-    raise TypeError("data must have type dict.")
-
-  for key, value in six.iteritems(data):
-    if isinstance(key, tf.Tensor) and "Placeholder" in key.op.type:
-      if isinstance(value, RandomVariable):
-        raise TypeError("The value of a feed cannot be a ed.RandomVariable "
-                        "object. "
-                        "Acceptable feed values include Python scalars, "
-                        "strings, lists, numpy ndarrays, or TensorHandles.")
-      elif isinstance(value, tf.Tensor):
-        raise TypeError("The value of a feed cannot be a tf.Tensor object. "
-                        "Acceptable feed values include Python scalars, "
-                        "strings, lists, numpy ndarrays, or TensorHandles.")
-    elif isinstance(key, (RandomVariable, tf.Tensor)):
-      if isinstance(value, (RandomVariable, tf.Tensor)):
-        if not key.shape.is_compatible_with(value.shape):
-          raise TypeError("Key-value pair in data does not have same "
-                          "shape: {}, {}".format(key.shape, value.shape))
-        elif key.dtype != value.dtype:
-          raise TypeError("Key-value pair in data does not have same "
-                          "dtype: {}, {}".format(key.dtype, value.dtype))
-      elif isinstance(value, (float, list, int, np.ndarray, np.number, str)):
-        if not key.shape.is_compatible_with(np.shape(value)):
-          raise TypeError("Key-value pair in data does not have same "
-                          "shape: {}, {}".format(key.shape, np.shape(value)))
-        elif isinstance(value, (np.ndarray, np.number)) and \
-                not np.issubdtype(value.dtype, np.float) and \
-                not np.issubdtype(value.dtype, np.int) and \
-                not np.issubdtype(value.dtype, np.str):
-          raise TypeError("Data value has an invalid dtype: "
-                          "{}".format(value.dtype))
-      else:
-        raise TypeError("Data value has an invalid type: "
-                        "{}".format(type(value)))
-    else:
-      raise TypeError("Data key has an invalid type: {}".format(type(key)))
-
-
-def check_latent_vars(latent_vars):
-  """Check that the latent variable dictionary passed during inference and
-  criticism is valid.
-  """
-  if not isinstance(latent_vars, dict):
-    raise TypeError("latent_vars must have type dict.")
-
-  for key, value in six.iteritems(latent_vars):
-    if not isinstance(key, (RandomVariable, tf.Tensor)):
-      raise TypeError("Latent variable key has an invalid type: "
-                      "{}".format(type(key)))
-    elif not isinstance(value, (RandomVariable, tf.Tensor)):
-      raise TypeError("Latent variable value has an invalid type: "
-                      "{}".format(type(value)))
-    elif not key.shape.is_compatible_with(value.shape):
-      raise TypeError("Key-value pair in latent_vars does not have same "
-                      "shape: {}, {}".format(key.shape, value.shape))
-    elif key.dtype != value.dtype:
-      raise TypeError("Key-value pair in latent_vars does not have same "
-                      "dtype: {}, {}".format(key.dtype, value.dtype))
-
-
-def _get_context_copy(ctx, scope):
-    # contexts are stored in graph collections
-    # is there a more efficient way to do this?
-
-    graph = tf.get_default_graph()
-
-    for name, collection in six.iteritems(graph._collections):
-      if ctx in collection:
-        for item in collection:
-          if item.name == scope + ctx.name:
-            return item
-
-    return None
-
-
-def _copy_context(ctx, context_matches, dict_swap, scope, copy_q):
-  if ctx is None:
-    return None
-
-  # We'd normally check about returning early, but the context won't
-  # be copied until after all children are, so we check that first.
-
-  graph = tf.get_default_graph()
-
-  # copy all nodes within context
-  for tensorname in ctx._values:
-    tensor = graph.as_graph_element(tensorname)
-    copy(tensor, dict_swap, scope, True, copy_q)
-
-  # now make sure we haven't already copied the context we're currently
-  # trying to copy (in the course of copying another child)
-  ctx_copy = _get_context_copy(ctx, scope)
-  if ctx_copy:
-    return ctx_copy
-
-  ctx_copy = ctx.from_proto(ctx.to_proto(), scope[:-1])
-  outer_copy = _copy_context(ctx.outer_context, context_matches, dict_swap,
-                             scope, copy_q)
-  ctx_copy._outer_context = outer_copy
-
-  for name, collection in six.iteritems(graph._collections):
-      if ctx in collection:
-        graph.add_to_collection(name, ctx_copy)
-  return ctx_copy
-
-
-def _copy_default(x, *args, **kwargs):
-  if isinstance(x, (RandomVariable, tf.Operation, tf.Tensor, tf.Variable)):
-    x = copy(x, *args, **kwargs)
-
-  return x
-
-
-def copy(org_instance, dict_swap=None, scope="copied",
-         replace_itself=False, copy_q=False, copy_parent_rvs=True):
-  """Build a new node in the TensorFlow graph from `org_instance`,
-  where any of its ancestors existing in `dict_swap` are
-  replaced with `dict_swap`'s corresponding value.
-
-  Copying is done recursively. Any `Operation` whose output is
-  required to copy `org_instance` is also copied (if it isn't already
-  copied within the new scope).
-
-  `tf.Variable`s, `tf.placeholder`s, and nodes of type `Queue` are
-  always reused and not copied. In addition, `tf.Operation`s with
-  operation-level seeds are copied with a new operation-level seed.
-
-  Args:
-    org_instance: RandomVariable, tf.Operation, tf.Tensor, or tf.Variable.
-      Node to add in graph with replaced ancestors.
-    dict_swap: dict.
-      Random variables, variables, tensors, or operations to swap with.
-      Its keys are what `org_instance` may depend on, and its values are
-      the corresponding object (not necessarily of the same class
-      instance, but must have the same type, e.g., float32) that is used
-      in exchange.
-    scope: str.
-      A scope for the new node(s). This is used to avoid name
-      conflicts with the original node(s).
-    replace_itself: bool.
-      Whether to replace `org_instance` itself if it exists in
-      `dict_swap`. (This is used for the recursion.)
-    copy_q: bool.
-      Whether to copy the replaced tensors too (if not already
-      copied within the new scope). Otherwise will reuse them.
-    copy_parent_rvs:
-      Whether to copy parent random variables `org_instance` depends
-      on. Otherwise will copy only the sample tensors and not the
-      random variable class itself.
-
-  Returns:
-    RandomVariable, tf.Variable, tf.Tensor, or tf.Operation.
-    The copied node.
-
-  Raises:
-    TypeError.
-    If `org_instance` is not one of the above types.
-
-  #### Examples
-
-  ```python
-  x = tf.constant(2.0)
-  y = tf.constant(3.0)
-  z = x * y
-
-  qx = tf.constant(4.0)
-  # The TensorFlow graph is currently
-  # `x` -> `z` <- y`, `qx`
-
-  # This adds a subgraph with newly copied nodes,
-  # `qx` -> `copied/z` <- `copied/y`
-  z_new = ed.copy(z, {x: qx})
-
-  sess = tf.Session()
-  sess.run(z)
-  6.0
-  sess.run(z_new)
-  12.0
-  ```
-  """
-  if not isinstance(org_instance,
-                    (RandomVariable, tf.Operation, tf.Tensor, tf.Variable)):
-    raise TypeError("Could not copy instance: " + str(org_instance))
-
-  if dict_swap is None:
-    dict_swap = {}
-  if scope[-1] != '/':
-    scope += '/'
-
-  # Swap instance if in dictionary.
-  if org_instance in dict_swap and replace_itself:
-    org_instance = dict_swap[org_instance]
-    if not copy_q:
-      return org_instance
-  elif isinstance(org_instance, tf.Tensor) and replace_itself:
-    # Deal with case when `org_instance` is the associated tensor
-    # from the RandomVariable, e.g., `z.value()`. If
-    # `dict_swap={z: qz}`, we aim to swap it with `qz.value()`.
-    for key, value in six.iteritems(dict_swap):
-      if isinstance(key, RandomVariable):
-        if org_instance == key.value():
-          if isinstance(value, RandomVariable):
-            org_instance = value.value()
-          else:
-            org_instance = value
-          if not copy_q:
-            return org_instance
-          break
-
-  # If instance is a tf.Variable, return it; do not copy any. Note we
-  # check variables via their name. If we get variables through an
-  # op's inputs, it has type tf.Tensor and not tf.Variable.
-  if isinstance(org_instance, (tf.Tensor, tf.Variable)):
-    for variable in tf.global_variables():
-      if org_instance.name == variable.name:
-        if variable in dict_swap and replace_itself:
-          # Deal with case when `org_instance` is the associated _ref
-          # tensor for a tf.Variable.
-          org_instance = dict_swap[variable]
-          if not copy_q or isinstance(org_instance, tf.Variable):
-            return org_instance
-          for variable in tf.global_variables():
-            if org_instance.name == variable.name:
-              return variable
-          break
-        else:
-          return variable
-
-  graph = tf.get_default_graph()
-  new_name = scope + org_instance.name
-
-  # If an instance of the same name exists, return it.
-  if isinstance(org_instance, RandomVariable):
-    for rv in random_variables():
-      if new_name == rv.name:
-        return rv
-  elif isinstance(org_instance, (tf.Tensor, tf.Operation)):
-    try:
-      return graph.as_graph_element(new_name,
-                                    allow_tensor=True,
-                                    allow_operation=True)
-    except:
-      pass
-
-  # Preserve ordering of random variables. Random variables are always
-  # copied first (from parent -> child) before any deterministic
-  # operations that depend on them.
-  if copy_parent_rvs and \
-          isinstance(org_instance, (RandomVariable, tf.Tensor, tf.Variable)):
-    for v in get_parents(org_instance):
-      copy(v, dict_swap, scope, True, copy_q, True)
-
-  if isinstance(org_instance, RandomVariable):
-    rv = org_instance
-
-    # If it has copiable arguments, copy them.
-    args = [_copy_default(arg, dict_swap, scope, True, copy_q, False)
-            for arg in rv._args]
-
-    kwargs = {}
-    for key, value in six.iteritems(rv._kwargs):
-      if isinstance(value, list):
-        kwargs[key] = [_copy_default(v, dict_swap, scope, True, copy_q, False)
-                       for v in value]
-      else:
-        kwargs[key] = _copy_default(
-            value, dict_swap, scope, True, copy_q, False)
-
-    kwargs['name'] = new_name
-    # Create new random variable with copied arguments.
-    try:
-      new_rv = type(rv)(*args, **kwargs)
-    except ValueError:
-      # Handle case where parameters are copied under absolute name
-      # scope. This can cause an error when creating a new random
-      # variable as tf.identity name ops are called on parameters ("op
-      # with name already exists"). To avoid remove absolute name scope.
-      kwargs['name'] = new_name[:-1]
-      new_rv = type(rv)(*args, **kwargs)
-    return new_rv
-  elif isinstance(org_instance, tf.Tensor):
-    tensor = org_instance
-
-    # Do not copy tf.placeholders.
-    if 'Placeholder' in tensor.op.type:
-      return tensor
-
-    # A tensor is one of the outputs of its underlying
-    # op. Therefore copy the op itself.
-    op = tensor.op
-    new_op = copy(op, dict_swap, scope, True, copy_q, False)
-
-    output_index = op.outputs.index(tensor)
-    new_tensor = new_op.outputs[output_index]
-
-    # Add copied tensor to collections that the original one is in.
-    for name, collection in six.iteritems(tensor.graph._collections):
-      if tensor in collection:
-        graph.add_to_collection(name, new_tensor)
-
-    return new_tensor
-  elif isinstance(org_instance, tf.Operation):
-    op = org_instance
-
-    # Do not copy queue operations.
-    if 'Queue' in op.type:
-      return op
-
-    # Copy the node def.
-    # It is unique to every Operation instance. Replace the name and
-    # its operation-level seed if it has one.
-    node_def = deepcopy(op.node_def)
-    node_def.name = new_name
-
-    # when copying control flow contexts,
-    # we need to make sure frame definitions are copied
-    if 'frame_name' in node_def.attr and node_def.attr['frame_name'].s != b'':
-      node_def.attr['frame_name'].s = (scope.encode('utf-8') +
-                                       node_def.attr['frame_name'].s)
-
-    if 'seed2' in node_def.attr and tf.get_seed(None)[1] is not None:
-      node_def.attr['seed2'].i = tf.get_seed(None)[1]
-
-    # Copy other arguments needed for initialization.
-    output_types = op._output_types[:]
-
-    # If it has an original op, copy it.
-    if op._original_op is not None:
-      original_op = copy(op._original_op, dict_swap, scope, True, copy_q, False)
-    else:
-      original_op = None
-
-    # Copy the op def.
-    # It is unique to every Operation type.
-    op_def = deepcopy(op.op_def)
-
-    new_op = tf.Operation(node_def,
-                          graph,
-                          [],  # inputs; will add them afterwards
-                          output_types,
-                          [],  # control inputs; will add them afterwards
-                          [],  # input types; will add them afterwards
-                          original_op,
-                          op_def)
-
-    # advertise op early to break recursions
-    graph._add_op(new_op)
-
-    # If it has control inputs, copy them.
-    control_inputs = []
-    for x in op.control_inputs:
-      elem = copy(x, dict_swap, scope, True, copy_q, False)
-      if not isinstance(elem, tf.Operation):
-        elem = tf.convert_to_tensor(elem)
-
-      control_inputs.append(elem)
-
-    new_op._add_control_inputs(control_inputs)
-
-    # If it has inputs, copy them.
-    for x in op.inputs:
-      elem = copy(x, dict_swap, scope, True, copy_q, False)
-      if not isinstance(elem, tf.Operation):
-        elem = tf.convert_to_tensor(elem)
-
-      new_op._add_input(elem)
-
-    # Copy the control flow context.
-    control_flow_context = _copy_context(op._get_control_flow_context(), {},
-                                         dict_swap, scope, copy_q)
-    new_op._set_control_flow_context(control_flow_context)
-
-    # Use Graph's private methods to add the op, following
-    # implementation of `tf.Graph().create_op()`.
-    compute_shapes = True
-    compute_device = True
-    op_type = new_name
-
-    if compute_shapes:
-      set_shapes_for_outputs(new_op)
-    graph._record_op_seen_by_control_dependencies(new_op)
-
-    if compute_device:
-      graph._apply_device_functions(new_op)
-
-    if graph._colocation_stack:
-      all_colocation_groups = []
-      for colocation_op in graph._colocation_stack:
-        all_colocation_groups.extend(colocation_op.colocation_groups())
-        if colocation_op.device:
-          # Make this device match the device of the colocated op, to
-          # provide consistency between the device and the colocation
-          # property.
-          if new_op.device and new_op.device != colocation_op.device:
-            logging.warning("Tried to colocate %s with an op %s that had "
-                            "a different device: %s vs %s. "
-                            "Ignoring colocation property.",
-                            name, colocation_op.name, new_op.device,
-                            colocation_op.device)
-
-      all_colocation_groups = sorted(set(all_colocation_groups))
-      new_op.node_def.attr["_class"].CopyFrom(attr_value_pb2.AttrValue(
-          list=attr_value_pb2.AttrValue.ListValue(s=all_colocation_groups)))
-
-    # Sets "container" attribute if
-    # (1) graph._container is not None
-    # (2) "is_stateful" is set in OpDef
-    # (3) "container" attribute is in OpDef
-    # (4) "container" attribute is None
-    if (graph._container and
-        op_type in graph._registered_ops and
-        graph._registered_ops[op_type].is_stateful and
-        "container" in new_op.node_def.attr and
-            not new_op.node_def.attr["container"].s):
-      new_op.node_def.attr["container"].CopyFrom(
-          attr_value_pb2.AttrValue(s=compat.as_bytes(graph._container)))
-
-    return new_op
-  else:
-    raise TypeError("Could not copy instance: " + str(org_instance))
-
-
-def get_ancestors(x, collection=None):
-  """Get ancestor random variables of input.
-
-  Args:
-    x: RandomVariable or tf.Tensor.
-      Query node to find ancestors of.
-    collection: list of RandomVariable.
-      The collection of random variables to check with respect to;
-      defaults to all random variables in the graph.
-
-  Returns:
-    list of RandomVariable.
-    Ancestor random variables of x.
-
-  #### Examples
-  ```python
-  a = Normal(0.0, 1.0)
-  b = Normal(a, 1.0)
-  c = Normal(0.0, 1.0)
-  d = Normal(b * c, 1.0)
-  assert set(ed.get_ancestors(d)) == set([a, b, c])
-  ```
-  """
-  if collection is None:
-    collection = random_variables()
-
-  node_dict = {node.value(): node for node in collection}
-
-  # Traverse the graph. Add each node to the set if it's in the collection.
-  output = set()
-  visited = set()
-  nodes = {x}
-  while nodes:
-    node = nodes.pop()
-
-    if node in visited:
-      continue
-    visited.add(node)
-
-    if isinstance(node, RandomVariable):
-      node = node.value()
-
-    candidate_node = node_dict.get(node, None)
-    if candidate_node is not None and candidate_node != x:
-      output.add(candidate_node)
-
-    nodes.update(node.op.inputs)
-
-  return list(output)
-
-
-def get_blanket(x, collection=None):
-  """Get Markov blanket of input, which consists of its parents, its
-  children, and the other parents of its children.
-
-  Args:
-    x: RandomVariable or tf.Tensor.
-      Query node to find Markov blanket of.
-    collection: list of RandomVariable.
-      The collection of random variables to check with respect to;
-      defaults to all random variables in the graph.
-
-  Returns:
-    list of RandomVariable.
-    Markov blanket of x.
-
-  #### Examples
-
-  ```python
-  a = Normal(0.0, 1.0)
-  b = Normal(0.0, 1.0)
-  c = Normal(a * b, 1.0)
-  d = Normal(0.0, 1.0)
-  e = Normal(c * d, 1.0)
-  assert set(ed.get_blanket(c)) == set([a, b, d, e])
-  ```
-  """
-  output = set()
-  output.update(get_parents(x, collection))
-  children = get_children(x, collection)
-  output.update(children)
-  for child in children:
-    output.update(get_parents(child, collection))
-
-  output.discard(x)
-  return list(output)
-
-
-def get_children(x, collection=None):
-  """Get child random variables of input.
-
-  Args:
-    x: RandomVariable or tf.Tensor>
-      Query node to find children of.
-    collection: list of RandomVariable.
-      The collection of random variables to check with respect to;
-      defaults to all random variables in the graph.
-
-  Returns:
-    list of RandomVariable.
-    Child random variables of x.
-
-  #### Examples
-
-  ```python
-  a = Normal(0.0, 1.0)
-  b = Normal(a, 1.0)
-  c = Normal(a, 1.0)
-  d = Normal(c, 1.0)
-  assert set(ed.get_children(a)) == set([b, c])
-  ```
-  """
-  if collection is None:
-    collection = random_variables()
-
-  node_dict = {node.value(): node for node in collection}
-
-  # Traverse the graph. Add each node to the set if it's in the collection.
-  output = set()
-  visited = set()
-  nodes = {x}
-  while nodes:
-    node = nodes.pop()
-
-    if node in visited:
-      continue
-    visited.add(node)
-
-    if isinstance(node, RandomVariable):
-      node = node.value()
-
-    candidate_node = node_dict.get(node, None)
-    if candidate_node is not None and candidate_node != x:
-      output.add(candidate_node)
-    else:
-      for op in node.consumers():
-        nodes.update(op.outputs)
-
-  return list(output)
-
-
-def get_descendants(x, collection=None):
-  """Get descendant random variables of input.
-
-  Args:
-    x: RandomVariable or tf.Tensor.
-      Query node to find descendants of.
-    collection: list of RandomVariable.
-      The collection of random variables to check with respect to;
-      defaults to all random variables in the graph.
-
-  Returns:
-    list of RandomVariable.
-    Descendant random variables of x.
-
-  #### Examples
-
-  ```python
-  a = Normal(0.0, 1.0)
-  b = Normal(a, 1.0)
-  c = Normal(a, 1.0)
-  d = Normal(c, 1.0)
-  assert set(ed.get_descendants(a)) == set([b, c, d])
-  ```
-  """
-  if collection is None:
-    collection = random_variables()
-
-  node_dict = {node.value(): node for node in collection}
-
-  # Traverse the graph. Add each node to the set if it's in the collection.
-  output = set()
-  visited = set()
-  nodes = {x}
-  while nodes:
-    node = nodes.pop()
-
-    if node in visited:
-      continue
-    visited.add(node)
-
-    if isinstance(node, RandomVariable):
-      node = node.value()
-
-    candidate_node = node_dict.get(node, None)
-    if candidate_node is not None and candidate_node != x:
-      output.add(candidate_node)
-
-    for op in node.consumers():
-      nodes.update(op.outputs)
-
-  return list(output)
-
-
-def get_parents(x, collection=None):
-  """Get parent random variables of input.
-
-  Args:
-    x: RandomVariable or tf.Tensor.
-      Query node to find parents of.
-    collection: list of RandomVariable.
-      The collection of random variables to check with respect to;
-      defaults to all random variables in the graph.
-
-  Returns:
-    list of RandomVariable.
-    Parent random variables of x.
-
-  #### Examples
-
-  ```python
-  a = Normal(0.0, 1.0)
-  b = Normal(a, 1.0)
-  c = Normal(0.0, 1.0)
-  d = Normal(b * c, 1.0)
-  assert set(ed.get_parents(d)) == set([b, c])
-  ```
-  """
-  if collection is None:
-    collection = random_variables()
-
-  node_dict = {node.value(): node for node in collection}
-
-  # Traverse the graph. Add each node to the set if it's in the collection.
-  output = set()
-  visited = set()
-  nodes = {x}
-  while nodes:
-    node = nodes.pop()
-
-    if node in visited:
-      continue
-    visited.add(node)
-
-    if isinstance(node, RandomVariable):
-      node = node.value()
-
-    candidate_node = node_dict.get(node, None)
-    if candidate_node is not None and candidate_node != x:
-      output.add(candidate_node)
-    else:
-      nodes.update(node.op.inputs)
-
-  return list(output)
-
-
-def get_siblings(x, collection=None):
-  """Get sibling random variables of input.
-
-  Args:
-    x: RandomVariable or tf.Tensor.
-      Query node to find siblings of.
-    collection: list of RandomVariable.
-      The collection of random variables to check with respect to;
-      defaults to all random variables in the graph.
-
-  Returns:
-    list of RandomVariable.
-    Sibling random variables of x.
-
-  #### Examples
-
-  ```python
-  a = Normal(0.0, 1.0)
-  b = Normal(a, 1.0)
-  c = Normal(a, 1.0)
-  assert ed.get_siblings(b) == [c]
-  ```
-  """
-  parents = get_parents(x, collection)
-  siblings = set()
-  for parent in parents:
-    siblings.update(get_children(parent, collection))
-
-  siblings.discard(x)
-  return list(siblings)
-
-
-def get_variables(x, collection=None):
-  """Get parent TensorFlow variables of input.
-
-  Args:
-    x: RandomVariable or tf.Tensor.
-      Query node to find parents of.
-    collection: list of tf.Variable.
-      The collection of variables to check with respect to; defaults to
-      all variables in the graph.
-
-  Returns:
-    list of tf.Variable.
-    TensorFlow variables that x depends on.
-
-  #### Examples
-
-  ```python
-  a = tf.Variable(0.0)
-  b = tf.Variable(0.0)
-  c = Normal(a * b, 1.0)
-  assert set(ed.get_variables(c)) == set([a, b])
-  ```
-  """
-  if collection is None:
-    collection = tf.global_variables()
-
-  node_dict = {node.name: node for node in collection}
-
-  # Traverse the graph. Add each node to the set if it's in the collection.
-  output = set()
-  visited = set()
-  nodes = {x}
-  while nodes:
-    node = nodes.pop()
-
-    if node in visited:
-      continue
-    visited.add(node)
-
-    if isinstance(node, RandomVariable):
-      node = node.value()
-
-    candidate_node = node_dict.get(node.name, None)
-    if candidate_node is not None and candidate_node != x:
-      output.add(candidate_node)
-
-    nodes.update(node.op.inputs)
-
-  return list(output)
-
-
-def is_independent(a, b, condition=None):
-  """Assess whether a is independent of b given the random variables in
-  condition.
-
-  Implemented using the Bayes-Ball algorithm [@schachter1998bayes].
-
-  Args:
-    a: RandomVariable or list of RandomVariable.
-       Query node(s).
-    b: RandomVariable or list of RandomVariable.
-       Query node(s).
-    condition: RandomVariable or list of RandomVariable.
-       Random variable(s) to condition on.
-
-  Returns:
-    bool.
-    True if a is independent of b given the random variables in condition.
-
-  #### Examples
-
-  ```python
-  a = Normal(0.0, 1.0)
-  b = Normal(a, 1.0)
-  c = Normal(a, 1.0)
-  assert ed.is_independent(b, c, condition=a)
-  ```
-  """
-  if condition is None:
-    condition = []
-  if not isinstance(a, list):
-    a = [a]
-  if not isinstance(b, list):
-    b = [b]
-  if not isinstance(condition, list):
-    condition = [condition]
-  A = set(a)
-  B = set(b)
-  condition = set(condition)
-
-  top_marked = set()
-  # The Bayes-Ball algorithm will traverse the belief network
-  # and add each node that is relevant to B given condition
-  # to the set bottom_marked. A and B are conditionally
-  # independent if no node in A is in bottom_marked.
-  bottom_marked = set()
-
-  schedule = [(node, "child") for node in B]
-  while schedule:
-    node, came_from = schedule.pop()
-
-    if node not in condition and came_from == "child":
-      if node not in top_marked:
-        top_marked.add(node)
-        for parent in get_parents(node):
-          schedule.append((parent, "child"))
-
-      if not isinstance(node, PointMass) and node not in bottom_marked:
-        bottom_marked.add(node)
-        if node in A:
-          return False  # node in A is relevant to B
-        for child in get_children(node):
-          schedule.append((child, "parent"))
-
-    elif came_from == "parent":
-      if node in condition and node not in top_marked:
-        top_marked.add(node)
-        for parent in get_parents(node):
-          schedule.append((parent, "child"))
-
-      elif node not in condition and node not in bottom_marked:
-        bottom_marked.add(node)
-        if node in A:
-          return False  # node in A is relevant to B
-        for child in get_children(node):
-          schedule.append((child, "parent"))
-
-  return True
-
-
-def transform(x, *args, **kwargs):
-  """Transform a continuous random variable to the unconstrained space.
-
-  `transform` selects among a number of default transformations which
-  depend on the support of the provided random variable:
-
-  + $[0, 1]$ (e.g., Beta): Inverse of sigmoid.
-  + $[0, \infty)$ (e.g., Gamma): Inverse of softplus.
-  + Simplex (e.g., Dirichlet): Inverse of softmax-centered.
-  + $(-\infty, \infty)$ (e.g., Normal, MultivariateNormalTriL): None.
-
-  Args:
-    x: RandomVariable.
-      Continuous random variable to transform.
-    *args, **kwargs:
-      Arguments to overwrite when forming the `TransformedDistribution`.
-      For example, manually specify the transformation by passing in
-      the `bijector` argument.
-
-  Returns:
-    RandomVariable.
-    A `TransformedDistribution` random variable, or the provided random
-    variable if no transformation was applied.
-
-  #### Examples
-
-  ```python
-  x = Gamma(1.0, 1.0)
-  y = ed.transform(x)
-  sess = tf.Session()
-  sess.run(y)
-  -2.2279539
-  ```
-  """
-  if len(args) != 0 or kwargs.get('bijector', None) is not None:
-    return TransformedDistribution(x, *args, **kwargs)
-
-  try:
-    support = x.support
-  except AttributeError as e:
-    msg = """'{}' object has no 'support'
-             so cannot be transformed.""".format(type(x).__name__)
-    raise AttributeError(msg)
-
-  if support == '01':
-    bij = tfb.Invert(tfb.Sigmoid())
-    new_support = 'real'
-  elif support == 'nonnegative':
-    bij = tfb.Invert(tfb.Softplus())
-    new_support = 'real'
-  elif support == 'simplex':
-    bij = tfb.Invert(tfb.SoftmaxCentered(event_ndims=1))
-    new_support = 'multivariate_real'
-  elif support in ('real', 'multivariate_real'):
-    return x
-  else:
-    msg = "'transform' does not handle supports of type '{}'".format(support)
-    raise ValueError(msg)
-
-  new_x = TransformedDistribution(x, bij, *args, **kwargs)
-  new_x.support = new_support
-  return new_x
-
-
-def compute_multinomial_mode(probs, total_count=1, seed=None):
-  """Compute the mode of a Multinomial random variable.
-
-  Args:
-    probs: 1-D Numpy array of Multinomial class probabilities
-    total_count: integer number of trials in single Multinomial draw
-    seed: a Python integer. Used to create a random seed for the
-      distribution
-
-  #### Examples
-
-  ```python
-  # returns either [2, 2, 1], [2, 1, 2] or [1, 2, 2]
-  probs = np.array(3 * [1/3])
-  total_count = 5
-  compute_multinomial_mode(probs, total_count)
-
-  # returns [3, 2, 0]
-  probs = np.array(3 * [1/3])
-  total_count = 5
-  compute_multinomial_mode(probs, total_count)
-  ```
-  """
-  def softmax(vec):
-    numerator = np.exp(vec)
-    return numerator / numerator.sum(axis=0)
-
-  random_state = np.random.RandomState(seed)
-  mode = np.zeros_like(probs, dtype=np.int32)
-  if total_count == 1:
-    mode[np.argmax(probs)] += 1
-    return list(mode)
-  remaining_count = total_count
-  uniform_prob = 1 / total_count
-
-  while remaining_count > 0:
-    if (probs < uniform_prob).all():
-      probs = softmax(probs)
-    mask = probs >= uniform_prob
-    overflow_count = int(mask.sum() - remaining_count)
-    if overflow_count > 0:
-      hot_indices = np.where(mask)[0]
-      cold_indices = random_state.choice(hot_indices, overflow_count,
-                                         replace=False)
-      mask[cold_indices] = False
-    mode[mask] += 1
-    probs[mask] -= uniform_prob
-    remaining_count -= np.sum(mask)
-  return mode
diff --git a/edward/util/tensorflow.py b/edward/util/tensorflow.py
deleted file mode 100644
index ca0976471..000000000
--- a/edward/util/tensorflow.py
+++ /dev/null
@@ -1,188 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf
-
-from tensorflow.python.ops import control_flow_ops
-
-
-def dot(x, y):
-  """Compute dot product between a 2-D tensor and a 1-D tensor.
-
-  If x is a `[M x N]` matrix, then y is a `M`-vector.
-
-  If x is a `M`-vector, then y is a `[M x N]` matrix.
-
-  Args:
-    x: tf.Tensor.
-      A 1-D or 2-D tensor (see above).
-    y: tf.Tensor.
-      A 1-D or 2-D tensor (see above).
-
-  Returns:
-    tf.Tensor.
-    A 1-D tensor of length `N`.
-
-  Raises:
-    InvalidArgumentError.
-    If the inputs have Inf or NaN values.
-  """
-  x = tf.convert_to_tensor(x)
-  y = tf.convert_to_tensor(y)
-  dependencies = [tf.verify_tensor_all_finite(x, msg=''),
-                  tf.verify_tensor_all_finite(y, msg='')]
-  x = control_flow_ops.with_dependencies(dependencies, x)
-  y = control_flow_ops.with_dependencies(dependencies, y)
-
-  if len(x.shape) == 1:
-    vec = x
-    mat = y
-    return tf.reshape(tf.matmul(tf.expand_dims(vec, 0), mat), [-1])
-  else:
-    mat = x
-    vec = y
-    return tf.reshape(tf.matmul(mat, tf.expand_dims(vec, 1)), [-1])
-
-
-def rbf(X, X2=None, lengthscale=1.0, variance=1.0):
-  """Radial basis function kernel, also known as the squared
-  exponential or exponentiated quadratic. It is defined as
-
-  $k(x, x') = \sigma^2 \exp\Big(
-      -\\frac{1}{2} \sum_{d=1}^D \\frac{1}{\ell_d^2} (x_d - x'_d)^2 \Big)$
-
-  for output variance $\sigma^2$ and lengthscale $\ell^2$.
-
-  The kernel is evaluated over all pairs of rows, `k(X[i, ], X2[j, ])`.
-  If `X2` is not specified, then it evaluates over all pairs
-  of rows in `X`, `k(X[i, ], X[j, ])`. The output is a matrix
-  where each entry (i, j) is the kernel over the ith and jth rows.
-
-  Args:
-    X: tf.Tensor.
-      N x D matrix of N data points each with D features.
-    X2: tf.Tensor.
-      N x D matrix of N data points each with D features.
-    lengthscale: tf.Tensor.
-      Lengthscale parameter, a positive scalar or D-dimensional vector.
-    variance: tf.Tensor.
-      Output variance parameter, a positive scalar.
-
-  #### Examples
-
-  ```python
-  X = tf.random_normal([100, 5])
-  K = ed.rbf(X)
-  assert K.shape == (100, 100)
-  ```
-  """
-  lengthscale = tf.convert_to_tensor(lengthscale)
-  variance = tf.convert_to_tensor(variance)
-  dependencies = [tf.assert_positive(lengthscale),
-                  tf.assert_positive(variance)]
-  lengthscale = control_flow_ops.with_dependencies(dependencies, lengthscale)
-  variance = control_flow_ops.with_dependencies(dependencies, variance)
-
-  X = tf.convert_to_tensor(X)
-  X = X / lengthscale
-  Xs = tf.reduce_sum(tf.square(X), 1)
-  if X2 is None:
-    X2 = X
-    X2s = Xs
-  else:
-    X2 = tf.convert_to_tensor(X2)
-    X2 = X2 / lengthscale
-    X2s = tf.reduce_sum(tf.square(X2), 1)
-
-  square = tf.reshape(Xs, [-1, 1]) + tf.reshape(X2s, [1, -1]) - \
-      2 * tf.matmul(X, X2, transpose_b=True)
-  output = variance * tf.exp(-square / 2)
-  return output
-
-
-def to_simplex(x):
-  """Transform real vector of length `(K-1)` to a simplex of dimension `K`
-  using a backward stick breaking construction.
-
-  Args:
-    x: tf.Tensor.
-      A 1-D or 2-D tensor.
-
-  Returns:
-    tf.Tensor.
-    A tensor of same shape as input but with last dimension of
-    size `K`.
-
-  Raises:
-    InvalidArgumentError.
-    If the input has Inf or NaN values.
-
-  #### Notes
-
-  x as a 3-D or higher tensor is not guaranteed to be supported.
-  """
-  x = tf.cast(x, dtype=tf.float32)
-  dependencies = [tf.verify_tensor_all_finite(x, msg='')]
-  x = control_flow_ops.with_dependencies(dependencies, x)
-
-  if isinstance(x, (tf.Tensor, tf.Variable)):
-    shape = x.get_shape().as_list()
-  else:
-    shape = x.shape
-
-  if len(shape) == 1:
-    K_minus_one = shape[0]
-    eq = -tf.log(tf.cast(K_minus_one - tf.range(K_minus_one), dtype=tf.float32))
-    z = tf.sigmoid(eq + x)
-    pil = tf.concat([z, tf.constant([1.0])], 0)
-    piu = tf.concat([tf.constant([1.0]), 1.0 - z], 0)
-    S = tf.cumprod(piu)
-    return S * pil
-  else:
-    n_rows = shape[0]
-    K_minus_one = shape[1]
-    eq = -tf.log(tf.cast(K_minus_one - tf.range(K_minus_one), dtype=tf.float32))
-    z = tf.sigmoid(eq + x)
-    pil = tf.concat([z, tf.ones([n_rows, 1])], 1)
-    piu = tf.concat([tf.ones([n_rows, 1]), 1.0 - z], 1)
-    S = tf.cumprod(piu, axis=1)
-    return S * pil
-
-
-def get_control_variate_coef(f, h):
-  """Returns scalar used by control variates method for variance reduction in
-  Monte Carlo methods.
-
-  If we have a statistic $m$ as an unbiased estimator of $\mu$ and
-  and another statistic $t$ which is an unbiased estimator of
-  $\\tau$ then $m^* = m + c(t - \\tau)$ is also an unbiased
-  estimator of $\mu$ for any coefficient $c$.
-
-  This function calculates the optimal coefficient
-
-  $c^* = \\frac{\\text{Cov}(m,t)}{\\text{Var}(t)}$
-
-  for minimizing the variance of $m^*$.
-
-  Args:
-    f: tf.Tensor.
-      A 1-D tensor.
-    h: tf.Tensor.
-      A 1-D tensor.
-
-  Returns:
-    tf.Tensor.
-    A 0 rank tensor
-  """
-  f_mu = tf.reduce_mean(f)
-  h_mu = tf.reduce_mean(h)
-
-  n = f.shape[0].value
-
-  cov_fh = tf.reduce_sum((f - f_mu) * (h - h_mu)) / (n - 1)
-  var_h = tf.reduce_sum(tf.square(h - h_mu)) / (n - 1)
-
-  a = cov_fh / var_h
-
-  return a
diff --git a/edward/version.py b/edward/version.py
index 8832a6a84..cab57cf58 100644
--- a/edward/version.py
+++ b/edward/version.py
@@ -1,2 +1,2 @@
-__version__ = '1.3.5'
+__version__ = '2.0.0'
 VERSION = __version__
diff --git a/examples/bayesian_linear_regression.py b/examples/bayesian_linear_regression.py
index d3ce100b8..de1fb7909 100644
--- a/examples/bayesian_linear_regression.py
+++ b/examples/bayesian_linear_regression.py
@@ -30,78 +30,151 @@
 FLAGS = tf.flags.FLAGS
 
 
-def build_toy_dataset(N, noise_std=0.5):
-  X = np.concatenate([np.linspace(0, 2, num=N / 2),
-                      np.linspace(6, 8, num=N / 2)])
-  y = 2.0 * X + 10 * np.random.normal(0, noise_std, size=N)
-  X = X.reshape((N, 1))
-  return X, y
+def get_input_fn():
+  """Returns `input_fn` for train and eval."""
+  def build_toy_dataset(N, noise_std=0.5):
+    X = np.concatenate([np.linspace(0, 2, num=N / 2),
+                        np.linspace(6, 8, num=N / 2)])
+    y = 2.0 * X + 10 * np.random.normal(0, noise_std, size=N)
+    X = X.reshape((N, 1))
+    return X, y
+  features, labels = build_toy_dataset(N)
+  def input_fn(params):
+    """A simple input_fn using the experimental input pipeline."""
+    batch_size = params["batch_size"]
+    # TODO
+    dataset = tf.data.TFRecordDataset(filename, buffer_size=None)
+    dataset = dataset.cache().repeat()
+    features, labels = dataset.make_one_shot_iterator().get_next()
+    return features, labels
+  return input_fn
+
+
+def model(X):
+  w = Normal(loc=tf.zeros(FLAGS.D), scale=tf.ones(FLAGS.D))
+  b = Normal(loc=tf.zeros(1), scale=tf.ones(1))
+  y = Normal(loc=tf.tensordot(X, w, [[1], [0]]) + b,
+             scale=tf.ones(FLAGS.N))
+  return y
+
+
+def model_fn(features, labels, mode, params):
+  """Model fn which runs on TPU.
+
+  Args:
+    features: [None, 784]
+    labels: [None, 10]
+    mode: tf.estimator.ModeKeys.*
+    params: dict of hyperparams.
+  """
+  qw = tf.get_variable("qw", [FLAGS.D])
+  qb = tf.get_variable("qb", [])
+  counter = tf.get_variable("counter", initializer=0.)
+  qw_mom = tf.get_variable("qw_mom", [FLAGS.D],
+                           initializer=tf.zeros_initializer())
+  qb_mom = tf.get_variable("qb_mom", [], initializer=tf.zeros_initializer())
+
+  new_states, new_counter, _, new_momentums = ed.sghmc(
+      model,
+      current_state=[qw, qb],
+      counter=counter,
+      momentums=[qw_mom, qb_mom],
+      learning_rate=1e-3,
+      align_latent=lambda name: {"w": "qw", "b": "qb"}.get(name),
+      align_data=lambda name: {"y": "y"}.get(name),
+      X=features,
+      y=labels)
+
+  if mode == tf.estimator.ModeKeys.PREDICT:
+    predicted_classes = tf.argmax(logits, 1)
+    predictions = {
+        "class_ids": predicted_classes[:, tf.newaxis],
+        "probabilities": tf.nn.softmax(logits),
+    }
+    return tf.estimator.EstimatorSpec(mode, loss=None, predictions=predictions)
+
+  predictions = tf.argmax(logits, 1)
+  accuracy = tf.metrics.accuracy(labels=labels, predictions=predictions)
+
+  n_accept = tf.get_variable("n_accept", initializer=0, trainable=False)
+  n_accept_over_t = n_accept / t
+
+  tf.summary.scalar("accuracy", accuracy[1])
+  tf.summary.scalar("n_accept", n_accept)
+
+  if mode == tf.estimator.ModeKeys.EVAL:
+    return tpu_estimator.TPUEstimatorSpec(
+        mode=mode,
+        loss=None,
+        eval_metrics={"accuracy": accuracy,
+                      "n_accept": n_accept,})
+
+  train_op = []
+  train_op.append(qw.assign(new_states[0]))
+  train_op.append(qb.assign(new_states[1]))
+  train_op.append(counter.assign(new_counter))
+  train_op.append(qw_mom.assign(new_momentums[0]))
+  train_op.append(qb_mom.assign(new_momentums[1]))
+  train_op = tf.group(*train_op)
+  return tpu_estimator.TPUEstimatorSpec(mode=mode, loss=None, train_op=train_op)
 
 
 def main(_):
-  ed.set_seed(42)
-
-  # DATA
-  X_train, y_train = build_toy_dataset(FLAGS.N)
-  X_test, y_test = build_toy_dataset(FLAGS.N)
-
-  # MODEL
-  X = tf.placeholder(tf.float32, [FLAGS.N, FLAGS.D])
-  w = Normal(loc=tf.zeros(FLAGS.D), scale=tf.ones(FLAGS.D))
-  b = Normal(loc=tf.zeros(1), scale=tf.ones(1))
-  y = Normal(loc=ed.dot(X, w) + b, scale=tf.ones(FLAGS.N))
+  tf.set_random_seed(42)
 
-  # INFERENCE
-  qw = Empirical(params=tf.get_variable("qw/params", [FLAGS.T, FLAGS.D]))
-  qb = Empirical(params=tf.get_variable("qb/params", [FLAGS.T, 1]))
+  train_input_fn = get_input_fn()
+  eval_input_fn = get_input_fn()
 
-  inference = ed.SGHMC({w: qw, b: qb}, data={X: X_train, y: y_train})
-  inference.run(step_size=1e-3)
+  estimator = tf.Estimator(model_fn=model_fn)
+  estimator.train(input_fn=train_input_fn,
+                  max_steps=FLAGS.train_steps)
 
-  # CRITICISM
+  eval_result = estimator.evaluate(input_fn=eval_input_fn)
+  print('\nTest set accuracy: {accuracy:0.3f}\n'.format(**eval_result))
 
-  # Plot posterior samples.
-  sns.jointplot(qb.params.eval()[FLAGS.nburn:FLAGS.T:FLAGS.stride],
-                qw.params.eval()[FLAGS.nburn:FLAGS.T:FLAGS.stride])
-  plt.show()
+  # # Plot posterior samples.
+  # sns.jointplot(qb.params.eval()[FLAGS.nburn:FLAGS.T:FLAGS.stride],
+  #               qw.params.eval()[FLAGS.nburn:FLAGS.T:FLAGS.stride])
+  # plt.show()
 
-  # Posterior predictive checks.
-  y_post = ed.copy(y, {w: qw, b: qb})
-  # This is equivalent to
-  # y_post = Normal(loc=ed.dot(X, qw) + qb, scale=tf.ones(FLAGS.N))
+  # # Posterior predictive checks.
+  # y_post = ed.copy(y, {w: qw, b: qb})
+  # # This is equivalent to
+  # # y_post = Normal(loc=tf.tensordot(X, qw, [[1], [0]]) + qb,
+  #                   scale=tf.ones(FLAGS.N))
 
-  print("Mean squared error on test data:")
-  print(ed.evaluate('mean_squared_error', data={X: X_test, y_post: y_test}))
+  # print("Mean squared error on test data:")
+  # print(ed.evaluate('mean_squared_error', data={X: X_test, y_post: y_test}))
 
-  print("Displaying prior predictive samples.")
-  n_prior_samples = 10
+  # print("Displaying prior predictive samples.")
+  # n_prior_samples = 10
 
-  w_prior = w.sample(n_prior_samples).eval()
-  b_prior = b.sample(n_prior_samples).eval()
+  # w_prior = w.sample(n_prior_samples).eval()
+  # b_prior = b.sample(n_prior_samples).eval()
 
-  plt.scatter(X_train, y_train)
+  # plt.scatter(X_train, y_train)
 
-  inputs = np.linspace(-1, 10, num=400)
-  for ns in range(n_prior_samples):
-      output = inputs * w_prior[ns] + b_prior[ns]
-      plt.plot(inputs, output)
+  # inputs = np.linspace(-1, 10, num=400)
+  # for ns in range(n_prior_samples):
+  #     output = inputs * w_prior[ns] + b_prior[ns]
+  #     plt.plot(inputs, output)
 
-  plt.show()
+  # plt.show()
 
-  print("Displaying posterior predictive samples.")
-  n_posterior_samples = 10
+  # print("Displaying posterior predictive samples.")
+  # n_posterior_samples = 10
 
-  w_post = qw.sample(n_posterior_samples).eval()
-  b_post = qb.sample(n_posterior_samples).eval()
+  # w_post = qw.sample(n_posterior_samples).eval()
+  # b_post = qb.sample(n_posterior_samples).eval()
 
-  plt.scatter(X_train, y_train)
+  # plt.scatter(X_train, y_train)
 
-  inputs = np.linspace(-1, 10, num=400)
-  for ns in range(n_posterior_samples):
-      output = inputs * w_post[ns] + b_post[ns]
-      plt.plot(inputs, output)
+  # inputs = np.linspace(-1, 10, num=400)
+  # for ns in range(n_posterior_samples):
+  #     output = inputs * w_post[ns] + b_post[ns]
+  #     plt.plot(inputs, output)
 
-  plt.show()
+  # plt.show()
 
 if __name__ == "__main__":
   tf.app.run()
diff --git a/examples/bayesian_linear_regression_implicitklqp.py b/examples/bayesian_linear_regression_implicitklqp.py
index 41a72a132..145869b32 100644
--- a/examples/bayesian_linear_regression_implicitklqp.py
+++ b/examples/bayesian_linear_regression_implicitklqp.py
@@ -86,7 +86,7 @@ def ratio_estimator(data, local_vars, global_vars):
   X = tf.placeholder(tf.float32, [FLAGS.M, FLAGS.D])
   y_ph = tf.placeholder(tf.float32, [FLAGS.M])
   w = Normal(loc=tf.zeros(FLAGS.D), scale=tf.ones(FLAGS.D))
-  y = Normal(loc=ed.dot(X, w), scale=tf.ones(FLAGS.M))
+  y = Normal(loc=tf.tensordot(X, w, [[1], [0]]), scale=tf.ones(M))
 
   # INFERENCE
   qw = Normal(loc=tf.get_variable("qw/loc", [FLAGS.D]) + 1.0,
diff --git a/examples/bayesian_logistic_regression.py b/examples/bayesian_logistic_regression.py
index bf4305a88..fb56ce23f 100644
--- a/examples/bayesian_logistic_regression.py
+++ b/examples/bayesian_logistic_regression.py
@@ -11,7 +11,7 @@
 import numpy as np
 import tensorflow as tf
 
-from edward.models import Bernoulli, Normal, Empirical
+from edward.models import Bernoulli, Normal
 
 tf.flags.DEFINE_integer("N", default=40, help="Number of data points.")
 tf.flags.DEFINE_integer("D", default=1, help="Number of features.")
@@ -27,42 +27,53 @@ def build_toy_dataset(N, noise_std=0.1):
   y[y < 0.5] = 0
   y[y >= 0.5] = 1
   X = (X - 4.0) / 4.0
-  X = X.reshape((N, D))
+  X = X.reshape((N, D)).astype(np.float32)
+  y = y.astype(np.float32)
   return X, y
 
 
+def model(X):
+  w = Normal(loc=tf.zeros(FLAGS.D), scale=3.0 * tf.ones(FLAGS.D), name="w")
+  b = Normal(loc=tf.zeros([]), scale=3.0 * tf.ones([]), name="b")
+  y = Bernoulli(logits=tf.tensordot(X, w, [[1], [0]]) + b, name="y")
+  return y
+
+
 def main(_):
-  ed.set_seed(42)
+  tf.set_random_seed(42)
 
-  # DATA
   X_train, y_train = build_toy_dataset(FLAGS.N)
 
-  # MODEL
-  X = tf.placeholder(tf.float32, [FLAGS.N, FLAGS.D])
-  w = Normal(loc=tf.zeros(FLAGS.D), scale=3.0 * tf.ones(FLAGS.D))
-  b = Normal(loc=tf.zeros([]), scale=3.0 * tf.ones([]))
-  y = Bernoulli(logits=ed.dot(X, w) + b)
+  qw = tf.get_variable("qw", [FLAGS.D])
+  qb = tf.get_variable("qb", [])
 
-  # INFERENCE
-  qw = Empirical(params=tf.get_variable("qw/params", [FLAGS.T, FLAGS.D]))
-  qb = Empirical(params=tf.get_variable("qb/params", [FLAGS.T]))
+  new_state, _, _ = ed.hmc(
+      model,
+      step_size=0.6,
+      current_state=[qw, qb],
+      align_latent=lambda name: {"w": "qw", "b": "qb"}.get(name),
+      align_data=lambda name: {"y": "y"}.get(name),
+      X=X_train,
+      y=y_train)
 
-  inference = ed.HMC({w: qw, b: qb}, data={X: X_train, y: y_train})
-  inference.initialize(n_print=10, step_size=0.6)
+  qw_update = qw.assign(new_state[0])
+  qb_update = qb.assign(new_state[1])
 
   # Alternatively, use variational inference.
-  # qw_loc = tf.get_variable("qw_loc", [FLAGS.D])
-  # qw_scale = tf.nn.softplus(tf.get_variable("qw_scale", [FLAGS.D]))
-  # qb_loc = tf.get_variable("qb_loc", []) + 10.0
-  # qb_scale = tf.nn.softplus(tf.get_variable("qb_scale", []))
-
-  # qw = Normal(loc=qw_loc, scale=qw_scale)
-  # qb = Normal(loc=qb_loc, scale=qb_scale)
-
-  # inference = ed.KLqp({w: qw, b: qb}, data={X: X_train, y: y_train})
-  # inference.initialize(n_print=10, n_iter=600)
-
-  tf.global_variables_initializer().run()
+  # def variational():
+  #   qw_loc = tf.get_variable("qw_loc", [FLAGS.D])
+  #   qw_scale = tf.nn.softplus(tf.get_variable("qw_scale", [FLAGS.D]))
+  #   qb_loc = tf.get_variable("qb_loc", []) + 10.0
+  #   qb_scale = tf.nn.softplus(tf.get_variable("qb_scale", []))
+  #   qw = Normal(loc=qw_loc, scale=qw_scale, name="qw")
+  #   qb = Normal(loc=qb_loc, scale=qb_scale, name="qb")
+  #   return qw, wb
+  #
+  # loss, surrogate_loss = ed.klqp(...)
+  # train_op = tf.train.AdamOptimizer().minimize(surrogate_loss)
+
+  sess = tf.Session()
+  sess.run(tf.global_variables_initializer())
 
   # Set up figure.
   fig = plt.figure(figsize=(8, 8), facecolor='white')
@@ -73,15 +84,14 @@ def main(_):
   # Build samples from inferred posterior.
   n_samples = 50
   inputs = np.linspace(-5, 3, num=400, dtype=np.float32).reshape((400, 1))
-  probs = tf.stack([tf.sigmoid(ed.dot(inputs, qw.sample()) + qb.sample())
+  # TODO n_samples; will need to store and use last X posterior samples
+  probs = tf.stack([tf.sigmoid(tf.tensordot(inputs, qw, [[1], [0]]) + qb)
                     for _ in range(n_samples)])
 
-  for t in range(inference.n_iter):
-    info_dict = inference.update()
-    inference.print_progress(info_dict)
-
-    if t % inference.n_print == 0:
-      outputs = probs.eval()
+  for t in range(5000):
+    sess.run([qw_update, qb_update])
+    if t % 10 == 0:
+      outputs = sess.run(probs)
 
       # Plot data and functions
       plt.cla()
diff --git a/examples/beta_bernoulli.py b/examples/beta_bernoulli.py
index c3a674091..77b8b2e85 100644
--- a/examples/beta_bernoulli.py
+++ b/examples/beta_bernoulli.py
@@ -9,44 +9,65 @@
 import numpy as np
 import tensorflow as tf
 
-from edward.models import Bernoulli, Beta, Empirical
+from edward.models import Bernoulli, Beta
+
+
+def model():
+  p = Beta(1.0, 1.0, name="p")
+  x = Bernoulli(probs=p, sample_shape=10, name="x")
+  return x
+
+
+def proposal(p):
+  proposal_p = Beta(3.0, 9.0, name="proposal/p")
+  return proposal_p
 
 
 def main(_):
-  ed.set_seed(42)
+  tf.set_random_seed(42)
 
-  # DATA
   x_data = np.array([0, 1, 0, 0, 0, 0, 0, 0, 0, 1])
 
-  # MODEL
-  p = Beta(1.0, 1.0)
-  x = Bernoulli(probs=p, sample_shape=10)
+  qp = tf.get_variable("qp", initializer=0.5)
+  new_state, is_accepted, _, _ = ed.metropolis_hastings(
+      model, proposal,
+      current_state=qp,
+      align_latent=lambda name: {"p": "qp"}.get(name),
+      align_proposal=lambda name: {"p": "proposal/p"}.get(name),
+      align_data=lambda name: {"x": "x_data"}.get(name),
+      x_data=x_data)
+  qp_update = qp.assign(new_state)
 
-  # INFERENCE
-  qp = Empirical(params=tf.get_variable(
-      "qp/params", [1000], initializer=tf.constant_initializer(0.5)))
+  sess = tf.Session()
+  sess.run(tf.global_variables_initializer())
 
-  proposal_p = Beta(3.0, 9.0)
+  samples = []
+  num_accept = 0
+  for t in range(2500):
+    sample, accept = sess.run([qp_update, is_accepted])
+    samples.append(sample)
+    num_accept += float(accept)
+    if t % 100 == 0:
+      print("Step {}, Acceptance Rate {:.3}".format(t, num_accept / max(t, 1)))
 
-  inference = ed.MetropolisHastings({p: qp}, {p: proposal_p}, data={x: x_data})
-  inference.run()
+  samples = samples[500:]
 
-  # CRITICISM
   # exact posterior has mean 0.25 and std 0.12
-  sess = ed.get_session()
-  mean, stddev = sess.run([qp.mean(), qp.stddev()])
+  mean = np.mean(samples)
+  stddev = np.std(samples)
   print("Inferred posterior mean:")
   print(mean)
   print("Inferred posterior stddev:")
   print(stddev)
 
-  x_post = ed.copy(x, {p: qp})
-  tx_rep, tx = ed.ppc(
-      lambda xs, zs: tf.reduce_mean(tf.cast(xs[x_post], tf.float32)),
-      data={x_post: x_data})
-  ed.ppc_stat_hist_plot(
-      tx[0], tx_rep, stat_name=r'$T \equiv$mean', bins=10)
-  plt.show()
+  # TODO
+  # x_post = ed.copy(x, {p: qp})
+  # tx_rep, tx = ed.ppc(
+  #     lambda xs, zs: tf.reduce_mean(tf.cast(xs[x_post], tf.float32)),
+  #     data={x_post: x_data})
+  # ed.ppc_stat_hist_plot(
+  #     tx[0], tx_rep, stat_name=r'$T \equiv$mean', bins=10)
+  # plt.show()
 
 if __name__ == "__main__":
   tf.app.run()
diff --git a/examples/cox_process.py b/examples/cox_process.py
index 1a6f50dbb..c58539d20 100644
--- a/examples/cox_process.py
+++ b/examples/cox_process.py
@@ -25,7 +25,6 @@
 import tensorflow as tf
 
 from edward.models import MultivariateNormalTriL, Normal, Poisson
-from edward.util import rbf
 from scipy.stats import multivariate_normal, poisson
 
 tf.flags.DEFINE_integer("N", default=308, help="Number of NBA players.")
@@ -48,6 +47,62 @@ def build_toy_dataset(N, V):
   return x
 
 
+def rbf(X, X2=None, lengthscale=1.0, variance=1.0):
+  """Radial basis function kernel, also known as the squared
+  exponential or exponentiated quadratic. It is defined as
+
+  $k(x, x') = \sigma^2 \exp\Big(
+      -\\frac{1}{2} \sum_{d=1}^D \\frac{1}{\ell_d^2} (x_d - x'_d)^2 \Big)$
+
+  for output variance $\sigma^2$ and lengthscale $\ell^2$.
+
+  The kernel is evaluated over all pairs of rows, `k(X[i, ], X2[j, ])`.
+  If `X2` is not specified, then it evaluates over all pairs
+  of rows in `X`, `k(X[i, ], X[j, ])`. The output is a matrix
+  where each entry (i, j) is the kernel over the ith and jth rows.
+
+  Args:
+    X: tf.Tensor.
+      N x D matrix of N data points each with D features.
+    X2: tf.Tensor.
+      N x D matrix of N data points each with D features.
+    lengthscale: tf.Tensor.
+      Lengthscale parameter, a positive scalar or D-dimensional vector.
+    variance: tf.Tensor.
+      Output variance parameter, a positive scalar.
+
+  #### Examples
+
+  ```python
+  X = tf.random_normal([100, 5])
+  K = ed.rbf(X)
+  assert K.shape == (100, 100)
+  ```
+  """
+  lengthscale = tf.convert_to_tensor(lengthscale)
+  variance = tf.convert_to_tensor(variance)
+  dependencies = [tf.assert_positive(lengthscale),
+                  tf.assert_positive(variance)]
+  lengthscale = control_flow_ops.with_dependencies(dependencies, lengthscale)
+  variance = control_flow_ops.with_dependencies(dependencies, variance)
+
+  X = tf.convert_to_tensor(X)
+  X = X / lengthscale
+  Xs = tf.reduce_sum(tf.square(X), 1)
+  if X2 is None:
+    X2 = X
+    X2s = Xs
+  else:
+    X2 = tf.convert_to_tensor(X2)
+    X2 = X2 / lengthscale
+    X2s = tf.reduce_sum(tf.square(X2), 1)
+
+  square = tf.reshape(Xs, [-1, 1]) + tf.reshape(X2s, [1, -1]) - \
+      2 * tf.matmul(X, X2, transpose_b=True)
+  output = variance * tf.exp(-square / 2)
+  return output
+
+
 def main(_):
   ed.set_seed(42)
 
diff --git a/edward/models/dirichlet_process.py b/examples/dirichlet_process.py
similarity index 100%
rename from edward/models/dirichlet_process.py
rename to examples/dirichlet_process.py
diff --git a/examples/eager.py b/examples/eager.py
new file mode 100644
index 000000000..91c7a1ae5
--- /dev/null
+++ b/examples/eager.py
@@ -0,0 +1,59 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import edward as ed
+import numpy as np
+import tensorflow as tf
+
+from edward.models import Gamma, Normal
+
+import tensorflow.contrib.eager as tfe
+tfe.enable_eager_execution()
+
+def model():
+  z = Normal(loc=0., scale=1., name='z')
+  x = Gamma(tf.nn.softplus(z), 1., sample_shape=1000, name='x')
+  return x
+
+def variational():
+  qz = Normal(loc=tf.get_variable("loc", shape=[]),
+              scale=tf.nn.softplus(tf.get_variable("scale", shape=[])), name='qz')
+  return qz
+
+variational = tf.make_template("variational", variational)
+
+x_data = np.random.gamma(5.2, 1.2, size=1000).astype(np.float32)
+
+optimizer = tf.train.AdamOptimizer(1e-2)
+
+# loss, surrogate_loss = ed.klqp(
+#     model,
+#     variational,
+#     align_latent=lambda name: {'z': 'qz'}.get(name),
+#     align_data=lambda name: {'x': 'x'}.get(name),
+#     x=x_data)
+# grads_and_vars = optimizer.compute_gradients(surrogate_loss)
+# train_op = optimizer.apply_gradients(grads_and_vars)
+
+# sess = tf.Session()
+# sess.run(tf.global_variables_initializer())
+# for _ in range(2000):
+#   sess.run(train_op)
+
+loss_fn = lambda *args: ed.klqp(
+    model,
+    variational,
+    lambda name: {'z': 'qz'}.get(name),
+    lambda name: {'x': 'x'}.get(name),
+    *args)[1]
+
+value_and_gradients_fn = tfe.implicit_value_and_gradients(loss_fn)
+
+for _ in range(100):
+  loss, gradients_and_variables = value_and_gradients_fn(x_data)
+  optimizer.apply_gradients(gradients_and_variables)
+
+qz = variational()
+print("Posterior mean: {}".format(qz.loc))
+print("Posterior variance: {}".format(qz.scale))
diff --git a/examples/iwvi.py b/examples/iwvi.py
index be199aae3..0df1a0447 100644
--- a/examples/iwvi.py
+++ b/examples/iwvi.py
@@ -105,7 +105,7 @@ def main(_):
   # MODEL
   X = tf.placeholder(tf.float32, [N, D])
   w = Normal(loc=tf.zeros(D), scale=tf.ones(D))
-  y = Bernoulli(logits=ed.dot(X, w))
+  y = Bernoulli(logits=tf.tensordot(X, w, [[1], [0]]))
 
   # INFERENCE
   qw = Normal(loc=tf.get_variable("qw/loc", [D]),
diff --git a/examples/lstm.py b/examples/lstm.py
index 1a7d89b5c..a5a01ea21 100644
--- a/examples/lstm.py
+++ b/examples/lstm.py
@@ -131,7 +131,7 @@ def language_model_gen(batch_size, vocab_size):
     x = tf.one_hot(x, depth=vocab_size, dtype=tf.float32)
     h, c = lstm_cell(x, h, c, name="lstm")
     logits = tf.layers.dense(h, vocab_size, name="dense")
-    x = Categorical(logits=logits).value()
+    x = Categorical(logits=logits).value
     xs.append(x)
 
   xs = tf.cast(tf.stack(xs, 1), tf.int32)
diff --git a/examples/normal_normal.py b/examples/normal_normal.py
index caad1ac31..4928a4138 100644
--- a/examples/normal_normal.py
+++ b/examples/normal_normal.py
@@ -8,38 +8,53 @@
 import numpy as np
 import tensorflow as tf
 
-from edward.models import Empirical, Normal
+from edward.models import Normal
+
+
+def model():
+  mu = Normal(loc=0.0, scale=1.0, name="mu")
+  x = Normal(loc=mu, scale=1.0, sample_shape=50, name="x")
+  return x
 
 
 def main(_):
-  ed.set_seed(42)
+  tf.set_random_seed(42)
 
-  # DATA
   x_data = np.array([0.0] * 50)
 
-  # MODEL: Normal-Normal with known variance
-  mu = Normal(loc=0.0, scale=1.0)
-  x = Normal(loc=mu, scale=1.0, sample_shape=50)
+  # analytic solution: N(loc=0.0, scale=\sqrt{1/51}=0.140)
+  qmu = tf.get_variable("qmu", [])
+  new_state, kernel_results = ed.hmc(
+      model,
+      step_size=0.2,
+      current_state=qmu,
+      align_latent=lambda name: {"mu" : "qmu"}.get(name),
+      align_data=lambda name: {"x": "x"}.get(name),
+      x=x_data)
 
-  # INFERENCE
-  qmu = Empirical(params=tf.get_variable("qmu/params", [1000],
-                                         initializer=tf.zeros_initializer()))
+  qmu_update = qmu.assign(new_state)
 
-  # analytic solution: N(loc=0.0, scale=\sqrt{1/51}=0.140)
-  inference = ed.HMC({mu: qmu}, data={x: x_data})
-  inference.run()
+  sess = tf.Session()
+  sess.run(tf.global_variables_initializer())
 
-  # CRITICISM
-  sess = ed.get_session()
-  mean, stddev = sess.run([qmu.mean(), qmu.stddev()])
+  samples = []
+  num_accept = 0
+  for t in range(2500):
+    sample, accept = sess.run([qmu_update, kernel_results.is_accepted])
+    samples.append(sample)
+    num_accept += float(accept)
+    if t % 100 == 0:
+      print("Step {}, Acceptance Rate {:.3}".format(t, num_accept / max(t, 1)))
+
+  samples = samples[500:]
+
+  mean = np.mean(samples)
+  stddev = np.std(samples)
   print("Inferred posterior mean:")
   print(mean)
   print("Inferred posterior stddev:")
   print(stddev)
 
-  # Check convergence with visual diagnostics.
-  samples = sess.run(qmu.params)
-
   # Plot histogram.
   plt.hist(samples, bins='auto')
   plt.show()
diff --git a/examples/normal_normal_eager.py b/examples/normal_normal_eager.py
new file mode 100644
index 000000000..316d74ba6
--- /dev/null
+++ b/examples/normal_normal_eager.py
@@ -0,0 +1,54 @@
+"""Normal-normal model using Hamiltonian Monte Carlo."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import edward as ed
+import numpy as np
+import tensorflow as tf
+
+from edward.models import Normal
+
+
+def model():
+  """Normal-Normal with known variance."""
+  loc = Normal(loc=0.0, scale=1.0, name="loc")
+  x = Normal(loc=loc, scale=1.0, sample_shape=50, name="x")
+  return x
+
+
+def variational():
+  qloc = Normal(loc=tf.get_variable("loc", []),
+                scale=tf.nn.softplus(tf.get_variable("shape", [])),
+                name="qloc")
+  return qloc
+
+
+variational = tf.make_template("variational", variational)
+
+tf.set_random_seed(42)
+x_data = np.array([0.0] * 50)
+
+# analytic solution: N(loc=0.0, scale=\sqrt{1/51}=0.140)
+loss, surrogate_loss = ed.klqp(
+    model,
+    variational,
+    align_latent=lambda name: 'qloc' if name == 'loc' else None,
+    align_data=lambda name: 'x_data' if name == 'x' else None,
+    x_data=x_data)
+
+optimizer = tf.train.AdamOptimizer(1e-2)
+grads_and_vars = optimizer.compute_gradients(surrogate_loss)
+train_op = optimizer.apply_gradients(grads_and_vars)
+
+qloc = variational()
+sess = tf.Session()
+
+sess.run(tf.global_variables_initializer())
+for t in range(1, 5001):
+  loss_val, _ = sess.run([loss, train_op])
+  if t % 50 == 0:
+    mean, stddev = sess.run([qloc.mean(), qloc.stddev()])
+    print({"Loss": loss_val,
+           "Posterior mean": mean,
+           "Posterior stddev": stddev})
diff --git a/examples/normal_sgld.py b/examples/normal_sgld.py
index 7ea554391..fb226acfa 100644
--- a/examples/normal_sgld.py
+++ b/examples/normal_sgld.py
@@ -6,28 +6,52 @@
 from __future__ import print_function
 
 import edward as ed
+import numpy as np
 import tensorflow as tf
 
-from edward.models import Empirical, MultivariateNormalTriL
+from edward.models import MultivariateNormalTriL
 
 
-def main(_):
-  ed.set_seed(42)
-
-  # MODEL
+def model():
   z = MultivariateNormalTriL(
       loc=tf.ones(2),
-      scale_tril=tf.cholesky(tf.constant([[1.0, 0.8], [0.8, 1.0]])))
+      scale_tril=tf.cholesky(tf.constant([[1.0, 0.8], [0.8, 1.0]])),
+      name="z")
+  return z
 
-  # INFERENCE
-  qz = Empirical(params=tf.get_variable("qz/params", [2000, 2]))
 
-  inference = ed.SGLD({z: qz})
-  inference.run(step_size=5.0)
-
-  # CRITICISM
-  sess = ed.get_session()
-  mean, stddev = sess.run([qz.mean(), qz.stddev()])
+def main(_):
+  tf.set_random_seed(42)
+
+  qz = tf.get_variable("qz", [2])
+  counter = tf.get_variable("counter", initializer=0.)
+  qz_mom = tf.get_variable("qz_mom", [2], initializer=tf.zeros_initializer())
+  # TODO what's up with the samples?
+  new_state, new_counter, new_momentum = ed.sgld(
+      model,
+      state=qz,
+      counter=counter,
+      momentum=qz_mom,
+      learning_rate=1e-3,
+      align_latent=lambda name: "qz" if name == "z" else None,
+      align_data=lambda name: None)
+  qz_update = qz.assign(new_state)
+  counter_update = counter.assign(new_counter)
+  qz_mom_update = qz_mom.assign(new_momentum)
+
+  sess = tf.Session()
+  sess.run(tf.global_variables_initializer())
+  samples = []
+  for t in range(2500):
+    sample, _, _ = sess.run([qz_update, counter_update, qz_mom_update])
+    samples.append(sample)
+    if t % 100 == 0:
+      print("Step {}".format(t))
+
+  samples = samples[500:]
+
+  mean = np.mean(samples)
+  stddev = np.std(samples)
   print("Inferred posterior mean:")
   print(mean)
   print("Inferred posterior stddev:")
diff --git a/edward/models/param_mixture.py b/examples/param_mixture.py
similarity index 99%
rename from edward/models/param_mixture.py
rename to examples/param_mixture.py
index 4f2f3f5b1..64cd2689d 100644
--- a/edward/models/param_mixture.py
+++ b/examples/param_mixture.py
@@ -143,7 +143,7 @@ def __init__(self,
         validate_args=validate_args,
         allow_nan_stats=allow_nan_stats,
         parameters=parameters,
-        graph_parents=[self._cat.value(), self._components.value()],
+        graph_parents=[self._cat.value, self._components.value],
         name=name)
 
   @property
diff --git a/notebooks/batch_training.ipynb b/notebooks/batch_training.ipynb
index 3320b24bd..5a25d40a1 100644
--- a/notebooks/batch_training.ipynb
+++ b/notebooks/batch_training.ipynb
@@ -167,7 +167,7 @@
     "\n",
     "w = Normal(loc=tf.zeros(D), scale=tf.ones(D))\n",
     "b = Normal(loc=tf.zeros(1), scale=tf.ones(1))\n",
-    "y = Normal(loc=ed.dot(X, w) + b, scale=1.0)"
+    "y = Normal(loc=tf.tensordot(X, w, [[1], [0]]) + b, scale=1.0)"
    ]
   },
   {
@@ -334,7 +334,7 @@
    "source": [
     "y_post = ed.copy(y, {w: qw, b: qb})\n",
     "# This is equivalent to\n",
-    "# y_post = Normal(loc=ed.dot(X, qw) + qb, scale=tf.ones(N))"
+    "# y_post = Normal(loc=tf.tensordot(X, qw, [[1], [0]]) + qb, scale=tf.ones(N))"
    ]
   },
   {
diff --git a/notebooks/supervised_classification.ipynb b/notebooks/supervised_classification.ipynb
index 5c78c0da2..d4ba9c599 100644
--- a/notebooks/supervised_classification.ipynb
+++ b/notebooks/supervised_classification.ipynb
@@ -31,10 +31,73 @@
     "import tensorflow as tf\n",
     "\n",
     "from edward.models import Bernoulli, MultivariateNormalTriL, Normal\n",
-    "from edward.util import rbf\n",
     "from observations import crabs"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "def rbf(X, X2=None, lengthscale=1.0, variance=1.0):\n",
+    "  \"\"\"Radial basis function kernel, also known as the squared\n",
+    "  exponential or exponentiated quadratic. It is defined as\n",
+    "\n",
+    "  $k(x, x') = \\sigma^2 \\exp\\Big(\n",
+    "      -\\\\frac{1}{2} \\sum_{d=1}^D \\\\frac{1}{\\ell_d^2} (x_d - x'_d)^2 \\Big)$\n",
+    "\n",
+    "  for output variance $\\sigma^2$ and lengthscale $\\ell^2$.\n",
+    "\n",
+    "  The kernel is evaluated over all pairs of rows, `k(X[i, ], X2[j, ])`.\n",
+    "  If `X2` is not specified, then it evaluates over all pairs\n",
+    "  of rows in `X`, `k(X[i, ], X[j, ])`. The output is a matrix\n",
+    "  where each entry (i, j) is the kernel over the ith and jth rows.\n",
+    "\n",
+    "  Args:\n",
+    "    X: tf.Tensor.\n",
+    "      N x D matrix of N data points each with D features.\n",
+    "    X2: tf.Tensor.\n",
+    "      N x D matrix of N data points each with D features.\n",
+    "    lengthscale: tf.Tensor.\n",
+    "      Lengthscale parameter, a positive scalar or D-dimensional vector.\n",
+    "    variance: tf.Tensor.\n",
+    "      Output variance parameter, a positive scalar.\n",
+    "\n",
+    "  #### Examples\n",
+    "\n",
+    "  ```python\n",
+    "  X = tf.random_normal([100, 5])\n",
+    "  K = ed.rbf(X)\n",
+    "  assert K.shape == (100, 100)\n",
+    "  ```\n",
+    "  \"\"\"\n",
+    "  lengthscale = tf.convert_to_tensor(lengthscale)\n",
+    "  variance = tf.convert_to_tensor(variance)\n",
+    "  dependencies = [tf.assert_positive(lengthscale),\n",
+    "                  tf.assert_positive(variance)]\n",
+    "  lengthscale = control_flow_ops.with_dependencies(dependencies, lengthscale)\n",
+    "  variance = control_flow_ops.with_dependencies(dependencies, variance)\n",
+    "\n",
+    "  X = tf.convert_to_tensor(X)\n",
+    "  X = X / lengthscale\n",
+    "  Xs = tf.reduce_sum(tf.square(X), 1)\n",
+    "  if X2 is None:\n",
+    "    X2 = X\n",
+    "    X2s = Xs\n",
+    "  else:\n",
+    "    X2 = tf.convert_to_tensor(X2)\n",
+    "    X2 = X2 / lengthscale\n",
+    "    X2s = tf.reduce_sum(tf.square(X2), 1)\n",
+    "\n",
+    "  square = tf.reshape(Xs, [-1, 1]) + tf.reshape(X2s, [1, -1]) - \\\n",
+    "      2 * tf.matmul(X, X2, transpose_b=True)\n",
+    "  output = variance * tf.exp(-square / 2)\n",
+    "  return output"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
diff --git a/notebooks/supervised_regression.ipynb b/notebooks/supervised_regression.ipynb
index 0ad68bee2..bf4bf8f31 100644
--- a/notebooks/supervised_regression.ipynb
+++ b/notebooks/supervised_regression.ipynb
@@ -124,7 +124,7 @@
     "X = tf.placeholder(tf.float32, [N, D])\n",
     "w = Normal(loc=tf.zeros(D), scale=tf.ones(D))\n",
     "b = Normal(loc=tf.zeros(1), scale=tf.ones(1))\n",
-    "y = Normal(loc=ed.dot(X, w) + b, scale=tf.ones(N))"
+    "y = Normal(loc=tf.tensordot(X, w, [[1], [0]]) + b, scale=tf.ones(N))"
    ]
   },
   {
@@ -217,7 +217,7 @@
    "source": [
     "y_post = ed.copy(y, {w: qw, b: qb})\n",
     "# This is equivalent to\n",
-    "# y_post = Normal(loc=ed.dot(X, qw) + qb, scale=tf.ones(N))"
+    "# y_post = Normal(loc=tf.tensordot(X, qw, [[1], [0]]) + qb, scale=tf.ones(N))"
    ]
   },
   {
diff --git a/notebooks/tensorboard.ipynb b/notebooks/tensorboard.ipynb
index c60aa3207..e32633ced 100644
--- a/notebooks/tensorboard.ipynb
+++ b/notebooks/tensorboard.ipynb
@@ -157,7 +157,7 @@
     "  b = Normal(loc=tf.zeros(1, name=\"bias/loc\"),\n",
     "             scale=tf.ones(1, name=\"bias/scale\"),\n",
     "             name=\"bias\")\n",
-    "  y = Normal(loc=ed.dot(X, w) + b,\n",
+    "  y = Normal(loc=tf.tensordot(X, w, [[1], [0]]) + b,\n",
     "             scale=tf.ones(N, name=\"y/scale\"),\n",
     "             name=\"y\")"
    ]
diff --git a/setup.py b/setup.py
index 45bd5742f..76e95542b 100644
--- a/setup.py
+++ b/setup.py
@@ -15,9 +15,8 @@
     install_requires=['numpy>=1.7',
                       'six>=1.10.0'],
     extras_require={
-        'tensorflow': ['tensorflow>=1.2.0rc0'],
-        'tensorflow with gpu': ['tensorflow-gpu>=1.2.0rc0'],
-        'neural networks': ['keras>=2.0.0', 'prettytensor>=0.7.4'],
+        'tensorflow': ['tensorflow>=1.6.0'],
+        'tensorflow with gpu': ['tensorflow-gpu>=1.6.0'],
         'datasets': ['observations>=0.1.2'],
         'notebooks': ['jupyter>=1.0.0'],
         'visualization': ['matplotlib>=1.3',
diff --git a/tests/criticisms/evaluate_test.py b/tests/criticisms/evaluate_test.py
deleted file mode 100644
index f12e90b43..000000000
--- a/tests/criticisms/evaluate_test.py
+++ /dev/null
@@ -1,188 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import edward as ed
-import numpy as np
-import tensorflow as tf
-
-from edward.models import Bernoulli, Categorical, Multinomial, Normal
-
-
-class test_evaluate_class(tf.test.TestCase):
-
-  RANDOM_SEED = 12345
-
-  def test_metrics(self):
-    with self.test_session():
-      x = Normal(loc=0.0, scale=1.0)
-      x_data = tf.constant(0.0)
-      ed.evaluate('mean_squared_error', {x: x_data}, n_samples=1)
-      ed.evaluate(['mean_squared_error'], {x: x_data}, n_samples=1)
-      ed.evaluate(['mean_squared_error', 'mean_absolute_error'],
-                  {x: x_data}, n_samples=1)
-      self.assertRaises(TypeError, ed.evaluate, x, {x: x_data}, n_samples=1)
-      self.assertRaises(NotImplementedError, ed.evaluate, 'hello world',
-                        {x: x_data}, n_samples=1)
-
-  def test_metrics_classification(self):
-    with self.test_session():
-      x = Bernoulli(probs=0.51)
-      x_data = tf.constant(1)
-      self.assertAllClose(
-          1.0,
-          ed.evaluate('binary_accuracy', {x: x_data}, n_samples=1))
-      x = Bernoulli(probs=0.51, sample_shape=5)
-      x_data = tf.constant([1, 1, 1, 0, 0])
-      self.assertAllClose(
-          0.6,
-          ed.evaluate('binary_accuracy', {x: x_data}, n_samples=1))
-      x = Bernoulli(probs=tf.constant([0.51, 0.49, 0.49]))
-      x_data = tf.constant([1, 0, 1])
-      self.assertAllClose(
-          2.0 / 3,
-          ed.evaluate('binary_accuracy', {x: x_data}, n_samples=1))
-
-      x = Categorical(probs=tf.constant([0.48, 0.51, 0.01]))
-      x_data = tf.constant(1)
-      self.assertAllClose(
-          1.0,
-          ed.evaluate('sparse_categorical_accuracy', {x: x_data}, n_samples=1))
-      x = Categorical(probs=tf.constant([0.48, 0.51, 0.01]), sample_shape=5)
-      x_data = tf.constant([1, 1, 1, 0, 2])
-      self.assertAllClose(
-          0.6,
-          ed.evaluate('sparse_categorical_accuracy', {x: x_data}, n_samples=1))
-      x = Categorical(
-          probs=tf.constant([[0.48, 0.51, 0.01], [0.51, 0.48, 0.01]]))
-      x_data = tf.constant([1, 2])
-      self.assertAllClose(
-          0.5,
-          ed.evaluate('sparse_categorical_accuracy', {x: x_data}, n_samples=1))
-
-      x = Multinomial(total_count=1.0, probs=tf.constant([0.48, 0.51, 0.01]))
-      x_data = tf.constant([0, 1, 0], dtype=x.dtype.as_numpy_dtype)
-      self.assertAllClose(
-          1.0,
-          ed.evaluate('categorical_accuracy', {x: x_data}, n_samples=1))
-      x = Multinomial(total_count=1.0, probs=tf.constant([0.48, 0.51, 0.01]),
-                      sample_shape=5)
-      x_data = tf.constant(
-          [[0, 1, 0], [0, 1, 0], [0, 1, 0], [1, 0, 0], [0, 0, 1]],
-          dtype=x.dtype.as_numpy_dtype)
-      self.assertAllClose(
-          0.6,
-          ed.evaluate('categorical_accuracy', {x: x_data}, n_samples=1))
-
-      x = Multinomial(total_count=5.0, probs=tf.constant([0.4, 0.6, 0.0]))
-      x_data = tf.constant([2, 3, 0], dtype=x.dtype.as_numpy_dtype)
-      self.assertAllClose(
-          1.0,
-          ed.evaluate('multinomial_accuracy', {x: x_data}, n_samples=1))
-
-  def test_metrics_with_binary_averaging(self):
-    x = Multinomial(total_count=10.0, probs=tf.constant([0.2, 0.7, 0.1]))
-    x_data = tf.constant([5, 4, 1], dtype=x.dtype.as_numpy_dtype)
-    self.assertAllEqual(
-        np.array([9.0, 4.0, 1.0], dtype=np.float32),
-        ed.evaluate([('mean_squared_error', {'average': None})],
-                    {x: x_data}, n_samples=1, seed=self.RANDOM_SEED))
-    x = Multinomial(total_count=10.0, probs=tf.constant([0.2, 0.7, 0.1]))
-    x_data = tf.constant([5, 4, 1], dtype=x.dtype.as_numpy_dtype)
-    self.assertAllClose(
-        4.6666665,
-        ed.evaluate([('mean_squared_error', {'average': 'macro'})],
-                    {x: x_data}, n_samples=1, seed=self.RANDOM_SEED))
-    x = Multinomial(total_count=10.0, probs=tf.constant([0.2, 0.7, 0.1]))
-    x_data = tf.constant([5, 4, 1], dtype=x.dtype.as_numpy_dtype)
-    self.assertAllClose(
-        4.6666665,
-        ed.evaluate([('mean_squared_error', {'average': 'micro'})],
-                    {x: x_data}, n_samples=1, seed=self.RANDOM_SEED))
-
-    x = Multinomial(total_count=10.0, probs=tf.constant([0.2, 0.7, 0.1]),
-                    sample_shape=5)
-    x_data = tf.constant(
-        [[2, 7, 1], [3, 6, 1], [3, 5, 2], [4, 4, 2], [2, 7, 1]],
-        dtype=x.dtype.as_numpy_dtype)
-    self.assertAllEqual(
-        np.array([1.2, 1.4, 0.6], dtype=np.float32),
-        ed.evaluate([('mean_squared_error', {'average': None})],
-                    {x: x_data}, n_samples=1, seed=self.RANDOM_SEED))
-    x = Multinomial(total_count=10.0, probs=tf.constant([0.2, 0.7, 0.1]),
-                    sample_shape=5)
-    x_data = tf.constant(
-        [[2, 7, 1], [3, 6, 1], [3, 5, 2], [4, 4, 2], [2, 7, 1]],
-        dtype=x.dtype.as_numpy_dtype)
-    self.assertAllClose(
-        1.066666603088379,
-        ed.evaluate([('mean_squared_error', {'average': 'macro'})],
-                    {x: x_data}, n_samples=1, seed=self.RANDOM_SEED))
-    x = Multinomial(total_count=10.0, probs=tf.constant([0.2, 0.7, 0.1]),
-                    sample_shape=5)
-    x_data = tf.constant(
-        [[2, 7, 1], [3, 6, 1], [3, 5, 2], [4, 4, 2], [2, 7, 1]],
-        dtype=x.dtype.as_numpy_dtype)
-    self.assertAllClose(
-        1.0666667222976685,
-        ed.evaluate([('mean_squared_error', {'average': 'micro'})],
-                    {x: x_data}, n_samples=1, seed=self.RANDOM_SEED))
-
-  def test_data(self):
-    with self.test_session():
-      x_ph = tf.placeholder(tf.float32, [])
-      x = Normal(loc=x_ph, scale=1.0)
-      y = 2.0 * Normal(loc=0.0, scale=1.0)
-      x_data = tf.constant(0.0)
-      x_ph_data = np.array(0.0)
-      y_data = tf.constant(20.0)
-      ed.evaluate('mean_squared_error', {x: x_data, x_ph: x_ph_data},
-                  n_samples=1)
-      ed.evaluate('mean_squared_error', {y: y_data}, n_samples=1)
-      self.assertRaises(TypeError, ed.evaluate, 'mean_squared_error',
-                        {'y': y_data}, n_samples=1)
-
-  def test_n_samples(self):
-    with self.test_session():
-      x = Normal(loc=0.0, scale=1.0)
-      x_data = tf.constant(0.0)
-      ed.evaluate('mean_squared_error', {x: x_data}, n_samples=1)
-      ed.evaluate('mean_squared_error', {x: x_data}, n_samples=5)
-      self.assertRaises(TypeError, ed.evaluate, 'mean_squared_error',
-                        {x: x_data}, n_samples='1')
-
-  def test_output_key(self):
-    with self.test_session():
-      x_ph = tf.placeholder(tf.float32, [])
-      x = Normal(loc=x_ph, scale=1.0)
-      y = 2.0 * x
-      x_data = tf.constant(0.0)
-      x_ph_data = np.array(0.0)
-      y_data = tf.constant(20.0)
-      ed.evaluate('mean_squared_error', {x: x_data, x_ph: x_ph_data},
-                  n_samples=1)
-      ed.evaluate('mean_squared_error', {y: y_data, x_ph: x_ph_data},
-                  n_samples=1)
-      ed.evaluate('mean_squared_error', {x: x_data, y: y_data, x_ph: x_ph_data},
-                  n_samples=1, output_key=x)
-      self.assertRaises(KeyError, ed.evaluate, 'mean_squared_error',
-                        {x: x_data, y: y_data, x_ph: x_ph_data}, n_samples=1)
-      self.assertRaises(TypeError, ed.evaluate, 'mean_squared_error',
-                        {x: x_data, y: y_data, x_ph: x_ph_data}, n_samples=1,
-                        output_key='x')
-
-  def test_custom_metric(self):
-    def logcosh(y_true, y_pred):
-      diff = y_pred - y_true
-      return tf.reduce_mean(diff + tf.nn.softplus(-2.0 * diff) - tf.log(2.0),
-                            axis=-1)
-    with self.test_session():
-      x = Normal(loc=0.0, scale=1.0)
-      x_data = tf.constant(0.0)
-      ed.evaluate(logcosh, {x: x_data}, n_samples=1)
-      ed.evaluate(['mean_squared_error', logcosh], {x: x_data}, n_samples=1)
-      self.assertRaises(NotImplementedError, ed.evaluate, 'logcosh',
-                        {x: x_data}, n_samples=1)
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/tests/criticisms/metrics_test.py b/tests/criticisms/metrics_test.py
deleted file mode 100644
index 452e357e8..000000000
--- a/tests/criticisms/metrics_test.py
+++ /dev/null
@@ -1,114 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-import tensorflow as tf
-
-from edward.criticisms.evaluate import *
-
-all_classification_metrics = [
-    binary_accuracy,
-    sparse_categorical_accuracy,
-]
-
-all_real_classification_metrics = [
-    binary_crossentropy,
-    categorical_crossentropy,
-    hinge,
-    squared_hinge,
-]
-
-all_regression_metrics = [
-    mean_squared_error,
-    mean_absolute_error,
-    mean_absolute_percentage_error,
-    mean_squared_logarithmic_error,
-    poisson,
-    cosine_proximity,
-]
-
-all_specialized_input_output_metrics = [
-    categorical_accuracy,
-    sparse_categorical_crossentropy,
-    kl_divergence
-]
-
-all_metrics_with_binary_averaging = [
-    mean_squared_error,
-    mean_absolute_error,
-    mean_absolute_percentage_error,
-    mean_squared_logarithmic_error
-]
-
-
-class test_metrics_class(tf.test.TestCase):
-
-  def _check_averaging(self, metric, y_true, y_pred):
-    n_classes = tf.squeeze(tf.shape(y_true)[-1]).eval()
-    class_scores = [metric(y_true[i], y_pred[i]) for i in range(n_classes)]
-
-    # No averaging
-    no_average = metric(y_true, y_pred, average=None)
-    expected_no_average = tf.stack(class_scores)
-    self.assertAllEqual(no_average.eval(), expected_no_average.eval())
-
-    # Macro-averaging
-    macro_average = metric(y_true, y_pred, average='macro')
-    expected_macro_average = tf.reduce_mean(tf.stack(class_scores))
-    self.assertAllEqual(macro_average.eval(), expected_macro_average.eval())
-
-    # Micro-averaging
-    micro_average = metric(y_true, y_pred, average='micro')
-    expected_micro_average = metric(tf.reshape(y_true, [1, -1]),
-                                    tf.reshape(y_pred, [1, -1]))
-    self.assertAllEqual(micro_average.eval(), expected_micro_average.eval())
-
-  def test_classification_metrics(self):
-    with self.test_session():
-      y_true = tf.convert_to_tensor(np.random.randint(0, 1, (2, 3)))
-      y_pred = tf.convert_to_tensor(np.random.randint(0, 1, (2, 3)))
-      for metric in all_classification_metrics:
-        self.assertEqual(metric(y_true, y_pred).eval().shape, ())
-
-  def test_real_classification_metrics(self):
-    with self.test_session():
-      y_true = tf.convert_to_tensor(np.random.randint(0, 5, (6, 7)))
-      y_pred = tf.random_normal([6, 7])
-      for metric in all_real_classification_metrics:
-        self.assertEqual(metric(y_true, y_pred).eval().shape, ())
-
-  def test_regression_metrics(self):
-    with self.test_session():
-      y_true = tf.random_normal([6, 7])
-      y_pred = tf.random_normal([6, 7])
-      for metric in all_regression_metrics:
-        self.assertEqual(metric(y_true, y_pred).eval().shape, ())
-
-  def test_specialized_input_output_metrics(self):
-    with self.test_session():
-      for metric in all_specialized_input_output_metrics:
-        if metric == categorical_accuracy:
-          y_true = tf.convert_to_tensor(np.random.randint(0, 1, (6, 7)))
-          y_pred = tf.convert_to_tensor(np.random.randint(0, 7, (6,)))
-          self.assertEqual(metric(y_true, y_pred).eval().shape, ())
-        elif metric == sparse_categorical_crossentropy:
-          y_true = tf.convert_to_tensor(np.random.randint(0, 5, (6)))
-          y_pred = tf.random_normal([6, 7])
-          self.assertEqual(metric(y_true, y_pred).eval().shape, ())
-        elif metric == kl_divergence:
-          y_true = tf.nn.softmax(tf.random_normal([6]))
-          y_pred = tf.nn.softmax(tf.random_normal([6]))
-          self.assertEqual(metric(y_true, y_pred).eval().shape, ())
-        else:
-          raise NotImplementedError()
-
-  def test_metrics_with_binary_averaging(self):
-    with self.test_session():
-      y_true = tf.constant([[1.0, 2.0, 3.0], [2.0, 3.0, 4.0], [3.0, 4.0, 5.0]])
-      y_pred = tf.constant([[2.0, 4.0, 6.0], [4.0, 6.0, 8.0], [6.0, 8.0, 10.0]])
-      for metric in all_metrics_with_binary_averaging:
-        self._check_averaging(metric, y_true, y_pred)
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/tests/criticisms/ppc_plots_test.py b/tests/criticisms/ppc_plots_test.py
deleted file mode 100644
index bf76bc12d..000000000
--- a/tests/criticisms/ppc_plots_test.py
+++ /dev/null
@@ -1,25 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import edward as ed
-import numpy as np
-import tensorflow as tf
-
-
-class test_ppc_plots_class(tf.test.TestCase):
-
-  def test_ppc_density_plot(self):
-    y = np.random.randn(20)
-    y_rep = np.random.randn(20, 20)
-
-    ed.ppc_density_plot(y, y_rep)
-
-  def test_ppc_stat_hist_plot(self):
-    y = np.random.randn(20)
-    t = 0.0
-
-    ed.ppc_stat_hist_plot(t, y, stat_name="mean", bins=10)
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/tests/criticisms/ppc_test.py b/tests/criticisms/ppc_test.py
deleted file mode 100644
index d06ba0153..000000000
--- a/tests/criticisms/ppc_test.py
+++ /dev/null
@@ -1,52 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import edward as ed
-import tensorflow as tf
-
-from edward.models import Normal
-
-
-class test_ppc_class(tf.test.TestCase):
-
-  def test_data(self):
-    with self.test_session():
-      x = Normal(loc=0.0, scale=1.0)
-      y = 2.0 * x
-      x_data = tf.constant(0.0)
-      y_data = tf.constant(0.0)
-      ed.ppc(lambda xs, zs: tf.reduce_mean(xs[x]), {x: x_data}, n_samples=1)
-      ed.ppc(lambda xs, zs: tf.reduce_mean(xs[y]), {y: y_data}, n_samples=1)
-      self.assertRaises(TypeError, ed.ppc, lambda xs, zs: tf.reduce_mean(xs[y]),
-                        {'y': y_data}, n_samples=1)
-
-  def test_latent_vars(self):
-    with self.test_session():
-      x = Normal(loc=0.0, scale=1.0)
-      y = 2.0 * x
-      z = Normal(loc=0.0, scale=1.0)
-      x_data = tf.constant(0.0)
-      y_data = tf.constant(0.0)
-      ed.ppc(lambda xs, zs: tf.reduce_mean(xs[x]) + tf.reduce_mean(zs[z]),
-             {x: x_data}, {z: z}, n_samples=1)
-      ed.ppc(lambda xs, zs: tf.reduce_mean(xs[x]) + tf.reduce_mean(zs[z]),
-             {x: x_data}, {z: y}, n_samples=1)
-      ed.ppc(lambda xs, zs: tf.reduce_mean(xs[x]) + tf.reduce_mean(zs[y]),
-             {x: x_data}, {y: y}, n_samples=1)
-      ed.ppc(lambda xs, zs: tf.reduce_mean(xs[x]) + tf.reduce_mean(zs[y]),
-             {x: x_data}, {y: z}, n_samples=1)
-      self.assertRaises(TypeError, ed.ppc, lambda xs, zs: tf.reduce_mean(xs[x]),
-                        {x: x_data}, {'y': z}, n_samples=1)
-
-  def test_n_samples(self):
-    with self.test_session():
-      x = Normal(loc=0.0, scale=1.0)
-      x_data = tf.constant(0.0)
-      ed.ppc(lambda xs, zs: tf.reduce_mean(xs[x]), {x: x_data}, n_samples=1)
-      ed.ppc(lambda xs, zs: tf.reduce_mean(xs[x]), {x: x_data}, n_samples=5)
-      self.assertRaises(TypeError, ed.ppc, lambda xs, zs: tf.reduce_mean(xs[x]),
-                        {x: x_data}, n_samples='1')
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/tests/util/get_control_variate_coef_test.py b/tests/inferences/get_control_variate_coef_test.py
similarity index 89%
rename from tests/util/get_control_variate_coef_test.py
rename to tests/inferences/get_control_variate_coef_test.py
index 5138f20e8..a502b7b28 100644
--- a/tests/util/get_control_variate_coef_test.py
+++ b/tests/inferences/get_control_variate_coef_test.py
@@ -4,7 +4,7 @@
 
 import tensorflow as tf
 
-from edward.util.tensorflow import get_control_variate_coef
+from edward.inferences.util import get_control_variate_coef
 
 
 class test_get_control_variate_coef(tf.test.TestCase):
diff --git a/tests/util/transform_test.py b/tests/inferences/transform_test.py
similarity index 89%
rename from tests/util/transform_test.py
rename to tests/inferences/transform_test.py
index f541f5008..903f0bd81 100644
--- a/tests/util/transform_test.py
+++ b/tests/inferences/transform_test.py
@@ -7,6 +7,7 @@
 import tensorflow as tf
 
 from collections import namedtuple
+from edward.inferences.util import transform
 from edward.models import (
     Beta, Dirichlet, DirichletProcess, Gamma, MultivariateNormalDiag,
     Normal, Poisson, TransformedDistribution)
@@ -24,21 +25,21 @@ def assertSamplePosNeg(self, sample):
   def test_args(self):
     with self.test_session():
       x = Normal(-100.0, 1.0)
-      y = ed.transform(x, bijectors.Softplus())
+      y = transform(x, bijectors.Softplus())
       sample = y.sample(10).eval()
       self.assertTrue((sample >= 0.0).all())
 
   def test_kwargs(self):
     with self.test_session():
       x = Normal(-100.0, 1.0)
-      y = ed.transform(x, bijector=bijectors.Softplus())
+      y = transform(x, bijector=bijectors.Softplus())
       sample = y.sample(10).eval()
       self.assertTrue((sample >= 0.0).all())
 
   def test_01(self):
     with self.test_session():
       x = Beta(1.0, 1.0)
-      y = ed.transform(x)
+      y = transform(x)
       self.assertIsInstance(y, TransformedDistribution)
       sample = y.sample(10, seed=1).eval()
       self.assertSamplePosNeg(sample)
@@ -46,7 +47,7 @@ def test_01(self):
   def test_nonnegative(self):
     with self.test_session():
       x = Gamma(1.0, 1.0)
-      y = ed.transform(x)
+      y = transform(x)
       self.assertIsInstance(y, TransformedDistribution)
       sample = y.sample(10, seed=1).eval()
       self.assertSamplePosNeg(sample)
@@ -54,7 +55,7 @@ def test_nonnegative(self):
   def test_simplex(self):
     with self.test_session():
       x = Dirichlet([1.1, 1.2, 1.3, 1.4])
-      y = ed.transform(x)
+      y = transform(x)
       self.assertIsInstance(y, TransformedDistribution)
       sample = y.sample(10, seed=1).eval()
       self.assertSamplePosNeg(sample)
@@ -62,7 +63,7 @@ def test_simplex(self):
   def test_real(self):
     with self.test_session():
       x = Normal(0.0, 1.0)
-      y = ed.transform(x)
+      y = transform(x)
       self.assertIsInstance(y, Normal)
       sample = y.sample(10, seed=1).eval()
       self.assertSamplePosNeg(sample)
@@ -70,7 +71,7 @@ def test_real(self):
   def test_multivariate_real(self):
     with self.test_session():
       x = MultivariateNormalDiag(tf.zeros(2), tf.ones(2))
-      y = ed.transform(x)
+      y = transform(x)
       sample = y.sample(10, seed=1).eval()
       self.assertSamplePosNeg(sample)
 
@@ -78,14 +79,14 @@ def test_no_support(self):
     with self.test_session():
       x = DirichletProcess(1.0, Normal(0.0, 1.0))
       with self.assertRaises(AttributeError):
-        y = ed.transform(x)
+        y = transform(x)
 
   def test_unhandled_support(self):
     with self.test_session():
       FakeRV = namedtuple('FakeRV', ['support'])
       x = FakeRV(support='rational')
       with self.assertRaises(ValueError):
-        y = ed.transform(x)
+        y = transform(x)
 
 if __name__ == '__main__':
   tf.test.main()
diff --git a/tests/models/call_with_manipulate_test.py b/tests/models/call_with_manipulate_test.py
new file mode 100644
index 000000000..2f53e381c
--- /dev/null
+++ b/tests/models/call_with_manipulate_test.py
@@ -0,0 +1,32 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import edward as ed
+import tensorflow as tf
+
+from edward.models import Normal, Poisson
+
+
+class test_call_with_manipulate_class(tf.test.TestCase):
+
+  def _test_intercept_value(self, RV, value, *args, **kwargs):
+    def manipulate(f, *fargs, **fkwargs):
+      name = kwargs.get('name', None)
+      if name == "rv2":
+        kwargs['value'] = rv1.value
+      return f(*fargs, **fkwargs)
+    rv1 = RV(*args, value=value, name="rv1", **kwargs)
+    rv2 = ed.call_with_manipulate(RV, manipulate, *args, name="rv2", **kwargs)
+    value_shape1 = rv1.value.shape
+    value_shape2 = rv2.value.shape
+    self.assertEqual(value_shape1, value_shape2)
+
+  def test_intercept_value(self):
+    with self.test_session():
+      self._test_intercept_value(Normal, 2, loc=0.5, scale=1.0)
+      self._test_intercept_value(Normal, [2], loc=[0.5], scale=[1.0])
+      self._test_intercept_value(Poisson, 2, rate=0.5)
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/tests/models/empirical_sample_test.py b/tests/models/empirical_sample_test.py
deleted file mode 100644
index c58462dc7..000000000
--- a/tests/models/empirical_sample_test.py
+++ /dev/null
@@ -1,34 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-import tensorflow as tf
-
-from edward.models import Empirical
-
-
-class test_empirical_sample_class(tf.test.TestCase):
-
-  def _test(self, params, n):
-    x = Empirical(params=params)
-    val_est = x.sample(n).shape.as_list()
-    val_true = n + tf.convert_to_tensor(params).shape.as_list()[1:]
-    self.assertEqual(val_est, val_true)
-
-  def test_0d(self):
-    with self.test_session():
-      self._test(0.5, [1])
-      self._test(np.array(0.5), [1])
-      self._test(tf.constant(0.5), [1])
-      self._test(np.array([0.5]), [1])
-      self._test(np.array([0.5]), [5])
-      self._test(np.array([0.2, 0.8]), [1])
-      self._test(np.array([0.2, 0.8]), [10])
-      self._test(tf.constant([0.5]), [1])
-      self._test(tf.constant([0.5]), [5])
-      self._test(tf.constant([0.2, 0.8]), [1])
-      self._test(tf.constant([0.2, 0.8]), [10])
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/tests/util/get_ancestors_test.py b/tests/models/get_ancestors_test.py
similarity index 97%
rename from tests/util/get_ancestors_test.py
rename to tests/models/get_ancestors_test.py
index eaaa67ad4..9b2852cf6 100644
--- a/tests/util/get_ancestors_test.py
+++ b/tests/models/get_ancestors_test.py
@@ -4,8 +4,7 @@
 
 import tensorflow as tf
 
-from edward.models import Bernoulli, Normal
-from edward.util import get_ancestors
+from edward.models import Bernoulli, Normal, get_ancestors
 
 
 class test_get_ancestors_class(tf.test.TestCase):
diff --git a/tests/util/get_blanket_test.py b/tests/models/get_blanket_test.py
similarity index 91%
rename from tests/util/get_blanket_test.py
rename to tests/models/get_blanket_test.py
index 21f32eec6..e1a6ed109 100644
--- a/tests/util/get_blanket_test.py
+++ b/tests/models/get_blanket_test.py
@@ -4,8 +4,7 @@
 
 import tensorflow as tf
 
-from edward.models import Bernoulli, Normal
-from edward.util import get_blanket
+from edward.models import Bernoulli, Normal, get_blanket
 
 
 class test_get_blanket_class(tf.test.TestCase):
diff --git a/tests/util/get_children_test.py b/tests/models/get_children_test.py
similarity index 97%
rename from tests/util/get_children_test.py
rename to tests/models/get_children_test.py
index bf6c05a99..345d4b500 100644
--- a/tests/util/get_children_test.py
+++ b/tests/models/get_children_test.py
@@ -4,8 +4,7 @@
 
 import tensorflow as tf
 
-from edward.models import Bernoulli, Normal
-from edward.util import get_children
+from edward.models import Bernoulli, Normal, get_children
 
 
 class test_get_children_class(tf.test.TestCase):
diff --git a/tests/util/get_descendants_test.py b/tests/models/get_descendants_test.py
similarity index 97%
rename from tests/util/get_descendants_test.py
rename to tests/models/get_descendants_test.py
index f70eeb6bd..4e5fbb72c 100644
--- a/tests/util/get_descendants_test.py
+++ b/tests/models/get_descendants_test.py
@@ -4,8 +4,7 @@
 
 import tensorflow as tf
 
-from edward.models import Bernoulli, Normal
-from edward.util import get_descendants
+from edward.models import Bernoulli, Normal, get_descendants
 
 
 class test_get_descendants_class(tf.test.TestCase):
diff --git a/tests/util/get_parents_test.py b/tests/models/get_parents_test.py
similarity index 97%
rename from tests/util/get_parents_test.py
rename to tests/models/get_parents_test.py
index eccb0782f..7fd40b619 100644
--- a/tests/util/get_parents_test.py
+++ b/tests/models/get_parents_test.py
@@ -4,8 +4,7 @@
 
 import tensorflow as tf
 
-from edward.models import Bernoulli, Normal
-from edward.util import get_parents
+from edward.models import Bernoulli, Normal, get_parents
 
 
 class test_get_parents_class(tf.test.TestCase):
diff --git a/tests/util/get_siblings_test.py b/tests/models/get_siblings_test.py
similarity index 97%
rename from tests/util/get_siblings_test.py
rename to tests/models/get_siblings_test.py
index b54543d6b..5a792468a 100644
--- a/tests/util/get_siblings_test.py
+++ b/tests/models/get_siblings_test.py
@@ -4,8 +4,7 @@
 
 import tensorflow as tf
 
-from edward.models import Bernoulli, Normal
-from edward.util import get_siblings
+from edward.models import Bernoulli, Normal, get_siblings
 
 
 class test_get_siblings_class(tf.test.TestCase):
diff --git a/tests/util/get_variables_test.py b/tests/models/get_variables_test.py
similarity index 97%
rename from tests/util/get_variables_test.py
rename to tests/models/get_variables_test.py
index cd5e91f78..c73a9bf96 100644
--- a/tests/util/get_variables_test.py
+++ b/tests/models/get_variables_test.py
@@ -4,8 +4,7 @@
 
 import tensorflow as tf
 
-from edward.models import Bernoulli, Normal
-from edward.util import get_variables
+from edward.models import Bernoulli, Normal, get_variables
 
 
 class test_get_variables_class(tf.test.TestCase):
diff --git a/tests/util/is_independent_test.py b/tests/models/is_independent_test.py
similarity index 96%
rename from tests/util/is_independent_test.py
rename to tests/models/is_independent_test.py
index adeaf9ab5..fec5a2324 100644
--- a/tests/util/is_independent_test.py
+++ b/tests/models/is_independent_test.py
@@ -4,8 +4,7 @@
 
 import tensorflow as tf
 
-from edward.models import Normal
-from edward.util import is_independent
+from edward.models import Normal, is_independent
 
 
 class test_is_independent_class(tf.test.TestCase):
diff --git a/tests/models/keras_core_layers_test.py b/tests/models/keras_core_layers_test.py
index 7da455141..a92a71983 100644
--- a/tests/models/keras_core_layers_test.py
+++ b/tests/models/keras_core_layers_test.py
@@ -13,51 +13,51 @@ class test_keras_core_layers_class(tf.test.TestCase):
 
   def test_dense(self):
     x = Normal(loc=tf.zeros([100, 10, 5]), scale=tf.ones([100, 10, 5]))
-    y = layers.Dense(32)(x.value())
+    y = layers.Dense(32)(x.value)
 
   def test_activation(self):
     x = Normal(loc=tf.zeros([100, 10, 5]), scale=tf.ones([100, 10, 5]))
-    y = layers.Activation('tanh')(x.value())
+    y = layers.Activation('tanh')(x.value)
 
   def test_dropout(self):
     x = Normal(loc=tf.zeros([100, 10, 5]), scale=tf.ones([100, 10, 5]))
-    y = layers.Dropout(0.5)(x.value())
+    y = layers.Dropout(0.5)(x.value)
 
   def test_flatten(self):
     x = Normal(loc=tf.zeros([100, 10, 5]), scale=tf.ones([100, 10, 5]))
-    y = layers.Flatten()(x.value())
+    y = layers.Flatten()(x.value)
     with self.test_session():
       self.assertEqual(y.eval().shape, (100, 50))
 
   def test_reshape(self):
     x = Normal(loc=tf.zeros([100, 10, 5]), scale=tf.ones([100, 10, 5]))
-    y = layers.Reshape((5, 10))(x.value())
+    y = layers.Reshape((5, 10))(x.value)
     with self.test_session():
       self.assertEqual(y.eval().shape, (100, 5, 10))
 
   def test_permute(self):
     x = Normal(loc=tf.zeros([100, 10, 5]), scale=tf.ones([100, 10, 5]))
-    y = layers.Permute((2, 1))(x.value())
+    y = layers.Permute((2, 1))(x.value)
     with self.test_session():
       self.assertEqual(y.eval().shape, (100, 5, 10))
 
   def test_repeat_vector(self):
     x = Normal(loc=tf.zeros([100, 10]), scale=tf.ones([100, 10]))
-    y = layers.RepeatVector(2)(x.value())
+    y = layers.RepeatVector(2)(x.value)
     with self.test_session():
       self.assertEqual(y.eval().shape, (100, 2, 10))
 
   def test_lambda(self):
     x = Normal(loc=tf.zeros([100, 10, 5]), scale=tf.ones([100, 10, 5]))
-    y = layers.Lambda(lambda x: x ** 2)(x.value())
+    y = layers.Lambda(lambda x: x ** 2)(x.value)
 
   def test_activity_regularization(self):
     x = Normal(loc=tf.zeros([100, 10, 5]), scale=tf.ones([100, 10, 5]))
-    y = layers.ActivityRegularization(l1=0.1)(x.value())
+    y = layers.ActivityRegularization(l1=0.1)(x.value)
 
   def test_masking(self):
     x = Normal(loc=tf.zeros([100, 10, 5]), scale=tf.ones([100, 10, 5]))
-    y = layers.Masking()(x.value())
+    y = layers.Masking()(x.value)
 
 if __name__ == '__main__':
   tf.test.main()
diff --git a/tests/models/point_mass_sample_test.py b/tests/models/point_mass_sample_test.py
deleted file mode 100644
index 7d240bb0f..000000000
--- a/tests/models/point_mass_sample_test.py
+++ /dev/null
@@ -1,37 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-import tensorflow as tf
-
-from edward.models import PointMass
-
-
-class test_pointmass_sample_class(tf.test.TestCase):
-
-  def _test(self, params, n):
-    x = PointMass(params=params)
-    val_est = x.sample(n).shape.as_list()
-    val_true = n + tf.convert_to_tensor(params).shape.as_list()
-    self.assertEqual(val_est, val_true)
-
-  def test_0d(self):
-    with self.test_session():
-      self._test(0.5, [1])
-      self._test(np.array(0.5), [1])
-      self._test(tf.constant(0.5), [1])
-
-  def test_1d(self):
-    with self.test_session():
-      self._test(np.array([0.5]), [1])
-      self._test(np.array([0.5]), [5])
-      self._test(np.array([0.2, 0.8]), [1])
-      self._test(np.array([0.2, 0.8]), [10])
-      self._test(tf.constant([0.5]), [1])
-      self._test(tf.constant([0.5]), [5])
-      self._test(tf.constant([0.2, 0.8]), [1])
-      self._test(tf.constant([0.2, 0.8]), [10])
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/tests/models/random_variable_operators_test.py b/tests/models/random_variable_operators_test.py
index 72ae5ab82..dd3e33093 100644
--- a/tests/models/random_variable_operators_test.py
+++ b/tests/models/random_variable_operators_test.py
@@ -15,7 +15,7 @@ def test_add(self):
       x = Normal(0.0, 1.0)
       y = 5.0
       z = x + y
-      z_value = x.value() + y
+      z_value = x.value + y
       z_eval, z_value_eval = sess.run([z, z_value])
       self.assertAllEqual(z_eval, z_value_eval)
 
@@ -24,7 +24,7 @@ def test_radd(self):
       x = Normal(0.0, 1.0)
       y = 5.0
       z = y + x
-      z_value = y + x.value()
+      z_value = y + x.value
       z_eval, z_value_eval = sess.run([z, z_value])
       self.assertAllEqual(z_eval, z_value_eval)
 
@@ -33,7 +33,7 @@ def test_sub(self):
       x = Normal(0.0, 1.0)
       y = 5.0
       z = x - y
-      z_value = x.value() - y
+      z_value = x.value - y
       z_eval, z_value_eval = sess.run([z, z_value])
       self.assertAllEqual(z_eval, z_value_eval)
 
@@ -42,7 +42,7 @@ def test_rsub(self):
       x = Normal(0.0, 1.0)
       y = 5.0
       z = y - x
-      z_value = y - x.value()
+      z_value = y - x.value
       z_eval, z_value_eval = sess.run([z, z_value])
       self.assertAllEqual(z_eval, z_value_eval)
 
@@ -51,7 +51,7 @@ def test_mul(self):
       x = Normal(0.0, 1.0)
       y = 5.0
       z = x * y
-      z_value = x.value() * y
+      z_value = x.value * y
       z_eval, z_value_eval = sess.run([z, z_value])
       self.assertAllEqual(z_eval, z_value_eval)
 
@@ -60,7 +60,7 @@ def test_rmul(self):
       x = Normal(0.0, 1.0)
       y = 5.0
       z = y * x
-      z_value = y * x.value()
+      z_value = y * x.value
       z_eval, z_value_eval = sess.run([z, z_value])
       self.assertAllEqual(z_eval, z_value_eval)
 
@@ -69,7 +69,7 @@ def test_div(self):
       x = Normal(0.0, 1.0)
       y = 5.0
       z = x / y
-      z_value = x.value() / y
+      z_value = x.value / y
       z_eval, z_value_eval = sess.run([z, z_value])
       self.assertAllEqual(z_eval, z_value_eval)
 
@@ -78,7 +78,7 @@ def test_rdiv(self):
       x = Normal(0.0, 1.0)
       y = 5.0
       z = y / x
-      z_value = y / x.value()
+      z_value = y / x.value
       z_eval, z_value_eval = sess.run([z, z_value])
       self.assertAllEqual(z_eval, z_value_eval)
 
@@ -87,7 +87,7 @@ def test_floordiv(self):
       x = Normal(0.0, 1.0)
       y = 5.0
       z = x // y
-      z_value = x.value() // y
+      z_value = x.value // y
       z_eval, z_value_eval = sess.run([z, z_value])
       self.assertAllEqual(z_eval, z_value_eval)
 
@@ -96,7 +96,7 @@ def test_rfloordiv(self):
       x = Normal(0.0, 1.0)
       y = 5.0
       z = y // x
-      z_value = y // x.value()
+      z_value = y // x.value
       z_eval, z_value_eval = sess.run([z, z_value])
       self.assertAllEqual(z_eval, z_value_eval)
 
@@ -105,7 +105,7 @@ def test_mod(self):
       x = Normal(0.0, 1.0)
       y = 5.0
       z = x % y
-      z_value = x.value() % y
+      z_value = x.value % y
       z_eval, z_value_eval = sess.run([z, z_value])
       self.assertAllEqual(z_eval, z_value_eval)
 
@@ -114,7 +114,7 @@ def test_rmod(self):
       x = Normal(0.0, 1.0)
       y = 5.0
       z = y % x
-      z_value = y % x.value()
+      z_value = y % x.value
       z_eval, z_value_eval = sess.run([z, z_value])
       self.assertAllEqual(z_eval, z_value_eval)
 
@@ -123,7 +123,7 @@ def test_lt(self):
       x = Normal(0.0, 1.0)
       y = 5.0
       z = x < y
-      z_value = x.value() < y
+      z_value = x.value < y
       z_eval, z_value_eval = sess.run([z, z_value])
       self.assertAllEqual(z_eval, z_value_eval)
 
@@ -132,7 +132,7 @@ def test_le(self):
       x = Normal(0.0, 1.0)
       y = 5.0
       z = x <= y
-      z_value = x.value() <= y
+      z_value = x.value <= y
       z_eval, z_value_eval = sess.run([z, z_value])
       self.assertAllEqual(z_eval, z_value_eval)
 
@@ -141,7 +141,7 @@ def test_gt(self):
       x = Normal(0.0, 1.0)
       y = 5.0
       z = x > y
-      z_value = x.value() > y
+      z_value = x.value > y
       z_eval, z_value_eval = sess.run([z, z_value])
       self.assertAllEqual(z_eval, z_value_eval)
 
@@ -150,7 +150,7 @@ def test_ge(self):
       x = Normal(0.0, 1.0)
       y = 5.0
       z = x >= y
-      z_value = x.value() >= y
+      z_value = x.value >= y
       z_eval, z_value_eval = sess.run([z, z_value])
       self.assertAllEqual(z_eval, z_value_eval)
 
@@ -160,7 +160,7 @@ def test_ge(self):
       # x = tf.cast(Bernoulli(0.5), tf.bool)
       # y = True
       # z = x & y
-      # z_value = x.value() & y
+      # z_value = x.value & y
       # z_eval, z_value_eval = sess.run([z, z_value])
       # self.assertAllEqual(z_eval, z_value_eval)
 
@@ -174,7 +174,7 @@ def test_getitem(self):
     with self.test_session() as sess:
       x = Normal(tf.zeros([3, 4]), tf.ones([3, 4]))
       z = x[0:2, 2:3]
-      z_value = x.value()[0:2, 2:3]
+      z_value = x.value[0:2, 2:3]
       z_eval, z_value_eval = sess.run([z, z_value])
       self.assertAllEqual(z_eval, z_value_eval)
 
@@ -183,7 +183,7 @@ def test_pow(self):
       x = Normal(0.0, 1.0)
       y = 5.0
       z = x ** y
-      z_value = x.value() ** y
+      z_value = x.value ** y
       z_eval, z_value_eval = sess.run([z, z_value])
       self.assertAllEqual(z_eval, z_value_eval)
 
@@ -192,7 +192,7 @@ def test_rpow(self):
       x = Normal(0.0, 1.0)
       y = 5.0
       z = y ** x
-      z_value = y ** x.value()
+      z_value = y ** x.value
       z_eval, z_value_eval = sess.run([z, z_value])
       self.assertAllEqual(z_eval, z_value_eval)
 
@@ -202,7 +202,7 @@ def test_neg(self):
     with self.test_session() as sess:
       x = Normal(0.0, 1.0)
       z = -x
-      z_value = -x.value()
+      z_value = -x.value
       z_eval, z_value_eval = sess.run([z, z_value])
       self.assertAllEqual(z_eval, z_value_eval)
 
@@ -210,7 +210,7 @@ def test_abs(self):
     with self.test_session() as sess:
       x = Normal(0.0, 1.0)
       z = abs(x)
-      z_value = abs(x.value())
+      z_value = abs(x.value)
       z_eval, z_value_eval = sess.run([z, z_value])
       self.assertAllEqual(z_eval, z_value_eval)
 
diff --git a/tests/models/random_variable_value_test.py b/tests/models/random_variable_value_test.py
index 0a351e645..1be69ab1a 100644
--- a/tests/models/random_variable_value_test.py
+++ b/tests/models/random_variable_value_test.py
@@ -6,25 +6,17 @@
 import tensorflow as tf
 
 from edward.models import Bernoulli, Normal, Poisson, RandomVariable
-from edward.util import copy
 
 
 class test_random_variable_value_class(tf.test.TestCase):
 
   def _test_sample(self, RV, value, *args, **kwargs):
     rv = RV(*args, value=value, **kwargs)
-    value_shape = rv.value().shape
+    value_shape = rv.value.shape
     expected_shape = rv.sample_shape.concatenate(
         rv.batch_shape).concatenate(rv.event_shape)
     self.assertEqual(value_shape, expected_shape)
-    self.assertEqual(rv.dtype, rv.value().dtype)
-
-  def _test_copy(self, RV, value, *args, **kwargs):
-    rv1 = RV(*args, value=value, **kwargs)
-    rv2 = copy(rv1)
-    value_shape1 = rv1.value().shape
-    value_shape2 = rv2.value().shape
-    self.assertEqual(value_shape1, value_shape2)
+    self.assertEqual(rv.dtype, rv.value.dtype)
 
   def test_shape_and_dtype(self):
     with self.test_session():
@@ -45,11 +37,5 @@ def test_mismatch_raises(self):
       self.assertRaises(ValueError, self._test_sample, Normal,
                         np.zeros([10, 3]), loc=[0.5, 0.5], scale=[1.0, 1.0])
 
-  def test_copy(self):
-    with self.test_session():
-      self._test_copy(Normal, 2, loc=0.5, scale=1.0)
-      self._test_copy(Normal, [2], loc=[0.5], scale=[1.0])
-      self._test_copy(Poisson, 2, rate=0.5)
-
 if __name__ == '__main__':
   tf.test.main()
diff --git a/tests/util/check_data_test.py b/tests/util/check_data_test.py
deleted file mode 100644
index 2a4fe2fa9..000000000
--- a/tests/util/check_data_test.py
+++ /dev/null
@@ -1,49 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-import tensorflow as tf
-
-from edward.models import Normal
-from edward.util import check_data
-
-
-class test_check_data_class(tf.test.TestCase):
-
-  def test(self):
-    with self.test_session():
-      x = Normal(0.0, 1.0)
-      qx = Normal(0.0, 1.0)
-      x_ph = tf.placeholder(tf.float32, [])
-
-      check_data({x: tf.constant(0.0)})
-      check_data({x: np.float64(0.0)})
-      check_data({x: np.int64(0)})
-      check_data({x: 0.0})
-      check_data({x: 0})
-      check_data({x: False})
-      check_data({x: '0'})
-      check_data({x: x_ph})
-      check_data({x: qx})
-      check_data({2.0 * x: tf.constant(0.0)})
-      self.assertRaises(TypeError, check_data, {0.0: x})
-      self.assertRaises(TypeError, check_data, {x: tf.zeros(5)})
-      self.assertRaises(TypeError, check_data, {x_ph: x})
-      self.assertRaises(TypeError, check_data, {x_ph: x})
-      self.assertRaises(TypeError, check_data,
-                        {x: tf.constant(0, tf.float64)})
-      self.assertRaises(TypeError, check_data,
-                        {x_ph: tf.constant(0.0)})
-
-      x_vec = Normal(tf.constant([0.0]), tf.constant([1.0]))
-      qx_vec = Normal(tf.constant([0.0]), tf.constant([1.0]))
-
-      check_data({x_vec: qx_vec})
-      check_data({x_vec: [0.0]})
-      check_data({x_vec: [0]})
-      check_data({x_vec: ['0']})
-      self.assertRaises(TypeError, check_data, {x: qx_vec})
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/tests/util/check_latent_vars_test.py b/tests/util/check_latent_vars_test.py
deleted file mode 100644
index a967629e2..000000000
--- a/tests/util/check_latent_vars_test.py
+++ /dev/null
@@ -1,26 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf
-
-from edward.models import Normal
-from edward.util import check_latent_vars
-
-
-class test_check_latent_vars_class(tf.test.TestCase):
-
-  def test(self):
-    with self.test_session():
-      mu = Normal(0.0, 1.0)
-      qmu = Normal(tf.Variable(0.0), tf.constant(1.0))
-      qmu_vec = Normal(tf.constant([0.0]), tf.constant([1.0]))
-
-      check_latent_vars({mu: qmu})
-      check_latent_vars({mu: tf.constant(0.0)})
-      check_latent_vars({tf.constant(0.0): qmu})
-      self.assertRaises(TypeError, check_latent_vars, {mu: '5'})
-      self.assertRaises(TypeError, check_latent_vars, {mu: qmu_vec})
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/tests/util/copy_test.py b/tests/util/copy_test.py
deleted file mode 100644
index 6b8913073..000000000
--- a/tests/util/copy_test.py
+++ /dev/null
@@ -1,248 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import edward as ed
-import numpy as np
-import tensorflow as tf
-
-from edward.models import Bernoulli, Categorical, Mixture, Normal
-
-
-class test_copy_class(tf.test.TestCase):
-
-  def test_scope(self):
-    with self.test_session():
-      x = tf.constant(2.0)
-      x_new = ed.copy(x, scope='new_scope')
-      self.assertTrue(x_new.name.startswith('new_scope'))
-
-  def test_replace_itself(self):
-    with self.test_session():
-      x = tf.constant(2.0)
-      y = tf.constant(3.0)
-      x_new = ed.copy(x, {x: y}, replace_itself=False)
-      self.assertEqual(x_new.eval(), 2.0)
-      x_new = ed.copy(x, {x: y}, replace_itself=True)
-      self.assertEqual(x_new.eval(), 3.0)
-
-  def test_copy_q(self):
-    with self.test_session() as sess:
-      x = tf.constant(2.0)
-      y = tf.random_normal([])
-      x_new = ed.copy(x, {x: y}, replace_itself=True, copy_q=False)
-      x_new_val, y_val = sess.run([x_new, y])
-      self.assertEqual(x_new_val, y_val)
-      x_new = ed.copy(x, {x: y}, replace_itself=True, copy_q=True)
-      x_new_val, x_val, y_val = sess.run([x_new, x, y])
-      self.assertNotEqual(x_new_val, x_val)
-      self.assertNotEqual(x_new_val, y_val)
-
-  def test_copy_parent_rvs(self):
-    with self.test_session() as sess:
-      x = Normal(0.0, 1.0)
-      y = tf.constant(3.0)
-      z = x * y
-      z_new = ed.copy(z, scope='no_copy_parent_rvs', copy_parent_rvs=False)
-      self.assertEqual(len(ed.random_variables()), 1)
-      z_new = ed.copy(z, scope='copy_parent_rvs', copy_parent_rvs=True)
-      self.assertEqual(len(ed.random_variables()), 2)
-
-  def test_placeholder(self):
-    with self.test_session() as sess:
-      x = tf.placeholder(tf.float32, name="CustomName")
-      y = tf.constant(3.0)
-      z = x * y
-      z_new = ed.copy(z)
-      self.assertEqual(sess.run(z_new, feed_dict={x: 4.0}), 12.0)
-
-  def test_variable(self):
-    with self.test_session() as sess:
-      x = tf.Variable(2.0, name="CustomName")
-      y = tf.constant(3.0)
-      z = x * y
-      z_new = ed.copy(z)
-      tf.variables_initializer([x]).run()
-      self.assertEqual(z_new.eval(), 6.0)
-
-  def test_queue(self):
-    with self.test_session() as sess:
-      tensor = tf.constant([0.0, 1.0, 2.0, 3.0])
-      x = tf.train.batch([tensor], batch_size=2, enqueue_many=True,
-                         name='CustomName')
-      y = tf.constant(3.0)
-      z = x * y
-      z_new = ed.copy(z)
-      coord = tf.train.Coordinator()
-      threads = tf.train.start_queue_runners(coord=coord)
-      self.assertAllEqual(sess.run(z_new), np.array([0.0, 3.0]))
-      self.assertAllEqual(sess.run(z_new), np.array([6.0, 9.0]))
-      coord.request_stop()
-      coord.join(threads)
-
-  def test_list(self):
-    with self.test_session() as sess:
-      x = Normal(tf.constant(0.0), tf.constant(0.1))
-      y = Normal(tf.constant(10.0), tf.constant(0.1))
-      cat = Categorical(logits=tf.zeros(5))
-      components = [Normal(x, tf.constant(0.1))
-                    for _ in range(5)]
-      z = Mixture(cat=cat, components=components)
-      z_new = ed.copy(z, {x: y.value()})
-      self.assertGreater(z_new.value().eval(), 5.0)
-
-  def test_random(self):
-    with self.test_session() as sess:
-      ed.set_seed(3742)
-      x = tf.random_normal([])
-      x_copy = ed.copy(x)
-
-      result_copy, result = sess.run([x_copy, x])
-      self.assertNotAlmostEquals(result_copy, result)
-
-  def test_scan(self):
-    with self.test_session() as sess:
-      ed.set_seed(42)
-      op = tf.scan(lambda a, x: a + x, tf.constant([2.0, 3.0, 1.0]))
-      copy_op = ed.copy(op)
-
-      result_copy, result = sess.run([copy_op, op])
-      self.assertAllClose(result_copy, [2.0, 5.0, 6.0])
-      self.assertAllClose(result, [2.0, 5.0, 6.0])
-
-  def test_scan_gradients(self):
-    with self.test_session() as sess:
-      a = tf.Variable([1.0, 2.0, 3.0])
-      op = tf.scan(lambda a, x: a + x, a)
-      copy_op = ed.copy(op)
-      gradient = tf.gradients(op, [a])[0]
-      copy_gradient = tf.gradients(copy_op, [a])[0]
-
-      tf.variables_initializer([a]).run()
-      result_copy, result = sess.run([copy_gradient, gradient])
-      self.assertAllClose(result, [3.0, 2.0, 1.0])
-      self.assertAllClose(result_copy, [3.0, 2.0, 1.0])
-
-  def test_nested_scan_gradients(self):
-    with self.test_session() as sess:
-      a = tf.Variable([1.0, 2.0, 3.0])
-      i = tf.constant(0.0)
-      tot = tf.constant([0.0, 0.0, 0.0])
-      op = tf.while_loop(lambda i, tot: i < 5,
-                         lambda i, tot: (i + 1,
-                                         tot + tf.scan(lambda x0, x:
-                                                       x0 + i * x, a, 0.0)),
-                         [i, tot])[1]
-      copy_op = ed.copy(op)
-      gradient = tf.gradients(op, [a])[0]
-      copy_gradient = tf.gradients(copy_op, [a])[0]
-
-      tf.variables_initializer([a]).run()
-      result_copy, result = sess.run([copy_gradient, gradient])
-      self.assertAllClose(result, [30.0, 20.0, 10.0])
-      self.assertAllClose(result_copy, [30.0, 20.0, 10.0])
-
-  def test_swap_tensor_tensor(self):
-    with self.test_session():
-      x = tf.constant(2.0)
-      y = tf.constant(3.0)
-      z = x * y
-      qx = tf.constant(4.0)
-      z_new = ed.copy(z, {x: qx})
-      self.assertEqual(z_new.eval(), 12.0)
-
-  def test_swap_placeholder_tensor(self):
-    with self.test_session():
-      x = tf.placeholder(tf.float32, name="CustomName")
-      y = tf.constant(3.0)
-      z = x * y
-      qx = tf.constant(4.0)
-      z_new = ed.copy(z, {x: qx})
-      self.assertEqual(z_new.eval(), 12.0)
-
-  def test_swap_tensor_placeholder(self):
-    with self.test_session() as sess:
-      x = tf.constant(2.0)
-      y = tf.constant(3.0)
-      z = x * y
-      qx = tf.placeholder(tf.float32, name="CustomName")
-      z_new = ed.copy(z, {x: qx})
-      self.assertEqual(sess.run(z_new, feed_dict={qx: 4.0}), 12.0)
-
-  def test_swap_variable_tensor(self):
-    with self.test_session():
-      x = tf.Variable(2.0, name="CustomName")
-      y = tf.constant(3.0)
-      z = x * y
-      qx = tf.constant(4.0)
-      z_new = ed.copy(z, {x: qx})
-      tf.variables_initializer([x]).run()
-      self.assertEqual(z_new.eval(), 12.0)
-
-  def test_swap_tensor_variable(self):
-    with self.test_session() as sess:
-      x = tf.constant(2.0)
-      y = tf.constant(3.0)
-      z = x * y
-      qx = tf.Variable(4.0, name="CustomName")
-      z_new = ed.copy(z, {x: qx})
-      tf.variables_initializer([qx]).run()
-      self.assertEqual(z_new.eval(), 12.0)
-
-  def test_swap_rv_rv(self):
-    with self.test_session():
-      ed.set_seed(325135)
-      x = Normal(0.0, 0.1)
-      y = tf.constant(1.0)
-      z = x * y
-      qx = Normal(10.0, 0.1)
-      z_new = ed.copy(z, {x: qx})
-      self.assertGreater(z_new.eval(), 5.0)
-
-  def test_swap_rv_tensor(self):
-    with self.test_session():
-      ed.set_seed(289362)
-      x = Normal(0.0, 0.1)
-      y = tf.constant(1.0)
-      z = x * y
-      qx = Normal(10.0, 0.1)
-      z_new = ed.copy(z, {x: qx.value()})
-      self.assertGreater(z_new.eval(), 5.0)
-
-  def test_swap_tensor_rv(self):
-    with self.test_session():
-      ed.set_seed(95258)
-      x = Normal(0.0, 0.1)
-      y = tf.constant(1.0)
-      z = x * y
-      qx = Normal(10.0, 0.1)
-      z_new = ed.copy(z, {x.value(): qx})
-      self.assertGreater(z_new.eval(), 5.0)
-
-  def test_ordering_rv_tensor(self):
-    # Check that random variables are copied correctly in dependency
-    # structure.
-    with self.test_session() as sess:
-      ed.set_seed(12432)
-      x = Bernoulli(logits=0.0)
-      y = tf.cast(x, tf.float32)
-      y_new = ed.copy(y)
-      x_new = ed.copy(x)
-      x_new_val, y_new_val = sess.run([x_new, y_new])
-      self.assertEqual(x_new_val, y_new_val)
-
-  def test_ordering_rv_rv(self):
-    # Check that random variables are copied correctly in dependency
-    # structure.
-    with self.test_session() as sess:
-      ed.set_seed(21782)
-      x = Normal(loc=0.0, scale=10.0)
-      x_abs = tf.abs(x)
-      y = Normal(loc=x_abs, scale=1e-8)
-      y_new = ed.copy(y)
-      x_new = ed.copy(x)
-      x_new_val, y_new_val = sess.run([x_new, y_new])
-      self.assertAllClose(abs(x_new_val), y_new_val)
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/tests/util/dot_test.py b/tests/util/dot_test.py
deleted file mode 100644
index 99f56d238..000000000
--- a/tests/util/dot_test.py
+++ /dev/null
@@ -1,34 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-import tensorflow as tf
-
-from edward.util import dot
-
-
-class test_dot_class(tf.test.TestCase):
-
-  def test_dot(self):
-    with self.test_session():
-      a = tf.constant(np.arange(5, dtype=np.float32))
-      b = tf.diag(tf.ones([5]))
-      self.assertAllEqual(dot(a, b).eval(),
-                          np.dot(a.eval(), b.eval()))
-      self.assertAllEqual(dot(b, a).eval(),
-                          np.dot(b.eval(), a.eval()))
-
-  def test_all_finite_raises(self):
-    with self.test_session():
-      a = np.inf * tf.ones([5])
-      b = tf.diag(tf.ones([5]))
-      with self.assertRaisesOpError('Inf'):
-        dot(a, b).eval()
-      a = tf.ones([5]) * np.arange(5)
-      b = np.inf * tf.diag(tf.ones([5]))
-      with self.assertRaisesOpError('Inf'):
-        dot(a, b).eval()
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/tests/util/random_variables_test.py b/tests/util/random_variables_test.py
deleted file mode 100644
index 1b99f625f..000000000
--- a/tests/util/random_variables_test.py
+++ /dev/null
@@ -1,29 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-import tensorflow as tf
-
-from edward.util.random_variables import compute_multinomial_mode
-
-
-class test_compute_multinomial_mode(tf.test.TestCase):
-
-  RANDOM_SEED = 12345
-
-  def test_correct_mode_computed_with_uniform_probabilities(self):
-    with self.test_session():
-      probs = np.array(3 * [1 / 3.0])
-      total_count = 5
-      self.assertAllEqual(
-          compute_multinomial_mode(probs, total_count, seed=self.RANDOM_SEED),
-          np.array([1, 2, 2]))
-      probs = np.array([0.6, 0.4, 0.0])
-      total_count = 5
-      self.assertAllEqual(
-          compute_multinomial_mode(probs, total_count, seed=self.RANDOM_SEED),
-          np.array([2, 2, 1]))
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/tests/util/rbf_test.py b/tests/util/rbf_test.py
deleted file mode 100644
index 0b91a9128..000000000
--- a/tests/util/rbf_test.py
+++ /dev/null
@@ -1,91 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-import tensorflow as tf
-
-from edward.util import rbf
-
-
-class test_rbf_class(tf.test.TestCase):
-
-  def test_x(self):
-    with self.test_session():
-      X = tf.constant([[0.0], [0.0]])
-      X2 = tf.constant([[0.0], [0.0]])
-      self.assertAllClose(rbf(X).eval(),
-                          [[1.0, 1.0], [1.0, 1.0]])
-      self.assertAllClose(rbf(X, X2).eval(),
-                          [[1.0, 1.0], [1.0, 1.0]])
-
-  def test_x2(self):
-    with self.test_session():
-      X = tf.constant([[10.0], [2.0]])
-      X2 = tf.constant([[2.0], [10.0]])
-      self.assertAllClose(rbf(X, X2).eval(),
-                          [[1.266417e-14, 1.0], [1.0, 1.266417e-14]])
-      self.assertAllClose(rbf(X2, X).eval(),
-                          [[1.266417e-14, 1.0], [1.0, 1.266417e-14]])
-
-      X = tf.constant([[2.0, 2.5], [4.1, 5.0]])
-      X2 = tf.constant([[1.5, 2.0], [3.1, 4.2]])
-      self.assertAllClose(rbf(X, X2).eval(),
-                          [[0.778800, 0.128734],
-                           [0.000378, 0.440431]], atol=1e-5, rtol=1e-5)
-
-  def test_lengthscale(self):
-    """checked calculations by hand, e.g.,
-    np.exp(-((2.0 - 1.5)**2 / (2.0**2) + (2.5 - 2.0)**2 / (1.5**2)) / 2)
-    np.exp(-((2.0 - 3.1)**2 / (2.0**2) + (2.5 - 4.2)**2 / (1.5**2)) / 2)
-    np.exp(-((4.1 - 1.5)**2 / (2.0**2) + (5.0 - 2.0)**2 / (1.5**2)) / 2)
-    np.exp(-((4.1 - 3.1)**2 / (2.0**2) + (5.0 - 4.2)**2 / (1.5**2)) / 2)
-    """
-    with self.test_session():
-      X = tf.constant([[2.0, 2.5], [4.1, 5.0]])
-      X2 = tf.constant([[1.5, 2.0], [3.1, 4.2]])
-      lengthscale1 = tf.constant(2.0)
-      lengthscale2 = tf.constant([2.0, 2.0])
-      lengthscale3 = tf.constant([2.0, 1.5])
-      self.assertAllClose(rbf(X, X2, lengthscale1).eval(),
-                          [[0.939413, 0.598996],
-                           [0.139456, 0.814647]], atol=1e-5, rtol=1e-5)
-      self.assertAllClose(rbf(X, X2, lengthscale2).eval(),
-                          [[0.939413, 0.598996],
-                           [0.139456, 0.814647]], atol=1e-5, rtol=1e-5)
-      self.assertAllClose(rbf(X, X2, lengthscale3).eval(),
-                          [[0.916855, 0.452271],
-                           [0.058134, 0.765502]], atol=1e-5, rtol=1e-5)
-
-  def test_variance(self):
-    with self.test_session():
-      X = tf.constant([[2.0, 2.5], [4.1, 5.0]])
-      X2 = tf.constant([[1.5, 2.0], [3.1, 4.2]])
-      variance = tf.constant(1.4)
-      self.assertAllClose(rbf(X, X2, variance=variance).eval(),
-                          [[1.090321, 0.180228],
-                           [0.000529, 0.616604]], atol=1e-5, rtol=1e-5)
-
-  def test_all(self):
-    with self.test_session():
-      X = tf.constant([[2.0, 2.5], [4.1, 5.0]])
-      X2 = tf.constant([[1.5, 2.0], [3.1, 4.2]])
-      lengthscale = tf.constant([2.0, 1.5])
-      variance = tf.constant(1.4)
-      self.assertAllClose(rbf(X, X2, lengthscale, variance).eval(),
-                          [[1.283597, 0.633180],
-                           [0.081387, 1.071704]], atol=1e-5, rtol=1e-5)
-
-  def test_raises(self):
-    with self.test_session():
-      X1 = tf.constant([[0.0]])
-      X2 = tf.constant([[0.0]])
-      lengthscale = tf.constant(-5.0)
-      variance = tf.constant(-1.0)
-      with self.assertRaisesOpError('Condition'):
-        rbf(X1, X2, variance=variance).eval()
-        rbf(X1, X2, lengthscale).eval()
-        rbf(X1, X2, lengthscale, variance).eval()
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/tests/util/to_simplex_test.py b/tests/util/to_simplex_test.py
deleted file mode 100644
index fbe83fc15..000000000
--- a/tests/util/to_simplex_test.py
+++ /dev/null
@@ -1,42 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-import tensorflow as tf
-
-from edward.util import to_simplex
-
-
-class test_to_simplex_class(tf.test.TestCase):
-
-  def test_to_simplex_1d(self):
-    with self.test_session():
-      x = tf.constant([0.0])
-      self.assertAllClose(to_simplex(x).eval(),
-                          [0.5, 0.5])
-      x = tf.constant([0.0, 10.0])
-      self.assertAllClose(to_simplex(x).eval(),
-                          [3.333333e-01, 6.666363e-01, 3.027916e-05])
-
-  def test_to_simplex_2d(self):
-    with self.test_session():
-      x = tf.constant([[0.0], [0.0]])
-      self.assertAllClose(to_simplex(x).eval(),
-                          [[0.5, 0.5], [0.5, 0.5]])
-      x = tf.constant([[0.0, 10.0], [0.0, 10.0]])
-      self.assertAllClose(to_simplex(x).eval(),
-                          [[3.333333e-01, 6.666363e-01, 3.027916e-05],
-                           [3.333333e-01, 6.666363e-01, 3.027916e-05]])
-
-  def test_all_finite_raises(self):
-    with self.test_session():
-      x = tf.constant([12.5, np.inf])
-      with self.assertRaisesOpError('Inf'):
-        to_simplex(x).eval()
-      x = tf.constant([12.5, np.nan])
-      with self.assertRaisesOpError('NaN'):
-        to_simplex(x).eval()
-
-if __name__ == '__main__':
-  tf.test.main()