From dec26730450e99e92bbfa94e8e1d4631fd34d4b5 Mon Sep 17 00:00:00 2001
From: Dustin Tran <dustinviettran@gmail.com>
Date: Sun, 7 Jan 2018 13:41:34 -0800
Subject: [PATCH 01/27] set rv.value() API -> rv.value

---
 edward/criticisms/ppc.py                      |  2 +-
 edward/inferences/conjugacy/conjugacy.py      | 10 ++---
 edward/inferences/hmc.py                      |  2 +-
 edward/inferences/implicit_klqp.py            |  8 ++--
 edward/inferences/klpq.py                     |  4 +-
 edward/inferences/klqp.py                     | 28 ++++++------
 edward/inferences/map.py                      |  4 +-
 edward/inferences/metropolis_hastings.py      |  4 +-
 edward/inferences/sghmc.py                    |  2 +-
 edward/inferences/sgld.py                     |  2 +-
 edward/inferences/wake_sleep.py               |  6 +--
 edward/models/param_mixture.py                |  2 +-
 edward/models/random_variable.py              | 25 ++++++-----
 edward/util/random_variables.py               | 26 +++++------
 examples/lstm.py                              |  2 +-
 tests/models/keras_core_layers_test.py        | 20 ++++-----
 .../models/random_variable_operators_test.py  | 44 +++++++++----------
 tests/models/random_variable_value_test.py    |  8 ++--
 tests/util/copy_test.py                       |  8 ++--
 19 files changed, 104 insertions(+), 103 deletions(-)

diff --git a/edward/criticisms/ppc.py b/edward/criticisms/ppc.py
index 5dddf6ea4..ad591b18d 100644
--- a/edward/criticisms/ppc.py
+++ b/edward/criticisms/ppc.py
@@ -95,7 +95,7 @@ def ppc(T, data, latent_vars=None, n_samples=100):
           for key, value in six.iteritems(latent_vars)}
 
   # Build replicated data.
-  xrep = {x: (x.value() if isinstance(x, RandomVariable) else obs)
+  xrep = {x: (x.value if isinstance(x, RandomVariable) else obs)
           for x, obs in six.iteritems(data)}
 
   # Create feed_dict for data placeholders that the model conditions
diff --git a/edward/inferences/conjugacy/conjugacy.py b/edward/inferences/conjugacy/conjugacy.py
index ce0941398..235e39146 100644
--- a/edward/inferences/conjugacy/conjugacy.py
+++ b/edward/inferences/conjugacy/conjugacy.py
@@ -103,15 +103,15 @@ def complete_conditional(rv, cond_set=None):
     log_joint = get_log_joint(cond_set)
 
     # Pull out the nodes that are nonlinear functions of rv into s_stats.
-    stop_nodes = set([i.value() for i in cond_set])
+    stop_nodes = set([i.value for i in cond_set])
     subgraph = extract_subgraph(log_joint, stop_nodes)
-    s_stats = suff_stat_nodes(subgraph, rv.value(), cond_set)
+    s_stats = suff_stat_nodes(subgraph, rv.value, cond_set)
     s_stats = list(set(s_stats))
 
     # Simplify those nodes, and put any new linear terms into multipliers_i.
     s_stat_exprs = defaultdict(list)
     for s_stat in s_stats:
-      expr = symbolic_suff_stat(s_stat, rv.value(), stop_nodes)
+      expr = symbolic_suff_stat(s_stat, rv.value, stop_nodes)
       expr = full_simplify(expr)
       multipliers_i, s_stats_i = extract_s_stat_multipliers(expr)
       s_stat_exprs[s_stats_i].append(
@@ -139,7 +139,7 @@ def complete_conditional(rv, cond_set=None):
     for s_stat_expr in six.itervalues(s_stat_exprs):
       s_stat_placeholder = tf.placeholder(tf.float32,
                                           s_stat_expr[0][0].get_shape())
-      swap_back[s_stat_placeholder] = tf.cast(rv.value(), tf.float32)
+      swap_back[s_stat_placeholder] = tf.cast(rv.value, tf.float32)
       s_stat_placeholders.append(s_stat_placeholder)
       for s_stat_node, multiplier in s_stat_expr:
         fake_node = s_stat_placeholder * multiplier
@@ -148,7 +148,7 @@ def complete_conditional(rv, cond_set=None):
 
     for i in cond_set:
       if i != rv:
-        val = i.value()
+        val = i.value
         val_placeholder = tf.placeholder(val.dtype)
         swap_dict[val] = val_placeholder
         swap_back[val_placeholder] = val
diff --git a/edward/inferences/hmc.py b/edward/inferences/hmc.py
index 825941ea3..23b8a9875 100644
--- a/edward/inferences/hmc.py
+++ b/edward/inferences/hmc.py
@@ -175,7 +175,7 @@ def _log_joint(self, z_sample):
       if isinstance(x, RandomVariable):
         if isinstance(qx, RandomVariable):
           qx_copy = copy(qx, scope=scope)
-          dict_swap[x] = qx_copy.value()
+          dict_swap[x] = qx_copy.value
         else:
           dict_swap[x] = qx
 
diff --git a/edward/inferences/implicit_klqp.py b/edward/inferences/implicit_klqp.py
index a794978b2..3baf9d813 100644
--- a/edward/inferences/implicit_klqp.py
+++ b/edward/inferences/implicit_klqp.py
@@ -154,7 +154,7 @@ def build_loss_and_gradients(self, var_list):
     for beta, qbeta in six.iteritems(self.global_vars):
       # Draw a sample beta' ~ q(beta) and calculate
       # log p(beta') and log q(beta').
-      qbeta_sample[beta] = qbeta.value()
+      qbeta_sample[beta] = qbeta.value
       pbeta_log_prob += tf.reduce_sum(beta.log_prob(qbeta_sample[beta]))
       qbeta_log_prob += tf.reduce_sum(qbeta.log_prob(qbeta_sample[beta]))
 
@@ -165,8 +165,8 @@ def build_loss_and_gradients(self, var_list):
         # Copy local variables p(z), q(z) to draw samples
         # z' ~ p(z | beta'), z' ~ q(z | beta').
         pz_copy = copy(z, dict_swap=qbeta_sample, scope=scope)
-        pz_sample[z] = pz_copy.value()
-        qz_sample[z] = qz.value()
+        pz_sample[z] = pz_copy.value
+        qz_sample[z] = qz.value
 
     # Collect x' ~ p(x | z', beta') and x' ~ q(x).
     dict_swap = qbeta_sample.copy()
@@ -183,7 +183,7 @@ def build_loss_and_gradients(self, var_list):
       elif isinstance(x, RandomVariable):
         # Copy p(x | z, beta) to get draw p(x | z', beta').
         x_copy = copy(x, dict_swap=dict_swap, scope=scope)
-        x_psample[x] = x_copy.value()
+        x_psample[x] = x_copy.value
         x_qsample[x] = x_data
 
     with tf.variable_scope("Disc"):
diff --git a/edward/inferences/klpq.py b/edward/inferences/klpq.py
index dfe0796e0..5639270ce 100644
--- a/edward/inferences/klpq.py
+++ b/edward/inferences/klpq.py
@@ -135,14 +135,14 @@ def build_loss_and_gradients(self, var_list):
         if isinstance(x, RandomVariable):
           if isinstance(qx, RandomVariable):
             qx_copy = copy(qx, scope=scope)
-            dict_swap[x] = qx_copy.value()
+            dict_swap[x] = qx_copy.value
           else:
             dict_swap[x] = qx
 
       for z, qz in six.iteritems(self.latent_vars):
         # Copy q(z) to obtain new set of posterior samples.
         qz_copy = copy(qz, scope=scope)
-        dict_swap[z] = qz_copy.value()
+        dict_swap[z] = qz_copy.value
         q_log_prob[s] += tf.reduce_sum(
             qz_copy.log_prob(tf.stop_gradient(dict_swap[z])))
 
diff --git a/edward/inferences/klqp.py b/edward/inferences/klqp.py
index 3cfbc9cea..d88088016 100644
--- a/edward/inferences/klqp.py
+++ b/edward/inferences/klqp.py
@@ -663,14 +663,14 @@ def build_reparam_loss_and_gradients(inference, var_list):
       if isinstance(x, RandomVariable):
         if isinstance(qx, RandomVariable):
           qx_copy = copy(qx, scope=scope)
-          dict_swap[x] = qx_copy.value()
+          dict_swap[x] = qx_copy.value
         else:
           dict_swap[x] = qx
 
     for z, qz in six.iteritems(inference.latent_vars):
       # Copy q(z) to obtain new set of posterior samples.
       qz_copy = copy(qz, scope=scope)
-      dict_swap[z] = qz_copy.value()
+      dict_swap[z] = qz_copy.value
       q_log_prob[s] += tf.reduce_sum(
           inference.scale.get(z, 1.0) * qz_copy.log_prob(dict_swap[z]))
 
@@ -731,14 +731,14 @@ def build_reparam_kl_loss_and_gradients(inference, var_list):
       if isinstance(x, RandomVariable):
         if isinstance(qx, RandomVariable):
           qx_copy = copy(qx, scope=scope)
-          dict_swap[x] = qx_copy.value()
+          dict_swap[x] = qx_copy.value
         else:
           dict_swap[x] = qx
 
     for z, qz in six.iteritems(inference.latent_vars):
       # Copy q(z) to obtain new set of posterior samples.
       qz_copy = copy(qz, scope=scope)
-      dict_swap[z] = qz_copy.value()
+      dict_swap[z] = qz_copy.value
 
     for x in six.iterkeys(inference.data):
       if isinstance(x, RandomVariable):
@@ -794,14 +794,14 @@ def build_reparam_entropy_loss_and_gradients(inference, var_list):
       if isinstance(x, RandomVariable):
         if isinstance(qx, RandomVariable):
           qx_copy = copy(qx, scope=scope)
-          dict_swap[x] = qx_copy.value()
+          dict_swap[x] = qx_copy.value
         else:
           dict_swap[x] = qx
 
     for z, qz in six.iteritems(inference.latent_vars):
       # Copy q(z) to obtain new set of posterior samples.
       qz_copy = copy(qz, scope=scope)
-      dict_swap[z] = qz_copy.value()
+      dict_swap[z] = qz_copy.value
 
     for z in six.iterkeys(inference.latent_vars):
       z_copy = copy(z, dict_swap, scope=scope)
@@ -856,14 +856,14 @@ def build_score_loss_and_gradients(inference, var_list):
       if isinstance(x, RandomVariable):
         if isinstance(qx, RandomVariable):
           qx_copy = copy(qx, scope=scope)
-          dict_swap[x] = qx_copy.value()
+          dict_swap[x] = qx_copy.value
         else:
           dict_swap[x] = qx
 
     for z, qz in six.iteritems(inference.latent_vars):
       # Copy q(z) to obtain new set of posterior samples.
       qz_copy = copy(qz, scope=scope)
-      dict_swap[z] = qz_copy.value()
+      dict_swap[z] = qz_copy.value
       q_log_prob[s] += tf.reduce_sum(
           inference.scale.get(z, 1.0) *
           qz_copy.log_prob(tf.stop_gradient(dict_swap[z])))
@@ -927,14 +927,14 @@ def build_score_kl_loss_and_gradients(inference, var_list):
       if isinstance(x, RandomVariable):
         if isinstance(qx, RandomVariable):
           qx_copy = copy(qx, scope=scope)
-          dict_swap[x] = qx_copy.value()
+          dict_swap[x] = qx_copy.value
         else:
           dict_swap[x] = qx
 
     for z, qz in six.iteritems(inference.latent_vars):
       # Copy q(z) to obtain new set of posterior samples.
       qz_copy = copy(qz, scope=scope)
-      dict_swap[z] = qz_copy.value()
+      dict_swap[z] = qz_copy.value
       q_log_prob[s] += tf.reduce_sum(
           inference.scale.get(z, 1.0) *
           qz_copy.log_prob(tf.stop_gradient(dict_swap[z])))
@@ -998,14 +998,14 @@ def build_score_entropy_loss_and_gradients(inference, var_list):
       if isinstance(x, RandomVariable):
         if isinstance(qx, RandomVariable):
           qx_copy = copy(qx, scope=scope)
-          dict_swap[x] = qx_copy.value()
+          dict_swap[x] = qx_copy.value
         else:
           dict_swap[x] = qx
 
     for z, qz in six.iteritems(inference.latent_vars):
       # Copy q(z) to obtain new set of posterior samples.
       qz_copy = copy(qz, scope=scope)
-      dict_swap[z] = qz_copy.value()
+      dict_swap[z] = qz_copy.value
       q_log_prob[s] += tf.reduce_sum(
           inference.scale.get(z, 1.0) *
           qz_copy.log_prob(tf.stop_gradient(dict_swap[z])))
@@ -1077,14 +1077,14 @@ def build_score_rb_loss_and_gradients(inference, var_list):
       if isinstance(x, RandomVariable):
         if isinstance(qx, RandomVariable):
           qx_copy = copy(qx, scope=scope)
-          dict_swap[x] = qx_copy.value()
+          dict_swap[x] = qx_copy.value
         else:
           dict_swap[x] = qx
 
     for z, qz in six.iteritems(inference.latent_vars):
       # Copy q(z) to obtain new set of posterior samples.
       qz_copy = copy(qz, scope=scope)
-      dict_swap[z] = qz_copy.value()
+      dict_swap[z] = qz_copy.value
       q_log_probs[s][qz] = tf.reduce_sum(
           inference.scale.get(z, 1.0) *
           qz_copy.log_prob(tf.stop_gradient(dict_swap[z])))
diff --git a/edward/inferences/map.py b/edward/inferences/map.py
index 406d461d4..fb0bf1d82 100644
--- a/edward/inferences/map.py
+++ b/edward/inferences/map.py
@@ -121,12 +121,12 @@ def build_loss_and_gradients(self, var_list):
     # Form dictionary in order to replace conditioning on prior or
     # observed variable with conditioning on a specific value.
     scope = tf.get_default_graph().unique_name("inference")
-    dict_swap = {z: qz.value()
+    dict_swap = {z: qz.value
                  for z, qz in six.iteritems(self.latent_vars)}
     for x, qx in six.iteritems(self.data):
       if isinstance(x, RandomVariable):
         if isinstance(qx, RandomVariable):
-          dict_swap[x] = qx.value()
+          dict_swap[x] = qx.value
         else:
           dict_swap[x] = qx
 
diff --git a/edward/inferences/metropolis_hastings.py b/edward/inferences/metropolis_hastings.py
index fc3259774..d57cff698 100644
--- a/edward/inferences/metropolis_hastings.py
+++ b/edward/inferences/metropolis_hastings.py
@@ -86,7 +86,7 @@ def build_update(self):
       if isinstance(x, RandomVariable):
         if isinstance(qx, RandomVariable):
           qx_copy = copy(qx, scope='conditional')
-          dict_swap[x] = qx_copy.value()
+          dict_swap[x] = qx_copy.value
         else:
           dict_swap[x] = qx
 
@@ -103,7 +103,7 @@ def build_update(self):
       # Build proposal g(znew | zold).
       proposal_znew = copy(proposal_z, dict_swap_old, scope=scope_old)
       # Sample znew ~ g(znew | zold).
-      new_sample[z] = proposal_znew.value()
+      new_sample[z] = proposal_znew.value
       # Increment ratio.
       ratio -= tf.reduce_sum(proposal_znew.log_prob(new_sample[z]))
 
diff --git a/edward/inferences/sghmc.py b/edward/inferences/sghmc.py
index d69dc7a0a..50a744194 100644
--- a/edward/inferences/sghmc.py
+++ b/edward/inferences/sghmc.py
@@ -111,7 +111,7 @@ def _log_joint(self, z_sample):
       if isinstance(x, RandomVariable):
         if isinstance(qx, RandomVariable):
           qx_copy = copy(qx, scope=scope)
-          dict_swap[x] = qx_copy.value()
+          dict_swap[x] = qx_copy.value
         else:
           dict_swap[x] = qx
 
diff --git a/edward/inferences/sgld.py b/edward/inferences/sgld.py
index bcd7027da..82c5163a4 100644
--- a/edward/inferences/sgld.py
+++ b/edward/inferences/sgld.py
@@ -102,7 +102,7 @@ def _log_joint(self, z_sample):
       if isinstance(x, RandomVariable):
         if isinstance(qx, RandomVariable):
           qx_copy = copy(qx, scope=scope)
-          dict_swap[x] = qx_copy.value()
+          dict_swap[x] = qx_copy.value
         else:
           dict_swap[x] = qx
 
diff --git a/edward/inferences/wake_sleep.py b/edward/inferences/wake_sleep.py
index 5d4db6d3d..0a6b41350 100644
--- a/edward/inferences/wake_sleep.py
+++ b/edward/inferences/wake_sleep.py
@@ -89,7 +89,7 @@ def build_loss_and_gradients(self, var_list):
         if isinstance(x, RandomVariable):
           if isinstance(qx, RandomVariable):
             qx_copy = copy(qx, scope=scope)
-            dict_swap[x] = qx_copy.value()
+            dict_swap[x] = qx_copy.value
           else:
             dict_swap[x] = qx
 
@@ -98,7 +98,7 @@ def build_loss_and_gradients(self, var_list):
       for z, qz in six.iteritems(self.latent_vars):
         # Copy q(z) to obtain new set of posterior samples.
         qz_copy = copy(qz, scope=scope)
-        q_dict_swap[z] = qz_copy.value()
+        q_dict_swap[z] = qz_copy.value
         if self.phase_q != 'sleep':
           # If not sleep phase, compute log q(z).
           q_log_prob[s] += tf.reduce_sum(
@@ -123,7 +123,7 @@ def build_loss_and_gradients(self, var_list):
         for z, qz in six.iteritems(self.latent_vars):
           # Copy p(z) to obtain new set of prior samples.
           z_copy = copy(z, scope=scope)
-          p_dict_swap[qz] = z_copy.value()
+          p_dict_swap[qz] = z_copy.value
         for qz in six.itervalues(self.latent_vars):
           qz_copy = copy(qz, p_dict_swap, scope=scope)
           q_log_prob[s] += tf.reduce_sum(
diff --git a/edward/models/param_mixture.py b/edward/models/param_mixture.py
index 4f2f3f5b1..64cd2689d 100644
--- a/edward/models/param_mixture.py
+++ b/edward/models/param_mixture.py
@@ -143,7 +143,7 @@ def __init__(self,
         validate_args=validate_args,
         allow_nan_stats=allow_nan_stats,
         parameters=parameters,
-        graph_parents=[self._cat.value(), self._components.value()],
+        graph_parents=[self._cat.value, self._components.value],
         name=name)
 
   @property
diff --git a/edward/models/random_variable.py b/edward/models/random_variable.py
index 65f69e85e..3bd9c0e02 100644
--- a/edward/models/random_variable.py
+++ b/edward/models/random_variable.py
@@ -145,7 +145,12 @@ def sample_shape(self):
   @property
   def shape(self):
     """Shape of random variable."""
-    return self._value.shape
+    return self.value.shape
+
+  @property
+  def value(self):
+    """Get tensor that the random variable corresponds to."""
+    return self._value
 
   def __str__(self):
     return "RandomVariable(\"%s\"%s%s%s)" % (
@@ -153,7 +158,7 @@ def __str__(self):
         (", shape=%s" % self.shape)
         if self.shape.ndims is not None else "",
         (", dtype=%s" % self.dtype.name) if self.dtype else "",
-        (", device=%s" % self.value().device) if self.value().device else "")
+        (", device=%s" % self.value.device) if self.value.device else "")
 
   def __repr__(self):
     return "<ed.RandomVariable '%s' shape=%s dtype=%s>" % (
@@ -213,11 +218,7 @@ def eval(self, session=None, feed_dict=None):
       print(x.eval())
     ```
     """
-    return self.value().eval(session=session, feed_dict=feed_dict)
-
-  def value(self):
-    """Get tensor that the random variable corresponds to."""
-    return self._value
+    return self.value.eval(session=session, feed_dict=feed_dict)
 
   def get_ancestors(self, collection=None):
     """Get ancestor random variables."""
@@ -274,7 +275,7 @@ def _overload_operator(operator):
       operator: string. The operator name.
     """
     def _run_op(a, *args):
-      return getattr(tf.Tensor, operator)(a.value(), *args)
+      return getattr(tf.Tensor, operator)(a.value, *args)
     # Propagate __doc__ to wrapper
     try:
       _run_op.__doc__ = getattr(tf.Tensor, operator).__doc__
@@ -291,15 +292,15 @@ def _run_op(a, *args):
 
   @staticmethod
   def _session_run_conversion_fetch_function(tensor):
-    return ([tensor.value()], lambda val: val[0])
+    return ([tensor.value], lambda val: val[0])
 
   @staticmethod
   def _session_run_conversion_feed_function(feed, feed_val):
-    return [(feed.value(), feed_val)]
+    return [(feed.value, feed_val)]
 
   @staticmethod
   def _session_run_conversion_feed_function_for_partial_run(feed):
-    return [feed.value()]
+    return [feed.value]
 
   @staticmethod
   def _tensor_conversion_function(v, dtype=None, name=None, as_ref=False):
@@ -308,7 +309,7 @@ def _tensor_conversion_function(v, dtype=None, name=None, as_ref=False):
       raise ValueError(
           "Incompatible type conversion requested to type '%s' for variable "
           "of type '%s'" % (dtype.name, v.dtype.name))
-    return v.value()
+    return v.value
 
 
 RandomVariable._overload_all_operators()
diff --git a/edward/util/random_variables.py b/edward/util/random_variables.py
index 3a581505a..2ec4e7ba1 100644
--- a/edward/util/random_variables.py
+++ b/edward/util/random_variables.py
@@ -219,13 +219,13 @@ def copy(org_instance, dict_swap=None, scope="copied",
       return org_instance
   elif isinstance(org_instance, tf.Tensor) and replace_itself:
     # Deal with case when `org_instance` is the associated tensor
-    # from the RandomVariable, e.g., `z.value()`. If
-    # `dict_swap={z: qz}`, we aim to swap it with `qz.value()`.
+    # from the RandomVariable, e.g., `z.value`. If
+    # `dict_swap={z: qz}`, we aim to swap it with `qz.value`.
     for key, value in six.iteritems(dict_swap):
       if isinstance(key, RandomVariable):
-        if org_instance == key.value():
+        if org_instance == key.value:
           if isinstance(value, RandomVariable):
-            org_instance = value.value()
+            org_instance = value.value
           else:
             org_instance = value
           if not copy_q:
@@ -471,7 +471,7 @@ def get_ancestors(x, collection=None):
   if collection is None:
     collection = random_variables()
 
-  node_dict = {node.value(): node for node in collection}
+  node_dict = {node.value: node for node in collection}
 
   # Traverse the graph. Add each node to the set if it's in the collection.
   output = set()
@@ -485,7 +485,7 @@ def get_ancestors(x, collection=None):
     visited.add(node)
 
     if isinstance(node, RandomVariable):
-      node = node.value()
+      node = node.value
 
     candidate_node = node_dict.get(node, None)
     if candidate_node is not None and candidate_node != x:
@@ -560,7 +560,7 @@ def get_children(x, collection=None):
   if collection is None:
     collection = random_variables()
 
-  node_dict = {node.value(): node for node in collection}
+  node_dict = {node.value: node for node in collection}
 
   # Traverse the graph. Add each node to the set if it's in the collection.
   output = set()
@@ -574,7 +574,7 @@ def get_children(x, collection=None):
     visited.add(node)
 
     if isinstance(node, RandomVariable):
-      node = node.value()
+      node = node.value
 
     candidate_node = node_dict.get(node, None)
     if candidate_node is not None and candidate_node != x:
@@ -613,7 +613,7 @@ def get_descendants(x, collection=None):
   if collection is None:
     collection = random_variables()
 
-  node_dict = {node.value(): node for node in collection}
+  node_dict = {node.value: node for node in collection}
 
   # Traverse the graph. Add each node to the set if it's in the collection.
   output = set()
@@ -627,7 +627,7 @@ def get_descendants(x, collection=None):
     visited.add(node)
 
     if isinstance(node, RandomVariable):
-      node = node.value()
+      node = node.value
 
     candidate_node = node_dict.get(node, None)
     if candidate_node is not None and candidate_node != x:
@@ -666,7 +666,7 @@ def get_parents(x, collection=None):
   if collection is None:
     collection = random_variables()
 
-  node_dict = {node.value(): node for node in collection}
+  node_dict = {node.value: node for node in collection}
 
   # Traverse the graph. Add each node to the set if it's in the collection.
   output = set()
@@ -680,7 +680,7 @@ def get_parents(x, collection=None):
     visited.add(node)
 
     if isinstance(node, RandomVariable):
-      node = node.value()
+      node = node.value
 
     candidate_node = node_dict.get(node, None)
     if candidate_node is not None and candidate_node != x:
@@ -763,7 +763,7 @@ def get_variables(x, collection=None):
     visited.add(node)
 
     if isinstance(node, RandomVariable):
-      node = node.value()
+      node = node.value
 
     candidate_node = node_dict.get(node.name, None)
     if candidate_node is not None and candidate_node != x:
diff --git a/examples/lstm.py b/examples/lstm.py
index 1a7d89b5c..a5a01ea21 100644
--- a/examples/lstm.py
+++ b/examples/lstm.py
@@ -131,7 +131,7 @@ def language_model_gen(batch_size, vocab_size):
     x = tf.one_hot(x, depth=vocab_size, dtype=tf.float32)
     h, c = lstm_cell(x, h, c, name="lstm")
     logits = tf.layers.dense(h, vocab_size, name="dense")
-    x = Categorical(logits=logits).value()
+    x = Categorical(logits=logits).value
     xs.append(x)
 
   xs = tf.cast(tf.stack(xs, 1), tf.int32)
diff --git a/tests/models/keras_core_layers_test.py b/tests/models/keras_core_layers_test.py
index 7da455141..a92a71983 100644
--- a/tests/models/keras_core_layers_test.py
+++ b/tests/models/keras_core_layers_test.py
@@ -13,51 +13,51 @@ class test_keras_core_layers_class(tf.test.TestCase):
 
   def test_dense(self):
     x = Normal(loc=tf.zeros([100, 10, 5]), scale=tf.ones([100, 10, 5]))
-    y = layers.Dense(32)(x.value())
+    y = layers.Dense(32)(x.value)
 
   def test_activation(self):
     x = Normal(loc=tf.zeros([100, 10, 5]), scale=tf.ones([100, 10, 5]))
-    y = layers.Activation('tanh')(x.value())
+    y = layers.Activation('tanh')(x.value)
 
   def test_dropout(self):
     x = Normal(loc=tf.zeros([100, 10, 5]), scale=tf.ones([100, 10, 5]))
-    y = layers.Dropout(0.5)(x.value())
+    y = layers.Dropout(0.5)(x.value)
 
   def test_flatten(self):
     x = Normal(loc=tf.zeros([100, 10, 5]), scale=tf.ones([100, 10, 5]))
-    y = layers.Flatten()(x.value())
+    y = layers.Flatten()(x.value)
     with self.test_session():
       self.assertEqual(y.eval().shape, (100, 50))
 
   def test_reshape(self):
     x = Normal(loc=tf.zeros([100, 10, 5]), scale=tf.ones([100, 10, 5]))
-    y = layers.Reshape((5, 10))(x.value())
+    y = layers.Reshape((5, 10))(x.value)
     with self.test_session():
       self.assertEqual(y.eval().shape, (100, 5, 10))
 
   def test_permute(self):
     x = Normal(loc=tf.zeros([100, 10, 5]), scale=tf.ones([100, 10, 5]))
-    y = layers.Permute((2, 1))(x.value())
+    y = layers.Permute((2, 1))(x.value)
     with self.test_session():
       self.assertEqual(y.eval().shape, (100, 5, 10))
 
   def test_repeat_vector(self):
     x = Normal(loc=tf.zeros([100, 10]), scale=tf.ones([100, 10]))
-    y = layers.RepeatVector(2)(x.value())
+    y = layers.RepeatVector(2)(x.value)
     with self.test_session():
       self.assertEqual(y.eval().shape, (100, 2, 10))
 
   def test_lambda(self):
     x = Normal(loc=tf.zeros([100, 10, 5]), scale=tf.ones([100, 10, 5]))
-    y = layers.Lambda(lambda x: x ** 2)(x.value())
+    y = layers.Lambda(lambda x: x ** 2)(x.value)
 
   def test_activity_regularization(self):
     x = Normal(loc=tf.zeros([100, 10, 5]), scale=tf.ones([100, 10, 5]))
-    y = layers.ActivityRegularization(l1=0.1)(x.value())
+    y = layers.ActivityRegularization(l1=0.1)(x.value)
 
   def test_masking(self):
     x = Normal(loc=tf.zeros([100, 10, 5]), scale=tf.ones([100, 10, 5]))
-    y = layers.Masking()(x.value())
+    y = layers.Masking()(x.value)
 
 if __name__ == '__main__':
   tf.test.main()
diff --git a/tests/models/random_variable_operators_test.py b/tests/models/random_variable_operators_test.py
index 72ae5ab82..dd3e33093 100644
--- a/tests/models/random_variable_operators_test.py
+++ b/tests/models/random_variable_operators_test.py
@@ -15,7 +15,7 @@ def test_add(self):
       x = Normal(0.0, 1.0)
       y = 5.0
       z = x + y
-      z_value = x.value() + y
+      z_value = x.value + y
       z_eval, z_value_eval = sess.run([z, z_value])
       self.assertAllEqual(z_eval, z_value_eval)
 
@@ -24,7 +24,7 @@ def test_radd(self):
       x = Normal(0.0, 1.0)
       y = 5.0
       z = y + x
-      z_value = y + x.value()
+      z_value = y + x.value
       z_eval, z_value_eval = sess.run([z, z_value])
       self.assertAllEqual(z_eval, z_value_eval)
 
@@ -33,7 +33,7 @@ def test_sub(self):
       x = Normal(0.0, 1.0)
       y = 5.0
       z = x - y
-      z_value = x.value() - y
+      z_value = x.value - y
       z_eval, z_value_eval = sess.run([z, z_value])
       self.assertAllEqual(z_eval, z_value_eval)
 
@@ -42,7 +42,7 @@ def test_rsub(self):
       x = Normal(0.0, 1.0)
       y = 5.0
       z = y - x
-      z_value = y - x.value()
+      z_value = y - x.value
       z_eval, z_value_eval = sess.run([z, z_value])
       self.assertAllEqual(z_eval, z_value_eval)
 
@@ -51,7 +51,7 @@ def test_mul(self):
       x = Normal(0.0, 1.0)
       y = 5.0
       z = x * y
-      z_value = x.value() * y
+      z_value = x.value * y
       z_eval, z_value_eval = sess.run([z, z_value])
       self.assertAllEqual(z_eval, z_value_eval)
 
@@ -60,7 +60,7 @@ def test_rmul(self):
       x = Normal(0.0, 1.0)
       y = 5.0
       z = y * x
-      z_value = y * x.value()
+      z_value = y * x.value
       z_eval, z_value_eval = sess.run([z, z_value])
       self.assertAllEqual(z_eval, z_value_eval)
 
@@ -69,7 +69,7 @@ def test_div(self):
       x = Normal(0.0, 1.0)
       y = 5.0
       z = x / y
-      z_value = x.value() / y
+      z_value = x.value / y
       z_eval, z_value_eval = sess.run([z, z_value])
       self.assertAllEqual(z_eval, z_value_eval)
 
@@ -78,7 +78,7 @@ def test_rdiv(self):
       x = Normal(0.0, 1.0)
       y = 5.0
       z = y / x
-      z_value = y / x.value()
+      z_value = y / x.value
       z_eval, z_value_eval = sess.run([z, z_value])
       self.assertAllEqual(z_eval, z_value_eval)
 
@@ -87,7 +87,7 @@ def test_floordiv(self):
       x = Normal(0.0, 1.0)
       y = 5.0
       z = x // y
-      z_value = x.value() // y
+      z_value = x.value // y
       z_eval, z_value_eval = sess.run([z, z_value])
       self.assertAllEqual(z_eval, z_value_eval)
 
@@ -96,7 +96,7 @@ def test_rfloordiv(self):
       x = Normal(0.0, 1.0)
       y = 5.0
       z = y // x
-      z_value = y // x.value()
+      z_value = y // x.value
       z_eval, z_value_eval = sess.run([z, z_value])
       self.assertAllEqual(z_eval, z_value_eval)
 
@@ -105,7 +105,7 @@ def test_mod(self):
       x = Normal(0.0, 1.0)
       y = 5.0
       z = x % y
-      z_value = x.value() % y
+      z_value = x.value % y
       z_eval, z_value_eval = sess.run([z, z_value])
       self.assertAllEqual(z_eval, z_value_eval)
 
@@ -114,7 +114,7 @@ def test_rmod(self):
       x = Normal(0.0, 1.0)
       y = 5.0
       z = y % x
-      z_value = y % x.value()
+      z_value = y % x.value
       z_eval, z_value_eval = sess.run([z, z_value])
       self.assertAllEqual(z_eval, z_value_eval)
 
@@ -123,7 +123,7 @@ def test_lt(self):
       x = Normal(0.0, 1.0)
       y = 5.0
       z = x < y
-      z_value = x.value() < y
+      z_value = x.value < y
       z_eval, z_value_eval = sess.run([z, z_value])
       self.assertAllEqual(z_eval, z_value_eval)
 
@@ -132,7 +132,7 @@ def test_le(self):
       x = Normal(0.0, 1.0)
       y = 5.0
       z = x <= y
-      z_value = x.value() <= y
+      z_value = x.value <= y
       z_eval, z_value_eval = sess.run([z, z_value])
       self.assertAllEqual(z_eval, z_value_eval)
 
@@ -141,7 +141,7 @@ def test_gt(self):
       x = Normal(0.0, 1.0)
       y = 5.0
       z = x > y
-      z_value = x.value() > y
+      z_value = x.value > y
       z_eval, z_value_eval = sess.run([z, z_value])
       self.assertAllEqual(z_eval, z_value_eval)
 
@@ -150,7 +150,7 @@ def test_ge(self):
       x = Normal(0.0, 1.0)
       y = 5.0
       z = x >= y
-      z_value = x.value() >= y
+      z_value = x.value >= y
       z_eval, z_value_eval = sess.run([z, z_value])
       self.assertAllEqual(z_eval, z_value_eval)
 
@@ -160,7 +160,7 @@ def test_ge(self):
       # x = tf.cast(Bernoulli(0.5), tf.bool)
       # y = True
       # z = x & y
-      # z_value = x.value() & y
+      # z_value = x.value & y
       # z_eval, z_value_eval = sess.run([z, z_value])
       # self.assertAllEqual(z_eval, z_value_eval)
 
@@ -174,7 +174,7 @@ def test_getitem(self):
     with self.test_session() as sess:
       x = Normal(tf.zeros([3, 4]), tf.ones([3, 4]))
       z = x[0:2, 2:3]
-      z_value = x.value()[0:2, 2:3]
+      z_value = x.value[0:2, 2:3]
       z_eval, z_value_eval = sess.run([z, z_value])
       self.assertAllEqual(z_eval, z_value_eval)
 
@@ -183,7 +183,7 @@ def test_pow(self):
       x = Normal(0.0, 1.0)
       y = 5.0
       z = x ** y
-      z_value = x.value() ** y
+      z_value = x.value ** y
       z_eval, z_value_eval = sess.run([z, z_value])
       self.assertAllEqual(z_eval, z_value_eval)
 
@@ -192,7 +192,7 @@ def test_rpow(self):
       x = Normal(0.0, 1.0)
       y = 5.0
       z = y ** x
-      z_value = y ** x.value()
+      z_value = y ** x.value
       z_eval, z_value_eval = sess.run([z, z_value])
       self.assertAllEqual(z_eval, z_value_eval)
 
@@ -202,7 +202,7 @@ def test_neg(self):
     with self.test_session() as sess:
       x = Normal(0.0, 1.0)
       z = -x
-      z_value = -x.value()
+      z_value = -x.value
       z_eval, z_value_eval = sess.run([z, z_value])
       self.assertAllEqual(z_eval, z_value_eval)
 
@@ -210,7 +210,7 @@ def test_abs(self):
     with self.test_session() as sess:
       x = Normal(0.0, 1.0)
       z = abs(x)
-      z_value = abs(x.value())
+      z_value = abs(x.value)
       z_eval, z_value_eval = sess.run([z, z_value])
       self.assertAllEqual(z_eval, z_value_eval)
 
diff --git a/tests/models/random_variable_value_test.py b/tests/models/random_variable_value_test.py
index 0a351e645..988e86d7b 100644
--- a/tests/models/random_variable_value_test.py
+++ b/tests/models/random_variable_value_test.py
@@ -13,17 +13,17 @@ class test_random_variable_value_class(tf.test.TestCase):
 
   def _test_sample(self, RV, value, *args, **kwargs):
     rv = RV(*args, value=value, **kwargs)
-    value_shape = rv.value().shape
+    value_shape = rv.value.shape
     expected_shape = rv.sample_shape.concatenate(
         rv.batch_shape).concatenate(rv.event_shape)
     self.assertEqual(value_shape, expected_shape)
-    self.assertEqual(rv.dtype, rv.value().dtype)
+    self.assertEqual(rv.dtype, rv.value.dtype)
 
   def _test_copy(self, RV, value, *args, **kwargs):
     rv1 = RV(*args, value=value, **kwargs)
     rv2 = copy(rv1)
-    value_shape1 = rv1.value().shape
-    value_shape2 = rv2.value().shape
+    value_shape1 = rv1.value.shape
+    value_shape2 = rv2.value.shape
     self.assertEqual(value_shape1, value_shape2)
 
   def test_shape_and_dtype(self):
diff --git a/tests/util/copy_test.py b/tests/util/copy_test.py
index 6b8913073..7aa714f4d 100644
--- a/tests/util/copy_test.py
+++ b/tests/util/copy_test.py
@@ -88,8 +88,8 @@ def test_list(self):
       components = [Normal(x, tf.constant(0.1))
                     for _ in range(5)]
       z = Mixture(cat=cat, components=components)
-      z_new = ed.copy(z, {x: y.value()})
-      self.assertGreater(z_new.value().eval(), 5.0)
+      z_new = ed.copy(z, {x: y.value})
+      self.assertGreater(z_new.value.eval(), 5.0)
 
   def test_random(self):
     with self.test_session() as sess:
@@ -206,7 +206,7 @@ def test_swap_rv_tensor(self):
       y = tf.constant(1.0)
       z = x * y
       qx = Normal(10.0, 0.1)
-      z_new = ed.copy(z, {x: qx.value()})
+      z_new = ed.copy(z, {x: qx.value})
       self.assertGreater(z_new.eval(), 5.0)
 
   def test_swap_tensor_rv(self):
@@ -216,7 +216,7 @@ def test_swap_tensor_rv(self):
       y = tf.constant(1.0)
       z = x * y
       qx = Normal(10.0, 0.1)
-      z_new = ed.copy(z, {x.value(): qx})
+      z_new = ed.copy(z, {x.value: qx})
       self.assertGreater(z_new.eval(), 5.0)
 
   def test_ordering_rv_tensor(self):

From 486b7e16f9d9c46483179b7f39cca7f5cbf9c157 Mon Sep 17 00:00:00 2001
From: Dustin Tran <dustinviettran@gmail.com>
Date: Mon, 8 Jan 2018 01:04:01 -0800
Subject: [PATCH 02/27] make {klpq,klqp,laplace,map,wake_sleep}.py,gans
 functional

---
 edward/__init__.py                         |   64 +-
 edward/criticisms/evaluate.py              |    5 +-
 edward/criticisms/ppc.py                   |   11 +-
 edward/inferences/__init__.py              |   47 +-
 edward/inferences/bigan_inference.py       |  100 +-
 edward/inferences/gan_inference.py         |  275 ++---
 edward/inferences/gibbs.py                 |    4 +-
 edward/inferences/implicit_klqp.py         |  339 +++---
 edward/inferences/inference.py             |  911 +++++++++------
 edward/inferences/klpq.py                  |  261 +++--
 edward/inferences/klqp.py                  | 1176 +++++++++-----------
 edward/inferences/laplace.py               |  146 ++-
 edward/inferences/map.py                   |  157 +--
 edward/inferences/metropolis_hastings.py   |    4 +-
 edward/inferences/monte_carlo.py           |  125 +--
 edward/inferences/variational_inference.py |  185 ---
 edward/inferences/wake_sleep.py            |  195 ++--
 edward/inferences/wgan_inference.py        |  148 ++-
 edward/util/__init__.py                    |    2 -
 edward/util/random_variables.py            |   65 --
 tests/util/check_data_test.py              |   49 -
 tests/util/check_latent_vars_test.py       |   26 -
 22 files changed, 1930 insertions(+), 2365 deletions(-)
 delete mode 100644 edward/inferences/variational_inference.py
 delete mode 100644 tests/util/check_data_test.py
 delete mode 100644 tests/util/check_latent_vars_test.py

diff --git a/edward/__init__.py b/edward/__init__.py
index 5892f3b65..ddb1380fb 100644
--- a/edward/__init__.py
+++ b/edward/__init__.py
@@ -11,15 +11,27 @@
 from edward.criticisms import (
     evaluate, ppc, ppc_density_plot, ppc_stat_hist_plot)
 from edward.inferences import (
-    Inference, MonteCarlo, VariationalInference,
-    HMC, MetropolisHastings, SGLD, SGHMC,
-    KLpq, KLqp, ReparameterizationKLqp, ReparameterizationKLKLqp,
-    ReparameterizationEntropyKLqp, ScoreKLqp, ScoreKLKLqp, ScoreEntropyKLqp,
-    ScoreRBKLqp, WakeSleep, GANInference, BiGANInference, WGANInference,
-    ImplicitKLqp, MAP, Laplace, complete_conditional, Gibbs)
+    bigan_inference,
+    complete_conditional,
+    gan_inference,
+    implicit_klqp,
+    klpq,
+    klqp,
+    reparameterization_klqp,
+    reparameterization_kl_klqp,
+    reparameterization_entropy_klqp,
+    score_klqp,
+    score_kl_klqp,
+    score_entropy_klqp,
+    score_rb_klqp,
+    laplace,
+    map,
+    wake_sleep,
+    wgan_inference)
+# from edward.inferences import MonteCarlo, HMC, MetropolisHastings, SGLD, SGHMC, Gibbs
 from edward.models import RandomVariable
 from edward.util import (
-    check_data, check_latent_vars, copy, dot,
+    copy, dot,
     get_ancestors, get_blanket, get_children, get_control_variate_coef,
     get_descendants, get_parents, get_session, get_siblings, get_variables,
     is_independent, Progbar, random_variables, rbf, set_seed,
@@ -38,34 +50,30 @@
     'ppc',
     'ppc_density_plot',
     'ppc_stat_hist_plot',
-    'Inference',
+    'bigan_inference',
+    'complete_conditional',
+    'gan_inference',
+    'implicit_klqp',
     'MonteCarlo',
-    'VariationalInference',
     'HMC',
     'MetropolisHastings',
     'SGLD',
     'SGHMC',
-    'KLpq',
-    'KLqp',
-    'ReparameterizationKLqp',
-    'ReparameterizationKLKLqp',
-    'ReparameterizationEntropyKLqp',
-    'ScoreKLqp',
-    'ScoreKLKLqp',
-    'ScoreEntropyKLqp',
-    'ScoreRBKLqp',
-    'WakeSleep',
-    'GANInference',
-    'BiGANInference',
-    'WGANInference',
-    'ImplicitKLqp',
-    'MAP',
-    'Laplace',
-    'complete_conditional',
+    'klpq',
+    'klqp',
+    'reparameterization_klqp',
+    'reparameterization_kl_klqp',
+    'reparameterization_entropy_klqp',
+    'score_klqp',
+    'score_kl_klqp',
+    'score_entropy_klqp',
+    'score_rb_klqp',
+    'laplace',
+    'map',
+    'wake_sleep',
+    'wgan_inference',
     'Gibbs',
     'RandomVariable',
-    'check_data',
-    'check_latent_vars',
     'copy',
     'dot',
     'get_ancestors',
diff --git a/edward/criticisms/evaluate.py b/edward/criticisms/evaluate.py
index 44074aa59..80be843bf 100644
--- a/edward/criticisms/evaluate.py
+++ b/edward/criticisms/evaluate.py
@@ -6,8 +6,9 @@
 import six
 import tensorflow as tf
 
+from edward.inferences.inference import check_and_maybe_build_data
 from edward.models import RandomVariable
-from edward.util import check_data, get_session, compute_multinomial_mode, \
+from edward.util import get_session, compute_multinomial_mode, \
     with_binary_averaging
 
 try:
@@ -105,7 +106,7 @@ def evaluate(metrics, data, n_samples=500, output_key=None, seed=None):
   elif not isinstance(metrics, list):
     raise TypeError("metrics must have type str or list, or be callable.")
 
-  check_data(data)
+  data = check_and_maybe_build_data(data)
   if not isinstance(n_samples, int):
     raise TypeError("n_samples must have type int.")
 
diff --git a/edward/criticisms/ppc.py b/edward/criticisms/ppc.py
index ad591b18d..462aaa623 100644
--- a/edward/criticisms/ppc.py
+++ b/edward/criticisms/ppc.py
@@ -6,8 +6,10 @@
 import six
 import tensorflow as tf
 
+from edward.inferences.inference import (check_and_maybe_build_data,
+    check_and_maybe_build_latent_vars)
 from edward.models import RandomVariable
-from edward.util import check_data, check_latent_vars, get_session
+from edward.util import get_session
 
 
 def ppc(T, data, latent_vars=None, n_samples=100):
@@ -82,11 +84,8 @@ def ppc(T, data, latent_vars=None, n_samples=100):
   if not callable(T):
     raise TypeError("T must be a callable function.")
 
-  check_data(data)
-  if latent_vars is None:
-    latent_vars = {}
-
-  check_latent_vars(latent_vars)
+  data = check_and_maybe_build_data(data)
+  latent_vars = check_and_maybe_build_latent_vars(latent_vars)
   if not isinstance(n_samples, int):
     raise TypeError("n_samples must have type int.")
 
diff --git a/edward/inferences/__init__.py b/edward/inferences/__init__.py
index 38262fcb7..10c664cb3 100644
--- a/edward/inferences/__init__.py
+++ b/edward/inferences/__init__.py
@@ -7,50 +7,47 @@
 from edward.inferences.bigan_inference import *
 from edward.inferences.conjugacy import *
 from edward.inferences.gan_inference import *
-from edward.inferences.gibbs import *
-from edward.inferences.hmc import *
+# from edward.inferences.gibbs import *
+# from edward.inferences.hmc import *
 from edward.inferences.implicit_klqp import *
 from edward.inferences.inference import *
 from edward.inferences.klpq import *
 from edward.inferences.klqp import *
 from edward.inferences.laplace import *
 from edward.inferences.map import *
-from edward.inferences.metropolis_hastings import *
-from edward.inferences.monte_carlo import *
-from edward.inferences.sgld import *
-from edward.inferences.sghmc import *
-from edward.inferences.variational_inference import *
+# from edward.inferences.metropolis_hastings import *
+# from edward.inferences.monte_carlo import *
+# from edward.inferences.sgld import *
+# from edward.inferences.sghmc import *
 from edward.inferences.wake_sleep import *
 from edward.inferences.wgan_inference import *
 
 from tensorflow.python.util.all_util import remove_undocumented
 
 _allowed_symbols = [
-    'BiGANInference',
+    'bigan_inference',
     'complete_conditional',
-    'GANInference',
+    'gan_inference',
+    'implicit_klqp',
     'Gibbs',
     'HMC',
-    'ImplicitKLqp',
-    'Inference',
-    'KLpq',
-    'KLqp',
-    'ReparameterizationKLqp',
-    'ReparameterizationKLKLqp',
-    'ReparameterizationEntropyKLqp',
-    'ScoreKLqp',
-    'ScoreKLKLqp',
-    'ScoreEntropyKLqp',
-    'ScoreRBKLqp',
-    'Laplace',
-    'MAP',
+    'klpq',
+    'klqp',
+    'reparameterization_klqp',
+    'reparameterization_kl_klqp',
+    'reparameterization_entropy_klqp',
+    'score_klqp',
+    'score_kl_klqp',
+    'score_entropy_klqp',
+    'score_rb_klqp',
+    'laplace',
+    'map',
     'MetropolisHastings',
     'MonteCarlo',
     'SGLD',
     'SGHMC',
-    'VariationalInference',
-    'WakeSleep',
-    'WGANInference',
+    'wake_sleep',
+    'wgan_inference',
 ]
 
 remove_undocumented(__name__, allowed_exception_list=_allowed_symbols)
diff --git a/edward/inferences/bigan_inference.py b/edward/inferences/bigan_inference.py
index 00b2396ba..57e7c99e6 100644
--- a/edward/inferences/bigan_inference.py
+++ b/edward/inferences/bigan_inference.py
@@ -5,11 +5,14 @@
 import six
 import tensorflow as tf
 
-from edward.inferences.gan_inference import GANInference
+from edward.inferences.inference import (check_and_maybe_build_data,
+    check_and_maybe_build_latent_vars, transform, check_and_maybe_build_dict, check_and_maybe_build_var_list)
 from edward.util import get_session
 
 
-class BiGANInference(GANInference):
+def bigan_inference(latent_vars=None, data=None, discriminator=None,
+                    auto_transform=True, scale=None, var_list=None,
+                    collections=None):
   """Adversarially Learned Inference [@dumuolin2017adversarially] or
   Bidirectional Generative Adversarial Networks [@donahue2017adversarial]
   for joint learning of generator and inference networks.
@@ -43,50 +46,49 @@ class BiGANInference(GANInference):
   inference = ed.BiGANInference({z_ph: zf}, {xf: x_ph}, discriminator)
   ```
   """
-  def __init__(self, latent_vars, data, discriminator):
-    if not callable(discriminator):
-      raise TypeError("discriminator must be a callable function.")
-
-    self.discriminator = discriminator
-    # call grandparent's method; avoid parent (GANInference)
-    super(GANInference, self).__init__(latent_vars, data)
-
-  def build_loss_and_gradients(self, var_list):
-    x_true = list(six.itervalues(self.data))[0]
-    x_fake = list(six.iterkeys(self.data))[0]
-
-    z_true = list(six.iterkeys(self.latent_vars))[0]
-    z_fake = list(six.itervalues(self.latent_vars))[0]
-
-    with tf.variable_scope("Disc"):
-        # xtzf := x_true, z_fake
-        d_xtzf = self.discriminator(x_true, z_fake)
-    with tf.variable_scope("Disc", reuse=True):
-        # xfzt := x_fake, z_true
-        d_xfzt = self.discriminator(x_fake, z_true)
-
-    loss_d = tf.nn.sigmoid_cross_entropy_with_logits(
-        labels=tf.ones_like(d_xfzt), logits=d_xfzt) + \
-        tf.nn.sigmoid_cross_entropy_with_logits(
-            labels=tf.zeros_like(d_xtzf), logits=d_xtzf)
-    loss = tf.nn.sigmoid_cross_entropy_with_logits(
-        labels=tf.zeros_like(d_xfzt), logits=d_xfzt) + \
-        tf.nn.sigmoid_cross_entropy_with_logits(
-            labels=tf.ones_like(d_xtzf), logits=d_xtzf)
-
-    reg_terms_d = tf.losses.get_regularization_losses(scope="Disc")
-    reg_terms = tf.losses.get_regularization_losses(scope="Gen")
-
-    loss_d = tf.reduce_mean(loss_d) + tf.reduce_sum(reg_terms_d)
-    loss = tf.reduce_mean(loss) + tf.reduce_sum(reg_terms)
-
-    var_list_d = tf.get_collection(
-        tf.GraphKeys.TRAINABLE_VARIABLES, scope="Disc")
-    var_list = tf.get_collection(
-        tf.GraphKeys.TRAINABLE_VARIABLES, scope="Gen")
-
-    grads_d = tf.gradients(loss_d, var_list_d)
-    grads = tf.gradients(loss, var_list)
-    grads_and_vars_d = list(zip(grads_d, var_list_d))
-    grads_and_vars = list(zip(grads, var_list))
-    return loss, grads_and_vars, loss_d, grads_and_vars_d
+  if not callable(discriminator):
+    raise TypeError("discriminator must be a callable function.")
+  latent_vars = check_and_maybe_build_latent_vars(latent_vars)
+  data = check_and_maybe_build_data(data)
+  latent_vars, _ = transform(latent_vars, auto_transform)
+  scale = check_and_maybe_build_dict(scale)
+  var_list = check_and_maybe_build_var_list(var_list, latent_vars, data)
+
+  x_true = list(six.itervalues(self.data))[0]
+  x_fake = list(six.iterkeys(self.data))[0]
+
+  z_true = list(six.iterkeys(self.latent_vars))[0]
+  z_fake = list(six.itervalues(self.latent_vars))[0]
+
+  with tf.variable_scope("Disc"):
+      # xtzf := x_true, z_fake
+      d_xtzf = self.discriminator(x_true, z_fake)
+  with tf.variable_scope("Disc", reuse=True):
+      # xfzt := x_fake, z_true
+      d_xfzt = self.discriminator(x_fake, z_true)
+
+  loss_d = tf.nn.sigmoid_cross_entropy_with_logits(
+      labels=tf.ones_like(d_xfzt), logits=d_xfzt) + \
+      tf.nn.sigmoid_cross_entropy_with_logits(
+          labels=tf.zeros_like(d_xtzf), logits=d_xtzf)
+  loss = tf.nn.sigmoid_cross_entropy_with_logits(
+      labels=tf.zeros_like(d_xfzt), logits=d_xfzt) + \
+      tf.nn.sigmoid_cross_entropy_with_logits(
+          labels=tf.ones_like(d_xtzf), logits=d_xtzf)
+
+  reg_terms_d = tf.losses.get_regularization_losses(scope="Disc")
+  reg_terms = tf.losses.get_regularization_losses(scope="Gen")
+
+  loss_d = tf.reduce_mean(loss_d) + tf.reduce_sum(reg_terms_d)
+  loss = tf.reduce_mean(loss) + tf.reduce_sum(reg_terms)
+
+  var_list_d = tf.get_collection(
+      tf.GraphKeys.TRAINABLE_VARIABLES, scope="Disc")
+  var_list = tf.get_collection(
+      tf.GraphKeys.TRAINABLE_VARIABLES, scope="Gen")
+
+  grads_d = tf.gradients(loss_d, var_list_d)
+  grads = tf.gradients(loss, var_list)
+  grads_and_vars_d = list(zip(grads_d, var_list_d))
+  grads_and_vars = list(zip(grads, var_list))
+  return loss, grads_and_vars, loss_d, grads_and_vars_d
diff --git a/edward/inferences/gan_inference.py b/edward/inferences/gan_inference.py
index 2a8c9d17c..40dd99859 100644
--- a/edward/inferences/gan_inference.py
+++ b/edward/inferences/gan_inference.py
@@ -5,11 +5,13 @@
 import six
 import tensorflow as tf
 
-from edward.inferences.variational_inference import VariationalInference
+from edward.inferences.inference import (check_and_maybe_build_data,
+    transform, check_and_maybe_build_dict, check_and_maybe_build_var_list)
 from edward.util import get_session
 
 
-class GANInference(VariationalInference):
+def gan_inference(data=None, discriminator=None,
+                  scale=None, var_list=None, collections=None):
   """Parameter estimation with GAN-style training
   [@goodfellow2014generative].
 
@@ -42,214 +44,61 @@ class GANInference(VariationalInference):
   inference = ed.GANInference({x: x_data}, discriminator)
   ```
   """
-  def __init__(self, data, discriminator):
-    """Create an inference algorithm.
-
-    Args:
-      data: dict.
-        Data dictionary which binds observed variables (of type
-        `RandomVariable` or `tf.Tensor`) to their realizations (of
-        type `tf.Tensor`).  It can also bind placeholders (of type
-        `tf.Tensor`) used in the model to their realizations.
-      discriminator: function.
-        Function (with parameters) to discriminate samples. It should
-        output logit probabilities (real-valued) and not probabilities
-        in $[0, 1]$.
-    """
-    if not callable(discriminator):
-      raise TypeError("discriminator must be a callable function.")
-
-    self.discriminator = discriminator
-    super(GANInference, self).__init__(None, data)
-
-  def initialize(self, optimizer=None, optimizer_d=None,
-                 global_step=None, global_step_d=None, var_list=None,
-                 *args, **kwargs):
-    """Initialize inference algorithm. It initializes hyperparameters
-    and builds ops for the algorithm's computation graph.
-
-    Args:
-      optimizer: str or tf.train.Optimizer.
-        A TensorFlow optimizer, to use for optimizing the generator
-        objective. Alternatively, one can pass in the name of a
-        TensorFlow optimizer, and default parameters for the optimizer
-        will be used.
-      optimizer_d: str or tf.train.Optimizer.
-        A TensorFlow optimizer, to use for optimizing the discriminator
-        objective. Alternatively, one can pass in the name of a
-        TensorFlow optimizer, and default parameters for the optimizer
-        will be used.
-      global_step: tf.Variable.
-        Optional `Variable` to increment by one after the variables
-        for the generator have been updated. See
-        `tf.train.Optimizer.apply_gradients`.
-      global_step_d: tf.Variable.
-        Optional `Variable` to increment by one after the variables
-        for the discriminator have been updated. See
-        `tf.train.Optimizer.apply_gradients`.
-      var_list: list of tf.Variable.
-        List of TensorFlow variables to optimize over (in the generative
-        model). Default is all trainable variables that `latent_vars`
-        and `data` depend on.
-    """
-    # call grandparent's method; avoid parent (VariationalInference)
-    super(VariationalInference, self).initialize(*args, **kwargs)
-
-    self.loss, grads_and_vars, self.loss_d, grads_and_vars_d = \
-        self.build_loss_and_gradients(var_list)
-
-    optimizer, global_step = _build_optimizer(optimizer, global_step)
-    optimizer_d, global_step_d = _build_optimizer(optimizer_d, global_step_d)
-
-    self.train = optimizer.apply_gradients(grads_and_vars,
-                                           global_step=global_step)
-    self.train_d = optimizer_d.apply_gradients(grads_and_vars_d,
-                                               global_step=global_step_d)
-
-    if self.logging:
-      tf.summary.scalar("loss", self.loss,
-                        collections=[self._summary_key])
-      tf.summary.scalar("loss/discriminative", self.loss_d,
-                        collections=[self._summary_key])
-      self.summarize = tf.summary.merge_all(key=self._summary_key)
-
-  def build_loss_and_gradients(self, var_list):
-    x_true = list(six.itervalues(self.data))[0]
-    x_fake = list(six.iterkeys(self.data))[0]
-    with tf.variable_scope("Disc"):
-      d_true = self.discriminator(x_true)
-
-    with tf.variable_scope("Disc", reuse=True):
-      d_fake = self.discriminator(x_fake)
-
-    if self.logging:
-      tf.summary.histogram("discriminator_outputs",
-                           tf.concat([d_true, d_fake], axis=0),
-                           collections=[self._summary_key])
-
-    reg_terms_d = tf.losses.get_regularization_losses(scope="Disc")
-    reg_terms_all = tf.losses.get_regularization_losses()
-    reg_terms = [r for r in reg_terms_all if r not in reg_terms_d]
-
-    loss_d = tf.nn.sigmoid_cross_entropy_with_logits(
-        labels=tf.ones_like(d_true), logits=d_true) + \
-        tf.nn.sigmoid_cross_entropy_with_logits(
-            labels=tf.zeros_like(d_fake), logits=d_fake)
-    loss = tf.nn.sigmoid_cross_entropy_with_logits(
-        labels=tf.ones_like(d_fake), logits=d_fake)
-    loss_d = tf.reduce_mean(loss_d) + tf.reduce_sum(reg_terms_d)
-    loss = tf.reduce_mean(loss) + tf.reduce_sum(reg_terms)
-
-    var_list_d = tf.get_collection(
-        tf.GraphKeys.TRAINABLE_VARIABLES, scope="Disc")
-    if var_list is None:
-      var_list = [v for v in tf.trainable_variables() if v not in var_list_d]
-
-    grads_d = tf.gradients(loss_d, var_list_d)
-    grads = tf.gradients(loss, var_list)
-    grads_and_vars_d = list(zip(grads_d, var_list_d))
-    grads_and_vars = list(zip(grads, var_list))
-    return loss, grads_and_vars, loss_d, grads_and_vars_d
-
-  def update(self, feed_dict=None, variables=None):
-    """Run one iteration of optimization.
-
-    Args:
-      feed_dict: dict.
-        Feed dictionary for a TensorFlow session run. It is used to feed
-        placeholders that are not fed during initialization.
-      variables: str.
-        Which set of variables to update. Either "Disc" or "Gen".
-        Default is both.
-
-    Returns:
-      dict.
-      Dictionary of algorithm-specific information. In this case, the
-      iteration number and generative and discriminative losses.
-
-    #### Notes
-
-    The outputted iteration number is the total number of calls to
-    `update`. Each update may include updating only a subset of
-    parameters.
-    """
-    if feed_dict is None:
-      feed_dict = {}
-
-    for key, value in six.iteritems(self.data):
-      if isinstance(key, tf.Tensor) and "Placeholder" in key.op.type:
-        feed_dict[key] = value
-
-    sess = get_session()
-    if variables is None:
-      _, _, t, loss, loss_d = sess.run(
-          [self.train, self.train_d, self.increment_t, self.loss, self.loss_d],
-          feed_dict)
-    elif variables == "Gen":
-      _, t, loss = sess.run(
-          [self.train, self.increment_t, self.loss], feed_dict)
-      loss_d = 0.0
-    elif variables == "Disc":
-      _, t, loss_d = sess.run(
-          [self.train_d, self.increment_t, self.loss_d], feed_dict)
-      loss = 0.0
-    else:
-      raise NotImplementedError("variables must be None, 'Gen', or 'Disc'.")
-
-    if self.debug:
-      sess.run(self.op_check, feed_dict)
-
-    if self.logging and self.n_print != 0:
-      if t == 1 or t % self.n_print == 0:
-        summary = sess.run(self.summarize, feed_dict)
-        self.train_writer.add_summary(summary, t)
-
-    return {'t': t, 'loss': loss, 'loss_d': loss_d}
-
-  def print_progress(self, info_dict):
-    """Print progress to output.
-    """
-    if self.n_print != 0:
-      t = info_dict['t']
-      if t == 1 or t % self.n_print == 0:
-        self.progbar.update(t, {'Gen Loss': info_dict['loss'],
-                                'Disc Loss': info_dict['loss_d']})
-
-
-def _build_optimizer(optimizer, global_step):
-  if optimizer is None and global_step is None:
-    # Default optimizer always uses a global step variable.
-    global_step = tf.Variable(0, trainable=False, name="global_step")
-
-  if isinstance(global_step, tf.Variable):
-    starter_learning_rate = 0.1
-    learning_rate = tf.train.exponential_decay(starter_learning_rate,
-                                               global_step,
-                                               100, 0.9, staircase=True)
-  else:
-    learning_rate = 0.01
-
-  # Build optimizer.
-  if optimizer is None:
-    optimizer = tf.train.AdamOptimizer(learning_rate)
-  elif isinstance(optimizer, str):
-    if optimizer == 'gradientdescent':
-      optimizer = tf.train.GradientDescentOptimizer(learning_rate)
-    elif optimizer == 'adadelta':
-      optimizer = tf.train.AdadeltaOptimizer(learning_rate)
-    elif optimizer == 'adagrad':
-      optimizer = tf.train.AdagradOptimizer(learning_rate)
-    elif optimizer == 'momentum':
-      optimizer = tf.train.MomentumOptimizer(learning_rate, 0.9)
-    elif optimizer == 'adam':
-      optimizer = tf.train.AdamOptimizer(learning_rate)
-    elif optimizer == 'ftrl':
-      optimizer = tf.train.FtrlOptimizer(learning_rate)
-    elif optimizer == 'rmsprop':
-      optimizer = tf.train.RMSPropOptimizer(learning_rate)
-    else:
-      raise ValueError('Optimizer class not found:', optimizer)
-  elif not isinstance(optimizer, tf.train.Optimizer):
-    raise TypeError("Optimizer must be str, tf.train.Optimizer, or None.")
-
-  return optimizer, global_step
+  """Create an inference algorithm.
+
+  Args:
+    data: dict.
+      Data dictionary which binds observed variables (of type
+      `RandomVariable` or `tf.Tensor`) to their realizations (of
+      type `tf.Tensor`).  It can also bind placeholders (of type
+      `tf.Tensor`) used in the model to their realizations.
+    discriminator: function.
+      Function (with parameters) to discriminate samples. It should
+      output logit probabilities (real-valued) and not probabilities
+      in $[0, 1]$.
+    var_list: list of tf.Variable, optional.
+      List of TensorFlow variables to optimize over (in the generative
+      model). Default is all trainable variables that `data` depends on.
+  """
+  if not callable(discriminator):
+    raise TypeError("discriminator must be a callable function.")
+  data = check_and_maybe_build_data(data)
+  scale = check_and_maybe_build_dict(scale)
+  var_list = check_and_maybe_build_var_list(var_list, {}, data)
+
+  x_true = list(six.itervalues(data))[0]
+  x_fake = list(six.iterkeys(data))[0]
+  with tf.variable_scope("Disc"):
+    d_true = discriminator(x_true)
+
+  with tf.variable_scope("Disc", reuse=True):
+    d_fake = discriminator(x_fake)
+
+  if collections is not None:
+    tf.summary.histogram("discriminator_outputs",
+                         tf.concat([d_true, d_fake], axis=0),
+                         collections=collections)
+
+  reg_terms_d = tf.losses.get_regularization_losses(scope="Disc")
+  reg_terms_all = tf.losses.get_regularization_losses()
+  reg_terms = [r for r in reg_terms_all if r not in reg_terms_d]
+
+  loss_d = tf.nn.sigmoid_cross_entropy_with_logits(
+      labels=tf.ones_like(d_true), logits=d_true) + \
+      tf.nn.sigmoid_cross_entropy_with_logits(
+          labels=tf.zeros_like(d_fake), logits=d_fake)
+  loss = tf.nn.sigmoid_cross_entropy_with_logits(
+      labels=tf.ones_like(d_fake), logits=d_fake)
+  loss_d = tf.reduce_mean(loss_d) + tf.reduce_sum(reg_terms_d)
+  loss = tf.reduce_mean(loss) + tf.reduce_sum(reg_terms)
+
+  var_list_d = tf.get_collection(
+      tf.GraphKeys.TRAINABLE_VARIABLES, scope="Disc")
+  if var_list is None:
+    var_list = [v for v in tf.trainable_variables() if v not in var_list_d]
+
+  grads_d = tf.gradients(loss_d, var_list_d)
+  grads = tf.gradients(loss, var_list)
+  grads_and_vars_d = list(zip(grads_d, var_list_d))
+  grads_and_vars = list(zip(grads, var_list))
+  return loss, grads_and_vars, loss_d, grads_and_vars_d
diff --git a/edward/inferences/gibbs.py b/edward/inferences/gibbs.py
index 3efb2d0c9..a5af7bd5d 100644
--- a/edward/inferences/gibbs.py
+++ b/edward/inferences/gibbs.py
@@ -10,7 +10,7 @@
 from edward.inferences.conjugacy import complete_conditional
 from edward.inferences.monte_carlo import MonteCarlo
 from edward.models import RandomVariable
-from edward.util import check_latent_vars, get_session
+from edward.util import check_and_maybe_build_latent_vars, get_session
 
 
 class Gibbs(MonteCarlo):
@@ -45,7 +45,7 @@ def __init__(self, latent_vars, proposal_vars=None, data=None):
       proposal_vars = {z: complete_conditional(z)
                        for z in six.iterkeys(latent_vars)}
     else:
-      check_latent_vars(proposal_vars)
+      proposal_vars = check_and_maybe_build_latent_vars(proposal_vars)
 
     self.proposal_vars = proposal_vars
     super(Gibbs, self).__init__(latent_vars, data)
diff --git a/edward/inferences/implicit_klqp.py b/edward/inferences/implicit_klqp.py
index 3baf9d813..5dcc485f0 100644
--- a/edward/inferences/implicit_klqp.py
+++ b/edward/inferences/implicit_klqp.py
@@ -5,12 +5,15 @@
 import six
 import tensorflow as tf
 
-from edward.inferences.gan_inference import GANInference
+from edward.inferences.inference import (check_and_maybe_build_data,
+    check_and_maybe_build_latent_vars, transform, check_and_maybe_build_dict, check_and_maybe_build_var_list)
 from edward.models import RandomVariable
-from edward.util import check_latent_vars, copy, get_session
+from edward.util import copy, get_session
 
 
-class ImplicitKLqp(GANInference):
+def implicit_klqp(latent_vars=None, data=None, discriminator=None,
+                  global_vars=None, ratio_loss='log',
+                  auto_transform=True, scale=None, var_list=None, collections=None):
   """Variational inference with implicit probabilistic models
   [@tran2017deep].
 
@@ -52,179 +55,169 @@ class ImplicitKLqp(GANInference):
   The objective function also adds to itself a summation over all
   tensors in the `REGULARIZATION_LOSSES` collection.
   """
-  def __init__(self, latent_vars, data=None, discriminator=None,
-               global_vars=None):
-    """Create an inference algorithm.
-
-    Args:
-      discriminator: function.
-        Function (with parameters). Unlike `GANInference`, it is
-        interpreted as a ratio estimator rather than a discriminator.
-        It takes three arguments: a data dict, local latent variable
-        dict, and global latent variable dict. As with GAN
-        discriminators, it can take a batch of data points and local
-        variables, of size $M$, and output a vector of length
-        $M$.
-      global_vars: dict of RandomVariable to RandomVariable.
-        Identifying which variables in `latent_vars` are global
-        variables, shared across data points. These will not be
-        encompassed in the ratio estimation problem, and will be
-        estimated with tractable variational approximations.
-    """
-    if not callable(discriminator):
-      raise TypeError("discriminator must be a callable function.")
-
-    self.discriminator = discriminator
-    if global_vars is None:
-      global_vars = {}
-
-    check_latent_vars(global_vars)
-    self.global_vars = global_vars
-    # call grandparent's method; avoid parent (GANInference)
-    super(GANInference, self).__init__(latent_vars, data)
-
-  def initialize(self, ratio_loss='log', *args, **kwargs):
-    """Initialize inference algorithm. It initializes hyperparameters
-    and builds ops for the algorithm's computation graph.
-
-    Args:
-      ratio_loss: str or fn.
-        Loss function minimized to get the ratio estimator. 'log' or 'hinge'.
-        Alternatively, one can pass in a function of two inputs,
-        `psamples` and `qsamples`, and output a point-wise value
-        with shape matching the shapes of the two inputs.
-    """
-    if callable(ratio_loss):
-      self.ratio_loss = ratio_loss
-    elif ratio_loss == 'log':
-      self.ratio_loss = log_loss
-    elif ratio_loss == 'hinge':
-      self.ratio_loss = hinge_loss
-    else:
-      raise ValueError('Ratio loss not found:', ratio_loss)
-
-    return super(ImplicitKLqp, self).initialize(*args, **kwargs)
-
-  def build_loss_and_gradients(self, var_list):
-    """Build loss function
-
-    $-\Big(\mathbb{E}_{q(\\beta)} [\log p(\\beta) - \log q(\\beta) ] +
-        \sum_{n=1}^N \mathbb{E}_{q(\\beta)q(z_n\mid\\beta)} [
-            r^*(x_n, z_n, \\beta) ] \Big).$
-
-    We minimize it with respect to parameterized variational
-    families $q(z, \\beta; \lambda)$.
-
-    $r^*(x_n, z_n, \\beta)$ is a function of a single data point
-    $x_n$, single local variable $z_n$, and all global
-    variables $\\beta$. It is equal to the log-ratio
-
-    $\log p(x_n, z_n\mid \\beta) - \log q(x_n, z_n\mid \\beta),$
-
-    where $q(x_n)$ is the empirical data distribution. Rather
-    than explicit calculation, $r^*(x, z, \\beta)$ is the
-    solution to a ratio estimation problem, minimizing the specified
-    `ratio_loss`.
-
-    Gradients are taken using the reparameterization trick
-    [@kingma2014auto].
-
-    #### Notes
-
-    This also includes model parameters $p(x, z, \\beta; \\theta)$
-    and variational distributions with inference networks
-    $q(z\mid x)$.
-
-    There are a bunch of extensions we could easily do in this
-    implementation:
-
-    + further factorizations can be used to better leverage the
-      graph structure for more complicated models;
-    + score function gradients for global variables;
-    + use more samples; this would require the `copy()` utility
-      function for q's as well, and an additional loop. we opt not to
-      because it complicates the code;
-    + analytic KL/swapping out the penalty term for the globals.
-    """
-    # Collect tensors used in calculation of losses.
-    scope = tf.get_default_graph().unique_name("inference")
-    qbeta_sample = {}
-    pbeta_log_prob = 0.0
-    qbeta_log_prob = 0.0
-    for beta, qbeta in six.iteritems(self.global_vars):
-      # Draw a sample beta' ~ q(beta) and calculate
-      # log p(beta') and log q(beta').
-      qbeta_sample[beta] = qbeta.value
-      pbeta_log_prob += tf.reduce_sum(beta.log_prob(qbeta_sample[beta]))
-      qbeta_log_prob += tf.reduce_sum(qbeta.log_prob(qbeta_sample[beta]))
-
-    pz_sample = {}
-    qz_sample = {}
-    for z, qz in six.iteritems(self.latent_vars):
-      if z not in self.global_vars:
-        # Copy local variables p(z), q(z) to draw samples
-        # z' ~ p(z | beta'), z' ~ q(z | beta').
-        pz_copy = copy(z, dict_swap=qbeta_sample, scope=scope)
-        pz_sample[z] = pz_copy.value
-        qz_sample[z] = qz.value
-
-    # Collect x' ~ p(x | z', beta') and x' ~ q(x).
-    dict_swap = qbeta_sample.copy()
-    dict_swap.update(qz_sample)
-    x_psample = {}
-    x_qsample = {}
-    for x, x_data in six.iteritems(self.data):
-      if isinstance(x, tf.Tensor):
-        if "Placeholder" not in x.op.type:
-          # Copy p(x | z, beta) to get draw p(x | z', beta').
-          x_copy = copy(x, dict_swap=dict_swap, scope=scope)
-          x_psample[x] = x_copy
-          x_qsample[x] = x_data
-      elif isinstance(x, RandomVariable):
+  """Create an inference algorithm.
+
+  Args:
+    discriminator: function.
+      Function (with parameters). Unlike `GANInference`, it is
+      interpreted as a ratio estimator rather than a discriminator.
+      It takes three arguments: a data dict, local latent variable
+      dict, and global latent variable dict. As with GAN
+      discriminators, it can take a batch of data points and local
+      variables, of size $M$, and output a vector of length
+      $M$.
+    global_vars: dict of RandomVariable to RandomVariable, optional.
+      Identifying which variables in `latent_vars` are global
+      variables, shared across data points. These will not be
+      encompassed in the ratio estimation problem, and will be
+      estimated with tractable variational approximations.
+  """
+  """Initialize inference algorithm. It initializes hyperparameters
+  and builds ops for the algorithm's computation graph.
+
+  Args:
+    ratio_loss: str or fn, optional.
+      Loss function minimized to get the ratio estimator. 'log' or 'hinge'.
+      Alternatively, one can pass in a function of two inputs,
+      `psamples` and `qsamples`, and output a point-wise value
+      with shape matching the shapes of the two inputs.
+  """
+  """Build loss function
+
+  $-\Big(\mathbb{E}_{q(\\beta)} [\log p(\\beta) - \log q(\\beta) ] +
+      \sum_{n=1}^N \mathbb{E}_{q(\\beta)q(z_n\mid\\beta)} [
+          r^*(x_n, z_n, \\beta) ] \Big).$
+
+  We minimize it with respect to parameterized variational
+  families $q(z, \\beta; \lambda)$.
+
+  $r^*(x_n, z_n, \\beta)$ is a function of a single data point
+  $x_n$, single local variable $z_n$, and all global
+  variables $\\beta$. It is equal to the log-ratio
+
+  $\log p(x_n, z_n\mid \\beta) - \log q(x_n, z_n\mid \\beta),$
+
+  where $q(x_n)$ is the empirical data distribution. Rather
+  than explicit calculation, $r^*(x, z, \\beta)$ is the
+  solution to a ratio estimation problem, minimizing the specified
+  `ratio_loss`.
+
+  Gradients are taken using the reparameterization trick
+  [@kingma2014auto].
+
+  #### Notes
+
+  This also includes model parameters $p(x, z, \\beta; \\theta)$
+  and variational distributions with inference networks
+  $q(z\mid x)$.
+
+  There are a bunch of extensions we could easily do in this
+  implementation:
+
+  + further factorizations can be used to better leverage the
+    graph structure for more complicated models;
+  + score function gradients for global variables;
+  + use more samples; this would require the `copy()` utility
+    function for q's as well, and an additional loop. we opt not to
+    because it complicates the code;
+  + analytic KL/swapping out the penalty term for the globals.
+  """
+  if not callable(discriminator):
+    raise TypeError("discriminator must be a callable function.")
+  if callable(ratio_loss):
+    ratio_loss = ratio_loss
+  elif ratio_loss == 'log':
+    ratio_loss = log_loss
+  elif ratio_loss == 'hinge':
+    ratio_loss = hinge_loss
+  else:
+    raise ValueError('Ratio loss not found:', ratio_loss)
+  latent_vars = check_and_maybe_build_latent_vars(latent_vars)
+  data = check_and_maybe_build_data(data)
+  global_vars = check_and_maybe_build_latent_vars(global_vars)
+  latent_vars, _ = transform(latent_vars, auto_transform)
+  scale = check_and_maybe_build_dict(scale)
+  var_list = check_and_maybe_build_var_list(var_list, latent_vars, data)
+
+  # Collect tensors used in calculation of losses.
+  scope = tf.get_default_graph().unique_name("inference")
+  qbeta_sample = {}
+  pbeta_log_prob = 0.0
+  qbeta_log_prob = 0.0
+  for beta, qbeta in six.iteritems(global_vars):
+    # Draw a sample beta' ~ q(beta) and calculate
+    # log p(beta') and log q(beta').
+    qbeta_sample[beta] = qbeta.value
+    pbeta_log_prob += tf.reduce_sum(beta.log_prob(qbeta_sample[beta]))
+    qbeta_log_prob += tf.reduce_sum(qbeta.log_prob(qbeta_sample[beta]))
+
+  pz_sample = {}
+  qz_sample = {}
+  for z, qz in six.iteritems(latent_vars):
+    if z not in global_vars:
+      # Copy local variables p(z), q(z) to draw samples
+      # z' ~ p(z | beta'), z' ~ q(z | beta').
+      pz_copy = copy(z, dict_swap=qbeta_sample, scope=scope)
+      pz_sample[z] = pz_copy.value
+      qz_sample[z] = qz.value
+
+  # Collect x' ~ p(x | z', beta') and x' ~ q(x).
+  dict_swap = qbeta_sample.copy()
+  dict_swap.update(qz_sample)
+  x_psample = {}
+  x_qsample = {}
+  for x, x_data in six.iteritems(data):
+    if isinstance(x, tf.Tensor):
+      if "Placeholder" not in x.op.type:
         # Copy p(x | z, beta) to get draw p(x | z', beta').
         x_copy = copy(x, dict_swap=dict_swap, scope=scope)
-        x_psample[x] = x_copy.value
+        x_psample[x] = x_copy
         x_qsample[x] = x_data
-
-    with tf.variable_scope("Disc"):
-      r_psample = self.discriminator(x_psample, pz_sample, qbeta_sample)
-
-    with tf.variable_scope("Disc", reuse=True):
-      r_qsample = self.discriminator(x_qsample, qz_sample, qbeta_sample)
-
-    # Form ratio loss and ratio estimator.
-    if len(self.scale) <= 1:
-      loss_d = tf.reduce_mean(self.ratio_loss(r_psample, r_qsample))
-      scale = list(six.itervalues(self.scale))
-      scale = scale[0] if scale else 1.0
-      scaled_ratio = tf.reduce_sum(scale * r_qsample)
-    else:
-      loss_d = [tf.reduce_mean(self.ratio_loss(r_psample[key], r_qsample[key]))
-                for key in six.iterkeys(self.scale)]
-      loss_d = tf.reduce_sum(loss_d)
-      scaled_ratio = [tf.reduce_sum(self.scale[key] * r_qsample[key])
-                      for key in six.iterkeys(self.scale)]
-      scaled_ratio = tf.reduce_sum(scaled_ratio)
-
-    reg_terms_d = tf.losses.get_regularization_losses(scope="Disc")
-    reg_terms_all = tf.losses.get_regularization_losses()
-    reg_terms = [r for r in reg_terms_all if r not in reg_terms_d]
-
-    # Form variational objective.
-    loss = -(pbeta_log_prob - qbeta_log_prob + scaled_ratio -
-             tf.reduce_sum(reg_terms))
-    loss_d = loss_d + tf.reduce_sum(reg_terms_d)
-
-    var_list_d = tf.get_collection(
-        tf.GraphKeys.TRAINABLE_VARIABLES, scope="Disc")
-    if var_list is None:
-      var_list = [v for v in tf.trainable_variables() if v not in var_list_d]
-
-    grads = tf.gradients(loss, var_list)
-    grads_d = tf.gradients(loss_d, var_list_d)
-    grads_and_vars = list(zip(grads, var_list))
-    grads_and_vars_d = list(zip(grads_d, var_list_d))
-    return loss, grads_and_vars, loss_d, grads_and_vars_d
+    elif isinstance(x, RandomVariable):
+      # Copy p(x | z, beta) to get draw p(x | z', beta').
+      x_copy = copy(x, dict_swap=dict_swap, scope=scope)
+      x_psample[x] = x_copy.value
+      x_qsample[x] = x_data
+
+  with tf.variable_scope("Disc"):
+    r_psample = discriminator(x_psample, pz_sample, qbeta_sample)
+
+  with tf.variable_scope("Disc", reuse=True):
+    r_qsample = discriminator(x_qsample, qz_sample, qbeta_sample)
+
+  # Form ratio loss and ratio estimator.
+  if len(scale) <= 1:
+    loss_d = tf.reduce_mean(ratio_loss(r_psample, r_qsample))
+    scale = list(six.itervalues(scale))
+    scale = scale[0] if scale else 1.0
+    scaled_ratio = tf.reduce_sum(scale * r_qsample)
+  else:
+    loss_d = [tf.reduce_mean(ratio_loss(r_psample[key], r_qsample[key]))
+              for key in six.iterkeys(scale)]
+    loss_d = tf.reduce_sum(loss_d)
+    scaled_ratio = [tf.reduce_sum(scale[key] * r_qsample[key])
+                    for key in six.iterkeys(scale)]
+    scaled_ratio = tf.reduce_sum(scaled_ratio)
+
+  reg_terms_d = tf.losses.get_regularization_losses(scope="Disc")
+  reg_terms_all = tf.losses.get_regularization_losses()
+  reg_terms = [r for r in reg_terms_all if r not in reg_terms_d]
+
+  # Form variational objective.
+  loss = -(pbeta_log_prob - qbeta_log_prob + scaled_ratio -
+           tf.reduce_sum(reg_terms))
+  loss_d = loss_d + tf.reduce_sum(reg_terms_d)
+
+  var_list_d = tf.get_collection(
+      tf.GraphKeys.TRAINABLE_VARIABLES, scope="Disc")
+  if var_list is None:
+    var_list = [v for v in tf.trainable_variables() if v not in var_list_d]
+
+  grads = tf.gradients(loss, var_list)
+  grads_d = tf.gradients(loss_d, var_list_d)
+  grads_and_vars = list(zip(grads, var_list))
+  grads_and_vars_d = list(zip(grads_d, var_list_d))
+  return loss, grads_and_vars, loss_d, grads_and_vars_d
 
 
 def log_loss(psample, qsample):
diff --git a/edward/inferences/inference.py b/edward/inferences/inference.py
index 28614223f..24f8238c4 100644
--- a/edward/inferences/inference.py
+++ b/edward/inferences/inference.py
@@ -1,8 +1,50 @@
+"""
+There are two approaches to inference.
+
+1. Idiomatic TensorFlow
+  1. Build train_op (need functions).
+  2. Build summary file writer.
+  3. Build and run TensorFlow variable initializer ops.
+  4. Build progressbar (need functions).
+  5. Within a training loop:
+    + sess.run with infeeding and summary writers.
+    + Update progressbar (need functions).
+    + Check convergence (need functions).
+  6. Build and run post-training ops (need functions).
+2. Idiomatic TensorFlow Estimator
+  + Call train(). It is a higher-order function taking in the model
+    program, inference function to build the train_op, and various
+    other things.
+
+Inference provides functions for both approaches. In the first
+approach, it provides (1) inference algorithms to help produce the
+train_op (and low-level functions to build your own algorithms); (2) a
+progressbar to build and update; (3) convergence diagnostics; and (4)
+post-training ops for certain algorithms. In the second approach, it
+provides the fully automated train().
+
+Inference uses (unbinded) pure functions with TensorFlow idiomatic
+exceptions (e.g., mutable state via TensorFlow variables; side effect
+of adding to global collections and TF graph). It forgoes OO.
+
+This file is a collection of functions shared across inference
+algorithms, used for the following:
+
++ input checking and default constructors
++ programmatic docstrings
++ automated transforms
++ summaries
++ variable scoping
++ train()
++ for a subset of algs, optimizer and Monte Carlo stuff (TBA).
+
+Other files provide functions to help produce the train (and
+post-training) ops.
+"""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import abc
 import numpy as np
 import six
 import tensorflow as tf
@@ -10,367 +52,540 @@
 
 from datetime import datetime
 from edward.models import RandomVariable
-from edward.util import check_data, check_latent_vars, get_session, \
-    get_variables, Progbar, transform
+from edward.util import get_session, get_variables, Progbar
+from edward.util import transform as _transform
 
 from tensorflow.contrib.distributions import bijectors
 
 
-@six.add_metaclass(abc.ABCMeta)
-class Inference(object):
-  """Abstract base class for inference. All inference algorithms in
-  Edward inherit from `Inference`, sharing common methods and
-  properties via a class hierarchy.
+def check_and_maybe_build_data(data):
+  """Check that the data dictionary passed during inference and
+  criticism is valid.
 
-  Specific algorithms typically inherit from other subclasses of
-  `Inference` rather than `Inference` directly. For example, one
-  might inherit from the abstract classes `MonteCarlo` or
-  `VariationalInference`.
-
-  To build an algorithm inheriting from `Inference`, one must at the
-  minimum implement `initialize` and `update`: the former builds
-  the computational graph for the algorithm; the latter runs the
-  computational graph for the algorithm.
-
-  To reset inference (e.g., internal variable counters incremented
-  over training), fetch inference's reset ops from session with
-  `sess.run(inference.reset)`.
-
-  #### Examples
-
-  ```python
-  # Set up probability model.
-  mu = Normal(loc=0.0, scale=1.0)
-  x = Normal(loc=mu, scale=1.0, sample_shape=50)
-
-  # Set up posterior approximation.
-  qmu_loc = tf.Variable(tf.random_normal([]))
-  qmu_scale = tf.nn.softplus(tf.Variable(tf.random_normal([])))
-  qmu = Normal(loc=qmu_loc, scale=qmu_scale)
-
-  inference = ed.Inference({mu: qmu}, data={x: tf.zeros(50)})
-  ```
+  Args:
+    data: dict.
+      Data dictionary which binds observed variables (of type
+      `RandomVariable` or `tf.Tensor`) to their realizations (of
+      type `tf.Tensor`). It can also bind placeholders (of type
+      `tf.Tensor`) used in the model to their realizations; and
+      prior latent variables (of type `RandomVariable`) to posterior
+      latent variables (of type `RandomVariable`).
   """
-  def __init__(self, latent_vars=None, data=None):
-    """Create an inference algorithm.
-
-    Args:
-      latent_vars: dict.
-        Collection of latent variables (of type `RandomVariable` or
-        `tf.Tensor`) to perform inference on. Each random variable is
-        binded to another random variable; the latter will infer the
-        former conditional on data.
-      data: dict.
-        Data dictionary which binds observed variables (of type
-        `RandomVariable` or `tf.Tensor`) to their realizations (of
-        type `tf.Tensor`). It can also bind placeholders (of type
-        `tf.Tensor`) used in the model to their realizations; and
-        prior latent variables (of type `RandomVariable`) to posterior
-        latent variables (of type `RandomVariable`).
-    """
-    sess = get_session()
-    if latent_vars is None:
-      latent_vars = {}
-    if data is None:
-      data = {}
-
-    check_latent_vars(latent_vars)
-    self.latent_vars = latent_vars
-
-    check_data(data)
-    self.data = {}
-    for key, value in six.iteritems(data):
-      if isinstance(key, tf.Tensor) and "Placeholder" in key.op.type:
-        self.data[key] = value
-      elif isinstance(key, (RandomVariable, tf.Tensor)):
-        if isinstance(value, (RandomVariable, tf.Tensor)):
-          self.data[key] = value
-        elif isinstance(value, (float, list, int, np.ndarray, np.number, str)):
-          # If value is a Python type, store it in the graph.
-          # Assign its placeholder with the key's data type.
-          with tf.variable_scope(None, default_name="data"):
-            ph = tf.placeholder(key.dtype, np.shape(value))
-            var = tf.Variable(ph, trainable=False, collections=[])
-            sess.run(var.initializer, {ph: value})
-            self.data[key] = var
-
-  def run(self, variables=None, use_coordinator=True, *args, **kwargs):
-    """A simple wrapper to run inference.
-
-    1. Initialize algorithm via `initialize`.
-    2. (Optional) Build a TensorFlow summary writer for TensorBoard.
-    3. (Optional) Initialize TensorFlow variables.
-    4. (Optional) Start queue runners.
-    5. Run `update` for `self.n_iter` iterations.
-    6. While running, `print_progress`.
-    7. Finalize algorithm via `finalize`.
-    8. (Optional) Stop queue runners.
-
-    To customize the way inference is run, run these steps
-    individually.
-
-    Args:
-      variables: list.
-        A list of TensorFlow variables to initialize during inference.
-        Default is to initialize all variables (this includes
-        reinitializing variables that were already initialized). To
-        avoid initializing any variables, pass in an empty list.
-      use_coordinator: bool.
-        Whether to start and stop queue runners during inference using a
-        TensorFlow coordinator. For example, queue runners are necessary
-        for batch training with file readers.
-      *args, **kwargs:
-        Passed into `initialize`.
-    """
-    self.initialize(*args, **kwargs)
-
-    if variables is None:
-      init = tf.global_variables_initializer()
-    else:
-      init = tf.variables_initializer(variables)
-
-    # Feed placeholders in case initialization depends on them.
-    feed_dict = {}
-    for key, value in six.iteritems(self.data):
-      if isinstance(key, tf.Tensor) and "Placeholder" in key.op.type:
-        feed_dict[key] = value
-
-    init.run(feed_dict)
-
-    if use_coordinator:
-      # Start input enqueue threads.
-      self.coord = tf.train.Coordinator()
-      self.threads = tf.train.start_queue_runners(coord=self.coord)
-
-    for _ in range(self.n_iter):
-      info_dict = self.update()
-      self.print_progress(info_dict)
-
-    self.finalize()
-
-    if use_coordinator:
-      # Ask threads to stop.
-      self.coord.request_stop()
-      self.coord.join(self.threads)
-
-  @abc.abstractmethod
-  def initialize(self, n_iter=1000, n_print=None, scale=None,
-                 auto_transform=True, logdir=None, log_timestamp=True,
-                 log_vars=None, debug=False):
-    """Initialize inference algorithm. It initializes hyperparameters
-    and builds ops for the algorithm's computation graph.
-
-    Any derived class of `Inference` **must** implement this method.
-    No methods which build ops should be called outside `initialize()`.
-
-    Args:
-      n_iter: int.
-        Number of iterations for algorithm when calling `run()`.
-        Alternatively if controlling inference manually, it is the
-        expected number of calls to `update()`; this number determines
-        tracking information during the print progress.
-      n_print: int.
-        Number of iterations for each print progress. To suppress print
-        progress, then specify 0. Default is `int(n_iter / 100)`.
-      scale: dict of RandomVariable to tf.Tensor.
-        A tensor to scale computation for any random variable that it is
-        binded to. Its shape must be broadcastable; it is multiplied
-        element-wise to the random variable. For example, this is useful
-        for mini-batch scaling when inferring global variables, or
-        applying masks on a random variable.
-      auto_transform: bool.
-        Whether to automatically transform continuous latent variables
-        of unequal support to be on the unconstrained space. It is
-        only applied if the argument is `True`, the latent variable
-        pair are `ed.RandomVariable`s with the `support` attribute,
-        the supports are both continuous and unequal.
-      logdir: str.
-        Directory where event file will be written. For details,
-        see `tf.summary.FileWriter`. Default is to log nothing.
-      log_timestamp: bool.
-        If True (and `logdir` is specified), create a subdirectory of
-        `logdir` to save the specific run results. The subdirectory's
-        name is the current UTC timestamp with format 'YYYYMMDD_HHMMSS'.
-      log_vars: list.
-        Specifies the list of variables to log after each `n_print`
-        steps. If None, will log all variables. If `[]`, no variables
-        will be logged. `logdir` must be specified for variables to be
-        logged.
-      debug: bool.
-        If True, add checks for `NaN` and `Inf` to all computations
-        in the graph. May result in substantially slower execution
-        times.
-    """
-    self.n_iter = n_iter
-    if n_print is None:
-      self.n_print = int(n_iter / 100)
-    else:
-      self.n_print = n_print
-
-    self.progbar = Progbar(self.n_iter)
-    self.t = tf.Variable(0, trainable=False, name="iteration")
-
-    self.increment_t = self.t.assign_add(1)
-
-    if scale is None:
-      scale = {}
-    elif not isinstance(scale, dict):
-      raise TypeError("scale must be a dict object.")
-
-    self.scale = scale
-
-    # map from original latent vars to unconstrained versions
-    self.transformations = {}
-    if auto_transform:
-      latent_vars = self.latent_vars.copy()
-      # latent_vars maps original latent vars to constrained Q's.
-      # latent_vars_unconstrained maps unconstrained vars to unconstrained Q's.
-      self.latent_vars = {}
-      self.latent_vars_unconstrained = {}
-      for z, qz in six.iteritems(latent_vars):
-        if hasattr(z, 'support') and hasattr(qz, 'support') and \
-                z.support != qz.support and qz.support != 'point':
-
-          # transform z to an unconstrained space
-          z_unconstrained = transform(z)
-          self.transformations[z] = z_unconstrained
-
-          # make sure we also have a qz that covers the unconstrained space
-          if qz.support == "points":
-            qz_unconstrained = qz
-          else:
-            qz_unconstrained = transform(qz)
-          self.latent_vars_unconstrained[z_unconstrained] = qz_unconstrained
-
-          # additionally construct the transformation of qz
-          # back into the original constrained space
-          if z_unconstrained != z:
-            qz_constrained = transform(
-                qz_unconstrained, bijectors.Invert(z_unconstrained.bijector))
-
-            try:  # attempt to pushforward the params of Empirical distributions
-              qz_constrained.params = z_unconstrained.bijector.inverse(
-                  qz_unconstrained.params)
-            except:  # qz_unconstrained is not an Empirical distribution
-              pass
-
-          else:
-            qz_constrained = qz_unconstrained
-
-          self.latent_vars[z] = qz_constrained
-        else:
-          self.latent_vars[z] = qz
-          self.latent_vars_unconstrained[z] = qz
-      del latent_vars
-
-    if logdir is not None:
-      self.logging = True
-      if log_timestamp:
-        logdir = os.path.expanduser(logdir)
-        logdir = os.path.join(
-            logdir, datetime.strftime(datetime.utcnow(), "%Y%m%d_%H%M%S"))
-
-      self._summary_key = tf.get_default_graph().unique_name("summaries")
-      self._set_log_variables(log_vars)
-      self.train_writer = tf.summary.FileWriter(logdir, tf.get_default_graph())
+  sess = get_session()
+  if data is None:
+    data = {}
+  elif not isinstance(data, dict):
+    raise TypeError("data must have type dict.")
+
+  for key, value in six.iteritems(data):
+    if isinstance(key, tf.Tensor) and "Placeholder" in key.op.type:
+      if isinstance(value, RandomVariable):
+        raise TypeError("The value of a feed cannot be a ed.RandomVariable "
+                        "object. "
+                        "Acceptable feed values include Python scalars, "
+                        "strings, lists, numpy ndarrays, or TensorHandles.")
+      elif isinstance(value, tf.Tensor):
+        raise TypeError("The value of a feed cannot be a tf.Tensor object. "
+                        "Acceptable feed values include Python scalars, "
+                        "strings, lists, numpy ndarrays, or TensorHandles.")
+    elif isinstance(key, (RandomVariable, tf.Tensor)):
+      if isinstance(value, (RandomVariable, tf.Tensor)):
+        if not key.shape.is_compatible_with(value.shape):
+          raise TypeError("Key-value pair in data does not have same "
+                          "shape: {}, {}".format(key.shape, value.shape))
+        elif key.dtype != value.dtype:
+          raise TypeError("Key-value pair in data does not have same "
+                          "dtype: {}, {}".format(key.dtype, value.dtype))
+      elif isinstance(value, (float, list, int, np.ndarray, np.number, str)):
+        if not key.shape.is_compatible_with(np.shape(value)):
+          raise TypeError("Key-value pair in data does not have same "
+                          "shape: {}, {}".format(key.shape, np.shape(value)))
+        elif isinstance(value, (np.ndarray, np.number)) and \
+                not np.issubdtype(value.dtype, np.float) and \
+                not np.issubdtype(value.dtype, np.int) and \
+                not np.issubdtype(value.dtype, np.str):
+          raise TypeError("Data value has an invalid dtype: "
+                          "{}".format(value.dtype))
+      else:
+        raise TypeError("Data value has an invalid type: "
+                        "{}".format(type(value)))
     else:
-      self.logging = False
+      raise TypeError("Data key has an invalid type: {}".format(type(key)))
+
+  processed_data = {}
+  for key, value in six.iteritems(data):
+    if isinstance(key, tf.Tensor) and "Placeholder" in key.op.type:
+      processed_data[key] = value
+    elif isinstance(key, (RandomVariable, tf.Tensor)):
+      if isinstance(value, (RandomVariable, tf.Tensor)):
+        processed_data[key] = value
+      elif isinstance(value, (float, list, int, np.ndarray, np.number, str)):
+        # If value is a Python type, store it in the graph.
+        # Assign its placeholder with the key's data type.
+        with tf.variable_scope(None, default_name="data"):
+          ph = tf.placeholder(key.dtype, np.shape(value))
+          var = tf.Variable(ph, trainable=False, collections=[])
+          sess.run(var.initializer, {ph: value})
+          processed_data[key] = var
+  return processed_data
+
+
+def check_and_maybe_build_latent_vars(latent_vars):
+  """Check that the latent variable dictionary passed during inference and
+  criticism is valid.
+
+  Args:
+    latent_vars: dict.
+      Collection of latent variables (of type `RandomVariable` or
+      `tf.Tensor`) to perform inference on. Each random variable is
+      binded to another random variable; the latter will infer the
+      former conditional on data.
+  """
+  if latent_vars is None:
+    latent_vars = {}
+  elif not isinstance(latent_vars, dict):
+    raise TypeError("latent_vars must have type dict.")
+
+  for key, value in six.iteritems(latent_vars):
+    if not isinstance(key, (RandomVariable, tf.Tensor)):
+      raise TypeError("Latent variable key has an invalid type: "
+                      "{}".format(type(key)))
+    elif not isinstance(value, (RandomVariable, tf.Tensor)):
+      raise TypeError("Latent variable value has an invalid type: "
+                      "{}".format(type(value)))
+    elif not key.shape.is_compatible_with(value.shape):
+      raise TypeError("Key-value pair in latent_vars does not have same "
+                      "shape: {}, {}".format(key.shape, value.shape))
+    elif key.dtype != value.dtype:
+      raise TypeError("Key-value pair in latent_vars does not have same "
+                      "dtype: {}, {}".format(key.dtype, value.dtype))
+  return latent_vars
+
+
+def check_and_maybe_build_dict(x):
+  if x is None:
+    x = {}
+  elif not isinstance(x, dict):
+    raise TypeError("x must be dict; got {}".format(type(x).__name__))
+  return x
+
+
+def check_and_maybe_build_var_list(var_list, latent_vars, data):
+  """
+  Returns:
+    List of TensorFlow variables to optimize over. Default is all
+    trainable variables that `latent_vars` and `data` depend on,
+    excluding those that are only used in conditionals in `data`.
+  """
+  # Traverse random variable graphs to get default list of variables.
+  if var_list is None:
+    var_list = set()
+    trainables = tf.trainable_variables()
+    for z, qz in six.iteritems(latent_vars):
+      var_list.update(get_variables(z, collection=trainables))
+      var_list.update(get_variables(qz, collection=trainables))
 
-    self.debug = debug
-    if self.debug:
-      self.op_check = tf.add_check_numerics_ops()
+    for x, qx in six.iteritems(data):
+      if isinstance(x, RandomVariable) and \
+              not isinstance(qx, RandomVariable):
+        var_list.update(get_variables(x, collection=trainables))
 
-    # Store reset ops which user can call. Subclasses should append
-    # any ops needed to reset internal variables in inference.
-    self.reset = [tf.variables_initializer([self.t])]
+    var_list = list(var_list)
+  return var_list
 
-  @abc.abstractmethod
-  def update(self, feed_dict=None):
-    """Run one iteration of inference.
 
-    Any derived class of `Inference` **must** implement this method.
+def transform(latent_vars, auto_transform=True):
+  """
+  Args:
+    auto_transform: bool, optional.
+      Whether to automatically transform continuous latent variables
+      of unequal support to be on the unconstrained space. It is
+      only applied if the argument is `True`, the latent variable
+      pair are `ed.RandomVariable`s with the `support` attribute,
+      the supports are both continuous and unequal.
+  """
+  # map from original latent vars to unconstrained versions
+  if auto_transform:
+    latent_vars_temp = latent_vars.copy()
+    # latent_vars maps original latent vars to constrained Q's.
+    # latent_vars_unconstrained maps unconstrained vars to unconstrained Q's.
+    latent_vars = {}
+    latent_vars_unconstrained = {}
+    for z, qz in six.iteritems(latent_vars_temp):
+      if hasattr(z, 'support') and hasattr(qz, 'support') and \
+            z.support != qz.support and qz.support != 'point':
+
+        # transform z to an unconstrained space
+        z_unconstrained = _transform(z)
+
+        # make sure we also have a qz that covers the unconstrained space
+        if qz.support == "points":
+          qz_unconstrained = qz
+        else:
+          qz_unconstrained = _transform(qz)
+        latent_vars_unconstrained[z_unconstrained] = qz_unconstrained
 
-    Args:
-      feed_dict: dict.
-        Feed dictionary for a TensorFlow session run. It is used to feed
-        placeholders that are not fed during initialization.
+        # additionally construct the transformation of qz
+        # back into the original constrained space
+        if z_unconstrained != z:
+          qz_constrained = _transform(
+            qz_unconstrained, bijectors.Invert(z_unconstrained.bijector))
 
-    Returns:
-      dict.
-        Dictionary of algorithm-specific information.
-    """
-    if feed_dict is None:
-      feed_dict = {}
+          try: # attempt to pushforward the params of Empirical distributions
+            qz_constrained.params = z_unconstrained.bijector.inverse(
+              qz_unconstrained.params)
+          except: # qz_unconstrained is not an Empirical distribution
+            pass
 
-    for key, value in six.iteritems(self.data):
-      if isinstance(key, tf.Tensor) and "Placeholder" in key.op.type:
-        feed_dict[key] = value
+        else:
+          qz_constrained = qz_unconstrained
 
+        latent_vars[z] = qz_constrained
+      else:
+        latent_vars[z] = qz
+        latent_vars_unconstrained[z] = qz
+  else:
+    latent_vars_unconstrained = None
+  return latent_vars, latent_vars_unconstrained
+
+
+def summary_variables(latent_vars=None, data=None, variables=None,
+                      *args, **kwargs):
+  # Note: to use summary_key, set
+  # collections=[tf.get_default_graph().unique_name("summaries")]
+  # TODO include in TensorBoard tutorial
+  """Log variables to TensorBoard.
+
+  For each variable in `variables`, forms a `tf.summary.scalar` if
+  the variable has scalar shape; otherwise forms a `tf.summary.histogram`.
+
+  Args:
+    variables: list, optional.
+      Specifies the list of variables to log after each `n_print`
+      steps. If None, will log all variables. If `[]`, no variables
+      will be logged.
+  """
+  if variables is None:
+    variables = []
+    for key in six.iterkeys(data):
+      variables += get_variables(key)
+
+    for key, value in six.iteritems(latent_vars):
+      variables += get_variables(key)
+      variables += get_variables(value)
+
+    variables = set(variables)
+
+  for var in variables:
+    # replace colons which are an invalid character
+    var_name = var.name.replace(':', '/')
+    # Log all scalars.
+    if len(var.shape) == 0:
+      tf.summary.scalar("parameter/{}".format(var_name),
+                        var, *args, **kwargs)
+    elif len(var.shape) == 1 and var.shape[0] == 1:
+      tf.summary.scalar("parameter/{}".format(var_name),
+                        var[0], *args, **kwargs)
+    else:
+      # If var is multi-dimensional, log a histogram of its values.
+      tf.summary.histogram("parameter/{}".format(var_name),
+                           var, *args, **kwargs)
+
+
+def train(train_op, summary_key=None, n_iter=1000, n_print=None,
+          logdir=None, log_timestamp=True,
+          debug=False, variables=None,
+          use_coordinator=True, *args, **kwargs):
+  """A wrapper to run inference.
+
+  1. (Optional) Build a TensorFlow summary writer for TensorBoard.
+  2. (Optional) Initialize TensorFlow variables.
+  3. (Optional) Start queue runners.
+  4. Run `update` for `n_iter` iterations.
+  5. Finalize algorithm via `finalize`.
+  6. (Optional) Stop queue runners.
+  + summary writer
+  + variable initialization
+  + update
+  + convergence diagnostics
+  + finalize
+
+  To customize the way inference is run, run these steps
+  individually.
+
+  Args:
+    n_iter: int, optional.
+      Number of iterations for algorithm when calling `run()`.
+      Alternatively if controlling inference manually, it is the
+      expected number of calls to `update()`; this number determines
+      tracking information during the print progress.
+    n_print: int, optional.
+      Number of iterations for each print progress. To suppress print
+      progress, then specify 0. Default is `int(n_iter / 100)`.
+    logdir: str, optional.
+      Directory where event file will be written. For details,
+      see `tf.summary.FileWriter`. Default is to log nothing.
+    log_timestamp: bool, optional.
+      If True (and `logdir` is specified), create a subdirectory of
+      `logdir` to save the specific run results. The subdirectory's
+      name is the current UTC timestamp with format 'YYYYMMDD_HHMMSS'.
+    debug: bool, optional.
+      If True, add checks for `NaN` and `Inf` to all computations
+      in the graph. May result in substantially slower execution
+      times.
+    variables: list, optional.
+      A list of TensorFlow variables to initialize during inference.
+      Default is to initialize all variables (this includes
+      reinitializing variables that were already initialized). To
+      avoid initializing any variables, pass in an empty list.
+    use_coordinator: bool, optional.
+      Whether to start and stop queue runners during inference using a
+      TensorFlow coordinator. For example, queue runners are necessary
+      for batch training with file readers.
+  """
+  if n_print is None:
+    n_print = int(n_iter / 100)
+  progbar = Progbar(n_iter)
+  t = tf.Variable(0, trainable=False, name="iteration")
+  kwargs['t'] = t.assign_add(1)  # add to update()
+
+  if summary_key is not None:
+    # TODO should run() also add summaries; or should user call
+    # summary_variables() manually?
+    summarize = tf.summary.merge_all(key=summary_key)
+    if log_timestamp:
+      logdir = os.path.expanduser(logdir)
+      logdir = os.path.join(
+          logdir, datetime.strftime(datetime.utcnow(), "%Y%m%d_%H%M%S"))
+    train_writer = tf.summary.FileWriter(logdir, tf.get_default_graph())
+  else:
+    summarize = None
+    train_writer = None
+
+  if debug:
+    op_check = tf.add_check_numerics_ops()
+  else:
+    op_check = None
+
+  if variables is None:
+    init = tf.global_variables_initializer()
+  else:
+    init = tf.variables_initializer(variables)
+
+  # Feed placeholders in case initialization depends on them.
+  feed_dict = kwargs.get('feed_dict', {})
+  init.run(feed_dict)
+
+  if use_coordinator:
+    # Start input enqueue threads.
+    coord = tf.train.Coordinator()
+    threads = tf.train.start_queue_runners(coord=coord)
+
+  for _ in range(n_iter):
+    info_dict = update(progbar, n_print, summarize,
+                       train_writer, debug, op_check,
+                       train_op, *args, **kwargs)
+
+  finalize = None
+  if finalize is not None:
+    finalize_ops = finalize()
     sess = get_session()
-    t = sess.run(self.increment_t)
-
-    if self.debug:
-      sess.run(self.op_check, feed_dict)
-
-    if self.logging and self.n_print != 0:
-      if t == 1 or t % self.n_print == 0:
-        summary = sess.run(self.summarize, feed_dict)
-        self.train_writer.add_summary(summary, t)
-
-    return {'t': t}
-
-  def print_progress(self, info_dict):
-    """Print progress to output.
-
-    Args:
-      info_dict: dict.
-        Dictionary of algorithm-specific information.
-    """
-    if self.n_print != 0:
-      t = info_dict['t']
-      if t == 1 or t % self.n_print == 0:
-        self.progbar.update(t)
-
-  def finalize(self):
-    """Function to call after convergence.
-    """
-    if self.logging:
-      self.train_writer.close()
-
-  def _set_log_variables(self, log_vars=None):
-    """Log variables to TensorBoard.
-
-    For each variable in `log_vars`, forms a `tf.summary.scalar` if
-    the variable has scalar shape; otherwise forms a `tf.summary.histogram`.
-
-    Args:
-      log_vars: list.
-        Specifies the list of variables to log after each `n_print`
-        steps. If None, will log all variables. If `[]`, no variables
-        will be logged.
-    """
-    if log_vars is None:
-      log_vars = []
-      for key in six.iterkeys(self.data):
-        log_vars += get_variables(key)
-
-      for key, value in six.iteritems(self.latent_vars):
-        log_vars += get_variables(key)
-        log_vars += get_variables(value)
-
-      log_vars = set(log_vars)
-
-    for var in log_vars:
+    sess.run(finalize_op, feed_dict)
+  else:
+    if summary_key is not None:
+      train_writer.close()
+
+  if use_coordinator:
+    # Ask threads to stop.
+    coord.request_stop()
+    coord.join(threads)
+
+def optimize(loss, grads_and_vars, collections=None, var_list=None,
+             optimizer=None, use_prettytensor=False, global_step=None):
+  """Build optimizer and its train op applied to loss or
+  grads_and_vars.
+
+  Args:
+    optimizer: str or tf.train.Optimizer, optional.
+      A TensorFlow optimizer, to use for optimizing the variational
+      objective. Alternatively, one can pass in the name of a
+      TensorFlow optimizer, and default parameters for the optimizer
+      will be used.
+    use_prettytensor: bool, optional.
+      `True` if aim to use PrettyTensor optimizer (when using
+      PrettyTensor) or `False` if aim to use TensorFlow optimizer.
+      Defaults to TensorFlow.
+    global_step: tf.Variable, optional.
+      A TensorFlow variable to hold the global step.
+  """
+  if collections is not None:
+    # TODO when users call this, this duplicates for GANs
+    # train = optimize(loss, grads_and_vars, summary_key)
+    # train_d = optimize(loss_d, grads_and_vars_d, summary_key)
+    tf.summary.scalar("loss", loss, collections=collections)
+    for grad, var in grads_and_vars:
       # replace colons which are an invalid character
-      var_name = var.name.replace(':', '/')
-      # Log all scalars.
-      if len(var.shape) == 0:
-        tf.summary.scalar("parameter/{}".format(var_name),
-                          var, collections=[self._summary_key])
-      elif len(var.shape) == 1 and var.shape[0] == 1:
-        tf.summary.scalar("parameter/{}".format(var_name),
-                          var[0], collections=[self._summary_key])
-      else:
-        # If var is multi-dimensional, log a histogram of its values.
-        tf.summary.histogram("parameter/{}".format(var_name),
-                             var, collections=[self._summary_key])
+      tf.summary.histogram("gradient/" +
+                           var.name.replace(':', '/'),
+                           grad, collections=collections)
+      tf.summary.scalar("gradient_norm/" +
+                        var.name.replace(':', '/'),
+                        tf.norm(grad), collections=collections)
+
+  if optimizer is None and global_step is None:
+    # Default optimizer always uses a global step variable.
+    global_step = tf.Variable(0, trainable=False, name="global_step")
+
+  if isinstance(global_step, tf.Variable):
+    starter_learning_rate = 0.1
+    learning_rate = tf.train.exponential_decay(starter_learning_rate,
+                                               global_step,
+                                               100, 0.9, staircase=True)
+  else:
+    learning_rate = 0.01
+
+  # Build optimizer.
+  if optimizer is None:
+    optimizer = tf.train.AdamOptimizer(learning_rate)
+  elif isinstance(optimizer, str):
+    if optimizer == 'gradientdescent':
+      optimizer = tf.train.GradientDescentOptimizer(learning_rate)
+    elif optimizer == 'adadelta':
+      optimizer = tf.train.AdadeltaOptimizer(learning_rate)
+    elif optimizer == 'adagrad':
+      optimizer = tf.train.AdagradOptimizer(learning_rate)
+    elif optimizer == 'momentum':
+      optimizer = tf.train.MomentumOptimizer(learning_rate, 0.9)
+    elif optimizer == 'adam':
+      optimizer = tf.train.AdamOptimizer(learning_rate)
+    elif optimizer == 'ftrl':
+      optimizer = tf.train.FtrlOptimizer(learning_rate)
+    elif optimizer == 'rmsprop':
+      optimizer = tf.train.RMSPropOptimizer(learning_rate)
+    else:
+      raise ValueError('Optimizer class not found:', optimizer)
+  elif not isinstance(optimizer, tf.train.Optimizer):
+    raise TypeError("Optimizer must be str, tf.train.Optimizer, or None.")
+
+  with tf.variable_scope(None, default_name="optimizer") as scope:
+    if not use_prettytensor:
+      train_op = optimizer.apply_gradients(grads_and_vars,
+                                           global_step=global_step)
+    else:
+      import prettytensor as pt
+      # Note PrettyTensor optimizer does not accept manual updates;
+      # it autodiffs the loss directly.
+      train_op = pt.apply_optimizer(optimizer, losses=[loss],
+                                    global_step=global_step,
+                                    var_list=var_list)
+  return train_op
+
+
+def update(progbar, n_print, summarize=None, train_writer=None,
+           debug=False, op_check=None, *args, **kwargs):
+  """Run one iteration of optimization.
+
+  Args:
+    args: things like `loss`
+    kwargs: things like 'feed_dict'
+    feed_dict: dict, optional.
+      Feed dictionary for a TensorFlow session run. It is used to feed
+      placeholders that are not fed during initialization.
+
+  Returns:
+    dict.
+    Dictionary of algorithm-specific information. In this case, the
+    loss function value after one iteration.
+  """
+  # TODO use if more automated
+  # feed_dict = {}
+  # for key, value in six.iteritems(data):
+  #   if isinstance(key, tf.Tensor) and "Placeholder" in key.op.type:
+  #     feed_dict[key] = value
+  sess = get_session()
+  feed_dict = kwargs.pop('feed_dict', {})
+  values = sess.run(list(args) + list(kwargs.values()), feed_dict)
+  info_dict = dict(zip(kwargs.keys(), values[len(args):]))
+
+  if debug:
+    sess.run(op_check, feed_dict)
+
+  if n_print != 0:
+    t = info_dict['t']
+    if t == 1 or t % n_print == 0:
+      # TODO do we want specific key names? User can specify whatever
+      # in kwargs during run(...).
+      # progbar.update(t, {'Loss': info_dict['loss']})
+      # progbar.update(t, {'Gen Loss': info_dict['loss'],
+      #                    'Disc Loss': info_dict['loss_d']})
+      progbar.update(t, {k: v for k, v in six.iteritems(info_dict)
+                         if k != 't'})
+      if summarize is not None:
+        summary = sess.run(summarize, feed_dict)
+        train_writer.add_summary(summary, t)
+
+  return info_dict
+
+
+# TODO within run(), use this for gan_inference, wgan_inference,
+# implicit_klqp, bigan_inference
+def update(train_op, train_op_d, n_print, summarize=None, train_writer=None,
+           debug=False, op_check=None, variables=None, *args, **kwargs):
+  """Run one iteration of optimization.
+
+  Args:
+    variables: str, optional.
+      Which set of variables to update. Either "Disc" or "Gen".
+      Default is both.
+
+  Returns:
+    dict.
+    Dictionary of algorithm-specific information. In this case, the
+    iteration number and generative and discriminative losses.
+
+  #### Notes
+
+  The outputted iteration number is the total number of calls to
+  `update`. Each update may include updating only a subset of
+  parameters.
+  """
+  # if feed_dict is None:
+  #   feed_dict = {}
+  # for key, value in six.iteritems(self.data):
+  #   if isinstance(key, tf.Tensor) and "Placeholder" in key.op.type:
+  #     feed_dict[key] = value
+  sess = get_session()
+  feed_dict = kwargs.pop('feed_dict', {})
+  if variables is None:
+    values = sess.run([train_op, train_op_d] + list(kwargs.values()), feed_dict)
+    values = values[2:]
+  elif variables == "Gen":
+    kwargs['loss_d'] = 0.0
+    values = sess.run([train_op] + list(kwargs_temp.values()), feed_dict)
+    values = values[1:]
+  elif variables == "Disc":
+    kwargs['loss'] = 0.0
+    values = sess.run([train_op_d] + list(kwargs_temp.values()), feed_dict)
+    values = values[1:]
+  else:
+    raise NotImplementedError("variables must be None, 'Gen', or 'Disc'.")
+
+  if debug:
+    sess.run(op_check, feed_dict)
+
+  if summarize is not None and n_print != 0:
+    if t == 1 or t % self.n_print == 0:
+      summary = sess.run(summarize, feed_dict)
+      train_writer.add_summary(summary, t)
+
+  return dict(zip(kwargs_temp.keys(), values))
+
+# TODO within run(), use this for wgan_inference
+def update(clip_op, variables=None, *args, **kwargs):
+  info_dict = gan_inference.update(variables=variables, *args, **kwargs)
+
+  sess = get_session()
+  if clip_op is not None and variables in (None, "Disc"):
+    sess.run(clip_op)
+
+  return info_dict
diff --git a/edward/inferences/klpq.py b/edward/inferences/klpq.py
index 5639270ce..797612a74 100644
--- a/edward/inferences/klpq.py
+++ b/edward/inferences/klpq.py
@@ -5,7 +5,8 @@
 import six
 import tensorflow as tf
 
-from edward.inferences.variational_inference import VariationalInference
+from edward.inferences.inference import (check_and_maybe_build_data,
+    check_and_maybe_build_latent_vars, transform, check_and_maybe_build_dict, check_and_maybe_build_var_list)
 from edward.models import RandomVariable
 from edward.util import copy, get_descendants
 
@@ -15,7 +16,8 @@
   raise ImportError("{0}. Your TensorFlow version is not supported.".format(e))
 
 
-class KLpq(VariationalInference):
+def klpq(latent_vars=None, data=None, n_samples=1,
+         auto_transform=True, scale=None, var_list=None, collections=None):
   """Variational inference with the KL divergence
 
   $\\text{KL}( p(z \mid x) \| q(z) ).$
@@ -49,136 +51,125 @@ class KLpq(VariationalInference):
   The objective function also adds to itself a summation over all
   tensors in the `REGULARIZATION_LOSSES` collection.
   """
-  def __init__(self, latent_vars=None, data=None):
-    """Create an inference algorithm.
-
-    Args:
-      latent_vars: list of RandomVariable or
-                   dict of RandomVariable to RandomVariable.
-        Collection of random variables to perform inference on. If
-        list, each random variable will be implictly optimized using a
-        `Normal` random variable that is defined internally with a
-        free parameter per location and scale and is initialized using
-        standard normal draws. The random variables to approximate
-        must be continuous.
-    """
-    if isinstance(latent_vars, list):
-      with tf.variable_scope(None, default_name="posterior"):
-        latent_vars_dict = {}
-        continuous = \
-            ('01', 'nonnegative', 'simplex', 'real', 'multivariate_real')
-        for z in latent_vars:
-          if not hasattr(z, 'support') or z.support not in continuous:
-            raise AttributeError(
-                "Random variable {} is not continuous or a random "
-                "variable with supported continuous support.".format(z))
-          batch_event_shape = z.batch_shape.concatenate(z.event_shape)
-          loc = tf.Variable(tf.random_normal(batch_event_shape))
-          scale = tf.nn.softplus(
-              tf.Variable(tf.random_normal(batch_event_shape)))
-          latent_vars_dict[z] = Normal(loc=loc, scale=scale)
-        latent_vars = latent_vars_dict
-        del latent_vars_dict
-
-    super(KLpq, self).__init__(latent_vars, data)
-
-  def initialize(self, n_samples=1, *args, **kwargs):
-    """Initialize inference algorithm. It initializes hyperparameters
-    and builds ops for the algorithm's computation graph.
-
-    Args:
-      n_samples: int.
-        Number of samples from variational model for calculating
-        stochastic gradients.
-    """
-    if n_samples <= 0:
-      raise ValueError(
-          "n_samples should be greater than zero: {}".format(n_samples))
-    self.n_samples = n_samples
-    return super(KLpq, self).initialize(*args, **kwargs)
-
-  def build_loss_and_gradients(self, var_list):
-    """Build loss function
-
-    $\\text{KL}( p(z \mid x) \| q(z) )
-      = \mathbb{E}_{p(z \mid x)} [ \log p(z \mid x) - \log q(z; \lambda) ]$
-
-    and stochastic gradients based on importance sampling.
-
-    The loss function can be estimated as
-
-    $\sum_{s=1}^S [
-      w_{\\text{norm}}(z^s; \lambda) (\log p(x, z^s) - \log q(z^s; \lambda) ],$
-
-    where for $z^s \sim q(z; \lambda)$,
-
-    $w_{\\text{norm}}(z^s; \lambda) =
-          w(z^s; \lambda) / \sum_{s=1}^S w(z^s; \lambda)$
-
-    normalizes the importance weights, $w(z^s; \lambda) = p(x,
-    z^s) / q(z^s; \lambda)$.
-
-    This provides a gradient,
-
-    $- \sum_{s=1}^S [
-      w_{\\text{norm}}(z^s; \lambda) \\nabla_{\lambda} \log q(z^s; \lambda) ].$
-    """
-    p_log_prob = [0.0] * self.n_samples
-    q_log_prob = [0.0] * self.n_samples
-    base_scope = tf.get_default_graph().unique_name("inference") + '/'
-    for s in range(self.n_samples):
-      # Form dictionary in order to replace conditioning on prior or
-      # observed variable with conditioning on a specific value.
-      scope = base_scope + tf.get_default_graph().unique_name("sample")
-      dict_swap = {}
-      for x, qx in six.iteritems(self.data):
-        if isinstance(x, RandomVariable):
-          if isinstance(qx, RandomVariable):
-            qx_copy = copy(qx, scope=scope)
-            dict_swap[x] = qx_copy.value
-          else:
-            dict_swap[x] = qx
-
-      for z, qz in six.iteritems(self.latent_vars):
-        # Copy q(z) to obtain new set of posterior samples.
-        qz_copy = copy(qz, scope=scope)
-        dict_swap[z] = qz_copy.value
-        q_log_prob[s] += tf.reduce_sum(
-            qz_copy.log_prob(tf.stop_gradient(dict_swap[z])))
-
-      for z in six.iterkeys(self.latent_vars):
-        z_copy = copy(z, dict_swap, scope=scope)
-        p_log_prob[s] += tf.reduce_sum(z_copy.log_prob(dict_swap[z]))
-
-      for x in six.iterkeys(self.data):
-        if isinstance(x, RandomVariable):
-          x_copy = copy(x, dict_swap, scope=scope)
-          p_log_prob[s] += tf.reduce_sum(x_copy.log_prob(dict_swap[x]))
-
-    p_log_prob = tf.stack(p_log_prob)
-    q_log_prob = tf.stack(q_log_prob)
-    reg_penalty = tf.reduce_sum(tf.losses.get_regularization_losses())
-
-    if self.logging:
-      tf.summary.scalar("loss/p_log_prob", tf.reduce_mean(p_log_prob),
-                        collections=[self._summary_key])
-      tf.summary.scalar("loss/q_log_prob", tf.reduce_mean(q_log_prob),
-                        collections=[self._summary_key])
-      tf.summary.scalar("loss/reg_penalty", reg_penalty,
-                        collections=[self._summary_key])
-
-    log_w = p_log_prob - q_log_prob
-    log_w_norm = log_w - tf.reduce_logsumexp(log_w)
-    w_norm = tf.exp(log_w_norm)
-    loss = tf.reduce_sum(w_norm * log_w) - reg_penalty
-
-    q_rvs = list(six.itervalues(self.latent_vars))
-    q_vars = [v for v in var_list
-              if len(get_descendants(tf.convert_to_tensor(v), q_rvs)) != 0]
-    q_grads = tf.gradients(
-        -(tf.reduce_sum(q_log_prob * tf.stop_gradient(w_norm)) - reg_penalty),
-        q_vars)
-    p_vars = [v for v in var_list if v not in q_vars]
-    p_grads = tf.gradients(-loss, p_vars)
-    grads_and_vars = list(zip(q_grads, q_vars)) + list(zip(p_grads, p_vars))
-    return loss, grads_and_vars
+  """Create an inference algorithm.
+
+  Args:
+    latent_vars: list of RandomVariable or
+                 dict of RandomVariable to RandomVariable.
+      Collection of random variables to perform inference on. If
+      list, each random variable will be implictly optimized using a
+      `Normal` random variable that is defined internally with a
+      free parameter per location and scale and is initialized using
+      standard normal draws. The random variables to approximate
+      must be continuous.
+    n_samples: int, optional.
+      Number of samples from variational model for calculating
+      stochastic gradients.
+  """
+  """Build loss function
+
+  $\\text{KL}( p(z \mid x) \| q(z) )
+    = \mathbb{E}_{p(z \mid x)} [ \log p(z \mid x) - \log q(z; \lambda) ]$
+
+  and stochastic gradients based on importance sampling.
+
+  The loss function can be estimated as
+
+  $\sum_{s=1}^S [
+    w_{\\text{norm}}(z^s; \lambda) (\log p(x, z^s) - \log q(z^s; \lambda) ],$
+
+  where for $z^s \sim q(z; \lambda)$,
+
+  $w_{\\text{norm}}(z^s; \lambda) =
+        w(z^s; \lambda) / \sum_{s=1}^S w(z^s; \lambda)$
+
+  normalizes the importance weights, $w(z^s; \lambda) = p(x,
+  z^s) / q(z^s; \lambda)$.
+
+  This provides a gradient,
+
+  $- \sum_{s=1}^S [
+    w_{\\text{norm}}(z^s; \lambda) \\nabla_{\lambda} \log q(z^s; \lambda) ].$
+  """
+  if isinstance(latent_vars, list):
+    with tf.variable_scope(None, default_name="posterior"):
+      latent_vars_dict = {}
+      continuous = \
+          ('01', 'nonnegative', 'simplex', 'real', 'multivariate_real')
+      for z in latent_vars:
+        if not hasattr(z, 'support') or z.support not in continuous:
+          raise AttributeError(
+              "Random variable {} is not continuous or a random "
+              "variable with supported continuous support.".format(z))
+        batch_event_shape = z.batch_shape.concatenate(z.event_shape)
+        loc = tf.Variable(tf.random_normal(batch_event_shape))
+        scale = tf.nn.softplus(
+            tf.Variable(tf.random_normal(batch_event_shape)))
+        latent_vars_dict[z] = Normal(loc=loc, scale=scale)
+      latent_vars = latent_vars_dict
+      del latent_vars_dict
+  latent_vars = check_and_maybe_build_latent_vars(latent_vars)
+  data = check_and_maybe_build_data(data)
+  latent_vars, _ = transform(latent_vars, auto_transform)
+  scale = check_and_maybe_build_dict(scale)
+  var_list = check_and_maybe_build_var_list(var_list, latent_vars, data)
+
+  p_log_prob = [0.0] * n_samples
+  q_log_prob = [0.0] * n_samples
+  base_scope = tf.get_default_graph().unique_name("inference") + '/'
+  for s in range(n_samples):
+    # Form dictionary in order to replace conditioning on prior or
+    # observed variable with conditioning on a specific value.
+    scope = base_scope + tf.get_default_graph().unique_name("sample")
+    dict_swap = {}
+    for x, qx in six.iteritems(data):
+      if isinstance(x, RandomVariable):
+        if isinstance(qx, RandomVariable):
+          qx_copy = copy(qx, scope=scope)
+          dict_swap[x] = qx_copy.value
+        else:
+          dict_swap[x] = qx
+
+    for z, qz in six.iteritems(latent_vars):
+      # Copy q(z) to obtain new set of posterior samples.
+      qz_copy = copy(qz, scope=scope)
+      dict_swap[z] = qz_copy.value
+      q_log_prob[s] += tf.reduce_sum(
+          qz_copy.log_prob(tf.stop_gradient(dict_swap[z])))
+
+    for z in six.iterkeys(latent_vars):
+      z_copy = copy(z, dict_swap, scope=scope)
+      p_log_prob[s] += tf.reduce_sum(z_copy.log_prob(dict_swap[z]))
+
+    for x in six.iterkeys(data):
+      if isinstance(x, RandomVariable):
+        x_copy = copy(x, dict_swap, scope=scope)
+        p_log_prob[s] += tf.reduce_sum(x_copy.log_prob(dict_swap[x]))
+
+  p_log_prob = tf.stack(p_log_prob)
+  q_log_prob = tf.stack(q_log_prob)
+  reg_penalty = tf.reduce_sum(tf.losses.get_regularization_losses())
+
+  if collections is not None:
+    tf.summary.scalar("loss/p_log_prob", tf.reduce_mean(p_log_prob),
+                      collections=collections)
+    tf.summary.scalar("loss/q_log_prob", tf.reduce_mean(q_log_prob),
+                      collections=collections)
+    tf.summary.scalar("loss/reg_penalty", reg_penalty,
+                      collections=collections)
+
+  log_w = p_log_prob - q_log_prob
+  log_w_norm = log_w - tf.reduce_logsumexp(log_w)
+  w_norm = tf.exp(log_w_norm)
+  loss = tf.reduce_sum(w_norm * log_w) - reg_penalty
+
+  q_rvs = list(six.itervalues(latent_vars))
+  q_vars = [v for v in var_list
+            if len(get_descendants(tf.convert_to_tensor(v), q_rvs)) != 0]
+  q_grads = tf.gradients(
+      -(tf.reduce_sum(q_log_prob * tf.stop_gradient(w_norm)) - reg_penalty),
+      q_vars)
+  p_vars = [v for v in var_list if v not in q_vars]
+  p_grads = tf.gradients(-loss, p_vars)
+  grads_and_vars = list(zip(q_grads, q_vars)) + list(zip(p_grads, p_vars))
+  return loss, grads_and_vars
diff --git a/edward/inferences/klqp.py b/edward/inferences/klqp.py
index d88088016..8bdf3a674 100644
--- a/edward/inferences/klqp.py
+++ b/edward/inferences/klqp.py
@@ -5,7 +5,8 @@
 import six
 import tensorflow as tf
 
-from edward.inferences.variational_inference import VariationalInference
+from edward.inferences.inference import (check_and_maybe_build_data,
+    check_and_maybe_build_latent_vars, transform, check_and_maybe_build_dict, check_and_maybe_build_var_list)
 from edward.models import RandomVariable
 from edward.util import copy, get_descendants
 
@@ -16,7 +17,8 @@
   raise ImportError("{0}. Your TensorFlow version is not supported.".format(e))
 
 
-class KLqp(VariationalInference):
+def klqp(latent_vars=None, data=None, n_samples=1, kl_scaling=None,
+         auto_transform=True, scale=None, var_list=None, summary_key=None):
   """Variational inference with the KL divergence
 
   $\\text{KL}( q(z; \lambda) \| p(z \mid x) ).$
@@ -24,6 +26,35 @@ class KLqp(VariationalInference):
   This class minimizes the objective by automatically selecting from a
   variety of black box inference techniques.
 
+  Args:
+    latent_vars: list of RandomVariable or
+                 dict of RandomVariable to RandomVariable.
+      Collection of random variables to perform inference on. If
+      list, each random variable will be implictly optimized using a
+      `Normal` random variable that is defined internally with a
+      free parameter per location and scale and is initialized using
+      standard normal draws. The random variables to approximate
+      must be continuous.
+    n_samples: int, optional.
+      Number of samples from variational model for calculating
+      stochastic gradients.
+    kl_scaling: dict of RandomVariable to tf.Tensor, optional.
+      Provides option to scale terms when using ELBO with KL divergence.
+      If the KL divergence terms are
+
+      $\\alpha_p \mathbb{E}_{q(z\mid x, \lambda)} [
+            \log q(z\mid x, \lambda) - \log p(z)],$
+
+      then pass {$p(z)$: $\\alpha_p$} as `kl_scaling`,
+      where $\\alpha_p$ is a tensor. Its shape must be broadcastable;
+      it is multiplied element-wise to the batchwise KL terms.
+    scale: dict of RandomVariable to tf.Tensor, optional.
+      A tensor to dict computation for any random variable that it is
+      binded to. Its shape must be broadcastable; it is multiplied
+      element-wise to the random variable. For example, this is useful
+      for mini-batch scaling when inferring global variables, or
+      applying masks on a random variable.
+
   #### Notes
 
   `KLqp` also optimizes any model parameters $p(z \mid x;
@@ -49,121 +80,90 @@ class KLqp(VariationalInference):
 
   The objective function also adds to itself a summation over all
   tensors in the `REGULARIZATION_LOSSES` collection.
+
+  ##
+
+  $-\\text{ELBO} =
+      -\mathbb{E}_{q(z; \lambda)} [ \log p(x, z) - \log q(z; \lambda) ]$
+
+  KLqp supports
+
+  1. score function gradients [@paisley2012variational]
+  2. reparameterization gradients [@kingma2014auto]
+
+  of the loss function.
+
+  If the KL divergence between the variational model and the prior
+  is tractable, then the loss function can be written as
+
+  $-\mathbb{E}_{q(z; \lambda)}[\log p(x \mid z)] +
+      \\text{KL}( q(z; \lambda) \| p(z) ),$
+
+  where the KL term is computed analytically [@kingma2014auto]. We
+  compute this automatically when $p(z)$ and $q(z; \lambda)$ are
+  Normal.
   """
-  def __init__(self, latent_vars=None, data=None):
-    """Create an inference algorithm.
-
-    Args:
-      latent_vars: list of RandomVariable or
-                   dict of RandomVariable to RandomVariable.
-        Collection of random variables to perform inference on. If
-        list, each random variable will be implictly optimized using a
-        `Normal` random variable that is defined internally with a
-        free parameter per location and scale and is initialized using
-        standard normal draws. The random variables to approximate
-        must be continuous.
-    """
-    if isinstance(latent_vars, list):
-      with tf.variable_scope(None, default_name="posterior"):
-        latent_vars_dict = {}
-        continuous = \
-            ('01', 'nonnegative', 'simplex', 'real', 'multivariate_real')
-        for z in latent_vars:
-          if not hasattr(z, 'support') or z.support not in continuous:
-            raise AttributeError(
-                "Random variable {} is not continuous or a random "
-                "variable with supported continuous support.".format(z))
-          batch_event_shape = z.batch_shape.concatenate(z.event_shape)
-          loc = tf.Variable(tf.random_normal(batch_event_shape))
-          scale = tf.nn.softplus(
-              tf.Variable(tf.random_normal(batch_event_shape)))
-          latent_vars_dict[z] = Normal(loc=loc, scale=scale)
-        latent_vars = latent_vars_dict
-        del latent_vars_dict
-
-    super(KLqp, self).__init__(latent_vars, data)
-
-  def initialize(self, n_samples=1, kl_scaling=None, *args, **kwargs):
-    """Initialize inference algorithm. It initializes hyperparameters
-    and builds ops for the algorithm's computation graph.
-
-    Args:
-      n_samples: int.
-        Number of samples from variational model for calculating
-        stochastic gradients.
-      kl_scaling: dict of RandomVariable to tf.Tensor.
-        Provides option to scale terms when using ELBO with KL divergence.
-        If the KL divergence terms are
-
-        $\\alpha_p \mathbb{E}_{q(z\mid x, \lambda)} [
-              \log q(z\mid x, \lambda) - \log p(z)],$
-
-        then pass {$p(z)$: $\\alpha_p$} as `kl_scaling`,
-        where $\\alpha_p$ is a tensor. Its shape must be broadcastable;
-        it is multiplied element-wise to the batchwise KL terms.
-    """
-    if kl_scaling is None:
-      kl_scaling = {}
-    if n_samples <= 0:
-      raise ValueError(
-          "n_samples should be greater than zero: {}".format(n_samples))
-
-    self.n_samples = n_samples
-    self.kl_scaling = kl_scaling
-    return super(KLqp, self).initialize(*args, **kwargs)
-
-  def build_loss_and_gradients(self, var_list):
-    """Wrapper for the `KLqp` loss function.
-
-    $-\\text{ELBO} =
-        -\mathbb{E}_{q(z; \lambda)} [ \log p(x, z) - \log q(z; \lambda) ]$
-
-    KLqp supports
-
-    1. score function gradients [@paisley2012variational]
-    2. reparameterization gradients [@kingma2014auto]
-
-    of the loss function.
-
-    If the KL divergence between the variational model and the prior
-    is tractable, then the loss function can be written as
-
-    $-\mathbb{E}_{q(z; \lambda)}[\log p(x \mid z)] +
-        \\text{KL}( q(z; \lambda) \| p(z) ),$
-
-    where the KL term is computed analytically [@kingma2014auto]. We
-    compute this automatically when $p(z)$ and $q(z; \lambda)$ are
-    Normal.
-    """
-    is_reparameterizable = all([
-        rv.reparameterization_type ==
-        tf.contrib.distributions.FULLY_REPARAMETERIZED
-        for rv in six.itervalues(self.latent_vars)])
-    is_analytic_kl = all([isinstance(z, Normal) and isinstance(qz, Normal)
-                          for z, qz in six.iteritems(self.latent_vars)])
-    if not is_analytic_kl and self.kl_scaling:
-      raise TypeError("kl_scaling must be None when using non-analytic KL term")
-    if is_reparameterizable:
-      if is_analytic_kl:
-        return build_reparam_kl_loss_and_gradients(self, var_list)
-      # elif is_analytic_entropy:
-      #    return build_reparam_entropy_loss_and_gradients(self, var_list)
-      else:
-        return build_reparam_loss_and_gradients(self, var_list)
+  if isinstance(latent_vars, list):
+    with tf.variable_scope(None, default_name="posterior"):
+      latent_vars_dict = {}
+      continuous = \
+          ('01', 'nonnegative', 'simplex', 'real', 'multivariate_real')
+      for z in latent_vars:
+        if not hasattr(z, 'support') or z.support not in continuous:
+          raise AttributeError(
+              "Random variable {} is not continuous or a random "
+              "variable with supported continuous support.".format(z))
+        batch_event_shape = z.batch_shape.concatenate(z.event_shape)
+        loc = tf.Variable(tf.random_normal(batch_event_shape))
+        scale = tf.nn.softplus(
+            tf.Variable(tf.random_normal(batch_event_shape)))
+        latent_vars_dict[z] = Normal(loc=loc, scale=scale)
+      latent_vars = latent_vars_dict
+      del latent_vars_dict
+  latent_vars = check_and_maybe_build_latent_vars(latent_vars)
+  data = check_and_maybe_build_data(data)
+  latent_vars, _ = transform(latent_vars, auto_transform)
+  kl_scaling = check_and_maybe_build_dict(kl_scaling)
+  scale = check_and_maybe_build_dict(scale)
+  var_list = check_and_maybe_build_var_list(var_list, latent_vars, data)
+
+  is_reparameterizable = all([
+      rv.reparameterization_type ==
+      tf.contrib.distributions.FULLY_REPARAMETERIZED
+      for rv in six.itervalues(latent_vars)])
+  is_analytic_kl = all([isinstance(z, Normal) and isinstance(qz, Normal)
+                        for z, qz in six.iteritems(latent_vars)])
+  if not is_analytic_kl and kl_scaling:
+    raise TypeError("kl_scaling must be None when using non-analytic KL term")
+  if is_reparameterizable:
+    if is_analytic_kl:
+      return build_reparam_kl_loss_and_gradients(
+          latent_vars, data, var_list,
+          scale, n_samples, kl_scaling, summary_key)
+    # elif is_analytic_entropy:
+    #    return build_reparam_entropy_loss_and_gradients(...)
     else:
-      # Prefer Rao-Blackwellization over analytic KL. Unknown what
-      # would happen stability-wise if the two are combined.
-      # if is_analytic_kl:
-      #   return build_score_kl_loss_and_gradients(self, var_list)
-      # Analytic entropies may lead to problems around
-      # convergence; for now it is deactivated.
-      # elif is_analytic_entropy:
-      #    return build_score_entropy_loss_and_gradients(self, var_list)
-      # else:
-      return build_score_rb_loss_and_gradients(self, var_list)
-
-
-class ReparameterizationKLqp(VariationalInference):
+      return build_reparam_loss_and_gradients(
+          latent_vars, data, var_list,
+          scale, n_samples, summary_key)
+  else:
+    # Prefer Rao-Blackwellization over analytic KL. Unknown what
+    # would happen stability-wise if the two are combined.
+    # if is_analytic_kl:
+    #   return build_score_kl_loss_and_gradients(...)
+    # Analytic entropies may lead to problems around
+    # convergence; for now it is deactivated.
+    # elif is_analytic_entropy:
+    #    return build_score_entropy_loss_and_gradients(...)
+    # else:
+    return build_score_rb_loss_and_gradients(
+        latent_vars, data, var_list,
+        scale, n_samples, summary_key)
+
+
+def reparameterization_klqp(
+    latent_vars=None, data=None, n_samples=1,
+    auto_transform=True, scale=None, var_list=None, summary_key=None):
   """Variational inference with the KL divergence
 
   $\\text{KL}( q(z; \lambda) \| p(z \mid x) ).$
@@ -171,62 +171,53 @@ class ReparameterizationKLqp(VariationalInference):
   This class minimizes the objective using the reparameterization
   gradient.
 
+  Args:
+    latent_vars: list of RandomVariable or
+                 dict of RandomVariable to RandomVariable.
+      Collection of random variables to perform inference on. If
+      list, each random variable will be implictly optimized using a
+      `Normal` random variable that is defined internally with a
+      free parameter per location and scale and is initialized using
+      standard normal draws. The random variables to approximate
+      must be continuous.
+    n_samples: int, optional.
+      Number of samples from variational model for calculating
+      stochastic gradients.
+
   The objective function also adds to itself a summation over all
   tensors in the `REGULARIZATION_LOSSES` collection.
   """
-  def __init__(self, latent_vars=None, data=None):
-    """Create an inference algorithm.
-
-    Args:
-      latent_vars: list of RandomVariable or
-                   dict of RandomVariable to RandomVariable.
-        Collection of random variables to perform inference on. If
-        list, each random variable will be implictly optimized using a
-        `Normal` random variable that is defined internally with a
-        free parameter per location and scale and is initialized using
-        standard normal draws. The random variables to approximate
-        must be continuous.
-    """
-    if isinstance(latent_vars, list):
-      with tf.variable_scope(None, default_name="posterior"):
-        latent_vars_dict = {}
-        continuous = \
-            ('01', 'nonnegative', 'simplex', 'real', 'multivariate_real')
-        for z in latent_vars:
-          if not hasattr(z, 'support') or z.support not in continuous:
-            raise AttributeError(
-                "Random variable {} is not continuous or a random "
-                "variable with supported continuous support.".format(z))
-          batch_event_shape = z.batch_shape.concatenate(z.event_shape)
-          loc = tf.Variable(tf.random_normal(batch_event_shape))
-          scale = tf.nn.softplus(
-              tf.Variable(tf.random_normal(batch_event_shape)))
-          latent_vars_dict[z] = Normal(loc=loc, scale=scale)
-        latent_vars = latent_vars_dict
-        del latent_vars_dict
-
-    super(ReparameterizationKLqp, self).__init__(latent_vars, data)
-
-  def initialize(self, n_samples=1, *args, **kwargs):
-    """Initialize inference algorithm. It initializes hyperparameters
-    and builds ops for the algorithm's computation graph.
-
-    Args:
-      n_samples: int.
-        Number of samples from variational model for calculating
-        stochastic gradients.
-    """
-    if n_samples <= 0:
-      raise ValueError(
-          "n_samples should be greater than zero: {}".format(n_samples))
-    self.n_samples = n_samples
-    return super(ReparameterizationKLqp, self).initialize(*args, **kwargs)
-
-  def build_loss_and_gradients(self, var_list):
-    return build_reparam_loss_and_gradients(self, var_list)
-
-
-class ReparameterizationKLKLqp(VariationalInference):
+  if isinstance(latent_vars, list):
+    with tf.variable_scope(None, default_name="posterior"):
+      latent_vars_dict = {}
+      continuous = \
+          ('01', 'nonnegative', 'simplex', 'real', 'multivariate_real')
+      for z in latent_vars:
+        if not hasattr(z, 'support') or z.support not in continuous:
+          raise AttributeError(
+              "Random variable {} is not continuous or a random "
+              "variable with supported continuous support.".format(z))
+        batch_event_shape = z.batch_shape.concatenate(z.event_shape)
+        loc = tf.Variable(tf.random_normal(batch_event_shape))
+        scale = tf.nn.softplus(
+            tf.Variable(tf.random_normal(batch_event_shape)))
+        latent_vars_dict[z] = Normal(loc=loc, scale=scale)
+      latent_vars = latent_vars_dict
+      del latent_vars_dict
+  latent_vars = check_and_maybe_build_latent_vars(latent_vars)
+  data = check_and_maybe_build_data(data)
+  latent_vars, _ = transform(latent_vars, auto_transform)
+  scale = check_and_maybe_build_dict(scale)
+  var_list = check_and_maybe_build_var_list(var_list, latent_vars, data)
+
+  return build_reparam_loss_and_gradients(
+      latent_vars, data, var_list,
+      scale, n_samples, summary_key)
+
+
+def reparameterization_kl_klqp(
+    latent_vars=None, data=None, n_samples=1, kl_scaling=None,
+    auto_transform=True, scale=None, var_list=None, summary_key=None):
   """Variational inference with the KL divergence
 
   $\\text{KL}( q(z; \lambda) \| p(z \mid x) ).$
@@ -234,76 +225,64 @@ class ReparameterizationKLKLqp(VariationalInference):
   This class minimizes the objective using the reparameterization
   gradient and an analytic KL term.
 
+  Args:
+    latent_vars: list of RandomVariable or
+                 dict of RandomVariable to RandomVariable.
+      Collection of random variables to perform inference on. If
+      list, each random variable will be implictly optimized using a
+      `Normal` random variable that is defined internally with a
+      free parameter per location and scale and is initialized using
+      standard normal draws. The random variables to approximate
+      must be continuous.
+    n_samples: int, optional.
+      Number of samples from variational model for calculating
+      stochastic gradients.
+    kl_scaling: dict of RandomVariable to tf.Tensor, optional.
+      Provides option to scale terms when using ELBO with KL divergence.
+      If the KL divergence terms are
+
+      $\\alpha_p \mathbb{E}_{q(z\mid x, \lambda)} [
+            \log q(z\mid x, \lambda) - \log p(z)],$
+
+      then pass {$p(z)$: $\\alpha_p$} as `kl_scaling`,
+      where $\\alpha_p$ is a tensor. Its shape must be broadcastable;
+      it is multiplied element-wise to the batchwise KL terms.
+
   The objective function also adds to itself a summation over all
   tensors in the `REGULARIZATION_LOSSES` collection.
   """
-  def __init__(self, latent_vars=None, data=None):
-    """Create an inference algorithm.
-
-    Args:
-      latent_vars: list of RandomVariable or
-                   dict of RandomVariable to RandomVariable.
-        Collection of random variables to perform inference on. If
-        list, each random variable will be implictly optimized using a
-        `Normal` random variable that is defined internally with a
-        free parameter per location and scale and is initialized using
-        standard normal draws. The random variables to approximate
-        must be continuous.
-    """
-    if isinstance(latent_vars, list):
-      with tf.variable_scope(None, default_name="posterior"):
-        latent_vars_dict = {}
-        continuous = \
-            ('01', 'nonnegative', 'simplex', 'real', 'multivariate_real')
-        for z in latent_vars:
-          if not hasattr(z, 'support') or z.support not in continuous:
-            raise AttributeError(
-                "Random variable {} is not continuous or a random "
-                "variable with supported continuous support.".format(z))
-          batch_event_shape = z.batch_shape.concatenate(z.event_shape)
-          loc = tf.Variable(tf.random_normal(batch_event_shape))
-          scale = tf.nn.softplus(
-              tf.Variable(tf.random_normal(batch_event_shape)))
-          latent_vars_dict[z] = Normal(loc=loc, scale=scale)
-        latent_vars = latent_vars_dict
-        del latent_vars_dict
-
-    super(ReparameterizationKLKLqp, self).__init__(latent_vars, data)
-
-  def initialize(self, n_samples=1, kl_scaling=None, *args, **kwargs):
-    """Initialize inference algorithm. It initializes hyperparameters
-    and builds ops for the algorithm's computation graph.
-
-    Args:
-      n_samples: int.
-        Number of samples from variational model for calculating
-        stochastic gradients.
-      kl_scaling: dict of RandomVariable to tf.Tensor.
-        Provides option to scale terms when using ELBO with KL divergence.
-        If the KL divergence terms are
-
-        $\\alpha_p \mathbb{E}_{q(z\mid x, \lambda)} [
-              \log q(z\mid x, \lambda) - \log p(z)],$
-
-        then pass {$p(z)$: $\\alpha_p$} as `kl_scaling`,
-        where $\\alpha_p$ is a tensor. Its shape must be broadcastable;
-        it is multiplied element-wise to the batchwise KL terms.
-    """
-    if kl_scaling is None:
-      kl_scaling = {}
-    if n_samples <= 0:
-      raise ValueError(
-          "n_samples should be greater than zero: {}".format(n_samples))
-
-    self.n_samples = n_samples
-    self.kl_scaling = kl_scaling
-    return super(ReparameterizationKLKLqp, self).initialize(*args, **kwargs)
-
-  def build_loss_and_gradients(self, var_list):
-    return build_reparam_kl_loss_and_gradients(self, var_list)
-
-
-class ReparameterizationEntropyKLqp(VariationalInference):
+  if isinstance(latent_vars, list):
+    with tf.variable_scope(None, default_name="posterior"):
+      latent_vars_dict = {}
+      continuous = \
+          ('01', 'nonnegative', 'simplex', 'real', 'multivariate_real')
+      for z in latent_vars:
+        if not hasattr(z, 'support') or z.support not in continuous:
+          raise AttributeError(
+              "Random variable {} is not continuous or a random "
+              "variable with supported continuous support.".format(z))
+        batch_event_shape = z.batch_shape.concatenate(z.event_shape)
+        loc = tf.Variable(tf.random_normal(batch_event_shape))
+        scale = tf.nn.softplus(
+            tf.Variable(tf.random_normal(batch_event_shape)))
+        latent_vars_dict[z] = Normal(loc=loc, scale=scale)
+      latent_vars = latent_vars_dict
+      del latent_vars_dict
+  latent_vars = check_and_maybe_build_latent_vars(latent_vars)
+  data = check_and_maybe_build_data(data)
+  latent_vars, _ = transform(latent_vars, auto_transform)
+  kl_scaling = check_and_maybe_build_dict(kl_scaling)
+  scale = check_and_maybe_build_dict(scale)
+  var_list = check_and_maybe_build_var_list(var_list, latent_vars, data)
+
+  return build_reparam_kl_loss_and_gradients(
+      latent_vars, data, var_list,
+      scale, n_samples, kl_scaling, summary_key)
+
+
+def reparameterization_entropy_klqp(
+    latent_vars=None, data=None, n_samples=1,
+    auto_transform=True, scale=None, var_list=None, summary_key=None):
   """Variational inference with the KL divergence
 
   $\\text{KL}( q(z; \lambda) \| p(z \mid x) ).$
@@ -311,63 +290,53 @@ class ReparameterizationEntropyKLqp(VariationalInference):
   This class minimizes the objective using the reparameterization
   gradient and an analytic entropy term.
 
+  Args:
+    latent_vars: list of RandomVariable or
+                 dict of RandomVariable to RandomVariable.
+      Collection of random variables to perform inference on. If
+      list, each random variable will be implictly optimized using a
+      `Normal` random variable that is defined internally with a
+      free parameter per location and scale and is initialized using
+      standard normal draws. The random variables to approximate
+      must be continuous.
+    n_samples: int, optional.
+      Number of samples from variational model for calculating
+      stochastic gradients.
+
   The objective function also adds to itself a summation over all
   tensors in the `REGULARIZATION_LOSSES` collection.
   """
-  def __init__(self, latent_vars=None, data=None):
-    """Create an inference algorithm.
-
-    Args:
-      latent_vars: list of RandomVariable or
-                   dict of RandomVariable to RandomVariable.
-        Collection of random variables to perform inference on. If
-        list, each random variable will be implictly optimized using a
-        `Normal` random variable that is defined internally with a
-        free parameter per location and scale and is initialized using
-        standard normal draws. The random variables to approximate
-        must be continuous.
-    """
-    if isinstance(latent_vars, list):
-      with tf.variable_scope(None, default_name="posterior"):
-        latent_vars_dict = {}
-        continuous = \
-            ('01', 'nonnegative', 'simplex', 'real', 'multivariate_real')
-        for z in latent_vars:
-          if not hasattr(z, 'support') or z.support not in continuous:
-            raise AttributeError(
-                "Random variable {} is not continuous or a random "
-                "variable with supported continuous support.".format(z))
-          batch_event_shape = z.batch_shape.concatenate(z.event_shape)
-          loc = tf.Variable(tf.random_normal(batch_event_shape))
-          scale = tf.nn.softplus(
-              tf.Variable(tf.random_normal(batch_event_shape)))
-          latent_vars_dict[z] = Normal(loc=loc, scale=scale)
-        latent_vars = latent_vars_dict
-        del latent_vars_dict
-
-    super(ReparameterizationEntropyKLqp, self).__init__(latent_vars, data)
-
-  def initialize(self, n_samples=1, *args, **kwargs):
-    """Initialize inference algorithm. It initializes hyperparameters
-    and builds ops for the algorithm's computation graph.
-
-    Args:
-      n_samples: int.
-        Number of samples from variational model for calculating
-        stochastic gradients.
-    """
-    if n_samples <= 0:
-      raise ValueError(
-          "n_samples should be greater than zero: {}".format(n_samples))
-    self.n_samples = n_samples
-    return super(ReparameterizationEntropyKLqp, self).initialize(
-        *args, **kwargs)
-
-  def build_loss_and_gradients(self, var_list):
-    return build_reparam_entropy_loss_and_gradients(self, var_list)
-
-
-class ScoreKLqp(VariationalInference):
+  if isinstance(latent_vars, list):
+    with tf.variable_scope(None, default_name="posterior"):
+      latent_vars_dict = {}
+      continuous = \
+          ('01', 'nonnegative', 'simplex', 'real', 'multivariate_real')
+      for z in latent_vars:
+        if not hasattr(z, 'support') or z.support not in continuous:
+          raise AttributeError(
+              "Random variable {} is not continuous or a random "
+              "variable with supported continuous support.".format(z))
+        batch_event_shape = z.batch_shape.concatenate(z.event_shape)
+        loc = tf.Variable(tf.random_normal(batch_event_shape))
+        scale = tf.nn.softplus(
+            tf.Variable(tf.random_normal(batch_event_shape)))
+        latent_vars_dict[z] = Normal(loc=loc, scale=scale)
+      latent_vars = latent_vars_dict
+      del latent_vars_dict
+  latent_vars = check_and_maybe_build_latent_vars(latent_vars)
+  data = check_and_maybe_build_data(data)
+  latent_vars, _ = transform(latent_vars, auto_transform)
+  scale = check_and_maybe_build_dict(scale)
+  var_list = check_and_maybe_build_var_list(var_list, latent_vars, data)
+
+  return build_reparam_entropy_loss_and_gradients(
+      latent_vars, data, var_list,
+      scale, n_samples, summary_key)
+
+
+def score_klqp(
+    latent_vars=None, data=None, n_samples=1,
+    auto_transform=True, scale=None, var_list=None, summary_key=None):
   """Variational inference with the KL divergence
 
   $\\text{KL}( q(z; \lambda) \| p(z \mid x) ).$
@@ -375,62 +344,50 @@ class ScoreKLqp(VariationalInference):
   This class minimizes the objective using the score function
   gradient.
 
+  Args:
+    latent_vars: list of RandomVariable or
+                 dict of RandomVariable to RandomVariable.
+      Collection of random variables to perform inference on. If
+      list, each random variable will be implictly optimized using a
+      `Normal` random variable that is defined internally with a
+      free parameter per location and scale and is initialized using
+      standard normal draws. The random variables to approximate
+      must be continuous.
+
   The objective function also adds to itself a summation over all
   tensors in the `REGULARIZATION_LOSSES` collection.
   """
-  def __init__(self, latent_vars=None, data=None):
-    """Create an inference algorithm.
-
-    Args:
-      latent_vars: list of RandomVariable or
-                   dict of RandomVariable to RandomVariable.
-        Collection of random variables to perform inference on. If
-        list, each random variable will be implictly optimized using a
-        `Normal` random variable that is defined internally with a
-        free parameter per location and scale and is initialized using
-        standard normal draws. The random variables to approximate
-        must be continuous.
-    """
-    if isinstance(latent_vars, list):
-      with tf.variable_scope(None, default_name="posterior"):
-        latent_vars_dict = {}
-        continuous = \
-            ('01', 'nonnegative', 'simplex', 'real', 'multivariate_real')
-        for z in latent_vars:
-          if not hasattr(z, 'support') or z.support not in continuous:
-            raise AttributeError(
-                "Random variable {} is not continuous or a random "
-                "variable with supported continuous support.".format(z))
-          batch_event_shape = z.batch_shape.concatenate(z.event_shape)
-          loc = tf.Variable(tf.random_normal(batch_event_shape))
-          scale = tf.nn.softplus(
-              tf.Variable(tf.random_normal(batch_event_shape)))
-          latent_vars_dict[z] = Normal(loc=loc, scale=scale)
-        latent_vars = latent_vars_dict
-        del latent_vars_dict
-
-    super(ScoreKLqp, self).__init__(latent_vars, data)
-
-  def initialize(self, n_samples=1, *args, **kwargs):
-    """Initialize inference algorithm. It initializes hyperparameters
-    and builds ops for the algorithm's computation graph.
-
-    Args:
-      n_samples: int.
-        Number of samples from variational model for calculating
-        stochastic gradients.
-    """
-    if n_samples <= 0:
-      raise ValueError(
-          "n_samples should be greater than zero: {}".format(n_samples))
-    self.n_samples = n_samples
-    return super(ScoreKLqp, self).initialize(*args, **kwargs)
-
-  def build_loss_and_gradients(self, var_list):
-    return build_score_loss_and_gradients(self, var_list)
-
-
-class ScoreKLKLqp(VariationalInference):
+  if isinstance(latent_vars, list):
+    with tf.variable_scope(None, default_name="posterior"):
+      latent_vars_dict = {}
+      continuous = \
+          ('01', 'nonnegative', 'simplex', 'real', 'multivariate_real')
+      for z in latent_vars:
+        if not hasattr(z, 'support') or z.support not in continuous:
+          raise AttributeError(
+              "Random variable {} is not continuous or a random "
+              "variable with supported continuous support.".format(z))
+        batch_event_shape = z.batch_shape.concatenate(z.event_shape)
+        loc = tf.Variable(tf.random_normal(batch_event_shape))
+        scale = tf.nn.softplus(
+            tf.Variable(tf.random_normal(batch_event_shape)))
+        latent_vars_dict[z] = Normal(loc=loc, scale=scale)
+      latent_vars = latent_vars_dict
+      del latent_vars_dict
+  latent_vars = check_and_maybe_build_latent_vars(latent_vars)
+  data = check_and_maybe_build_data(data)
+  latent_vars, _ = transform(latent_vars, auto_transform)
+  scale = check_and_maybe_build_dict(scale)
+  var_list = check_and_maybe_build_var_list(var_list, latent_vars, data)
+
+  return build_score_loss_and_gradients(
+      latent_vars, data, var_list,
+      scale, n_samples, summary_key)
+
+
+def score_kl_klqp(
+    latent_vars=None, data=None, n_samples=1, kl_scaling=None,
+    auto_transform=True, scale=None, var_list=None, summary_key=None):
   """Variational inference with the KL divergence
 
   $\\text{KL}( q(z; \lambda) \| p(z \mid x) ).$
@@ -438,75 +395,64 @@ class ScoreKLKLqp(VariationalInference):
   This class minimizes the objective using the score function gradient
   and an analytic KL term.
 
+  Args:
+    latent_vars: list of RandomVariable or
+                 dict of RandomVariable to RandomVariable.
+      Collection of random variables to perform inference on. If
+      list, each random variable will be implictly optimized using a
+      `Normal` random variable that is defined internally with a
+      free parameter per location and scale and is initialized using
+      standard normal draws. The random variables to approximate
+      must be continuous.
+    n_samples: int, optional.
+      Number of samples from variational model for calculating
+      stochastic gradients.
+    kl_scaling: dict of RandomVariable to tf.Tensor, optional.
+      Provides option to scale terms when using ELBO with KL divergence.
+      If the KL divergence terms are
+
+      $\\alpha_p \mathbb{E}_{q(z\mid x, \lambda)} [
+            \log q(z\mid x, \lambda) - \log p(z)],$
+
+      then pass {$p(z)$: $\\alpha_p$} as `kl_scaling`,
+      where $\\alpha_p$ is a tensor. Its shape must be broadcastable;
+      it is multiplied element-wise to the batchwise KL terms.
+
   The objective function also adds to itself a summation over all
   tensors in the `REGULARIZATION_LOSSES` collection.
   """
-  def __init__(self, latent_vars=None, data=None):
-    """Create an inference algorithm.
-
-    Args:
-      latent_vars: list of RandomVariable or
-                   dict of RandomVariable to RandomVariable.
-        Collection of random variables to perform inference on. If
-        list, each random variable will be implictly optimized using a
-        `Normal` random variable that is defined internally with a
-        free parameter per location and scale and is initialized using
-        standard normal draws. The random variables to approximate
-        must be continuous.
-    """
-    if isinstance(latent_vars, list):
-      with tf.variable_scope(None, default_name="posterior"):
-        latent_vars_dict = {}
-        continuous = \
-            ('01', 'nonnegative', 'simplex', 'real', 'multivariate_real')
-        for z in latent_vars:
-          if not hasattr(z, 'support') or z.support not in continuous:
-            raise AttributeError(
-                "Random variable {} is not continuous or a random "
-                "variable with supported continuous support.".format(z))
-          batch_event_shape = z.batch_shape.concatenate(z.event_shape)
-          loc = tf.Variable(tf.random_normal(batch_event_shape))
-          scale = tf.nn.softplus(
-              tf.Variable(tf.random_normal(batch_event_shape)))
-          latent_vars_dict[z] = Normal(loc=loc, scale=scale)
-        latent_vars = latent_vars_dict
-        del latent_vars_dict
-
-    super(ScoreKLKLqp, self).__init__(latent_vars, data)
-
-  def initialize(self, n_samples=1, kl_scaling=None, *args, **kwargs):
-    """Initialize inference algorithm. It initializes hyperparameters
-    and builds ops for the algorithm's computation graph.
-
-    Args:
-      n_samples: int.
-        Number of samples from variational model for calculating
-        stochastic gradients.
-      kl_scaling: dict of RandomVariable to tf.Tensor.
-        Provides option to scale terms when using ELBO with KL divergence.
-        If the KL divergence terms are
-
-        $\\alpha_p \mathbb{E}_{q(z\mid x, \lambda)} [
-              \log q(z\mid x, \lambda) - \log p(z)],$
-
-        then pass {$p(z)$: $\\alpha_p$} as `kl_scaling`,
-        where $\\alpha_p$ is a tensor. Its shape must be broadcastable;
-        it is multiplied element-wise to the batchwise KL terms.
-    """
-    if kl_scaling is None:
-      kl_scaling = {}
-    if n_samples <= 0:
-      raise ValueError(
-          "n_samples should be greater than zero: {}".format(n_samples))
-    self.n_samples = n_samples
-    self.kl_scaling = kl_scaling
-    return super(ScoreKLKLqp, self).initialize(*args, **kwargs)
-
-  def build_loss_and_gradients(self, var_list):
-    return build_score_kl_loss_and_gradients(self, var_list)
-
-
-class ScoreEntropyKLqp(VariationalInference):
+  if isinstance(latent_vars, list):
+    with tf.variable_scope(None, default_name="posterior"):
+      latent_vars_dict = {}
+      continuous = \
+          ('01', 'nonnegative', 'simplex', 'real', 'multivariate_real')
+      for z in latent_vars:
+        if not hasattr(z, 'support') or z.support not in continuous:
+          raise AttributeError(
+              "Random variable {} is not continuous or a random "
+              "variable with supported continuous support.".format(z))
+        batch_event_shape = z.batch_shape.concatenate(z.event_shape)
+        loc = tf.Variable(tf.random_normal(batch_event_shape))
+        scale = tf.nn.softplus(
+            tf.Variable(tf.random_normal(batch_event_shape)))
+        latent_vars_dict[z] = Normal(loc=loc, scale=scale)
+      latent_vars = latent_vars_dict
+      del latent_vars_dict
+  latent_vars = check_and_maybe_build_latent_vars(latent_vars)
+  data = check_and_maybe_build_data(data)
+  latent_vars, _ = transform(latent_vars, auto_transform)
+  kl_scaling = check_and_maybe_build_dict(kl_scaling)
+  scale = check_and_maybe_build_dict(scale)
+  var_list = check_and_maybe_build_var_list(var_list, latent_vars, data)
+
+  return build_score_kl_loss_and_gradients(
+      latent_vars, data, var_list,
+      scale, n_samples, kl_scaling, summary_key)
+
+
+def score_entropy_klqp(
+    latent_vars=None, data=None, n_samples=1,
+    auto_transform=True, scale=None, var_list=None, summary_key=None):
   """Variational inference with the KL divergence
 
   $\\text{KL}( q(z; \lambda) \| p(z \mid x) ).$
@@ -514,62 +460,53 @@ class ScoreEntropyKLqp(VariationalInference):
   This class minimizes the objective using the score function gradient
   and an analytic entropy term.
 
+  Args:
+    latent_vars: list of RandomVariable or
+                 dict of RandomVariable to RandomVariable.
+      Collection of random variables to perform inference on. If
+      list, each random variable will be implictly optimized using a
+      `Normal` random variable that is defined internally with a
+      free parameter per location and scale and is initialized using
+      standard normal draws. The random variables to approximate
+      must be continuous.
+    n_samples: int, optional.
+      Number of samples from variational model for calculating
+      stochastic gradients.
+
   The objective function also adds to itself a summation over all
   tensors in the `REGULARIZATION_LOSSES` collection.
   """
-  def __init__(self, latent_vars=None, data=None):
-    """Create an inference algorithm.
-
-    Args:
-      latent_vars: list of RandomVariable or
-                   dict of RandomVariable to RandomVariable.
-        Collection of random variables to perform inference on. If
-        list, each random variable will be implictly optimized using a
-        `Normal` random variable that is defined internally with a
-        free parameter per location and scale and is initialized using
-        standard normal draws. The random variables to approximate
-        must be continuous.
-    """
-    if isinstance(latent_vars, list):
-      with tf.variable_scope(None, default_name="posterior"):
-        latent_vars_dict = {}
-        continuous = \
-            ('01', 'nonnegative', 'simplex', 'real', 'multivariate_real')
-        for z in latent_vars:
-          if not hasattr(z, 'support') or z.support not in continuous:
-            raise AttributeError(
-                "Random variable {} is not continuous or a random "
-                "variable with supported continuous support.".format(z))
-          batch_event_shape = z.batch_shape.concatenate(z.event_shape)
-          loc = tf.Variable(tf.random_normal(batch_event_shape))
-          scale = tf.nn.softplus(
-              tf.Variable(tf.random_normal(batch_event_shape)))
-          latent_vars_dict[z] = Normal(loc=loc, scale=scale)
-        latent_vars = latent_vars_dict
-        del latent_vars_dict
-
-    super(ScoreEntropyKLqp, self).__init__(latent_vars, data)
-
-  def initialize(self, n_samples=1, *args, **kwargs):
-    """Initialize inference algorithm. It initializes hyperparameters
-    and builds ops for the algorithm's computation graph.
-
-    Args:
-      n_samples: int.
-        Number of samples from variational model for calculating
-        stochastic gradients.
-    """
-    if n_samples <= 0:
-      raise ValueError(
-          "n_samples should be greater than zero: {}".format(n_samples))
-    self.n_samples = n_samples
-    return super(ScoreEntropyKLqp, self).initialize(*args, **kwargs)
-
-  def build_loss_and_gradients(self, var_list):
-    return build_score_entropy_loss_and_gradients(self, var_list)
-
-
-class ScoreRBKLqp(VariationalInference):
+  if isinstance(latent_vars, list):
+    with tf.variable_scope(None, default_name="posterior"):
+      latent_vars_dict = {}
+      continuous = \
+          ('01', 'nonnegative', 'simplex', 'real', 'multivariate_real')
+      for z in latent_vars:
+        if not hasattr(z, 'support') or z.support not in continuous:
+          raise AttributeError(
+              "Random variable {} is not continuous or a random "
+              "variable with supported continuous support.".format(z))
+        batch_event_shape = z.batch_shape.concatenate(z.event_shape)
+        loc = tf.Variable(tf.random_normal(batch_event_shape))
+        scale = tf.nn.softplus(
+            tf.Variable(tf.random_normal(batch_event_shape)))
+        latent_vars_dict[z] = Normal(loc=loc, scale=scale)
+      latent_vars = latent_vars_dict
+      del latent_vars_dict
+  latent_vars = check_and_maybe_build_latent_vars(latent_vars)
+  data = check_and_maybe_build_data(data)
+  latent_vars, _ = transform(latent_vars, auto_transform)
+  scale = check_and_maybe_build_dict(scale)
+  var_list = check_and_maybe_build_var_list(var_list, latent_vars, data)
+
+  return build_score_entropy_loss_and_gradients(
+      latent_vars, data, var_list,
+      scale, n_samples, summary_key)
+
+
+def score_rb_klqp(
+    latent_vars=None, data=None, n_samples=1,
+    auto_transform=True, scale=None, var_list=None, summary_key=None):
   """Variational inference with the KL divergence
 
   $\\text{KL}( q(z; \lambda) \| p(z \mid x) ).$
@@ -577,6 +514,16 @@ class ScoreRBKLqp(VariationalInference):
   This class minimizes the objective using the score function gradient
   and Rao-Blackwellization.
 
+  Args:
+    latent_vars: list of RandomVariable or
+                 dict of RandomVariable to RandomVariable.
+      Collection of random variables to perform inference on. If
+      list, each random variable will be implictly optimized using a
+      `Normal` random variable that is defined internally with a
+      free parameter per location and scale and is initialized using
+      standard normal draws. The random variables to approximate
+      must be continuous.
+
   #### Notes
 
   Current Rao-Blackwellization is limited to Rao-Blackwellizing across
@@ -587,59 +534,36 @@ class ScoreRBKLqp(VariationalInference):
   The objective function also adds to itself a summation over all
   tensors in the `REGULARIZATION_LOSSES` collection.
   """
-  def __init__(self, latent_vars=None, data=None):
-    """Create an inference algorithm.
-
-    Args:
-      latent_vars: list of RandomVariable or
-                   dict of RandomVariable to RandomVariable.
-        Collection of random variables to perform inference on. If
-        list, each random variable will be implictly optimized using a
-        `Normal` random variable that is defined internally with a
-        free parameter per location and scale and is initialized using
-        standard normal draws. The random variables to approximate
-        must be continuous.
-    """
-    if isinstance(latent_vars, list):
-      with tf.variable_scope(None, default_name="posterior"):
-        latent_vars_dict = {}
-        continuous = \
-            ('01', 'nonnegative', 'simplex', 'real', 'multivariate_real')
-        for z in latent_vars:
-          if not hasattr(z, 'support') or z.support not in continuous:
-            raise AttributeError(
-                "Random variable {} is not continuous or a random "
-                "variable with supported continuous support.".format(z))
-          batch_event_shape = z.batch_shape.concatenate(z.event_shape)
-          loc = tf.Variable(tf.random_normal(batch_event_shape))
-          scale = tf.nn.softplus(
-              tf.Variable(tf.random_normal(batch_event_shape)))
-          latent_vars_dict[z] = Normal(loc=loc, scale=scale)
-        latent_vars = latent_vars_dict
-        del latent_vars_dict
-
-    super(ScoreRBKLqp, self).__init__(latent_vars, data)
-
-  def initialize(self, n_samples=1, *args, **kwargs):
-    """Initialize inference algorithm. It initializes hyperparameters
-    and builds ops for the algorithm's computation graph.
-
-    Args:
-      n_samples: int.
-        Number of samples from variational model for calculating
-        stochastic gradients.
-    """
-    if n_samples <= 0:
-      raise ValueError(
-          "n_samples should be greater than zero: {}".format(n_samples))
-    self.n_samples = n_samples
-    return super(ScoreRBKLqp, self).initialize(*args, **kwargs)
-
-  def build_loss_and_gradients(self, var_list):
-    return build_score_rb_loss_and_gradients(self, var_list)
-
-
-def build_reparam_loss_and_gradients(inference, var_list):
+  if isinstance(latent_vars, list):
+    with tf.variable_scope(None, default_name="posterior"):
+      latent_vars_dict = {}
+      continuous = \
+          ('01', 'nonnegative', 'simplex', 'real', 'multivariate_real')
+      for z in latent_vars:
+        if not hasattr(z, 'support') or z.support not in continuous:
+          raise AttributeError(
+              "Random variable {} is not continuous or a random "
+              "variable with supported continuous support.".format(z))
+        batch_event_shape = z.batch_shape.concatenate(z.event_shape)
+        loc = tf.Variable(tf.random_normal(batch_event_shape))
+        scale = tf.nn.softplus(
+            tf.Variable(tf.random_normal(batch_event_shape)))
+        latent_vars_dict[z] = Normal(loc=loc, scale=scale)
+      latent_vars = latent_vars_dict
+      del latent_vars_dict
+  latent_vars = check_and_maybe_build_latent_vars(latent_vars)
+  data = check_and_maybe_build_data(data)
+  latent_vars, _ = transform(latent_vars, auto_transform)
+  scale = check_and_maybe_build_dict(scale)
+  var_list = check_and_maybe_build_var_list(var_list, latent_vars, data)
+
+  return build_score_rb_loss_and_gradients(
+      latent_vars, data, var_list,
+      scale, n_samples, summary_key)
+
+
+def build_reparam_loss_and_gradients(
+    latent_vars, data, var_list, scale, n_samples, summary_key):
   """Build loss function. Its automatic differentiation
   is a stochastic gradient of
 
@@ -651,15 +575,15 @@ def build_reparam_loss_and_gradients(inference, var_list):
   Computed by sampling from $q(z;\lambda)$ and evaluating the
   expectation using Monte Carlo sampling.
   """
-  p_log_prob = [0.0] * inference.n_samples
-  q_log_prob = [0.0] * inference.n_samples
+  p_log_prob = [0.0] * n_samples
+  q_log_prob = [0.0] * n_samples
   base_scope = tf.get_default_graph().unique_name("inference") + '/'
-  for s in range(inference.n_samples):
+  for s in range(n_samples):
     # Form dictionary in order to replace conditioning on prior or
     # observed variable with conditioning on a specific value.
     scope = base_scope + tf.get_default_graph().unique_name("sample")
     dict_swap = {}
-    for x, qx in six.iteritems(inference.data):
+    for x, qx in six.iteritems(data):
       if isinstance(x, RandomVariable):
         if isinstance(qx, RandomVariable):
           qx_copy = copy(qx, scope=scope)
@@ -667,35 +591,35 @@ def build_reparam_loss_and_gradients(inference, var_list):
         else:
           dict_swap[x] = qx
 
-    for z, qz in six.iteritems(inference.latent_vars):
+    for z, qz in six.iteritems(latent_vars):
       # Copy q(z) to obtain new set of posterior samples.
       qz_copy = copy(qz, scope=scope)
       dict_swap[z] = qz_copy.value
       q_log_prob[s] += tf.reduce_sum(
-          inference.scale.get(z, 1.0) * qz_copy.log_prob(dict_swap[z]))
+          scale.get(z, 1.0) * qz_copy.log_prob(dict_swap[z]))
 
-    for z in six.iterkeys(inference.latent_vars):
+    for z in six.iterkeys(latent_vars):
       z_copy = copy(z, dict_swap, scope=scope)
       p_log_prob[s] += tf.reduce_sum(
-          inference.scale.get(z, 1.0) * z_copy.log_prob(dict_swap[z]))
+          scale.get(z, 1.0) * z_copy.log_prob(dict_swap[z]))
 
-    for x in six.iterkeys(inference.data):
+    for x in six.iterkeys(data):
       if isinstance(x, RandomVariable):
         x_copy = copy(x, dict_swap, scope=scope)
         p_log_prob[s] += tf.reduce_sum(
-            inference.scale.get(x, 1.0) * x_copy.log_prob(dict_swap[x]))
+            scale.get(x, 1.0) * x_copy.log_prob(dict_swap[x]))
 
   p_log_prob = tf.reduce_mean(p_log_prob)
   q_log_prob = tf.reduce_mean(q_log_prob)
   reg_penalty = tf.reduce_sum(tf.losses.get_regularization_losses())
 
-  if inference.logging:
+  if summary_key is not None:
     tf.summary.scalar("loss/p_log_prob", p_log_prob,
-                      collections=[inference._summary_key])
+                      collections=[summary_key])
     tf.summary.scalar("loss/q_log_prob", q_log_prob,
-                      collections=[inference._summary_key])
+                      collections=[summary_key])
     tf.summary.scalar("loss/reg_penalty", reg_penalty,
-                      collections=[inference._summary_key])
+                      collections=[summary_key])
 
   loss = -(p_log_prob - q_log_prob - reg_penalty)
 
@@ -704,7 +628,8 @@ def build_reparam_loss_and_gradients(inference, var_list):
   return loss, grads_and_vars
 
 
-def build_reparam_kl_loss_and_gradients(inference, var_list):
+def build_reparam_kl_loss_and_gradients(
+    latent_vars, data, var_list, scale, n_samples, kl_scaling, summary_key):
   """Build loss function. Its automatic differentiation
   is a stochastic gradient of
 
@@ -720,14 +645,14 @@ def build_reparam_kl_loss_and_gradients(inference, var_list):
   Computed by sampling from $q(z;\lambda)$ and evaluating the
   expectation using Monte Carlo sampling.
   """
-  p_log_lik = [0.0] * inference.n_samples
+  p_log_lik = [0.0] * n_samples
   base_scope = tf.get_default_graph().unique_name("inference") + '/'
-  for s in range(inference.n_samples):
+  for s in range(n_samples):
     # Form dictionary in order to replace conditioning on prior or
     # observed variable with conditioning on a specific value.
     scope = base_scope + tf.get_default_graph().unique_name("sample")
     dict_swap = {}
-    for x, qx in six.iteritems(inference.data):
+    for x, qx in six.iteritems(data):
       if isinstance(x, RandomVariable):
         if isinstance(qx, RandomVariable):
           qx_copy = copy(qx, scope=scope)
@@ -735,32 +660,32 @@ def build_reparam_kl_loss_and_gradients(inference, var_list):
         else:
           dict_swap[x] = qx
 
-    for z, qz in six.iteritems(inference.latent_vars):
+    for z, qz in six.iteritems(latent_vars):
       # Copy q(z) to obtain new set of posterior samples.
       qz_copy = copy(qz, scope=scope)
       dict_swap[z] = qz_copy.value
 
-    for x in six.iterkeys(inference.data):
+    for x in six.iterkeys(data):
       if isinstance(x, RandomVariable):
         x_copy = copy(x, dict_swap, scope=scope)
         p_log_lik[s] += tf.reduce_sum(
-            inference.scale.get(x, 1.0) * x_copy.log_prob(dict_swap[x]))
+            scale.get(x, 1.0) * x_copy.log_prob(dict_swap[x]))
 
   p_log_lik = tf.reduce_mean(p_log_lik)
 
   kl_penalty = tf.reduce_sum([
-      tf.reduce_sum(inference.kl_scaling.get(z, 1.0) * kl_divergence(qz, z))
-      for z, qz in six.iteritems(inference.latent_vars)])
+      tf.reduce_sum(kl_scaling.get(z, 1.0) * kl_divergence(qz, z))
+      for z, qz in six.iteritems(latent_vars)])
 
   reg_penalty = tf.reduce_sum(tf.losses.get_regularization_losses())
 
-  if inference.logging:
+  if summary_key is not None:
     tf.summary.scalar("loss/p_log_lik", p_log_lik,
-                      collections=[inference._summary_key])
+                      collections=[summary_key])
     tf.summary.scalar("loss/kl_penalty", kl_penalty,
-                      collections=[inference._summary_key])
+                      collections=[summary_key])
     tf.summary.scalar("loss/reg_penalty", reg_penalty,
-                      collections=[inference._summary_key])
+                      collections=[summary_key])
 
   loss = -(p_log_lik - kl_penalty - reg_penalty)
 
@@ -769,7 +694,8 @@ def build_reparam_kl_loss_and_gradients(inference, var_list):
   return loss, grads_and_vars
 
 
-def build_reparam_entropy_loss_and_gradients(inference, var_list):
+def build_reparam_entropy_loss_and_gradients(
+    latent_vars, data, var_list, scale, n_samples, summary_key):
   """Build loss function. Its automatic differentiation
   is a stochastic gradient of
 
@@ -783,14 +709,14 @@ def build_reparam_entropy_loss_and_gradients(inference, var_list):
   Computed by sampling from $q(z;\lambda)$ and evaluating the
   expectation using Monte Carlo sampling.
   """
-  p_log_prob = [0.0] * inference.n_samples
+  p_log_prob = [0.0] * n_samples
   base_scope = tf.get_default_graph().unique_name("inference") + '/'
-  for s in range(inference.n_samples):
+  for s in range(n_samples):
     # Form dictionary in order to replace conditioning on prior or
     # observed variable with conditioning on a specific value.
     scope = base_scope + tf.get_default_graph().unique_name("sample")
     dict_swap = {}
-    for x, qx in six.iteritems(inference.data):
+    for x, qx in six.iteritems(data):
       if isinstance(x, RandomVariable):
         if isinstance(qx, RandomVariable):
           qx_copy = copy(qx, scope=scope)
@@ -798,37 +724,37 @@ def build_reparam_entropy_loss_and_gradients(inference, var_list):
         else:
           dict_swap[x] = qx
 
-    for z, qz in six.iteritems(inference.latent_vars):
+    for z, qz in six.iteritems(latent_vars):
       # Copy q(z) to obtain new set of posterior samples.
       qz_copy = copy(qz, scope=scope)
       dict_swap[z] = qz_copy.value
 
-    for z in six.iterkeys(inference.latent_vars):
+    for z in six.iterkeys(latent_vars):
       z_copy = copy(z, dict_swap, scope=scope)
       p_log_prob[s] += tf.reduce_sum(
-          inference.scale.get(z, 1.0) * z_copy.log_prob(dict_swap[z]))
+          scale.get(z, 1.0) * z_copy.log_prob(dict_swap[z]))
 
-    for x in six.iterkeys(inference.data):
+    for x in six.iterkeys(data):
       if isinstance(x, RandomVariable):
         x_copy = copy(x, dict_swap, scope=scope)
         p_log_prob[s] += tf.reduce_sum(
-            inference.scale.get(x, 1.0) * x_copy.log_prob(dict_swap[x]))
+            scale.get(x, 1.0) * x_copy.log_prob(dict_swap[x]))
 
   p_log_prob = tf.reduce_mean(p_log_prob)
 
   q_entropy = tf.reduce_sum([
       tf.reduce_sum(qz.entropy())
-      for z, qz in six.iteritems(inference.latent_vars)])
+      for z, qz in six.iteritems(latent_vars)])
 
   reg_penalty = tf.reduce_sum(tf.losses.get_regularization_losses())
 
-  if inference.logging:
+  if summary_key is not None:
     tf.summary.scalar("loss/p_log_prob", p_log_prob,
-                      collections=[inference._summary_key])
+                      collections=[summary_key])
     tf.summary.scalar("loss/q_entropy", q_entropy,
-                      collections=[inference._summary_key])
+                      collections=[summary_key])
     tf.summary.scalar("loss/reg_penalty", reg_penalty,
-                      collections=[inference._summary_key])
+                      collections=[summary_key])
 
   loss = -(p_log_prob + q_entropy - reg_penalty)
 
@@ -837,22 +763,23 @@ def build_reparam_entropy_loss_and_gradients(inference, var_list):
   return loss, grads_and_vars
 
 
-def build_score_loss_and_gradients(inference, var_list):
+def build_score_loss_and_gradients(
+    latent_vars, data, var_list, scale, n_samples, summary_key):
   """Build loss function and gradients based on the score function
   estimator [@paisley2012variational].
 
   Computed by sampling from $q(z;\lambda)$ and evaluating the
   expectation using Monte Carlo sampling.
   """
-  p_log_prob = [0.0] * inference.n_samples
-  q_log_prob = [0.0] * inference.n_samples
+  p_log_prob = [0.0] * n_samples
+  q_log_prob = [0.0] * n_samples
   base_scope = tf.get_default_graph().unique_name("inference") + '/'
-  for s in range(inference.n_samples):
+  for s in range(n_samples):
     # Form dictionary in order to replace conditioning on prior or
     # observed variable with conditioning on a specific value.
     scope = base_scope + tf.get_default_graph().unique_name("sample")
     dict_swap = {}
-    for x, qx in six.iteritems(inference.data):
+    for x, qx in six.iteritems(data):
       if isinstance(x, RandomVariable):
         if isinstance(qx, RandomVariable):
           qx_copy = copy(qx, scope=scope)
@@ -860,41 +787,41 @@ def build_score_loss_and_gradients(inference, var_list):
         else:
           dict_swap[x] = qx
 
-    for z, qz in six.iteritems(inference.latent_vars):
+    for z, qz in six.iteritems(latent_vars):
       # Copy q(z) to obtain new set of posterior samples.
       qz_copy = copy(qz, scope=scope)
       dict_swap[z] = qz_copy.value
       q_log_prob[s] += tf.reduce_sum(
-          inference.scale.get(z, 1.0) *
+          scale.get(z, 1.0) *
           qz_copy.log_prob(tf.stop_gradient(dict_swap[z])))
 
-    for z in six.iterkeys(inference.latent_vars):
+    for z in six.iterkeys(latent_vars):
       z_copy = copy(z, dict_swap, scope=scope)
       p_log_prob[s] += tf.reduce_sum(
-          inference.scale.get(z, 1.0) * z_copy.log_prob(dict_swap[z]))
+          scale.get(z, 1.0) * z_copy.log_prob(dict_swap[z]))
 
-    for x in six.iterkeys(inference.data):
+    for x in six.iterkeys(data):
       if isinstance(x, RandomVariable):
         x_copy = copy(x, dict_swap, scope=scope)
         p_log_prob[s] += tf.reduce_sum(
-            inference.scale.get(x, 1.0) * x_copy.log_prob(dict_swap[x]))
+            scale.get(x, 1.0) * x_copy.log_prob(dict_swap[x]))
 
   p_log_prob = tf.stack(p_log_prob)
   q_log_prob = tf.stack(q_log_prob)
   reg_penalty = tf.reduce_sum(tf.losses.get_regularization_losses())
 
-  if inference.logging:
+  if summary_key is not None:
     tf.summary.scalar("loss/p_log_prob", tf.reduce_mean(p_log_prob),
-                      collections=[inference._summary_key])
+                      collections=[summary_key])
     tf.summary.scalar("loss/q_log_prob", tf.reduce_mean(q_log_prob),
-                      collections=[inference._summary_key])
+                      collections=[summary_key])
     tf.summary.scalar("loss/reg_penalty", reg_penalty,
-                      collections=[inference._summary_key])
+                      collections=[summary_key])
 
   losses = p_log_prob - q_log_prob
   loss = -(tf.reduce_mean(losses) - reg_penalty)
 
-  q_rvs = list(six.itervalues(inference.latent_vars))
+  q_rvs = list(six.itervalues(latent_vars))
   q_vars = [v for v in var_list
             if len(get_descendants(tf.convert_to_tensor(v), q_rvs)) != 0]
   q_grads = tf.gradients(
@@ -906,7 +833,8 @@ def build_score_loss_and_gradients(inference, var_list):
   return loss, grads_and_vars
 
 
-def build_score_kl_loss_and_gradients(inference, var_list):
+def build_score_kl_loss_and_gradients(
+    latent_vars, data, var_list, scale, n_samples, kl_scaling, summary_key):
   """Build loss function and gradients based on the score function
   estimator [@paisley2012variational].
 
@@ -915,15 +843,15 @@ def build_score_kl_loss_and_gradients(inference, var_list):
   Computed by sampling from $q(z;\lambda)$ and evaluating the
   expectation using Monte Carlo sampling.
   """
-  p_log_lik = [0.0] * inference.n_samples
-  q_log_prob = [0.0] * inference.n_samples
+  p_log_lik = [0.0] * n_samples
+  q_log_prob = [0.0] * n_samples
   base_scope = tf.get_default_graph().unique_name("inference") + '/'
-  for s in range(inference.n_samples):
+  for s in range(n_samples):
     # Form dictionary in order to replace conditioning on prior or
     # observed variable with conditioning on a specific value.
     scope = base_scope + tf.get_default_graph().unique_name("sample")
     dict_swap = {}
-    for x, qx in six.iteritems(inference.data):
+    for x, qx in six.iteritems(data):
       if isinstance(x, RandomVariable):
         if isinstance(qx, RandomVariable):
           qx_copy = copy(qx, scope=scope)
@@ -931,40 +859,40 @@ def build_score_kl_loss_and_gradients(inference, var_list):
         else:
           dict_swap[x] = qx
 
-    for z, qz in six.iteritems(inference.latent_vars):
+    for z, qz in six.iteritems(latent_vars):
       # Copy q(z) to obtain new set of posterior samples.
       qz_copy = copy(qz, scope=scope)
       dict_swap[z] = qz_copy.value
       q_log_prob[s] += tf.reduce_sum(
-          inference.scale.get(z, 1.0) *
+          scale.get(z, 1.0) *
           qz_copy.log_prob(tf.stop_gradient(dict_swap[z])))
 
-    for x in six.iterkeys(inference.data):
+    for x in six.iterkeys(data):
       if isinstance(x, RandomVariable):
         x_copy = copy(x, dict_swap, scope=scope)
         p_log_lik[s] += tf.reduce_sum(
-            inference.scale.get(x, 1.0) * x_copy.log_prob(dict_swap[x]))
+            scale.get(x, 1.0) * x_copy.log_prob(dict_swap[x]))
 
   p_log_lik = tf.stack(p_log_lik)
   q_log_prob = tf.stack(q_log_prob)
 
   kl_penalty = tf.reduce_sum([
-      tf.reduce_sum(inference.kl_scaling.get(z, 1.0) * kl_divergence(qz, z))
-      for z, qz in six.iteritems(inference.latent_vars)])
+      tf.reduce_sum(kl_scaling.get(z, 1.0) * kl_divergence(qz, z))
+      for z, qz in six.iteritems(latent_vars)])
 
   reg_penalty = tf.reduce_sum(tf.losses.get_regularization_losses())
 
-  if inference.logging:
+  if summary_key is not None:
     tf.summary.scalar("loss/p_log_lik", tf.reduce_mean(p_log_lik),
-                      collections=[inference._summary_key])
+                      collections=[summary_key])
     tf.summary.scalar("loss/kl_penalty", kl_penalty,
-                      collections=[inference._summary_key])
+                      collections=[summary_key])
     tf.summary.scalar("loss/reg_penalty", reg_penalty,
-                      collections=[inference._summary_key])
+                      collections=[summary_key])
 
   loss = -(tf.reduce_mean(p_log_lik) - kl_penalty - reg_penalty)
 
-  q_rvs = list(six.itervalues(inference.latent_vars))
+  q_rvs = list(six.itervalues(latent_vars))
   q_vars = [v for v in var_list
             if len(get_descendants(tf.convert_to_tensor(v), q_rvs)) != 0]
   q_grads = tf.gradients(
@@ -977,7 +905,8 @@ def build_score_kl_loss_and_gradients(inference, var_list):
   return loss, grads_and_vars
 
 
-def build_score_entropy_loss_and_gradients(inference, var_list):
+def build_score_entropy_loss_and_gradients(
+    latent_vars, data, var_list, scale, n_samples, summary_key):
   """Build loss function and gradients based on the score function
   estimator [@paisley2012variational].
 
@@ -986,15 +915,15 @@ def build_score_entropy_loss_and_gradients(inference, var_list):
   Computed by sampling from $q(z;\lambda)$ and evaluating the
   expectation using Monte Carlo sampling.
   """
-  p_log_prob = [0.0] * inference.n_samples
-  q_log_prob = [0.0] * inference.n_samples
+  p_log_prob = [0.0] * n_samples
+  q_log_prob = [0.0] * n_samples
   base_scope = tf.get_default_graph().unique_name("inference") + '/'
-  for s in range(inference.n_samples):
+  for s in range(n_samples):
     # Form dictionary in order to replace conditioning on prior or
     # observed variable with conditioning on a specific value.
     scope = base_scope + tf.get_default_graph().unique_name("sample")
     dict_swap = {}
-    for x, qx in six.iteritems(inference.data):
+    for x, qx in six.iteritems(data):
       if isinstance(x, RandomVariable):
         if isinstance(qx, RandomVariable):
           qx_copy = copy(qx, scope=scope)
@@ -1002,47 +931,47 @@ def build_score_entropy_loss_and_gradients(inference, var_list):
         else:
           dict_swap[x] = qx
 
-    for z, qz in six.iteritems(inference.latent_vars):
+    for z, qz in six.iteritems(latent_vars):
       # Copy q(z) to obtain new set of posterior samples.
       qz_copy = copy(qz, scope=scope)
       dict_swap[z] = qz_copy.value
       q_log_prob[s] += tf.reduce_sum(
-          inference.scale.get(z, 1.0) *
+          scale.get(z, 1.0) *
           qz_copy.log_prob(tf.stop_gradient(dict_swap[z])))
 
-    for z in six.iterkeys(inference.latent_vars):
+    for z in six.iterkeys(latent_vars):
       z_copy = copy(z, dict_swap, scope=scope)
       p_log_prob[s] += tf.reduce_sum(
-          inference.scale.get(z, 1.0) * z_copy.log_prob(dict_swap[z]))
+          scale.get(z, 1.0) * z_copy.log_prob(dict_swap[z]))
 
-    for x in six.iterkeys(inference.data):
+    for x in six.iterkeys(data):
       if isinstance(x, RandomVariable):
         x_copy = copy(x, dict_swap, scope=scope)
         p_log_prob[s] += tf.reduce_sum(
-            inference.scale.get(x, 1.0) * x_copy.log_prob(dict_swap[x]))
+            scale.get(x, 1.0) * x_copy.log_prob(dict_swap[x]))
 
   p_log_prob = tf.stack(p_log_prob)
   q_log_prob = tf.stack(q_log_prob)
 
   q_entropy = tf.reduce_sum([
       tf.reduce_sum(qz.entropy())
-      for z, qz in six.iteritems(inference.latent_vars)])
+      for z, qz in six.iteritems(latent_vars)])
 
   reg_penalty = tf.reduce_sum(tf.losses.get_regularization_losses())
 
-  if inference.logging:
+  if summary_key is not None:
     tf.summary.scalar("loss/p_log_prob", tf.reduce_mean(p_log_prob),
-                      collections=[inference._summary_key])
+                      collections=[summary_key])
     tf.summary.scalar("loss/q_log_prob", tf.reduce_mean(q_log_prob),
-                      collections=[inference._summary_key])
+                      collections=[summary_key])
     tf.summary.scalar("loss/q_entropy", q_entropy,
-                      collections=[inference._summary_key])
+                      collections=[summary_key])
     tf.summary.scalar("loss/reg_penalty", reg_penalty,
-                      collections=[inference._summary_key])
+                      collections=[summary_key])
 
   loss = -(tf.reduce_mean(p_log_prob) + q_entropy - reg_penalty)
 
-  q_rvs = list(six.itervalues(inference.latent_vars))
+  q_rvs = list(six.itervalues(latent_vars))
   q_vars = [v for v in var_list
             if len(get_descendants(tf.convert_to_tensor(v), q_rvs)) != 0]
   q_grads = tf.gradients(
@@ -1055,7 +984,8 @@ def build_score_entropy_loss_and_gradients(inference, var_list):
   return loss, grads_and_vars
 
 
-def build_score_rb_loss_and_gradients(inference, var_list):
+def build_score_rb_loss_and_gradients(
+    latent_vars, data, var_list, scale, n_samples, summary_key):
   """Build loss function and gradients based on the score function
   estimator [@paisley2012variational] and Rao-Blackwellization
   [@ranganath2014black].
@@ -1065,15 +995,15 @@ def build_score_rb_loss_and_gradients(inference, var_list):
   """
   # Build tensors for loss and gradient calculations. There is one set
   # for each sample from the variational distribution.
-  p_log_probs = [{}] * inference.n_samples
-  q_log_probs = [{}] * inference.n_samples
+  p_log_probs = [{}] * n_samples
+  q_log_probs = [{}] * n_samples
   base_scope = tf.get_default_graph().unique_name("inference") + '/'
-  for s in range(inference.n_samples):
+  for s in range(n_samples):
     # Form dictionary in order to replace conditioning on prior or
     # observed variable with conditioning on a specific value.
     scope = base_scope + tf.get_default_graph().unique_name("sample")
     dict_swap = {}
-    for x, qx in six.iteritems(inference.data):
+    for x, qx in six.iteritems(data):
       if isinstance(x, RandomVariable):
         if isinstance(qx, RandomVariable):
           qx_copy = copy(qx, scope=scope)
@@ -1081,30 +1011,30 @@ def build_score_rb_loss_and_gradients(inference, var_list):
         else:
           dict_swap[x] = qx
 
-    for z, qz in six.iteritems(inference.latent_vars):
+    for z, qz in six.iteritems(latent_vars):
       # Copy q(z) to obtain new set of posterior samples.
       qz_copy = copy(qz, scope=scope)
       dict_swap[z] = qz_copy.value
       q_log_probs[s][qz] = tf.reduce_sum(
-          inference.scale.get(z, 1.0) *
+          scale.get(z, 1.0) *
           qz_copy.log_prob(tf.stop_gradient(dict_swap[z])))
 
-    for z in six.iterkeys(inference.latent_vars):
+    for z in six.iterkeys(latent_vars):
       z_copy = copy(z, dict_swap, scope=scope)
       p_log_probs[s][z] = tf.reduce_sum(
-          inference.scale.get(z, 1.0) * z_copy.log_prob(dict_swap[z]))
+          scale.get(z, 1.0) * z_copy.log_prob(dict_swap[z]))
 
-    for x in six.iterkeys(inference.data):
+    for x in six.iterkeys(data):
       if isinstance(x, RandomVariable):
         x_copy = copy(x, dict_swap, scope=scope)
         p_log_probs[s][x] = tf.reduce_sum(
-            inference.scale.get(x, 1.0) * x_copy.log_prob(dict_swap[x]))
+            scale.get(x, 1.0) * x_copy.log_prob(dict_swap[x]))
 
   # Take gradients of Rao-Blackwellized loss for each variational parameter.
-  p_rvs = list(six.iterkeys(inference.latent_vars)) + \
-      [x for x in six.iterkeys(inference.data) if isinstance(x, RandomVariable)]
-  q_rvs = list(six.itervalues(inference.latent_vars))
-  reverse_latent_vars = {v: k for k, v in six.iteritems(inference.latent_vars)}
+  p_rvs = list(six.iterkeys(latent_vars)) + \
+      [x for x in six.iterkeys(data) if isinstance(x, RandomVariable)]
+  q_rvs = list(six.itervalues(latent_vars))
+  reverse_latent_vars = {v: k for k, v in six.iteritems(latent_vars)}
   grads = []
   grads_vars = []
   for var in var_list:
@@ -1122,9 +1052,9 @@ def build_score_rb_loss_and_gradients(inference, var_list):
     for qz in descendants:
       var_q_rvs.update(qz.get_blanket(q_rvs) + [qz])
 
-    pi_log_prob = [0.0] * inference.n_samples
-    qi_log_prob = [0.0] * inference.n_samples
-    for s in range(inference.n_samples):
+    pi_log_prob = [0.0] * n_samples
+    qi_log_prob = [0.0] * n_samples
+    for s in range(n_samples):
       pi_log_prob[s] = tf.reduce_sum([p_log_probs[s][rv] for rv in var_p_rvs])
       qi_log_prob[s] = tf.reduce_sum([q_log_probs[s][rv] for rv in var_q_rvs])
 
diff --git a/edward/inferences/laplace.py b/edward/inferences/laplace.py
index 67258ca4e..c44d2455c 100644
--- a/edward/inferences/laplace.py
+++ b/edward/inferences/laplace.py
@@ -5,7 +5,7 @@
 import six
 import tensorflow as tf
 
-from edward.inferences.map import MAP
+from edward.inferences.map import map
 from edward.models import PointMass, RandomVariable
 from edward.util import get_session, get_variables
 from edward.util import copy, transform
@@ -17,7 +17,8 @@
   raise ImportError("{0}. Your TensorFlow version is not supported.".format(e))
 
 
-class Laplace(MAP):
+def laplace(latent_vars=None, data=None,
+            auto_transform=True, scale=None, var_list=None, collections=None):
   """Laplace approximation [@laplace1986memoir].
 
   It approximates the posterior distribution using a multivariate
@@ -61,62 +62,63 @@ class Laplace(MAP):
   inference = ed.Laplace({w: qw}, data={X: X_train, y: y_train})
   ```
   """
-  def __init__(self, latent_vars, data=None):
-    """Create an inference algorithm.
-
-    Args:
-      latent_vars: list of RandomVariable or
-                   dict of RandomVariable to RandomVariable.
-        Collection of random variables to perform inference on. If list,
-        each random variable will be implictly optimized using a
-        `MultivariateNormalTriL` random variable that is defined
-        internally with unconstrained support and is initialized using
-        standard normal draws. If dictionary, each random
-        variable must be a `MultivariateNormalDiag`,
-        `MultivariateNormalTriL`, or `Normal` random variable.
+  """Create an inference algorithm.
+
+  Args:
+    latent_vars: list of RandomVariable or
+                 dict of RandomVariable to RandomVariable.
+      Collection of random variables to perform inference on. If list,
+      each random variable will be implictly optimized using a
+      `MultivariateNormalTriL` random variable that is defined
+      internally with unconstrained support and is initialized using
+      standard normal draws. If dictionary, each random
+      variable must be a `MultivariateNormalDiag`,
+      `MultivariateNormalTriL`, or `Normal` random variable.
+  """
+  if isinstance(latent_vars, list):
+    with tf.variable_scope(None, default_name="posterior"):
+      latent_vars_dict = {}
+      for z in latent_vars:
+        # Define location to have constrained support and
+        # unconstrained free parameters.
+        batch_event_shape = z.batch_shape.concatenate(z.event_shape)
+        loc = tf.Variable(tf.random_normal(batch_event_shape))
+        if hasattr(z, 'support'):
+          z_transform = transform(z)
+          if hasattr(z_transform, 'bijector'):
+            loc = z_transform.bijector.inverse(loc)
+        scale_tril = tf.Variable(tf.random_normal(
+            batch_event_shape.concatenate(batch_event_shape[-1])))
+        qz = MultivariateNormalTriL(loc=loc, scale_tril=scale_tril)
+        latent_vars_dict[z] = qz
+      latent_vars = latent_vars_dict
+      del latent_vars_dict
+  elif isinstance(latent_vars, dict):
+    for qz in six.itervalues(latent_vars):
+      if not isinstance(
+              qz, (MultivariateNormalDiag, MultivariateNormalTriL, Normal)):
+        raise TypeError("Posterior approximation must consist of only "
+                        "MultivariateNormalDiag, MultivariateTriL, or "
+                        "Normal random variables.")
+
+  # Store latent variables in a temporary object; MAP will
+  # optimize `PointMass` random variables, which subsequently
+  # optimizes location parameters of the normal approximations.
+  latent_vars_normal = latent_vars.copy()
+  latent_vars = {z: PointMass(params=qz.loc)
+                 for z, qz in six.iteritems(latent_vars_normal)}
+
+  loss, grads_and_vars = map(
+      latent_vars, data,
+      auto_transform, scale, var_list, collections)
+  def _finalize(loss, latent_vars, latent_vars_normal):
+    """Function to call after convergence.
+
+    Computes the Hessian at the mode.
     """
-    if isinstance(latent_vars, list):
-      with tf.variable_scope(None, default_name="posterior"):
-        latent_vars_dict = {}
-        for z in latent_vars:
-          # Define location to have constrained support and
-          # unconstrained free parameters.
-          batch_event_shape = z.batch_shape.concatenate(z.event_shape)
-          loc = tf.Variable(tf.random_normal(batch_event_shape))
-          if hasattr(z, 'support'):
-            z_transform = transform(z)
-            if hasattr(z_transform, 'bijector'):
-              loc = z_transform.bijector.inverse(loc)
-          scale_tril = tf.Variable(tf.random_normal(
-              batch_event_shape.concatenate(batch_event_shape[-1])))
-          qz = MultivariateNormalTriL(loc=loc, scale_tril=scale_tril)
-          latent_vars_dict[z] = qz
-        latent_vars = latent_vars_dict
-        del latent_vars_dict
-    elif isinstance(latent_vars, dict):
-      for qz in six.itervalues(latent_vars):
-        if not isinstance(
-                qz, (MultivariateNormalDiag, MultivariateNormalTriL, Normal)):
-          raise TypeError("Posterior approximation must consist of only "
-                          "MultivariateNormalDiag, MultivariateTriL, or "
-                          "Normal random variables.")
-
-    # call grandparent's method; avoid parent (MAP)
-    super(MAP, self).__init__(latent_vars, data)
-
-  def initialize(self, *args, **kwargs):
-    # Store latent variables in a temporary object; MAP will
-    # optimize `PointMass` random variables, which subsequently
-    # optimizes location parameters of the normal approximations.
-    latent_vars_normal = self.latent_vars.copy()
-    self.latent_vars = {z: PointMass(params=qz.loc)
-                        for z, qz in six.iteritems(latent_vars_normal)}
-
-    super(Laplace, self).initialize(*args, **kwargs)
-
-    hessians = tf.hessians(self.loss, list(six.itervalues(self.latent_vars)))
-    self.finalize_ops = []
-    for z, hessian in zip(six.iterkeys(self.latent_vars), hessians):
+    hessians = tf.hessians(loss, list(six.itervalues(latent_vars)))
+    finalize_ops = []
+    for z, hessian in zip(six.iterkeys(latent_vars), hessians):
       qz = latent_vars_normal[z]
       if isinstance(qz, (MultivariateNormalDiag, Normal)):
         scale_var = get_variables(qz.variance())[0]
@@ -125,29 +127,7 @@ def initialize(self, *args, **kwargs):
         scale_var = get_variables(qz.covariance())[0]
         scale = tf.matrix_inverse(tf.cholesky(hessian))
 
-      self.finalize_ops.append(scale_var.assign(scale))
-
-    self.latent_vars = latent_vars_normal.copy()
-    del latent_vars_normal
-
-  def finalize(self, feed_dict=None):
-    """Function to call after convergence.
-
-    Computes the Hessian at the mode.
-
-    Args:
-      feed_dict: dict.
-        Feed dictionary for a TensorFlow session run during evaluation
-        of Hessian. It is used to feed placeholders that are not fed
-        during initialization.
-    """
-    if feed_dict is None:
-      feed_dict = {}
-
-    for key, value in six.iteritems(self.data):
-      if isinstance(key, tf.Tensor) and "Placeholder" in key.op.type:
-        feed_dict[key] = value
-
-    sess = get_session()
-    sess.run(self.finalize_ops, feed_dict)
-    super(Laplace, self).finalize()
+      finalize_ops.append(scale_var.assign(scale))
+    return tf.group(*finalize_ops)
+  finalize_op = _finalize(loss, latent_vars, latent_vars_normal)
+  return loss, grads_and_vars, finalize_op
diff --git a/edward/inferences/map.py b/edward/inferences/map.py
index fb0bf1d82..593f3efbc 100644
--- a/edward/inferences/map.py
+++ b/edward/inferences/map.py
@@ -5,7 +5,8 @@
 import six
 import tensorflow as tf
 
-from edward.inferences.variational_inference import VariationalInference
+from edward.inferences.inference import (check_and_maybe_build_data,
+    check_and_maybe_build_latent_vars, transform, check_and_maybe_build_dict, check_and_maybe_build_var_list)
 from edward.models import RandomVariable, PointMass
 from edward.util import copy, transform
 
@@ -15,7 +16,8 @@
   raise ImportError("{0}. Your TensorFlow version is not supported.".format(e))
 
 
-class MAP(VariationalInference):
+def map(latent_vars=None, data=None,
+        auto_transform=True, scale=None, var_list=None, collections=None):
   """Maximum a posteriori.
 
   This class implements gradient-based optimization to solve the
@@ -75,79 +77,80 @@ class MAP(VariationalInference):
   The objective function also adds to itself a summation over all
   tensors in the `REGULARIZATION_LOSSES` collection.
   """
-  def __init__(self, latent_vars=None, data=None):
-    """Create an inference algorithm.
-
-    Args:
-      latent_vars: list of RandomVariable or
-                   dict of RandomVariable to RandomVariable.
-        Collection of random variables to perform inference on. If
-        list, each random variable will be implictly optimized using a
-        `PointMass` random variable that is defined internally with
-        constrained support, has unconstrained free parameters, and is
-        initialized using standard normal draws. If dictionary, each
-        value in the dictionary must be a `PointMass` random variable
-        with the same support as the key.
-    """
-    if isinstance(latent_vars, list):
-      with tf.variable_scope(None, default_name="posterior"):
-        latent_vars_dict = {}
-        for z in latent_vars:
-          # Define point masses to have constrained support and
-          # unconstrained free parameters.
-          batch_event_shape = z.batch_shape.concatenate(z.event_shape)
-          params = tf.Variable(tf.random_normal(batch_event_shape))
-          if hasattr(z, 'support'):
-            z_transform = transform(z)
-            if hasattr(z_transform, 'bijector'):
-              params = z_transform.bijector.inverse(params)
-          latent_vars_dict[z] = PointMass(params=params)
-        latent_vars = latent_vars_dict
-        del latent_vars_dict
-    elif isinstance(latent_vars, dict):
-      for qz in six.itervalues(latent_vars):
-        if not isinstance(qz, PointMass):
-          raise TypeError("Posterior approximation must consist of only "
-                          "PointMass random variables.")
-
-    super(MAP, self).__init__(latent_vars, data)
-
-  def build_loss_and_gradients(self, var_list):
-    """Build loss function. Its automatic differentiation
-    is the gradient of
-
-    $- \log p(x,z).$
-    """
-    # Form dictionary in order to replace conditioning on prior or
-    # observed variable with conditioning on a specific value.
-    scope = tf.get_default_graph().unique_name("inference")
-    dict_swap = {z: qz.value
-                 for z, qz in six.iteritems(self.latent_vars)}
-    for x, qx in six.iteritems(self.data):
-      if isinstance(x, RandomVariable):
-        if isinstance(qx, RandomVariable):
-          dict_swap[x] = qx.value
-        else:
-          dict_swap[x] = qx
-
-    p_log_prob = 0.0
-    for z in six.iterkeys(self.latent_vars):
-      z_copy = copy(z, dict_swap, scope=scope)
+  """Create an inference algorithm.
+
+  Args:
+    latent_vars: list of RandomVariable or
+                 dict of RandomVariable to RandomVariable.
+      Collection of random variables to perform inference on. If
+      list, each random variable will be implictly optimized using a
+      `PointMass` random variable that is defined internally with
+      constrained support, has unconstrained free parameters, and is
+      initialized using standard normal draws. If dictionary, each
+      value in the dictionary must be a `PointMass` random variable
+      with the same support as the key.
+  """
+  """Build loss function. Its automatic differentiation
+  is the gradient of
+
+  $- \log p(x,z).$
+  """
+  if isinstance(latent_vars, list):
+    with tf.variable_scope(None, default_name="posterior"):
+      latent_vars_dict = {}
+      for z in latent_vars:
+        # Define point masses to have constrained support and
+        # unconstrained free parameters.
+        batch_event_shape = z.batch_shape.concatenate(z.event_shape)
+        params = tf.Variable(tf.random_normal(batch_event_shape))
+        if hasattr(z, 'support'):
+          z_transform = transform(z)
+          if hasattr(z_transform, 'bijector'):
+            params = z_transform.bijector.inverse(params)
+        latent_vars_dict[z] = PointMass(params=params)
+      latent_vars = latent_vars_dict
+      del latent_vars_dict
+  elif isinstance(latent_vars, dict):
+    for qz in six.itervalues(latent_vars):
+      if not isinstance(qz, PointMass):
+        raise TypeError("Posterior approximation must consist of only "
+                        "PointMass random variables.")
+  latent_vars = check_and_maybe_build_latent_vars(latent_vars)
+  data = check_and_maybe_build_data(data)
+  latent_vars, _ = transform(latent_vars, auto_transform)
+  scale = check_and_maybe_build_dict(scale)
+  var_list = check_and_maybe_build_var_list(var_list, latent_vars, data)
+
+  # Form dictionary in order to replace conditioning on prior or
+  # observed variable with conditioning on a specific value.
+  scope = tf.get_default_graph().unique_name("inference")
+  dict_swap = {z: qz.value
+               for z, qz in six.iteritems(latent_vars)}
+  for x, qx in six.iteritems(data):
+    if isinstance(x, RandomVariable):
+      if isinstance(qx, RandomVariable):
+        dict_swap[x] = qx.value
+      else:
+        dict_swap[x] = qx
+
+  p_log_prob = 0.0
+  for z in six.iterkeys(latent_vars):
+    z_copy = copy(z, dict_swap, scope=scope)
+    p_log_prob += tf.reduce_sum(
+        scale.get(z, 1.0) * z_copy.log_prob(dict_swap[z]))
+
+  for x in six.iterkeys(data):
+    if isinstance(x, RandomVariable):
+      if dict_swap:
+        x_copy = copy(x, dict_swap, scope=scope)
+      else:
+        x_copy = x
       p_log_prob += tf.reduce_sum(
-          self.scale.get(z, 1.0) * z_copy.log_prob(dict_swap[z]))
-
-    for x in six.iterkeys(self.data):
-      if isinstance(x, RandomVariable):
-        if dict_swap:
-          x_copy = copy(x, dict_swap, scope=scope)
-        else:
-          x_copy = x
-        p_log_prob += tf.reduce_sum(
-            self.scale.get(x, 1.0) * x_copy.log_prob(dict_swap[x]))
-
-    reg_penalty = tf.reduce_sum(tf.losses.get_regularization_losses())
-    loss = -p_log_prob + reg_penalty
-
-    grads = tf.gradients(loss, var_list)
-    grads_and_vars = list(zip(grads, var_list))
-    return loss, grads_and_vars
+          scale.get(x, 1.0) * x_copy.log_prob(dict_swap[x]))
+
+  reg_penalty = tf.reduce_sum(tf.losses.get_regularization_losses())
+  loss = -p_log_prob + reg_penalty
+
+  grads = tf.gradients(loss, var_list)
+  grads_and_vars = list(zip(grads, var_list))
+  return loss, grads_and_vars
diff --git a/edward/inferences/metropolis_hastings.py b/edward/inferences/metropolis_hastings.py
index d57cff698..69c1a5307 100644
--- a/edward/inferences/metropolis_hastings.py
+++ b/edward/inferences/metropolis_hastings.py
@@ -8,7 +8,7 @@
 from collections import OrderedDict
 from edward.inferences.monte_carlo import MonteCarlo
 from edward.models import RandomVariable
-from edward.util import check_latent_vars, copy
+from edward.util import check_and_maybe_build_latent_vars, copy
 
 
 class MetropolisHastings(MonteCarlo):
@@ -53,7 +53,7 @@ def __init__(self, latent_vars, proposal_vars, data=None):
         Collection of random variables to perform inference on; each is
         binded to a proposal distribution $g(z' \mid z)$.
     """
-    check_latent_vars(proposal_vars)
+    proposal_vars = check_and_maybe_build_latent_vars(proposal_vars)
     self.proposal_vars = proposal_vars
     super(MetropolisHastings, self).__init__(latent_vars, data)
 
diff --git a/edward/inferences/monte_carlo.py b/edward/inferences/monte_carlo.py
index da22cb285..d5ef6a2fe 100644
--- a/edward/inferences/monte_carlo.py
+++ b/edward/inferences/monte_carlo.py
@@ -2,17 +2,14 @@
 from __future__ import division
 from __future__ import print_function
 
-import abc
 import numpy as np
 import six
 import tensorflow as tf
 
-from edward.inferences.inference import Inference
 from edward.models import Empirical, RandomVariable
 from edward.util import get_session
 
 
-@six.add_metaclass(abc.ABCMeta)
 class MonteCarlo(Inference):
   """Abstract base class for Monte Carlo. Specific Monte Carlo methods
   inherit from `MonteCarlo`, sharing methods in this class.
@@ -58,38 +55,35 @@ class MonteCarlo(Inference):
   It defaults to `Empirical` random variables with 10,000 samples for
   each dimension.
   """
-  def __init__(self, latent_vars=None, data=None):
-    """Create an inference algorithm.
-
-    Args:
-      latent_vars: list or dict.
-        Collection of random variables (of type `RandomVariable` or
-        `tf.Tensor`) to perform inference on. If list, each random
-        variable will be approximated using a `Empirical` random
-        variable that is defined internally (with unconstrained
-        support). If dictionary, each value in the dictionary must be a
-        `Empirical` random variable.
-      data: dict.
-        Data dictionary which binds observed variables (of type
-        `RandomVariable` or `tf.Tensor`) to their realizations (of
-        type `tf.Tensor`). It can also bind placeholders (of type
-        `tf.Tensor`) used in the model to their realizations.
-    """
-    if isinstance(latent_vars, list):
-      with tf.variable_scope(None, default_name="posterior"):
-        latent_vars = {z: Empirical(params=tf.Variable(tf.zeros(
-            [1e4] + z.batch_shape.concatenate(z.event_shape).as_list())))
-            for z in latent_vars}
-    elif isinstance(latent_vars, dict):
-      for qz in six.itervalues(latent_vars):
-        if not isinstance(qz, Empirical):
-          raise TypeError("Posterior approximation must consist of only "
-                          "Empirical random variables.")
-        elif len(qz.sample_shape) != 0:
-          raise ValueError("Empirical posterior approximations must have "
-                           "a scalar sample shape.")
-
-    super(MonteCarlo, self).__init__(latent_vars, data)
+  """Create an inference algorithm.
+
+  Args:
+    latent_vars: list or dict, optional.
+      Collection of random variables (of type `RandomVariable` or
+      `tf.Tensor`) to perform inference on. If list, each random
+      variable will be approximated using a `Empirical` random
+      variable that is defined internally (with unconstrained
+      support). If dictionary, each value in the dictionary must be a
+      `Empirical` random variable.
+    data: dict, optional.
+      Data dictionary which binds observed variables (of type
+      `RandomVariable` or `tf.Tensor`) to their realizations (of
+      type `tf.Tensor`). It can also bind placeholders (of type
+      `tf.Tensor`) used in the model to their realizations.
+  """
+  if isinstance(latent_vars, list):
+    with tf.variable_scope(None, default_name="posterior"):
+      latent_vars = {z: Empirical(params=tf.Variable(tf.zeros(
+          [1e4] + z.batch_shape.concatenate(z.event_shape).as_list())))
+          for z in latent_vars}
+  elif isinstance(latent_vars, dict):
+    for qz in six.itervalues(latent_vars):
+      if not isinstance(qz, Empirical):
+        raise TypeError("Posterior approximation must consist of only "
+                        "Empirical random variables.")
+      elif len(qz.sample_shape) != 0:
+        raise ValueError("Empirical posterior approximations must have "
+                         "a scalar sample shape.")
 
   def initialize(self, *args, **kwargs):
     kwargs['n_iter'] = np.amin([qz.params.shape.as_list()[0] for
@@ -106,64 +100,3 @@ def initialize(self, *args, **kwargs):
       tf.summary.scalar("n_accept", self.n_accept,
                         collections=[self._summary_key])
       self.summarize = tf.summary.merge_all(key=self._summary_key)
-
-  def update(self, feed_dict=None):
-    """Run one iteration of sampling.
-
-    Args:
-      feed_dict: dict.
-        Feed dictionary for a TensorFlow session run. It is used to feed
-        placeholders that are not fed during initialization.
-
-    Returns:
-      dict.
-      Dictionary of algorithm-specific information. In this case, the
-      acceptance rate of samples since (and including) this iteration.
-
-    #### Notes
-
-    We run the increment of `t` separately from other ops. Whether the
-    others op run with the `t` before incrementing or after incrementing
-    depends on which is run faster in the TensorFlow graph. Running it
-    separately forces a consistent behavior.
-    """
-    if feed_dict is None:
-      feed_dict = {}
-
-    for key, value in six.iteritems(self.data):
-      if isinstance(key, tf.Tensor) and "Placeholder" in key.op.type:
-        feed_dict[key] = value
-
-    sess = get_session()
-    _, accept_rate = sess.run([self.train, self.n_accept_over_t], feed_dict)
-    t = sess.run(self.increment_t)
-
-    if self.debug:
-      sess.run(self.op_check, feed_dict)
-
-    if self.logging and self.n_print != 0:
-      if t == 1 or t % self.n_print == 0:
-        summary = sess.run(self.summarize, feed_dict)
-        self.train_writer.add_summary(summary, t)
-
-    return {'t': t, 'accept_rate': accept_rate}
-
-  def print_progress(self, info_dict):
-    """Print progress to output.
-    """
-    if self.n_print != 0:
-      t = info_dict['t']
-      if t == 1 or t % self.n_print == 0:
-        self.progbar.update(t, {'Acceptance Rate': info_dict['accept_rate']})
-
-  @abc.abstractmethod
-  def build_update(self):
-    """Build update rules, returning an assign op for parameters in
-    the `Empirical` random variables.
-
-    Any derived class of `MonteCarlo` **must** implement this method.
-
-    Raises:
-      NotImplementedError.
-    """
-    raise NotImplementedError()
diff --git a/edward/inferences/variational_inference.py b/edward/inferences/variational_inference.py
deleted file mode 100644
index f6e8c244d..000000000
--- a/edward/inferences/variational_inference.py
+++ /dev/null
@@ -1,185 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import abc
-import numpy as np
-import six
-import tensorflow as tf
-
-from edward.inferences.inference import Inference
-from edward.models import RandomVariable
-from edward.util import get_session, get_variables
-
-
-@six.add_metaclass(abc.ABCMeta)
-class VariationalInference(Inference):
-  """Abstract base class for variational inference. Specific
-  variational inference methods inherit from `VariationalInference`,
-  sharing methods such as a default optimizer.
-
-  To build an algorithm inheriting from `VariationalInference`, one
-  must at the minimum implement `build_loss_and_gradients`: it
-  determines the loss function and gradients to apply for a given
-  optimizer.
-  """
-  def __init__(self, *args, **kwargs):
-    super(VariationalInference, self).__init__(*args, **kwargs)
-
-  def initialize(self, optimizer=None, var_list=None, use_prettytensor=False,
-                 global_step=None, *args, **kwargs):
-    """Initialize inference algorithm. It initializes hyperparameters
-    and builds ops for the algorithm's computation graph.
-
-    Args:
-      optimizer: str or tf.train.Optimizer.
-        A TensorFlow optimizer, to use for optimizing the variational
-        objective. Alternatively, one can pass in the name of a
-        TensorFlow optimizer, and default parameters for the optimizer
-        will be used.
-      var_list: list of tf.Variable.
-        List of TensorFlow variables to optimize over. Default is all
-        trainable variables that `latent_vars` and `data` depend on,
-        excluding those that are only used in conditionals in `data`.
-      use_prettytensor: bool.
-        `True` if aim to use PrettyTensor optimizer (when using
-        PrettyTensor) or `False` if aim to use TensorFlow optimizer.
-        Defaults to TensorFlow.
-      global_step: tf.Variable.
-        A TensorFlow variable to hold the global step.
-    """
-    super(VariationalInference, self).initialize(*args, **kwargs)
-
-    if var_list is None:
-      # Traverse random variable graphs to get default list of variables.
-      var_list = set()
-      trainables = tf.trainable_variables()
-      for z, qz in six.iteritems(self.latent_vars):
-        var_list.update(get_variables(z, collection=trainables))
-        var_list.update(get_variables(qz, collection=trainables))
-
-      for x, qx in six.iteritems(self.data):
-        if isinstance(x, RandomVariable) and \
-                not isinstance(qx, RandomVariable):
-          var_list.update(get_variables(x, collection=trainables))
-
-      var_list = list(var_list)
-
-    self.loss, grads_and_vars = self.build_loss_and_gradients(var_list)
-
-    if self.logging:
-      tf.summary.scalar("loss", self.loss, collections=[self._summary_key])
-      for grad, var in grads_and_vars:
-        # replace colons which are an invalid character
-        tf.summary.histogram("gradient/" +
-                             var.name.replace(':', '/'),
-                             grad, collections=[self._summary_key])
-        tf.summary.scalar("gradient_norm/" +
-                          var.name.replace(':', '/'),
-                          tf.norm(grad), collections=[self._summary_key])
-
-      self.summarize = tf.summary.merge_all(key=self._summary_key)
-
-    if optimizer is None and global_step is None:
-      # Default optimizer always uses a global step variable.
-      global_step = tf.Variable(0, trainable=False, name="global_step")
-
-    if isinstance(global_step, tf.Variable):
-      starter_learning_rate = 0.1
-      learning_rate = tf.train.exponential_decay(starter_learning_rate,
-                                                 global_step,
-                                                 100, 0.9, staircase=True)
-    else:
-      learning_rate = 0.01
-
-    # Build optimizer.
-    if optimizer is None:
-      optimizer = tf.train.AdamOptimizer(learning_rate)
-    elif isinstance(optimizer, str):
-      if optimizer == 'gradientdescent':
-        optimizer = tf.train.GradientDescentOptimizer(learning_rate)
-      elif optimizer == 'adadelta':
-        optimizer = tf.train.AdadeltaOptimizer(learning_rate)
-      elif optimizer == 'adagrad':
-        optimizer = tf.train.AdagradOptimizer(learning_rate)
-      elif optimizer == 'momentum':
-        optimizer = tf.train.MomentumOptimizer(learning_rate, 0.9)
-      elif optimizer == 'adam':
-        optimizer = tf.train.AdamOptimizer(learning_rate)
-      elif optimizer == 'ftrl':
-        optimizer = tf.train.FtrlOptimizer(learning_rate)
-      elif optimizer == 'rmsprop':
-        optimizer = tf.train.RMSPropOptimizer(learning_rate)
-      else:
-        raise ValueError('Optimizer class not found:', optimizer)
-    elif not isinstance(optimizer, tf.train.Optimizer):
-      raise TypeError("Optimizer must be str, tf.train.Optimizer, or None.")
-
-    with tf.variable_scope(None, default_name="optimizer") as scope:
-      if not use_prettytensor:
-        self.train = optimizer.apply_gradients(grads_and_vars,
-                                               global_step=global_step)
-      else:
-        import prettytensor as pt
-        # Note PrettyTensor optimizer does not accept manual updates;
-        # it autodiffs the loss directly.
-        self.train = pt.apply_optimizer(optimizer, losses=[self.loss],
-                                        global_step=global_step,
-                                        var_list=var_list)
-
-    self.reset.append(tf.variables_initializer(
-        tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=scope.name)))
-
-  def update(self, feed_dict=None):
-    """Run one iteration of optimization.
-
-    Args:
-      feed_dict: dict.
-        Feed dictionary for a TensorFlow session run. It is used to feed
-        placeholders that are not fed during initialization.
-
-    Returns:
-      dict.
-      Dictionary of algorithm-specific information. In this case, the
-      loss function value after one iteration.
-    """
-    if feed_dict is None:
-      feed_dict = {}
-
-    for key, value in six.iteritems(self.data):
-      if isinstance(key, tf.Tensor) and "Placeholder" in key.op.type:
-        feed_dict[key] = value
-
-    sess = get_session()
-    _, t, loss = sess.run([self.train, self.increment_t, self.loss], feed_dict)
-
-    if self.debug:
-      sess.run(self.op_check, feed_dict)
-
-    if self.logging and self.n_print != 0:
-      if t == 1 or t % self.n_print == 0:
-        summary = sess.run(self.summarize, feed_dict)
-        self.train_writer.add_summary(summary, t)
-
-    return {'t': t, 'loss': loss}
-
-  def print_progress(self, info_dict):
-    """Print progress to output.
-    """
-    if self.n_print != 0:
-      t = info_dict['t']
-      if t == 1 or t % self.n_print == 0:
-        self.progbar.update(t, {'Loss': info_dict['loss']})
-
-  @abc.abstractmethod
-  def build_loss_and_gradients(self, var_list):
-    """Build loss function and its gradients. They will be leveraged
-    in an optimizer to update the model and variational parameters.
-
-    Any derived class of `VariationalInference` **must** implement
-    this method.
-
-    Raises:
-      NotImplementedError.
-    """
-    raise NotImplementedError()
diff --git a/edward/inferences/wake_sleep.py b/edward/inferences/wake_sleep.py
index 0a6b41350..4e98fa897 100644
--- a/edward/inferences/wake_sleep.py
+++ b/edward/inferences/wake_sleep.py
@@ -5,12 +5,14 @@
 import six
 import tensorflow as tf
 
-from edward.inferences.variational_inference import VariationalInference
+from edward.inferences.inference import (check_and_maybe_build_data,
+    check_and_maybe_build_latent_vars, transform, check_and_maybe_build_dict, check_and_maybe_build_var_list)
 from edward.models import RandomVariable
 from edward.util import copy, get_descendants
 
 
-class WakeSleep(VariationalInference):
+def wake_sleep(latent_vars=None, data=None, n_samples=1, phase_q='sleep',
+               auto_transform=True, scale=None, var_list=None, collections=None):
   """Wake-Sleep algorithm [@hinton1995wake].
 
   Given a probability model $p(x, z; \\theta)$ and variational
@@ -55,101 +57,96 @@ class WakeSleep(VariationalInference):
   The objective function also adds to itself a summation over all
   tensors in the `REGULARIZATION_LOSSES` collection.
   """
-  def __init__(self, *args, **kwargs):
-    super(WakeSleep, self).__init__(*args, **kwargs)
-
-  def initialize(self, n_samples=1, phase_q='sleep', *args, **kwargs):
-    """Initialize inference algorithm. It initializes hyperparameters
-    and builds ops for the algorithm's computation graph.
-
-    Args:
-      n_samples: int.
-        Number of samples for calculating stochastic gradients during
-        wake and sleep phases.
-      phase_q: str.
-        Phase for updating parameters of q. If 'sleep', update using
-        a sample from p. If 'wake', update using a sample from q.
-        (Unlike reparameterization gradients, the sample is held
-        fixed.)
-    """
-    self.n_samples = n_samples
-    self.phase_q = phase_q
-    return super(WakeSleep, self).initialize(*args, **kwargs)
-
-  def build_loss_and_gradients(self, var_list):
-    p_log_prob = [0.0] * self.n_samples
-    q_log_prob = [0.0] * self.n_samples
-    base_scope = tf.get_default_graph().unique_name("inference") + '/'
-    for s in range(self.n_samples):
-      # Form dictionary in order to replace conditioning on prior or
-      # observed variable with conditioning on a specific value.
-      scope = base_scope + tf.get_default_graph().unique_name("q_sample")
-      dict_swap = {}
-      for x, qx in six.iteritems(self.data):
-        if isinstance(x, RandomVariable):
-          if isinstance(qx, RandomVariable):
-            qx_copy = copy(qx, scope=scope)
-            dict_swap[x] = qx_copy.value
-          else:
-            dict_swap[x] = qx
-
-      # Sample z ~ q(z), then compute log p(x, z).
-      q_dict_swap = dict_swap.copy()
-      for z, qz in six.iteritems(self.latent_vars):
-        # Copy q(z) to obtain new set of posterior samples.
-        qz_copy = copy(qz, scope=scope)
-        q_dict_swap[z] = qz_copy.value
-        if self.phase_q != 'sleep':
-          # If not sleep phase, compute log q(z).
-          q_log_prob[s] += tf.reduce_sum(
-              self.scale.get(z, 1.0) *
-              qz_copy.log_prob(tf.stop_gradient(q_dict_swap[z])))
-
-      for z in six.iterkeys(self.latent_vars):
-        z_copy = copy(z, q_dict_swap, scope=scope)
+  """
+  Args:
+    n_samples: int, optional.
+      Number of samples for calculating stochastic gradients during
+      wake and sleep phases.
+    phase_q: str, optional.
+      Phase for updating parameters of q. If 'sleep', update using
+      a sample from p. If 'wake', update using a sample from q.
+      (Unlike reparameterization gradients, the sample is held
+      fixed.)
+  """
+  latent_vars = check_and_maybe_build_latent_vars(latent_vars)
+  data = check_and_maybe_build_data(data)
+  latent_vars, _ = transform(latent_vars, auto_transform)
+  scale = check_and_maybe_build_dict(scale)
+  var_list = check_and_maybe_build_var_list(var_list, latent_vars, data)
+
+  p_log_prob = [0.0] * n_samples
+  q_log_prob = [0.0] * n_samples
+  base_scope = tf.get_default_graph().unique_name("inference") + '/'
+  for s in range(n_samples):
+    # Form dictionary in order to replace conditioning on prior or
+    # observed variable with conditioning on a specific value.
+    scope = base_scope + tf.get_default_graph().unique_name("q_sample")
+    dict_swap = {}
+    for x, qx in six.iteritems(data):
+      if isinstance(x, RandomVariable):
+        if isinstance(qx, RandomVariable):
+          qx_copy = copy(qx, scope=scope)
+          dict_swap[x] = qx_copy.value
+        else:
+          dict_swap[x] = qx
+
+    # Sample z ~ q(z), then compute log p(x, z).
+    q_dict_swap = dict_swap.copy()
+    for z, qz in six.iteritems(latent_vars):
+      # Copy q(z) to obtain new set of posterior samples.
+      qz_copy = copy(qz, scope=scope)
+      q_dict_swap[z] = qz_copy.value
+      if phase_q != 'sleep':
+        # If not sleep phase, compute log q(z).
+        q_log_prob[s] += tf.reduce_sum(
+            scale.get(z, 1.0) *
+            qz_copy.log_prob(tf.stop_gradient(q_dict_swap[z])))
+
+    for z in six.iterkeys(latent_vars):
+      z_copy = copy(z, q_dict_swap, scope=scope)
+      p_log_prob[s] += tf.reduce_sum(
+          scale.get(z, 1.0) * z_copy.log_prob(q_dict_swap[z]))
+
+    for x in six.iterkeys(data):
+      if isinstance(x, RandomVariable):
+        x_copy = copy(x, q_dict_swap, scope=scope)
         p_log_prob[s] += tf.reduce_sum(
-            self.scale.get(z, 1.0) * z_copy.log_prob(q_dict_swap[z]))
-
-      for x in six.iterkeys(self.data):
-        if isinstance(x, RandomVariable):
-          x_copy = copy(x, q_dict_swap, scope=scope)
-          p_log_prob[s] += tf.reduce_sum(
-              self.scale.get(x, 1.0) * x_copy.log_prob(q_dict_swap[x]))
-
-      if self.phase_q == 'sleep':
-        # Sample z ~ p(z), then compute log q(z).
-        scope = base_scope + tf.get_default_graph().unique_name("p_sample")
-        p_dict_swap = dict_swap.copy()
-        for z, qz in six.iteritems(self.latent_vars):
-          # Copy p(z) to obtain new set of prior samples.
-          z_copy = copy(z, scope=scope)
-          p_dict_swap[qz] = z_copy.value
-        for qz in six.itervalues(self.latent_vars):
-          qz_copy = copy(qz, p_dict_swap, scope=scope)
-          q_log_prob[s] += tf.reduce_sum(
-              self.scale.get(z, 1.0) *
-              qz_copy.log_prob(tf.stop_gradient(p_dict_swap[qz])))
-
-    p_log_prob = tf.reduce_mean(p_log_prob)
-    q_log_prob = tf.reduce_mean(q_log_prob)
-    reg_penalty = tf.reduce_sum(tf.losses.get_regularization_losses())
-
-    if self.logging:
-      tf.summary.scalar("loss/p_log_prob", p_log_prob,
-                        collections=[self._summary_key])
-      tf.summary.scalar("loss/q_log_prob", q_log_prob,
-                        collections=[self._summary_key])
-      tf.summary.scalar("loss/reg_penalty", reg_penalty,
-                        collections=[self._summary_key])
-
-    loss_p = -p_log_prob + reg_penalty
-    loss_q = -q_log_prob + reg_penalty
-
-    q_rvs = list(six.itervalues(self.latent_vars))
-    q_vars = [v for v in var_list
-              if len(get_descendants(tf.convert_to_tensor(v), q_rvs)) != 0]
-    q_grads = tf.gradients(loss_q, q_vars)
-    p_vars = [v for v in var_list if v not in q_vars]
-    p_grads = tf.gradients(loss_p, p_vars)
-    grads_and_vars = list(zip(q_grads, q_vars)) + list(zip(p_grads, p_vars))
-    return loss_p, grads_and_vars
+            scale.get(x, 1.0) * x_copy.log_prob(q_dict_swap[x]))
+
+    if phase_q == 'sleep':
+      # Sample z ~ p(z), then compute log q(z).
+      scope = base_scope + tf.get_default_graph().unique_name("p_sample")
+      p_dict_swap = dict_swap.copy()
+      for z, qz in six.iteritems(latent_vars):
+        # Copy p(z) to obtain new set of prior samples.
+        z_copy = copy(z, scope=scope)
+        p_dict_swap[qz] = z_copy.value
+      for qz in six.itervalues(latent_vars):
+        qz_copy = copy(qz, p_dict_swap, scope=scope)
+        q_log_prob[s] += tf.reduce_sum(
+            scale.get(z, 1.0) *
+            qz_copy.log_prob(tf.stop_gradient(p_dict_swap[qz])))
+
+  p_log_prob = tf.reduce_mean(p_log_prob)
+  q_log_prob = tf.reduce_mean(q_log_prob)
+  reg_penalty = tf.reduce_sum(tf.losses.get_regularization_losses())
+
+  if collections is not None:
+    tf.summary.scalar("loss/p_log_prob", p_log_prob,
+                      collections=collections)
+    tf.summary.scalar("loss/q_log_prob", q_log_prob,
+                      collections=collections)
+    tf.summary.scalar("loss/reg_penalty", reg_penalty,
+                      collections=collections)
+
+  loss_p = -p_log_prob + reg_penalty
+  loss_q = -q_log_prob + reg_penalty
+
+  q_rvs = list(six.itervalues(latent_vars))
+  q_vars = [v for v in var_list
+            if len(get_descendants(tf.convert_to_tensor(v), q_rvs)) != 0]
+  q_grads = tf.gradients(loss_q, q_vars)
+  p_vars = [v for v in var_list if v not in q_vars]
+  p_grads = tf.gradients(loss_p, p_vars)
+  grads_and_vars = list(zip(q_grads, q_vars)) + list(zip(p_grads, p_vars))
+  return loss_p, grads_and_vars
diff --git a/edward/inferences/wgan_inference.py b/edward/inferences/wgan_inference.py
index 7f67ab75b..1c850cf2e 100644
--- a/edward/inferences/wgan_inference.py
+++ b/edward/inferences/wgan_inference.py
@@ -5,11 +5,14 @@
 import six
 import tensorflow as tf
 
-from edward.inferences.gan_inference import GANInference
+from edward.inferences.inference import (check_and_maybe_build_data,
+    transform, check_and_maybe_build_dict, check_and_maybe_build_var_list)
 from edward.util import get_session
 
 
-class WGANInference(GANInference):
+def wgan_inference(data=None, discriminator=None,
+                   penalty=10.0,
+                   scale=None, var_list=None, collections=None):
   """Parameter estimation with GAN-style training
   [@goodfellow2014generative], using the Wasserstein distance
   [@arjovsky2017wasserstein].
@@ -18,6 +21,18 @@ class WGANInference(GANInference):
   models. These models do not require a tractable density and assume
   only a program that generates samples.
 
+  The original WGAN clips weight parameters of the discriminator as an
+  approximation to the 1-Lipschitz constraint. To clip weights, one
+  must manually add a clipping op and then call it after each gradient
+  update during training. For example:
+
+  ```python
+  ... = wgan_inference(data, discriminator, penalty=None)
+  var_list = tf.get_collection(
+      tf.GraphKeys.TRAINABLE_VARIABLES, scope="Disc")
+  clip_op = [w.assign(tf.clip_by_value(w, -0.1, 0.1)) for w in var_list]
+  ```
+
   #### Notes
 
   Argument-wise, the only difference from `GANInference` is
@@ -38,79 +53,58 @@ class WGANInference(GANInference):
   inference = ed.WGANInference({x: x_data}, discriminator)
   ```
   """
-  def __init__(self, *args, **kwargs):
-    super(WGANInference, self).__init__(*args, **kwargs)
-
-  def initialize(self, penalty=10.0, clip=None, *args, **kwargs):
-    """Initialize inference algorithm. It initializes hyperparameters
-    and builds ops for the algorithm's computation graph.
-
-    Args:
-      penalty: float.
-        Scalar value to enforce gradient penalty that ensures the
-        gradients have norm equal to 1 [@gulrajani2017improved]. Set to
-        None (or 0.0) if using no penalty.
-      clip: float.
-        Value to clip weights by. Default is no clipping.
-    """
-    self.penalty = penalty
-
-    super(WGANInference, self).initialize(*args, **kwargs)
-
-    self.clip_op = None
-    if clip is not None:
-      var_list = tf.get_collection(
-          tf.GraphKeys.TRAINABLE_VARIABLES, scope="Disc")
-      self.clip_op = [w.assign(tf.clip_by_value(w, -clip, clip))
-                      for w in var_list]
-
-  def build_loss_and_gradients(self, var_list):
-    x_true = list(six.itervalues(self.data))[0]
-    x_fake = list(six.iterkeys(self.data))[0]
-    with tf.variable_scope("Disc"):
-      d_true = self.discriminator(x_true)
-
+  """Initialize inference algorithm. It initializes hyperparameters
+  and builds ops for the algorithm's computation graph.
+
+  Args:
+    penalty: float, optional.
+      Scalar value to enforce gradient penalty that ensures the
+      gradients have norm equal to 1 [@gulrajani2017improved]. Set to
+      None (or 0.0) if using no penalty.
+    clip: float, optional.
+      Value to clip weights by. Default is no clipping.
+  """
+  data = check_and_maybe_build_data(data)
+  scale = check_and_maybe_build_dict(scale)
+  var_list = check_and_maybe_build_var_list(var_list, {}, data)
+
+  x_true = list(six.itervalues(data))[0]
+  x_fake = list(six.iterkeys(data))[0]
+  with tf.variable_scope("Disc"):
+    d_true = discriminator(x_true)
+
+  with tf.variable_scope("Disc", reuse=True):
+    d_fake = discriminator(x_fake)
+
+  if penalty is None or penalty == 0:
+    penalty = 0.0
+  else:
+    eps = tf.random_uniform(tf.shape(x_true))
+    x_interpolated = eps * x_true + (1.0 - eps) * x_fake
     with tf.variable_scope("Disc", reuse=True):
-      d_fake = self.discriminator(x_fake)
-
-    if self.penalty is None or self.penalty == 0:
-      penalty = 0.0
-    else:
-      eps = tf.random_uniform(tf.shape(x_true))
-      x_interpolated = eps * x_true + (1.0 - eps) * x_fake
-      with tf.variable_scope("Disc", reuse=True):
-        d_interpolated = self.discriminator(x_interpolated)
-
-      gradients = tf.gradients(d_interpolated, [x_interpolated])[0]
-      slopes = tf.sqrt(tf.reduce_sum(tf.square(gradients),
-                                     list(range(1, gradients.shape.ndims))))
-      penalty = self.penalty * tf.reduce_mean(tf.square(slopes - 1.0))
-
-    reg_terms_d = tf.losses.get_regularization_losses(scope="Disc")
-    reg_terms_all = tf.losses.get_regularization_losses()
-    reg_terms = [r for r in reg_terms_all if r not in reg_terms_d]
-
-    mean_true = tf.reduce_mean(d_true)
-    mean_fake = tf.reduce_mean(d_fake)
-    loss_d = mean_fake - mean_true + penalty + tf.reduce_sum(reg_terms_d)
-    loss = -mean_fake + tf.reduce_sum(reg_terms)
-
-    var_list_d = tf.get_collection(
-        tf.GraphKeys.TRAINABLE_VARIABLES, scope="Disc")
-    if var_list is None:
-      var_list = [v for v in tf.trainable_variables() if v not in var_list_d]
-
-    grads_d = tf.gradients(loss_d, var_list_d)
-    grads = tf.gradients(loss, var_list)
-    grads_and_vars_d = list(zip(grads_d, var_list_d))
-    grads_and_vars = list(zip(grads, var_list))
-    return loss, grads_and_vars, loss_d, grads_and_vars_d
-
-  def update(self, feed_dict=None, variables=None):
-    info_dict = super(WGANInference, self).update(feed_dict, variables)
-
-    sess = get_session()
-    if self.clip_op is not None and variables in (None, "Disc"):
-      sess.run(self.clip_op)
-
-    return info_dict
+      d_interpolated = discriminator(x_interpolated)
+
+    gradients = tf.gradients(d_interpolated, [x_interpolated])[0]
+    slopes = tf.sqrt(tf.reduce_sum(tf.square(gradients),
+                                   list(range(1, gradients.shape.ndims))))
+    penalty = penalty * tf.reduce_mean(tf.square(slopes - 1.0))
+
+  reg_terms_d = tf.losses.get_regularization_losses(scope="Disc")
+  reg_terms_all = tf.losses.get_regularization_losses()
+  reg_terms = [r for r in reg_terms_all if r not in reg_terms_d]
+
+  mean_true = tf.reduce_mean(d_true)
+  mean_fake = tf.reduce_mean(d_fake)
+  loss_d = mean_fake - mean_true + penalty + tf.reduce_sum(reg_terms_d)
+  loss = -mean_fake + tf.reduce_sum(reg_terms)
+
+  var_list_d = tf.get_collection(
+      tf.GraphKeys.TRAINABLE_VARIABLES, scope="Disc")
+  if var_list is None:
+    var_list = [v for v in tf.trainable_variables() if v not in var_list_d]
+
+  grads_d = tf.gradients(loss_d, var_list_d)
+  grads = tf.gradients(loss, var_list)
+  grads_and_vars_d = list(zip(grads_d, var_list_d))
+  grads_and_vars = list(zip(grads, var_list))
+  return loss, grads_and_vars, loss_d, grads_and_vars_d
diff --git a/edward/util/__init__.py b/edward/util/__init__.py
index 19bb24b70..7a2b650ba 100644
--- a/edward/util/__init__.py
+++ b/edward/util/__init__.py
@@ -13,8 +13,6 @@
 from tensorflow.python.util.all_util import remove_undocumented
 
 _allowed_symbols = [
-    'check_data',
-    'check_latent_vars',
     'compute_multinomial_mode',
     'copy',
     'dot',
diff --git a/edward/util/random_variables.py b/edward/util/random_variables.py
index 2ec4e7ba1..fd9b15659 100644
--- a/edward/util/random_variables.py
+++ b/edward/util/random_variables.py
@@ -18,71 +18,6 @@
 tfb = tf.contrib.distributions.bijectors
 
 
-def check_data(data):
-  """Check that the data dictionary passed during inference and
-  criticism is valid.
-  """
-  if not isinstance(data, dict):
-    raise TypeError("data must have type dict.")
-
-  for key, value in six.iteritems(data):
-    if isinstance(key, tf.Tensor) and "Placeholder" in key.op.type:
-      if isinstance(value, RandomVariable):
-        raise TypeError("The value of a feed cannot be a ed.RandomVariable "
-                        "object. "
-                        "Acceptable feed values include Python scalars, "
-                        "strings, lists, numpy ndarrays, or TensorHandles.")
-      elif isinstance(value, tf.Tensor):
-        raise TypeError("The value of a feed cannot be a tf.Tensor object. "
-                        "Acceptable feed values include Python scalars, "
-                        "strings, lists, numpy ndarrays, or TensorHandles.")
-    elif isinstance(key, (RandomVariable, tf.Tensor)):
-      if isinstance(value, (RandomVariable, tf.Tensor)):
-        if not key.shape.is_compatible_with(value.shape):
-          raise TypeError("Key-value pair in data does not have same "
-                          "shape: {}, {}".format(key.shape, value.shape))
-        elif key.dtype != value.dtype:
-          raise TypeError("Key-value pair in data does not have same "
-                          "dtype: {}, {}".format(key.dtype, value.dtype))
-      elif isinstance(value, (float, list, int, np.ndarray, np.number, str)):
-        if not key.shape.is_compatible_with(np.shape(value)):
-          raise TypeError("Key-value pair in data does not have same "
-                          "shape: {}, {}".format(key.shape, np.shape(value)))
-        elif isinstance(value, (np.ndarray, np.number)) and \
-                not np.issubdtype(value.dtype, np.float) and \
-                not np.issubdtype(value.dtype, np.int) and \
-                not np.issubdtype(value.dtype, np.str):
-          raise TypeError("Data value has an invalid dtype: "
-                          "{}".format(value.dtype))
-      else:
-        raise TypeError("Data value has an invalid type: "
-                        "{}".format(type(value)))
-    else:
-      raise TypeError("Data key has an invalid type: {}".format(type(key)))
-
-
-def check_latent_vars(latent_vars):
-  """Check that the latent variable dictionary passed during inference and
-  criticism is valid.
-  """
-  if not isinstance(latent_vars, dict):
-    raise TypeError("latent_vars must have type dict.")
-
-  for key, value in six.iteritems(latent_vars):
-    if not isinstance(key, (RandomVariable, tf.Tensor)):
-      raise TypeError("Latent variable key has an invalid type: "
-                      "{}".format(type(key)))
-    elif not isinstance(value, (RandomVariable, tf.Tensor)):
-      raise TypeError("Latent variable value has an invalid type: "
-                      "{}".format(type(value)))
-    elif not key.shape.is_compatible_with(value.shape):
-      raise TypeError("Key-value pair in latent_vars does not have same "
-                      "shape: {}, {}".format(key.shape, value.shape))
-    elif key.dtype != value.dtype:
-      raise TypeError("Key-value pair in latent_vars does not have same "
-                      "dtype: {}, {}".format(key.dtype, value.dtype))
-
-
 def _get_context_copy(ctx, scope):
     # contexts are stored in graph collections
     # is there a more efficient way to do this?
diff --git a/tests/util/check_data_test.py b/tests/util/check_data_test.py
deleted file mode 100644
index 2a4fe2fa9..000000000
--- a/tests/util/check_data_test.py
+++ /dev/null
@@ -1,49 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-import tensorflow as tf
-
-from edward.models import Normal
-from edward.util import check_data
-
-
-class test_check_data_class(tf.test.TestCase):
-
-  def test(self):
-    with self.test_session():
-      x = Normal(0.0, 1.0)
-      qx = Normal(0.0, 1.0)
-      x_ph = tf.placeholder(tf.float32, [])
-
-      check_data({x: tf.constant(0.0)})
-      check_data({x: np.float64(0.0)})
-      check_data({x: np.int64(0)})
-      check_data({x: 0.0})
-      check_data({x: 0})
-      check_data({x: False})
-      check_data({x: '0'})
-      check_data({x: x_ph})
-      check_data({x: qx})
-      check_data({2.0 * x: tf.constant(0.0)})
-      self.assertRaises(TypeError, check_data, {0.0: x})
-      self.assertRaises(TypeError, check_data, {x: tf.zeros(5)})
-      self.assertRaises(TypeError, check_data, {x_ph: x})
-      self.assertRaises(TypeError, check_data, {x_ph: x})
-      self.assertRaises(TypeError, check_data,
-                        {x: tf.constant(0, tf.float64)})
-      self.assertRaises(TypeError, check_data,
-                        {x_ph: tf.constant(0.0)})
-
-      x_vec = Normal(tf.constant([0.0]), tf.constant([1.0]))
-      qx_vec = Normal(tf.constant([0.0]), tf.constant([1.0]))
-
-      check_data({x_vec: qx_vec})
-      check_data({x_vec: [0.0]})
-      check_data({x_vec: [0]})
-      check_data({x_vec: ['0']})
-      self.assertRaises(TypeError, check_data, {x: qx_vec})
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/tests/util/check_latent_vars_test.py b/tests/util/check_latent_vars_test.py
deleted file mode 100644
index a967629e2..000000000
--- a/tests/util/check_latent_vars_test.py
+++ /dev/null
@@ -1,26 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf
-
-from edward.models import Normal
-from edward.util import check_latent_vars
-
-
-class test_check_latent_vars_class(tf.test.TestCase):
-
-  def test(self):
-    with self.test_session():
-      mu = Normal(0.0, 1.0)
-      qmu = Normal(tf.Variable(0.0), tf.constant(1.0))
-      qmu_vec = Normal(tf.constant([0.0]), tf.constant([1.0]))
-
-      check_latent_vars({mu: qmu})
-      check_latent_vars({mu: tf.constant(0.0)})
-      check_latent_vars({tf.constant(0.0): qmu})
-      self.assertRaises(TypeError, check_latent_vars, {mu: '5'})
-      self.assertRaises(TypeError, check_latent_vars, {mu: qmu_vec})
-
-if __name__ == '__main__':
-  tf.test.main()

From 3381070be57fbb3c928d6cbb8dd174ebebf9c622 Mon Sep 17 00:00:00 2001
From: Dustin Tran <dustinviettran@gmail.com>
Date: Sun, 28 Jan 2018 03:55:12 -0800
Subject: [PATCH 03/27] remove edward.criticisms

---
 edward/__init__.py                 |  10 +-
 edward/criticisms/__init__.py      |  20 --
 edward/criticisms/evaluate.py      | 477 -----------------------------
 edward/criticisms/ppc.py           | 120 --------
 edward/criticisms/ppc_plots.py     | 109 -------
 tests/criticisms/evaluate_test.py  | 188 ------------
 tests/criticisms/metrics_test.py   | 114 -------
 tests/criticisms/ppc_plots_test.py |  25 --
 tests/criticisms/ppc_test.py       |  52 ----
 9 files changed, 1 insertion(+), 1114 deletions(-)
 delete mode 100644 edward/criticisms/__init__.py
 delete mode 100644 edward/criticisms/evaluate.py
 delete mode 100644 edward/criticisms/ppc.py
 delete mode 100644 edward/criticisms/ppc_plots.py
 delete mode 100644 tests/criticisms/evaluate_test.py
 delete mode 100644 tests/criticisms/metrics_test.py
 delete mode 100644 tests/criticisms/ppc_plots_test.py
 delete mode 100644 tests/criticisms/ppc_test.py

diff --git a/edward/__init__.py b/edward/__init__.py
index ddb1380fb..d087e8379 100644
--- a/edward/__init__.py
+++ b/edward/__init__.py
@@ -2,14 +2,11 @@
 from __future__ import division
 from __future__ import print_function
 
-from edward import criticisms
 from edward import inferences
 from edward import models
 from edward import util
 
 # Direct imports for convenience
-from edward.criticisms import (
-    evaluate, ppc, ppc_density_plot, ppc_stat_hist_plot)
 from edward.inferences import (
     bigan_inference,
     complete_conditional,
@@ -42,14 +39,9 @@
 
 # Export modules and constants.
 _allowed_symbols = [
-    'criticisms',
     'inferences',
     'models',
     'util',
-    'evaluate',
-    'ppc',
-    'ppc_density_plot',
-    'ppc_stat_hist_plot',
     'bigan_inference',
     'complete_conditional',
     'gan_inference',
@@ -99,5 +91,5 @@
 # Remove all extra symbols that don't have a docstring or are not explicitly
 # referenced in the whitelist.
 remove_undocumented(__name__, _allowed_symbols, [
-    criticisms, inferences, models, util
+    inferences, models, util
 ])
diff --git a/edward/criticisms/__init__.py b/edward/criticisms/__init__.py
deleted file mode 100644
index 5a9aff3d6..000000000
--- a/edward/criticisms/__init__.py
+++ /dev/null
@@ -1,20 +0,0 @@
-"""
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from edward.criticisms.evaluate import *
-from edward.criticisms.ppc import *
-from edward.criticisms.ppc_plots import *
-
-from tensorflow.python.util.all_util import remove_undocumented
-
-_allowed_symbols = [
-    'evaluate',
-    'ppc',
-    'ppc_density_plot',
-    'ppc_stat_hist_plot',
-]
-
-remove_undocumented(__name__, allowed_exception_list=_allowed_symbols)
diff --git a/edward/criticisms/evaluate.py b/edward/criticisms/evaluate.py
deleted file mode 100644
index 80be843bf..000000000
--- a/edward/criticisms/evaluate.py
+++ /dev/null
@@ -1,477 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-import six
-import tensorflow as tf
-
-from edward.inferences.inference import check_and_maybe_build_data
-from edward.models import RandomVariable
-from edward.util import get_session, compute_multinomial_mode, \
-    with_binary_averaging
-
-try:
-  from edward.models import Bernoulli, Binomial, Categorical, \
-      Multinomial, OneHotCategorical
-except Exception as e:
-  raise ImportError("{0}. Your TensorFlow version is not supported.".format(e))
-
-
-def evaluate(metrics, data, n_samples=500, output_key=None, seed=None):
-  """Evaluate fitted model using a set of metrics.
-
-  A metric, or scoring rule [@winkler1994evaluating], is a function of
-  observed data under the posterior predictive distribution. For
-  example in supervised metrics such as classification accuracy, the
-  observed data (true output) is compared to the posterior
-  predictive's mean (predicted output). In unsupervised metrics such
-  as log-likelihood, the probability of observing the data is
-  calculated under the posterior predictive's log-density.
-
-  Args:
-    metrics: list of str and/or (str, params: dict) tuples, str,
-    or (str, params: dict) tuple.
-      List of metrics or a single metric:
-      `'binary_accuracy'`,
-      `'categorical_accuracy'`,
-      `'sparse_categorical_accuracy'`,
-      `'log_loss'` or `'binary_crossentropy'`,
-      `'categorical_crossentropy'`,
-      `'sparse_categorical_crossentropy'`,
-      `'hinge'`,
-      `'squared_hinge'`,
-      `'mse'` or `'MSE'` or `'mean_squared_error'`,
-      `'mae'` or `'MAE'` or `'mean_absolute_error'`,
-      `'mape'` or `'MAPE'` or `'mean_absolute_percentage_error'`,
-      `'msle'` or `'MSLE'` or `'mean_squared_logarithmic_error'`,
-      `'poisson'`,
-      `'cosine'` or `'cosine_proximity'`,
-      `'log_lik'` or `'log_likelihood'`.
-      In lieu of a metric string, this method also accepts (str, params: dict)
-      tuples; the first element of this tuple is the metric string, and
-      the second is a dict of associated params. At present, this dict only
-      expects one key, `'average'`, which stipulates the type of averaging to
-      perform on those metrics that permit binary averaging. Permissible
-      options include: `None`, `'macro'` and `'micro'`.
-    data: dict.
-      Data to evaluate model with. It binds observed variables (of type
-      `RandomVariable` or `tf.Tensor`) to their realizations (of
-      type `tf.Tensor`). It can also bind placeholders (of type
-      `tf.Tensor`) used in the model to their realizations.
-    n_samples: int.
-      Number of posterior samples for making predictions, using the
-      posterior predictive distribution.
-    output_key: RandomVariable or tf.Tensor.
-      It is the key in `data` which corresponds to the model's output.
-    seed: a Python integer. Used to create a random seed for the
-      distribution
-
-  Returns:
-    list of float or float.
-    A list of evaluations or a single evaluation.
-
-  Raises:
-    NotImplementedError.
-    If an input metric does not match an implemented metric in Edward.
-
-  #### Examples
-
-  ```python
-  # build posterior predictive after inference: it is
-  # parameterized by a posterior sample
-  x_post = ed.copy(x, {z: qz, beta: qbeta})
-
-  # log-likelihood performance
-  ed.evaluate('log_likelihood', data={x_post: x_train})
-
-  # classification accuracy
-  # here, `x_ph` is any features the model is defined with respect to,
-  # and `y_post` is the posterior predictive distribution
-  ed.evaluate('binary_accuracy', data={y_post: y_train, x_ph: x_train})
-
-  # mean squared error
-  ed.evaluate('mean_squared_error', data={y: y_data, x: x_data})
-  ```
-
-  # mean squared logarithmic error with `'micro'` averaging
-  ed.evaluate(('mean_squared_logarithmic_error', {'average': 'micro'}),
-              data={y: y_data, x: x_data})
-  """
-  sess = get_session()
-  if isinstance(metrics, str):
-    metrics = [metrics]
-  elif callable(metrics):
-    metrics = [metrics]
-  elif not isinstance(metrics, list):
-    raise TypeError("metrics must have type str or list, or be callable.")
-
-  data = check_and_maybe_build_data(data)
-  if not isinstance(n_samples, int):
-    raise TypeError("n_samples must have type int.")
-
-  if output_key is None:
-    # Default output_key to the only data key that isn't a placeholder.
-    keys = [key for key in six.iterkeys(data) if not
-            isinstance(key, tf.Tensor) or "Placeholder" not in key.op.type]
-    if len(keys) == 1:
-      output_key = keys[0]
-    else:
-      raise KeyError("User must specify output_key.")
-  elif not isinstance(output_key, RandomVariable):
-    raise TypeError("output_key must have type RandomVariable.")
-
-  # Create feed_dict for data placeholders that the model conditions
-  # on; it is necessary for all session runs.
-  feed_dict = {key: value for key, value in six.iteritems(data)
-               if isinstance(key, tf.Tensor) and "Placeholder" in key.op.type}
-
-  # Form true data.
-  y_true = data[output_key]
-  # Make predictions (if there are any supervised metrics).
-  if metrics != ['log_lik'] and metrics != ['log_likelihood']:
-    binary_discrete = (Bernoulli, Binomial)
-    categorical_discrete = (Categorical, Multinomial, OneHotCategorical)
-    total_count = sess.run(getattr(output_key, 'total_count', tf.constant(1.)))
-    if isinstance(output_key, binary_discrete + categorical_discrete):
-      # Average over realizations of their probabilities, then predict
-      # via argmax over probabilities.
-      probs = [sess.run(output_key.probs, feed_dict) for _ in range(n_samples)]
-      probs = np.sum(probs, axis=0) / n_samples
-      if isinstance(output_key, binary_discrete):
-        # make random prediction whenever probs is exactly 0.5
-        random = tf.random_uniform(shape=tf.shape(probs))
-        y_pred = tf.round(tf.where(tf.equal(0.5, probs), random, probs))
-      else:
-        if total_count > 1:
-          mode = compute_multinomial_mode(probs, total_count, seed)
-          if len(output_key.sample_shape):
-            y_pred = tf.reshape(tf.tile(mode, output_key.sample_shape),
-                                [-1, len(probs)])
-          else:
-            y_pred = mode
-        else:
-          y_pred = tf.argmax(probs, len(probs.shape) - 1)
-      probs = tf.constant(probs)
-    else:
-      # Monte Carlo estimate the mean of the posterior predictive.
-      y_pred = [sess.run(output_key, feed_dict) for _ in range(n_samples)]
-      y_pred = tf.cast(tf.add_n(y_pred), y_pred[0].dtype) / \
-          tf.cast(n_samples, y_pred[0].dtype)
-    if len(y_true.shape) == 0:
-      y_true = tf.expand_dims(y_true, 0)
-      y_pred = tf.expand_dims(y_pred, 0)
-
-  # Evaluate y_true (according to y_pred if supervised) for all metrics.
-  evaluations = []
-  for metric in metrics:
-    if isinstance(metric, tuple):
-      metric, params = metric
-    else:
-      params = {}
-    if metric == 'accuracy' or metric == 'crossentropy':
-      # automate binary or sparse cat depending on its support
-      support = sess.run(tf.reduce_max(y_true), feed_dict)
-      if support <= 1:
-        metric = 'binary_' + metric
-      else:
-        metric = 'sparse_categorical_' + metric
-
-    if metric == 'binary_accuracy':
-      evaluations += [binary_accuracy(y_true, y_pred, **params)]
-    elif metric == 'categorical_accuracy':
-      evaluations += [categorical_accuracy(y_true, y_pred, **params)]
-    elif metric == 'sparse_categorical_accuracy':
-      evaluations += [sparse_categorical_accuracy(y_true, y_pred, **params)]
-    elif metric == 'log_loss' or metric == 'binary_crossentropy':
-      evaluations += [binary_crossentropy(y_true, y_pred, **params)]
-    elif metric == 'categorical_crossentropy':
-      evaluations += [categorical_crossentropy(y_true, y_pred, **params)]
-    elif metric == 'sparse_categorical_crossentropy':
-      evaluations += [sparse_categorical_crossentropy(y_true, y_pred, **params)]
-    elif metric == 'multinomial_accuracy':
-      evaluations += [multinomial_accuracy(y_true, y_pred, **params)]
-    elif metric == 'kl_divergence':
-      y_true_ = y_true / total_count
-      y_pred_ = probs
-      evaluations += [kl_divergence(y_true_, y_pred_, **params)]
-    elif metric == 'hinge':
-      evaluations += [hinge(y_true, y_pred, **params)]
-    elif metric == 'squared_hinge':
-      evaluations += [squared_hinge(y_true, y_pred, **params)]
-    elif (metric == 'mse' or metric == 'MSE' or
-          metric == 'mean_squared_error'):
-      evaluations += [mean_squared_error(y_true, y_pred, **params)]
-    elif (metric == 'mae' or metric == 'MAE' or
-          metric == 'mean_absolute_error'):
-      evaluations += [mean_absolute_error(y_true, y_pred, **params)]
-    elif (metric == 'mape' or metric == 'MAPE' or
-          metric == 'mean_absolute_percentage_error'):
-      evaluations += [mean_absolute_percentage_error(y_true, y_pred, **params)]
-    elif (metric == 'msle' or metric == 'MSLE' or
-          metric == 'mean_squared_logarithmic_error'):
-      evaluations += [mean_squared_logarithmic_error(y_true, y_pred, **params)]
-    elif metric == 'poisson':
-      evaluations += [poisson(y_true, y_pred, **params)]
-    elif metric == 'cosine' or metric == 'cosine_proximity':
-      evaluations += [cosine_proximity(y_true, y_pred, **params)]
-    elif metric == 'log_lik' or metric == 'log_likelihood':
-      # Monte Carlo estimate the log-density of the posterior predictive.
-      tensor = tf.reduce_mean(output_key.log_prob(y_true))
-      log_pred = [sess.run(tensor, feed_dict) for _ in range(n_samples)]
-      log_pred = tf.add_n(log_pred) / tf.cast(n_samples, tensor.dtype)
-      evaluations += [log_pred]
-    elif callable(metric):
-      evaluations += [metric(y_true, y_pred, **params)]
-    else:
-      raise NotImplementedError("Metric is not implemented: {}".format(metric))
-
-  if len(evaluations) == 1:
-    return sess.run(evaluations[0], feed_dict)
-  else:
-    return sess.run(evaluations, feed_dict)
-
-
-# Classification metrics
-
-
-def binary_accuracy(y_true, y_pred):
-  """Binary prediction accuracy, also known as 0/1-loss.
-
-  Args:
-    y_true: tf.Tensor.
-      Tensor of 0s and 1s (most generally, any real values a and b).
-    y_pred: tf.Tensor.
-      Tensor of predictions, with same shape as `y_true`.
-  """
-  y_true = tf.cast(y_true, tf.float32)
-  y_pred = tf.cast(y_pred, tf.float32)
-  return tf.reduce_mean(tf.cast(tf.equal(y_true, y_pred), tf.float32))
-
-
-def categorical_accuracy(y_true, y_pred):
-  """Multi-class prediction accuracy. One-hot representation for `y_true`.
-
-  Args:
-    y_true: tf.Tensor.
-      Tensor of 0s and 1s, where the outermost dimension of size `K`
-      has only one 1 per row.
-    y_pred: tf.Tensor.
-      Tensor of predictions, with shape `y_true.shape[:-1]`. Each
-      entry is an integer {0, 1, ..., K-1}.
-  """
-  y_true = tf.cast(tf.argmax(y_true, len(y_true.shape) - 1), tf.float32)
-  y_pred = tf.cast(y_pred, tf.float32)
-  return tf.reduce_mean(tf.cast(tf.equal(y_true, y_pred), tf.float32))
-
-
-def sparse_categorical_accuracy(y_true, y_pred):
-  """Multi-class prediction accuracy. Label {0, 1, .., K-1}
-  representation for `y_true`.
-
-  Args:
-    y_true: tf.Tensor.
-      Tensor of integers {0, 1, ..., K-1}.
-    y_pred: tf.Tensor.
-      Tensor of predictions, with same shape as `y_true`.
-  """
-  y_true = tf.cast(y_true, tf.float32)
-  y_pred = tf.cast(y_pred, tf.float32)
-  return tf.reduce_mean(tf.cast(tf.equal(y_true, y_pred), tf.float32))
-
-
-# Classification metrics (with real-valued predictions)
-
-
-def binary_crossentropy(y_true, y_pred):
-  """Binary cross-entropy.
-
-  Args:
-    y_true: tf.Tensor.
-      Tensor of 0s and 1s.
-    y_pred: tf.Tensor.
-      Tensor of real values (logit probabilities), with same shape as
-      `y_true`.
-  """
-  y_true = tf.cast(y_true, tf.float32)
-  y_pred = tf.cast(y_pred, tf.float32)
-  return tf.reduce_mean(
-      tf.nn.sigmoid_cross_entropy_with_logits(logits=y_pred, labels=y_true))
-
-
-def categorical_crossentropy(y_true, y_pred):
-  """Multi-class cross entropy. One-hot representation for `y_true`.
-
-  Args:
-    y_true: tf.Tensor.
-      Tensor of 0s and 1s, where the outermost dimension of size K
-      has only one 1 per row.
-    y_pred: tf.Tensor.
-      Tensor of real values (logit probabilities), with same shape as
-      `y_true`. The outermost dimension is the number of classes.
-  """
-  y_true = tf.cast(y_true, tf.float32)
-  y_pred = tf.cast(y_pred, tf.float32)
-  return tf.reduce_mean(
-      tf.nn.softmax_cross_entropy_with_logits(logits=y_pred, labels=y_true))
-
-
-def sparse_categorical_crossentropy(y_true, y_pred):
-  """Multi-class cross entropy. Label {0, 1, .., K-1} representation
-  for `y_true.`
-
-  Args:
-    y_true: tf.Tensor.
-      Tensor of integers {0, 1, ..., K-1}.
-    y_pred: tf.Tensor.
-      Tensor of real values (logit probabilities), with shape
-      `(y_true.shape, K)`. The outermost dimension is the number of classes.
-  """
-  y_true = tf.cast(y_true, tf.int64)
-  y_pred = tf.cast(y_pred, tf.float32)
-  return tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
-      logits=y_pred, labels=y_true))
-
-
-def multinomial_accuracy(y_true, y_pred):
-  """Multinomial prediction accuracy. `y_true` is a tensor
-  of integers, where the outermost dimension gives a draw
-  from a Multinomial distribution.
-
-  NB: In evaluating the accuracy between two Multinomials
-  results may vary across evaluations. This is because Edward's
-  algorithm for computing `y_pred`, i.e. the Multinomial
-  mode, yields variable results if `any(isinstance(p, float)
-  for p in total_count * probs)` (where `probs` is a vector
-  of the predicted Multinomial probabilities).
-  """
-  y_true = tf.cast(y_true, tf.float32)
-  y_pred = tf.cast(y_pred, tf.float32)
-  return tf.reduce_mean(tf.cast(tf.equal(y_true, y_pred), tf.float32))
-
-
-def kl_divergence(y_true, y_pred):
-  """Kullback-Leibler divergence between two probability distributions. A
-  vector of probabilities for `y_true`.
-
-  Args:
-    y_true: tf.Tensor.
-      Tensor of real values (probabilities) where the values in each row
-      of the outermost dimension sum to 1.
-    y_pred: tf.Tensor.
-      Same as `y_true`, and with the same shape.
-  """
-  y_true = tf.cast(y_true, tf.float32)
-  y_pred = tf.cast(y_pred, tf.float32)
-  zeros = tf.zeros(shape=(tf.shape(y_true)))
-  summand = tf.where(tf.equal(y_true, 0.0), zeros,
-                     y_true * (tf.log(y_true) - tf.log(y_pred)))
-  return tf.reduce_sum(summand)
-
-
-def hinge(y_true, y_pred):
-  """Hinge loss.
-
-  Args:
-    y_true: tf.Tensor.
-      Tensor of 0s and 1s.
-    y_pred: tf.Tensor.
-      Tensor of real values, with same shape as `y_true`.
-  """
-  y_true = tf.cast(y_true, tf.float32)
-  y_pred = tf.cast(y_pred, tf.float32)
-  return tf.reduce_mean(tf.maximum(1.0 - y_true * y_pred, 0.0))
-
-
-def squared_hinge(y_true, y_pred):
-  """Squared hinge loss.
-
-  Args:
-    y_true: tf.Tensor.
-      Tensor of 0s and 1s.
-    y_pred: tf.Tensor.
-      Tensor of real values, with same shape as `y_true`.
-  """
-  y_true = tf.cast(y_true, tf.float32)
-  y_pred = tf.cast(y_pred, tf.float32)
-  return tf.reduce_mean(tf.square(tf.maximum(1.0 - y_true * y_pred, 0.0)))
-
-
-# Regression metrics
-
-
-@with_binary_averaging
-def mean_squared_error(y_true, y_pred):
-  """Mean squared error loss.
-
-  Args:
-    y_true: tf.Tensor.
-    y_pred: tf.Tensor.
-      Tensors of same shape and type.
-  """
-  return tf.reduce_mean(tf.square(y_pred - y_true), axis=-2)
-
-
-@with_binary_averaging
-def mean_absolute_error(y_true, y_pred):
-  """Mean absolute error loss.
-
-  Args:
-    y_true: tf.Tensor.
-    y_pred: tf.Tensor.
-      Tensors of same shape and type.
-  """
-  return tf.reduce_mean(tf.abs(y_pred - y_true), axis=-2)
-
-
-@with_binary_averaging
-def mean_absolute_percentage_error(y_true, y_pred):
-  """Mean absolute percentage error loss.
-
-  Args:
-    y_true: tf.Tensor.
-    y_pred: tf.Tensor.
-      Tensors of same shape and type.
-  """
-  diff = tf.abs((y_true - y_pred) / tf.clip_by_value(tf.abs(y_true),
-                                                     1e-8, np.inf))
-  return 100.0 * tf.reduce_mean(diff, axis=-2)
-
-
-@with_binary_averaging
-def mean_squared_logarithmic_error(y_true, y_pred):
-  """Mean squared logarithmic error loss.
-
-  Args:
-    y_true: tf.Tensor.
-    y_pred: tf.Tensor.
-      Tensors of same shape and type.
-  """
-  first_log = tf.log(tf.clip_by_value(y_pred, 1e-8, np.inf) + 1.0)
-  second_log = tf.log(tf.clip_by_value(y_true, 1e-8, np.inf) + 1.0)
-  return tf.reduce_mean(tf.square(first_log - second_log), axis=-2)
-
-
-def poisson(y_true, y_pred):
-  """Negative Poisson log-likelihood of data `y_true` given predictions
-  `y_pred` (up to proportion).
-
-  Args:
-    y_true: tf.Tensor.
-    y_pred: tf.Tensor.
-      Tensors of same shape and type.
-  """
-  return tf.reduce_sum(y_pred - y_true * tf.log(y_pred + 1e-8))
-
-
-def cosine_proximity(y_true, y_pred):
-  """Cosine similarity of two vectors.
-
-  Args:
-    y_true: tf.Tensor.
-    y_pred: tf.Tensor.
-      Tensors of same shape and type.
-  """
-  y_true = tf.nn.l2_normalize(y_true, len(y_true.shape) - 1)
-  y_pred = tf.nn.l2_normalize(y_pred, len(y_pred.shape) - 1)
-  return tf.reduce_sum(y_true * y_pred)
diff --git a/edward/criticisms/ppc.py b/edward/criticisms/ppc.py
deleted file mode 100644
index 462aaa623..000000000
--- a/edward/criticisms/ppc.py
+++ /dev/null
@@ -1,120 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-import six
-import tensorflow as tf
-
-from edward.inferences.inference import (check_and_maybe_build_data,
-    check_and_maybe_build_latent_vars)
-from edward.models import RandomVariable
-from edward.util import get_session
-
-
-def ppc(T, data, latent_vars=None, n_samples=100):
-  """Posterior predictive check
-  [@rubin1984bayesianly; @meng1994posterior; @gelman1996posterior].
-
-  PPC's form an empirical distribution for the predictive discrepancy,
-
-  $p(T\mid x) = \int p(T(x^{\\text{rep}})\mid z) p(z\mid x) dz$
-
-  by drawing replicated data sets $x^{\\text{rep}}$ and
-  calculating $T(x^{\\text{rep}})$ for each data set. Then it
-  compares it to $T(x)$.
-
-  If `data` is inputted with the prior predictive distribution, then
-  it is a prior predictive check [@box1980sampling].
-
-  Args:
-    T: function.
-      Discrepancy function, which takes a dictionary of data and
-      dictionary of latent variables as input and outputs a `tf.Tensor`.
-    data: dict.
-      Data to compare to. It binds observed variables (of type
-      `RandomVariable` or `tf.Tensor`) to their realizations (of
-      type `tf.Tensor`). It can also bind placeholders (of type
-      `tf.Tensor`) used in the model to their realizations.
-    latent_vars: dict.
-      Collection of random variables (of type `RandomVariable` or
-      `tf.Tensor`) binded to their inferred posterior. This argument
-      is used when the discrepancy is a function of latent variables.
-    n_samples: int.
-      Number of replicated data sets.
-
-  Returns:
-    list of np.ndarray.
-    List containing the reference distribution, which is a NumPy array
-    with `n_samples` elements,
-
-    $(T(x^{{\\text{rep}},1}, z^{1}), ...,
-       T(x^{\\text{rep,nsamples}}, z^{\\text{nsamples}}))$
-
-    and the realized discrepancy, which is a NumPy array with
-    `n_samples` elements,
-
-    $(T(x, z^{1}), ..., T(x, z^{\\text{nsamples}})).$
-
-
-  #### Examples
-
-  ```python
-  # build posterior predictive after inference:
-  # it is parameterized by a posterior sample
-  x_post = ed.copy(x, {z: qz, beta: qbeta})
-
-  # posterior predictive check
-  # T is a user-defined function of data, T(data)
-  T = lambda xs, zs: tf.reduce_mean(xs[x_post])
-  ed.ppc(T, data={x_post: x_train})
-
-  # in general T is a discrepancy function of the data (both response and
-  # covariates) and latent variables, T(data, latent_vars)
-  T = lambda xs, zs: tf.reduce_mean(zs[z])
-  ed.ppc(T, data={y_post: y_train, x_ph: x_train},
-         latent_vars={z: qz, beta: qbeta})
-
-  # prior predictive check
-  # run ppc on original x
-  ed.ppc(T, data={x: x_train})
-  ```
-  """
-  sess = get_session()
-  if not callable(T):
-    raise TypeError("T must be a callable function.")
-
-  data = check_and_maybe_build_data(data)
-  latent_vars = check_and_maybe_build_latent_vars(latent_vars)
-  if not isinstance(n_samples, int):
-    raise TypeError("n_samples must have type int.")
-
-  # Build replicated latent variables.
-  zrep = {key: tf.convert_to_tensor(value)
-          for key, value in six.iteritems(latent_vars)}
-
-  # Build replicated data.
-  xrep = {x: (x.value if isinstance(x, RandomVariable) else obs)
-          for x, obs in six.iteritems(data)}
-
-  # Create feed_dict for data placeholders that the model conditions
-  # on; it is necessary for all session runs.
-  feed_dict = {key: value for key, value in six.iteritems(data)
-               if isinstance(key, tf.Tensor) and "Placeholder" in key.op.type}
-
-  # Calculate discrepancy over many replicated data sets and latent
-  # variables.
-  Trep = T(xrep, zrep)
-  Tobs = T(data, zrep)
-  Treps = []
-  Ts = []
-  for _ in range(n_samples):
-    # Take a forward pass (session run) to get new samples for
-    # each calculation of the discrepancy.
-    # Alternatively, we could unroll the graph by registering this
-    # operation `n_samples` times, each for different parent nodes
-    # representing `xrep` and `zrep`. But it's expensive.
-    Treps += [sess.run(Trep, feed_dict)]
-    Ts += [sess.run(Tobs, feed_dict)]
-
-  return [np.stack(Treps), np.stack(Ts)]
diff --git a/edward/criticisms/ppc_plots.py b/edward/criticisms/ppc_plots.py
deleted file mode 100644
index 2b2677105..000000000
--- a/edward/criticisms/ppc_plots.py
+++ /dev/null
@@ -1,109 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-
-def ppc_density_plot(y, y_rep):
-  """Create 1D kernel density plot comparing data to samples from posterior.
-
-  Args:
-    y: np.ndarray.
-      A 1-D NumPy array.
-    y_rep: np.ndarray.
-      A 2-D NumPy array where rows represent different samples from posterior.
-
-  Returns:
-    matplotlib axes
-
-  #### Examples
-
-  ```python
-  import matplotlib.pyplot as plt
-
-  y = np.random.randn(20)
-  y_rep = np.random.randn(20, 20)
-
-  ed.ppc_density_plot(y, y_rep)
-  plt.show()
-  ```
-  """
-  import matplotlib.pyplot as plt
-  import seaborn as sns
-  ax = sns.kdeplot(y, color="maroon")
-
-  n = y_rep.shape[0]
-
-  for i in range(n):
-    ax = sns.kdeplot(y_rep[i, :], color="maroon", alpha=0.2, linewidth=0.8)
-
-  y_line = plt.Line2D([], [], color='maroon', label='y')
-  y_rep_line = plt.Line2D([], [], color='maroon', alpha=0.2, label='y_rep')
-
-  handles = [y_line, y_rep_line]
-  labels = ['y', r'$y_{rep}$']
-
-  ax.legend(handles, labels)
-
-  return ax
-
-
-def ppc_stat_hist_plot(y_stats, yrep_stats, stat_name=None, **kwargs):
-  """Create histogram plot comparing data to samples from posterior.
-
-  Args:
-    y_stats: float.
-      Float representing statistic value of observed data.
-    yrep_stats: np.ndarray.
-      A 1-D NumPy array.
-    stat_name: string.
-      Optional string value for including statistic name in legend.
-    **kwargs:
-      Keyword arguments used by seaborn.distplot can be given to customize plot.
-
-  Returns:
-    matplotlib axes.
-
-  #### Examples
-
-  ```python
-  import matplotlib.pyplot as plt
-
-  # DATA
-  x_data = np.array([0, 1, 0, 0, 0, 0, 0, 0, 0, 1])
-
-  # MODEL
-  p = Beta(1.0, 1.0)
-  x = Bernoulli(probs=p, sample_shape=10)
-
-  # INFERENCE
-  qp = Beta(tf.nn.softplus(tf.Variable(tf.random_normal([]))),
-            tf.nn.softplus(tf.Variable(tf.random_normal([]))))
-
-  inference = ed.KLqp({p: qp}, data={x: x_data})
-  inference.run(n_iter=500)
-
-  # CRITICISM
-  x_post = ed.copy(x, {p: qp})
-  y_rep, y = ed.ppc(
-      lambda xs, zs: tf.reduce_mean(tf.cast(xs[x_post], tf.float32)),
-      data={x_post: x_data})
-
-  ed.ppc_stat_hist_plot(
-      y[0], y_rep, stat_name=r'$T \equiv$mean', bins=10)
-  plt.show()
-  ```
-  """
-  import matplotlib.pyplot as plt
-  import seaborn as sns
-  ax = sns.distplot(yrep_stats, kde=False, label=r'$T(y_{rep})$', **kwargs)
-
-  max_value = ax.get_ylim()[1]
-
-  plt.vlines(y_stats, ymin=0.0, ymax=max_value, label='T(y)')
-
-  if stat_name is not None:
-    plt.legend(title=stat_name)
-  else:
-    plt.legend()
-
-  return ax
diff --git a/tests/criticisms/evaluate_test.py b/tests/criticisms/evaluate_test.py
deleted file mode 100644
index f12e90b43..000000000
--- a/tests/criticisms/evaluate_test.py
+++ /dev/null
@@ -1,188 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import edward as ed
-import numpy as np
-import tensorflow as tf
-
-from edward.models import Bernoulli, Categorical, Multinomial, Normal
-
-
-class test_evaluate_class(tf.test.TestCase):
-
-  RANDOM_SEED = 12345
-
-  def test_metrics(self):
-    with self.test_session():
-      x = Normal(loc=0.0, scale=1.0)
-      x_data = tf.constant(0.0)
-      ed.evaluate('mean_squared_error', {x: x_data}, n_samples=1)
-      ed.evaluate(['mean_squared_error'], {x: x_data}, n_samples=1)
-      ed.evaluate(['mean_squared_error', 'mean_absolute_error'],
-                  {x: x_data}, n_samples=1)
-      self.assertRaises(TypeError, ed.evaluate, x, {x: x_data}, n_samples=1)
-      self.assertRaises(NotImplementedError, ed.evaluate, 'hello world',
-                        {x: x_data}, n_samples=1)
-
-  def test_metrics_classification(self):
-    with self.test_session():
-      x = Bernoulli(probs=0.51)
-      x_data = tf.constant(1)
-      self.assertAllClose(
-          1.0,
-          ed.evaluate('binary_accuracy', {x: x_data}, n_samples=1))
-      x = Bernoulli(probs=0.51, sample_shape=5)
-      x_data = tf.constant([1, 1, 1, 0, 0])
-      self.assertAllClose(
-          0.6,
-          ed.evaluate('binary_accuracy', {x: x_data}, n_samples=1))
-      x = Bernoulli(probs=tf.constant([0.51, 0.49, 0.49]))
-      x_data = tf.constant([1, 0, 1])
-      self.assertAllClose(
-          2.0 / 3,
-          ed.evaluate('binary_accuracy', {x: x_data}, n_samples=1))
-
-      x = Categorical(probs=tf.constant([0.48, 0.51, 0.01]))
-      x_data = tf.constant(1)
-      self.assertAllClose(
-          1.0,
-          ed.evaluate('sparse_categorical_accuracy', {x: x_data}, n_samples=1))
-      x = Categorical(probs=tf.constant([0.48, 0.51, 0.01]), sample_shape=5)
-      x_data = tf.constant([1, 1, 1, 0, 2])
-      self.assertAllClose(
-          0.6,
-          ed.evaluate('sparse_categorical_accuracy', {x: x_data}, n_samples=1))
-      x = Categorical(
-          probs=tf.constant([[0.48, 0.51, 0.01], [0.51, 0.48, 0.01]]))
-      x_data = tf.constant([1, 2])
-      self.assertAllClose(
-          0.5,
-          ed.evaluate('sparse_categorical_accuracy', {x: x_data}, n_samples=1))
-
-      x = Multinomial(total_count=1.0, probs=tf.constant([0.48, 0.51, 0.01]))
-      x_data = tf.constant([0, 1, 0], dtype=x.dtype.as_numpy_dtype)
-      self.assertAllClose(
-          1.0,
-          ed.evaluate('categorical_accuracy', {x: x_data}, n_samples=1))
-      x = Multinomial(total_count=1.0, probs=tf.constant([0.48, 0.51, 0.01]),
-                      sample_shape=5)
-      x_data = tf.constant(
-          [[0, 1, 0], [0, 1, 0], [0, 1, 0], [1, 0, 0], [0, 0, 1]],
-          dtype=x.dtype.as_numpy_dtype)
-      self.assertAllClose(
-          0.6,
-          ed.evaluate('categorical_accuracy', {x: x_data}, n_samples=1))
-
-      x = Multinomial(total_count=5.0, probs=tf.constant([0.4, 0.6, 0.0]))
-      x_data = tf.constant([2, 3, 0], dtype=x.dtype.as_numpy_dtype)
-      self.assertAllClose(
-          1.0,
-          ed.evaluate('multinomial_accuracy', {x: x_data}, n_samples=1))
-
-  def test_metrics_with_binary_averaging(self):
-    x = Multinomial(total_count=10.0, probs=tf.constant([0.2, 0.7, 0.1]))
-    x_data = tf.constant([5, 4, 1], dtype=x.dtype.as_numpy_dtype)
-    self.assertAllEqual(
-        np.array([9.0, 4.0, 1.0], dtype=np.float32),
-        ed.evaluate([('mean_squared_error', {'average': None})],
-                    {x: x_data}, n_samples=1, seed=self.RANDOM_SEED))
-    x = Multinomial(total_count=10.0, probs=tf.constant([0.2, 0.7, 0.1]))
-    x_data = tf.constant([5, 4, 1], dtype=x.dtype.as_numpy_dtype)
-    self.assertAllClose(
-        4.6666665,
-        ed.evaluate([('mean_squared_error', {'average': 'macro'})],
-                    {x: x_data}, n_samples=1, seed=self.RANDOM_SEED))
-    x = Multinomial(total_count=10.0, probs=tf.constant([0.2, 0.7, 0.1]))
-    x_data = tf.constant([5, 4, 1], dtype=x.dtype.as_numpy_dtype)
-    self.assertAllClose(
-        4.6666665,
-        ed.evaluate([('mean_squared_error', {'average': 'micro'})],
-                    {x: x_data}, n_samples=1, seed=self.RANDOM_SEED))
-
-    x = Multinomial(total_count=10.0, probs=tf.constant([0.2, 0.7, 0.1]),
-                    sample_shape=5)
-    x_data = tf.constant(
-        [[2, 7, 1], [3, 6, 1], [3, 5, 2], [4, 4, 2], [2, 7, 1]],
-        dtype=x.dtype.as_numpy_dtype)
-    self.assertAllEqual(
-        np.array([1.2, 1.4, 0.6], dtype=np.float32),
-        ed.evaluate([('mean_squared_error', {'average': None})],
-                    {x: x_data}, n_samples=1, seed=self.RANDOM_SEED))
-    x = Multinomial(total_count=10.0, probs=tf.constant([0.2, 0.7, 0.1]),
-                    sample_shape=5)
-    x_data = tf.constant(
-        [[2, 7, 1], [3, 6, 1], [3, 5, 2], [4, 4, 2], [2, 7, 1]],
-        dtype=x.dtype.as_numpy_dtype)
-    self.assertAllClose(
-        1.066666603088379,
-        ed.evaluate([('mean_squared_error', {'average': 'macro'})],
-                    {x: x_data}, n_samples=1, seed=self.RANDOM_SEED))
-    x = Multinomial(total_count=10.0, probs=tf.constant([0.2, 0.7, 0.1]),
-                    sample_shape=5)
-    x_data = tf.constant(
-        [[2, 7, 1], [3, 6, 1], [3, 5, 2], [4, 4, 2], [2, 7, 1]],
-        dtype=x.dtype.as_numpy_dtype)
-    self.assertAllClose(
-        1.0666667222976685,
-        ed.evaluate([('mean_squared_error', {'average': 'micro'})],
-                    {x: x_data}, n_samples=1, seed=self.RANDOM_SEED))
-
-  def test_data(self):
-    with self.test_session():
-      x_ph = tf.placeholder(tf.float32, [])
-      x = Normal(loc=x_ph, scale=1.0)
-      y = 2.0 * Normal(loc=0.0, scale=1.0)
-      x_data = tf.constant(0.0)
-      x_ph_data = np.array(0.0)
-      y_data = tf.constant(20.0)
-      ed.evaluate('mean_squared_error', {x: x_data, x_ph: x_ph_data},
-                  n_samples=1)
-      ed.evaluate('mean_squared_error', {y: y_data}, n_samples=1)
-      self.assertRaises(TypeError, ed.evaluate, 'mean_squared_error',
-                        {'y': y_data}, n_samples=1)
-
-  def test_n_samples(self):
-    with self.test_session():
-      x = Normal(loc=0.0, scale=1.0)
-      x_data = tf.constant(0.0)
-      ed.evaluate('mean_squared_error', {x: x_data}, n_samples=1)
-      ed.evaluate('mean_squared_error', {x: x_data}, n_samples=5)
-      self.assertRaises(TypeError, ed.evaluate, 'mean_squared_error',
-                        {x: x_data}, n_samples='1')
-
-  def test_output_key(self):
-    with self.test_session():
-      x_ph = tf.placeholder(tf.float32, [])
-      x = Normal(loc=x_ph, scale=1.0)
-      y = 2.0 * x
-      x_data = tf.constant(0.0)
-      x_ph_data = np.array(0.0)
-      y_data = tf.constant(20.0)
-      ed.evaluate('mean_squared_error', {x: x_data, x_ph: x_ph_data},
-                  n_samples=1)
-      ed.evaluate('mean_squared_error', {y: y_data, x_ph: x_ph_data},
-                  n_samples=1)
-      ed.evaluate('mean_squared_error', {x: x_data, y: y_data, x_ph: x_ph_data},
-                  n_samples=1, output_key=x)
-      self.assertRaises(KeyError, ed.evaluate, 'mean_squared_error',
-                        {x: x_data, y: y_data, x_ph: x_ph_data}, n_samples=1)
-      self.assertRaises(TypeError, ed.evaluate, 'mean_squared_error',
-                        {x: x_data, y: y_data, x_ph: x_ph_data}, n_samples=1,
-                        output_key='x')
-
-  def test_custom_metric(self):
-    def logcosh(y_true, y_pred):
-      diff = y_pred - y_true
-      return tf.reduce_mean(diff + tf.nn.softplus(-2.0 * diff) - tf.log(2.0),
-                            axis=-1)
-    with self.test_session():
-      x = Normal(loc=0.0, scale=1.0)
-      x_data = tf.constant(0.0)
-      ed.evaluate(logcosh, {x: x_data}, n_samples=1)
-      ed.evaluate(['mean_squared_error', logcosh], {x: x_data}, n_samples=1)
-      self.assertRaises(NotImplementedError, ed.evaluate, 'logcosh',
-                        {x: x_data}, n_samples=1)
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/tests/criticisms/metrics_test.py b/tests/criticisms/metrics_test.py
deleted file mode 100644
index 452e357e8..000000000
--- a/tests/criticisms/metrics_test.py
+++ /dev/null
@@ -1,114 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-import tensorflow as tf
-
-from edward.criticisms.evaluate import *
-
-all_classification_metrics = [
-    binary_accuracy,
-    sparse_categorical_accuracy,
-]
-
-all_real_classification_metrics = [
-    binary_crossentropy,
-    categorical_crossentropy,
-    hinge,
-    squared_hinge,
-]
-
-all_regression_metrics = [
-    mean_squared_error,
-    mean_absolute_error,
-    mean_absolute_percentage_error,
-    mean_squared_logarithmic_error,
-    poisson,
-    cosine_proximity,
-]
-
-all_specialized_input_output_metrics = [
-    categorical_accuracy,
-    sparse_categorical_crossentropy,
-    kl_divergence
-]
-
-all_metrics_with_binary_averaging = [
-    mean_squared_error,
-    mean_absolute_error,
-    mean_absolute_percentage_error,
-    mean_squared_logarithmic_error
-]
-
-
-class test_metrics_class(tf.test.TestCase):
-
-  def _check_averaging(self, metric, y_true, y_pred):
-    n_classes = tf.squeeze(tf.shape(y_true)[-1]).eval()
-    class_scores = [metric(y_true[i], y_pred[i]) for i in range(n_classes)]
-
-    # No averaging
-    no_average = metric(y_true, y_pred, average=None)
-    expected_no_average = tf.stack(class_scores)
-    self.assertAllEqual(no_average.eval(), expected_no_average.eval())
-
-    # Macro-averaging
-    macro_average = metric(y_true, y_pred, average='macro')
-    expected_macro_average = tf.reduce_mean(tf.stack(class_scores))
-    self.assertAllEqual(macro_average.eval(), expected_macro_average.eval())
-
-    # Micro-averaging
-    micro_average = metric(y_true, y_pred, average='micro')
-    expected_micro_average = metric(tf.reshape(y_true, [1, -1]),
-                                    tf.reshape(y_pred, [1, -1]))
-    self.assertAllEqual(micro_average.eval(), expected_micro_average.eval())
-
-  def test_classification_metrics(self):
-    with self.test_session():
-      y_true = tf.convert_to_tensor(np.random.randint(0, 1, (2, 3)))
-      y_pred = tf.convert_to_tensor(np.random.randint(0, 1, (2, 3)))
-      for metric in all_classification_metrics:
-        self.assertEqual(metric(y_true, y_pred).eval().shape, ())
-
-  def test_real_classification_metrics(self):
-    with self.test_session():
-      y_true = tf.convert_to_tensor(np.random.randint(0, 5, (6, 7)))
-      y_pred = tf.random_normal([6, 7])
-      for metric in all_real_classification_metrics:
-        self.assertEqual(metric(y_true, y_pred).eval().shape, ())
-
-  def test_regression_metrics(self):
-    with self.test_session():
-      y_true = tf.random_normal([6, 7])
-      y_pred = tf.random_normal([6, 7])
-      for metric in all_regression_metrics:
-        self.assertEqual(metric(y_true, y_pred).eval().shape, ())
-
-  def test_specialized_input_output_metrics(self):
-    with self.test_session():
-      for metric in all_specialized_input_output_metrics:
-        if metric == categorical_accuracy:
-          y_true = tf.convert_to_tensor(np.random.randint(0, 1, (6, 7)))
-          y_pred = tf.convert_to_tensor(np.random.randint(0, 7, (6,)))
-          self.assertEqual(metric(y_true, y_pred).eval().shape, ())
-        elif metric == sparse_categorical_crossentropy:
-          y_true = tf.convert_to_tensor(np.random.randint(0, 5, (6)))
-          y_pred = tf.random_normal([6, 7])
-          self.assertEqual(metric(y_true, y_pred).eval().shape, ())
-        elif metric == kl_divergence:
-          y_true = tf.nn.softmax(tf.random_normal([6]))
-          y_pred = tf.nn.softmax(tf.random_normal([6]))
-          self.assertEqual(metric(y_true, y_pred).eval().shape, ())
-        else:
-          raise NotImplementedError()
-
-  def test_metrics_with_binary_averaging(self):
-    with self.test_session():
-      y_true = tf.constant([[1.0, 2.0, 3.0], [2.0, 3.0, 4.0], [3.0, 4.0, 5.0]])
-      y_pred = tf.constant([[2.0, 4.0, 6.0], [4.0, 6.0, 8.0], [6.0, 8.0, 10.0]])
-      for metric in all_metrics_with_binary_averaging:
-        self._check_averaging(metric, y_true, y_pred)
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/tests/criticisms/ppc_plots_test.py b/tests/criticisms/ppc_plots_test.py
deleted file mode 100644
index bf76bc12d..000000000
--- a/tests/criticisms/ppc_plots_test.py
+++ /dev/null
@@ -1,25 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import edward as ed
-import numpy as np
-import tensorflow as tf
-
-
-class test_ppc_plots_class(tf.test.TestCase):
-
-  def test_ppc_density_plot(self):
-    y = np.random.randn(20)
-    y_rep = np.random.randn(20, 20)
-
-    ed.ppc_density_plot(y, y_rep)
-
-  def test_ppc_stat_hist_plot(self):
-    y = np.random.randn(20)
-    t = 0.0
-
-    ed.ppc_stat_hist_plot(t, y, stat_name="mean", bins=10)
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/tests/criticisms/ppc_test.py b/tests/criticisms/ppc_test.py
deleted file mode 100644
index d06ba0153..000000000
--- a/tests/criticisms/ppc_test.py
+++ /dev/null
@@ -1,52 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import edward as ed
-import tensorflow as tf
-
-from edward.models import Normal
-
-
-class test_ppc_class(tf.test.TestCase):
-
-  def test_data(self):
-    with self.test_session():
-      x = Normal(loc=0.0, scale=1.0)
-      y = 2.0 * x
-      x_data = tf.constant(0.0)
-      y_data = tf.constant(0.0)
-      ed.ppc(lambda xs, zs: tf.reduce_mean(xs[x]), {x: x_data}, n_samples=1)
-      ed.ppc(lambda xs, zs: tf.reduce_mean(xs[y]), {y: y_data}, n_samples=1)
-      self.assertRaises(TypeError, ed.ppc, lambda xs, zs: tf.reduce_mean(xs[y]),
-                        {'y': y_data}, n_samples=1)
-
-  def test_latent_vars(self):
-    with self.test_session():
-      x = Normal(loc=0.0, scale=1.0)
-      y = 2.0 * x
-      z = Normal(loc=0.0, scale=1.0)
-      x_data = tf.constant(0.0)
-      y_data = tf.constant(0.0)
-      ed.ppc(lambda xs, zs: tf.reduce_mean(xs[x]) + tf.reduce_mean(zs[z]),
-             {x: x_data}, {z: z}, n_samples=1)
-      ed.ppc(lambda xs, zs: tf.reduce_mean(xs[x]) + tf.reduce_mean(zs[z]),
-             {x: x_data}, {z: y}, n_samples=1)
-      ed.ppc(lambda xs, zs: tf.reduce_mean(xs[x]) + tf.reduce_mean(zs[y]),
-             {x: x_data}, {y: y}, n_samples=1)
-      ed.ppc(lambda xs, zs: tf.reduce_mean(xs[x]) + tf.reduce_mean(zs[y]),
-             {x: x_data}, {y: z}, n_samples=1)
-      self.assertRaises(TypeError, ed.ppc, lambda xs, zs: tf.reduce_mean(xs[x]),
-                        {x: x_data}, {'y': z}, n_samples=1)
-
-  def test_n_samples(self):
-    with self.test_session():
-      x = Normal(loc=0.0, scale=1.0)
-      x_data = tf.constant(0.0)
-      ed.ppc(lambda xs, zs: tf.reduce_mean(xs[x]), {x: x_data}, n_samples=1)
-      ed.ppc(lambda xs, zs: tf.reduce_mean(xs[x]), {x: x_data}, n_samples=5)
-      self.assertRaises(TypeError, ed.ppc, lambda xs, zs: tf.reduce_mean(xs[x]),
-                        {x: x_data}, n_samples='1')
-
-if __name__ == '__main__':
-  tf.test.main()

From bc6caaf1497324447010132adc240c04773c3de8 Mon Sep 17 00:00:00 2001
From: Dustin Tran <dustinviettran@gmail.com>
Date: Mon, 5 Feb 2018 07:23:27 -0800
Subject: [PATCH 04/27] remove RandomVariable from ed namescope

---
 edward/__init__.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/edward/__init__.py b/edward/__init__.py
index d087e8379..78dd81746 100644
--- a/edward/__init__.py
+++ b/edward/__init__.py
@@ -26,7 +26,6 @@
     wake_sleep,
     wgan_inference)
 # from edward.inferences import MonteCarlo, HMC, MetropolisHastings, SGLD, SGHMC, Gibbs
-from edward.models import RandomVariable
 from edward.util import (
     copy, dot,
     get_ancestors, get_blanket, get_children, get_control_variate_coef,
@@ -65,7 +64,6 @@
     'wake_sleep',
     'wgan_inference',
     'Gibbs',
-    'RandomVariable',
     'copy',
     'dot',
     'get_ancestors',

From ead535bb4d7c92275574e8392cc5e91d341311f7 Mon Sep 17 00:00:00 2001
From: Dustin Tran <dustinviettran@gmail.com>
Date: Sun, 28 Jan 2018 04:09:52 -0800
Subject: [PATCH 05/27] remove with_binary_averaging,compute_multinomial_mode

---
 edward/util/__init__.py             |  3 --
 edward/util/metrics.py              | 46 -------------------------
 edward/util/random_variables.py     | 52 -----------------------------
 tests/util/random_variables_test.py | 29 ----------------
 4 files changed, 130 deletions(-)
 delete mode 100644 edward/util/metrics.py
 delete mode 100644 tests/util/random_variables_test.py

diff --git a/edward/util/__init__.py b/edward/util/__init__.py
index 7a2b650ba..40d4417e6 100644
--- a/edward/util/__init__.py
+++ b/edward/util/__init__.py
@@ -5,7 +5,6 @@
 from __future__ import print_function
 
 from edward.util.graphs import *
-from edward.util.metrics import *
 from edward.util.progbar import *
 from edward.util.random_variables import *
 from edward.util.tensorflow import *
@@ -13,7 +12,6 @@
 from tensorflow.python.util.all_util import remove_undocumented
 
 _allowed_symbols = [
-    'compute_multinomial_mode',
     'copy',
     'dot',
     'get_ancestors',
@@ -32,7 +30,6 @@
     'set_seed',
     'to_simplex',
     'transform',
-    'with_binary_averaging'
 ]
 
 remove_undocumented(__name__, allowed_exception_list=_allowed_symbols)
diff --git a/edward/util/metrics.py b/edward/util/metrics.py
deleted file mode 100644
index 4caf2e725..000000000
--- a/edward/util/metrics.py
+++ /dev/null
@@ -1,46 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from functools import wraps
-
-import tensorflow as tf
-
-
-def with_binary_averaging(metric):
-  """
-  Inspired by scikit-learn's _average_binary_score function:
-  https://github.com/scikit-learn/scikit-learn/blob/d9fdd8b0d1053cb47af8e3823b7a05279dd72054/sklearn/metrics/base.py#L23.
-
-  `None`: computes the specified metric along the second-to-last
-  dimension of `y_true` and `y_pred`. Returns a vector of "class-wise"
-  metrics.
-  `'macro'`: same as `None`, except compute the (unweighted) global
-  average of the resulting vector.
-  `'micro'`: flatten `y_true` and `y_pred` into vectors, then compute
-  `'macro'`
-  """
-  AVERAGE_OPTIONS = (None, 'micro', 'macro')
-
-  @wraps(metric)
-  def with_binary_averaging(*args, **kwargs):
-    y_true, y_pred = args
-    y_true = tf.cast(y_true, tf.float32)
-    y_pred = tf.cast(y_pred, tf.float32)
-    if len(y_true.shape) < 2 and len(y_pred.shape) < 2:
-      y_true = tf.expand_dims(y_true, 0)
-      y_pred = tf.expand_dims(y_pred, 0)
-
-    average = kwargs.get('average', 'macro')
-    if average not in AVERAGE_OPTIONS:
-      raise ValueError('average has to be one of {0}'
-                       ''.format(average_options))
-    if average is None:
-      return metric(y_true, y_pred)
-    if average == 'macro':
-      return tf.reduce_mean(metric(y_true, y_pred))
-    if average == 'micro':
-      y_true = tf.reshape(y_true, [1, -1])
-      y_pred = tf.reshape(y_pred, [1, -1])
-      return tf.reduce_mean(metric(y_true, y_pred))
-  return with_binary_averaging
diff --git a/edward/util/random_variables.py b/edward/util/random_variables.py
index fd9b15659..8498c0b94 100644
--- a/edward/util/random_variables.py
+++ b/edward/util/random_variables.py
@@ -2,7 +2,6 @@
 from __future__ import division
 from __future__ import print_function
 
-import numpy as np
 import six
 import tensorflow as tf
 
@@ -850,54 +849,3 @@ def transform(x, *args, **kwargs):
   new_x = TransformedDistribution(x, bij, *args, **kwargs)
   new_x.support = new_support
   return new_x
-
-
-def compute_multinomial_mode(probs, total_count=1, seed=None):
-  """Compute the mode of a Multinomial random variable.
-
-  Args:
-    probs: 1-D Numpy array of Multinomial class probabilities
-    total_count: integer number of trials in single Multinomial draw
-    seed: a Python integer. Used to create a random seed for the
-      distribution
-
-  #### Examples
-
-  ```python
-  # returns either [2, 2, 1], [2, 1, 2] or [1, 2, 2]
-  probs = np.array(3 * [1/3])
-  total_count = 5
-  compute_multinomial_mode(probs, total_count)
-
-  # returns [3, 2, 0]
-  probs = np.array(3 * [1/3])
-  total_count = 5
-  compute_multinomial_mode(probs, total_count)
-  ```
-  """
-  def softmax(vec):
-    numerator = np.exp(vec)
-    return numerator / numerator.sum(axis=0)
-
-  random_state = np.random.RandomState(seed)
-  mode = np.zeros_like(probs, dtype=np.int32)
-  if total_count == 1:
-    mode[np.argmax(probs)] += 1
-    return list(mode)
-  remaining_count = total_count
-  uniform_prob = 1 / total_count
-
-  while remaining_count > 0:
-    if (probs < uniform_prob).all():
-      probs = softmax(probs)
-    mask = probs >= uniform_prob
-    overflow_count = int(mask.sum() - remaining_count)
-    if overflow_count > 0:
-      hot_indices = np.where(mask)[0]
-      cold_indices = random_state.choice(hot_indices, overflow_count,
-                                         replace=False)
-      mask[cold_indices] = False
-    mode[mask] += 1
-    probs[mask] -= uniform_prob
-    remaining_count -= np.sum(mask)
-  return mode
diff --git a/tests/util/random_variables_test.py b/tests/util/random_variables_test.py
deleted file mode 100644
index 1b99f625f..000000000
--- a/tests/util/random_variables_test.py
+++ /dev/null
@@ -1,29 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-import tensorflow as tf
-
-from edward.util.random_variables import compute_multinomial_mode
-
-
-class test_compute_multinomial_mode(tf.test.TestCase):
-
-  RANDOM_SEED = 12345
-
-  def test_correct_mode_computed_with_uniform_probabilities(self):
-    with self.test_session():
-      probs = np.array(3 * [1 / 3.0])
-      total_count = 5
-      self.assertAllEqual(
-          compute_multinomial_mode(probs, total_count, seed=self.RANDOM_SEED),
-          np.array([1, 2, 2]))
-      probs = np.array([0.6, 0.4, 0.0])
-      total_count = 5
-      self.assertAllEqual(
-          compute_multinomial_mode(probs, total_count, seed=self.RANDOM_SEED),
-          np.array([2, 2, 1]))
-
-if __name__ == '__main__':
-  tf.test.main()

From 4d1a20a9508113e71b54fea9e9e15b10cb8bd462 Mon Sep 17 00:00:00 2001
From: Dustin Tran <dustinviettran@gmail.com>
Date: Wed, 24 Jan 2018 17:32:32 -0800
Subject: [PATCH 06/27] remove PointMass,Empirical; use tfd.Deterministic

---
 edward/inferences/laplace.py           |   2 +-
 edward/models/__init__.py              |   4 -
 edward/models/empirical.py             | 125 -------------------------
 edward/models/point_mass.py            | 110 ----------------------
 edward/util/random_variables.py        |   3 +-
 tests/models/empirical_sample_test.py  |  34 -------
 tests/models/point_mass_sample_test.py |  37 --------
 7 files changed, 3 insertions(+), 312 deletions(-)
 delete mode 100644 edward/models/empirical.py
 delete mode 100644 edward/models/point_mass.py
 delete mode 100644 tests/models/empirical_sample_test.py
 delete mode 100644 tests/models/point_mass_sample_test.py

diff --git a/edward/inferences/laplace.py b/edward/inferences/laplace.py
index c44d2455c..f97d2cff2 100644
--- a/edward/inferences/laplace.py
+++ b/edward/inferences/laplace.py
@@ -6,7 +6,7 @@
 import tensorflow as tf
 
 from edward.inferences.map import map
-from edward.models import PointMass, RandomVariable
+from edward.models import RandomVariable
 from edward.util import get_session, get_variables
 from edward.util import copy, transform
 
diff --git a/edward/models/__init__.py b/edward/models/__init__.py
index 2b2eaa2cc..fc1963f89 100644
--- a/edward/models/__init__.py
+++ b/edward/models/__init__.py
@@ -5,9 +5,7 @@
 from __future__ import print_function
 
 from edward.models.dirichlet_process import *
-from edward.models.empirical import *
 from edward.models.param_mixture import *
-from edward.models.point_mass import *
 from edward.models.random_variable import RandomVariable
 from edward.models.random_variables import *
 
@@ -16,9 +14,7 @@
 
 _allowed_symbols = [
     'DirichletProcess',
-    'Empirical',
     'ParamMixture',
-    'PointMass',
     'RandomVariable',
 ]
 for name in dir(_module):
diff --git a/edward/models/empirical.py b/edward/models/empirical.py
deleted file mode 100644
index 7da9b8265..000000000
--- a/edward/models/empirical.py
+++ /dev/null
@@ -1,125 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf
-
-from edward.models.random_variable import RandomVariable
-from tensorflow.contrib.distributions import Distribution
-
-try:
-  from tensorflow.contrib.distributions import FULLY_REPARAMETERIZED
-except Exception as e:
-  raise ImportError("{0}. Your TensorFlow version is not supported.".format(e))
-
-
-class distributions_Empirical(Distribution):
-  """Empirical random variable.
-
-  #### Examples
-
-  ```python
-  # 100 samples of a scalar
-  x = Empirical(params=tf.zeros(100))
-  assert x.shape == ()
-
-  # 5 samples of a 2 x 3 matrix
-  x = Empirical(params=tf.zeros([5, 2, 3]))
-  assert x.shape == (2, 3)
-  ```
-  """
-  def __init__(self,
-               params,
-               validate_args=False,
-               allow_nan_stats=True,
-               name="Empirical"):
-    """Initialize an `Empirical` random variable.
-
-    Args:
-      params: tf.Tensor.
-      Collection of samples. Its outer (left-most) dimension
-      determines the number of samples.
-    """
-    parameters = locals()
-    with tf.name_scope(name, values=[params]):
-      with tf.control_dependencies([]):
-        self._params = tf.identity(params, name="params")
-        try:
-          self._n = tf.shape(self._params)[0]
-        except ValueError:  # scalar params
-          self._n = tf.constant(1)
-
-    super(distributions_Empirical, self).__init__(
-        dtype=self._params.dtype,
-        reparameterization_type=FULLY_REPARAMETERIZED,
-        validate_args=validate_args,
-        allow_nan_stats=allow_nan_stats,
-        parameters=parameters,
-        graph_parents=[self._params, self._n],
-        name=name)
-
-  @staticmethod
-  def _param_shapes(sample_shape):
-    return {"params": tf.convert_to_tensor(sample_shape, dtype=tf.int32)}
-
-  @property
-  def params(self):
-    """Distribution parameter."""
-    return self._params
-
-  @property
-  def n(self):
-    """Number of samples."""
-    return self._n
-
-  def _batch_shape_tensor(self):
-    return tf.constant([], dtype=tf.int32)
-
-  def _batch_shape(self):
-    return tf.TensorShape([])
-
-  def _event_shape_tensor(self):
-    return tf.shape(self.params)[1:]
-
-  def _event_shape(self):
-    return self.params.shape[1:]
-
-  def _mean(self):
-    return tf.reduce_mean(self.params, 0)
-
-  def _stddev(self):
-    # broadcasting n x shape - shape = n x shape
-    r = self.params - self.mean()
-    return tf.sqrt(tf.reduce_mean(tf.square(r), 0))
-
-  def _variance(self):
-    return tf.square(self.stddev())
-
-  def _sample_n(self, n, seed=None):
-    input_tensor = self.params
-    if len(input_tensor.shape) == 0:
-      input_tensor = tf.expand_dims(input_tensor, 0)
-      multiples = tf.concat(
-          [tf.expand_dims(n, 0), [1] * len(self.event_shape)], 0)
-      return tf.tile(input_tensor, multiples)
-    else:
-      probs = tf.ones([self.n]) / tf.cast(self.n, dtype=tf.float32)
-      cat = tf.contrib.distributions.Categorical(probs)
-      indices = cat._sample_n(n, seed)
-      tensor = tf.gather(input_tensor, indices)
-      return tensor
-
-
-# Generate random variable class similar to autogenerated ones from TensorFlow.
-def __init__(self, *args, **kwargs):
-  RandomVariable.__init__(self, *args, **kwargs)
-
-
-_name = 'Empirical'
-_candidate = distributions_Empirical
-__init__.__doc__ = _candidate.__init__.__doc__
-_globals = globals()
-_params = {'__doc__': _candidate.__doc__,
-           '__init__': __init__,
-           'support': 'points'}
-_globals[_name] = type(_name, (RandomVariable, _candidate), _params)
diff --git a/edward/models/point_mass.py b/edward/models/point_mass.py
deleted file mode 100644
index b63031b6d..000000000
--- a/edward/models/point_mass.py
+++ /dev/null
@@ -1,110 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf
-
-from edward.models.random_variable import RandomVariable
-from tensorflow.contrib.distributions import Distribution
-
-try:
-  from tensorflow.contrib.distributions import FULLY_REPARAMETERIZED
-except Exception as e:
-  raise ImportError("{0}. Your TensorFlow version is not supported.".format(e))
-
-
-class distributions_PointMass(Distribution):
-  """PointMass random variable.
-
-  It is analogous to an Empirical random variable with one sample, but
-  its parameter argument does not have an outer dimension.
-
-  #### Examples
-
-  ```python
-  # scalar
-  x = PointMass(params=28.3)
-  assert x.shape == ()
-
-  # 5 x 2 x 3 tensor
-  x = PointMass(params=tf.zeros([5, 2, 3]))
-  assert x.shape == (5, 2, 3)
-  ```
-  """
-  def __init__(self,
-               params,
-               validate_args=False,
-               allow_nan_stats=True,
-               name="PointMass"):
-    """Initialize a `PointMass` random variable.
-
-    Args:
-      params: tf.Tensor.
-        The location with all probability mass.
-    """
-    parameters = locals()
-    with tf.name_scope(name, values=[params]):
-      with tf.control_dependencies([]):
-        self._params = tf.identity(params, name="params")
-
-    super(distributions_PointMass, self).__init__(
-        dtype=self._params.dtype,
-        reparameterization_type=FULLY_REPARAMETERIZED,
-        validate_args=validate_args,
-        allow_nan_stats=allow_nan_stats,
-        parameters=parameters,
-        graph_parents=[self._params],
-        name=name)
-
-  @staticmethod
-  def _param_shapes(sample_shape):
-    return {"params": tf.expand_dims(
-        tf.convert_to_tensor(sample_shape, dtype=tf.int32), 0)}
-
-  @property
-  def params(self):
-    """Distribution parameter."""
-    return self._params
-
-  def _batch_shape_tensor(self):
-    return tf.constant([], dtype=tf.int32)
-
-  def _batch_shape(self):
-    return tf.TensorShape([])
-
-  def _event_shape_tensor(self):
-    return tf.shape(self.params)
-
-  def _event_shape(self):
-    return self.params.shape
-
-  def _mean(self):
-    return self.params
-
-  def _stddev(self):
-    return 0.0 * tf.ones_like(self.params)
-
-  def _variance(self):
-    return tf.square(self.stddev())
-
-  def _sample_n(self, n, seed=None):
-    input_tensor = self.params
-    input_tensor = tf.expand_dims(input_tensor, 0)
-    multiples = tf.concat(
-        [tf.expand_dims(n, 0), [1] * len(self.event_shape)], 0)
-    return tf.tile(input_tensor, multiples)
-
-
-# Generate random variable class similar to autogenerated ones from TensorFlow.
-def __init__(self, *args, **kwargs):
-  RandomVariable.__init__(self, *args, **kwargs)
-
-
-_name = 'PointMass'
-_candidate = distributions_PointMass
-__init__.__doc__ = _candidate.__init__.__doc__
-_globals = globals()
-_params = {'__doc__': _candidate.__doc__,
-           '__init__': __init__,
-           'support': 'point'}
-_globals[_name] = type(_name, (RandomVariable, _candidate), _params)
diff --git a/edward/util/random_variables.py b/edward/util/random_variables.py
index 8498c0b94..b1e62073f 100644
--- a/edward/util/random_variables.py
+++ b/edward/util/random_variables.py
@@ -8,7 +8,6 @@
 from copy import deepcopy
 from edward.models.random_variable import RandomVariable
 from edward.models.random_variables import TransformedDistribution
-from edward.models import PointMass
 from edward.util.graphs import random_variables
 from tensorflow.core.framework import attr_value_pb2
 from tensorflow.python.framework.ops import set_shapes_for_outputs
@@ -764,6 +763,8 @@ def is_independent(a, b, condition=None):
         for parent in get_parents(node):
           schedule.append((parent, "child"))
 
+      # TODO
+      from edward.models import PointMass
       if not isinstance(node, PointMass) and node not in bottom_marked:
         bottom_marked.add(node)
         if node in A:
diff --git a/tests/models/empirical_sample_test.py b/tests/models/empirical_sample_test.py
deleted file mode 100644
index c58462dc7..000000000
--- a/tests/models/empirical_sample_test.py
+++ /dev/null
@@ -1,34 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-import tensorflow as tf
-
-from edward.models import Empirical
-
-
-class test_empirical_sample_class(tf.test.TestCase):
-
-  def _test(self, params, n):
-    x = Empirical(params=params)
-    val_est = x.sample(n).shape.as_list()
-    val_true = n + tf.convert_to_tensor(params).shape.as_list()[1:]
-    self.assertEqual(val_est, val_true)
-
-  def test_0d(self):
-    with self.test_session():
-      self._test(0.5, [1])
-      self._test(np.array(0.5), [1])
-      self._test(tf.constant(0.5), [1])
-      self._test(np.array([0.5]), [1])
-      self._test(np.array([0.5]), [5])
-      self._test(np.array([0.2, 0.8]), [1])
-      self._test(np.array([0.2, 0.8]), [10])
-      self._test(tf.constant([0.5]), [1])
-      self._test(tf.constant([0.5]), [5])
-      self._test(tf.constant([0.2, 0.8]), [1])
-      self._test(tf.constant([0.2, 0.8]), [10])
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/tests/models/point_mass_sample_test.py b/tests/models/point_mass_sample_test.py
deleted file mode 100644
index 7d240bb0f..000000000
--- a/tests/models/point_mass_sample_test.py
+++ /dev/null
@@ -1,37 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-import tensorflow as tf
-
-from edward.models import PointMass
-
-
-class test_pointmass_sample_class(tf.test.TestCase):
-
-  def _test(self, params, n):
-    x = PointMass(params=params)
-    val_est = x.sample(n).shape.as_list()
-    val_true = n + tf.convert_to_tensor(params).shape.as_list()
-    self.assertEqual(val_est, val_true)
-
-  def test_0d(self):
-    with self.test_session():
-      self._test(0.5, [1])
-      self._test(np.array(0.5), [1])
-      self._test(tf.constant(0.5), [1])
-
-  def test_1d(self):
-    with self.test_session():
-      self._test(np.array([0.5]), [1])
-      self._test(np.array([0.5]), [5])
-      self._test(np.array([0.2, 0.8]), [1])
-      self._test(np.array([0.2, 0.8]), [10])
-      self._test(tf.constant([0.5]), [1])
-      self._test(tf.constant([0.5]), [5])
-      self._test(tf.constant([0.2, 0.8]), [1])
-      self._test(tf.constant([0.2, 0.8]), [10])
-
-if __name__ == '__main__':
-  tf.test.main()

From 2e930e67d29933647244d98f7ba41bcc51a7e0dd Mon Sep 17 00:00:00 2001
From: Dustin Tran <dustinviettran@gmail.com>
Date: Thu, 1 Feb 2018 22:54:39 -0800
Subject: [PATCH 07/27] mv DirichletProcess,ParamMixture to examples/

---
 edward/models/__init__.py                        | 4 ----
 {edward/models => examples}/dirichlet_process.py | 0
 {edward/models => examples}/param_mixture.py     | 0
 3 files changed, 4 deletions(-)
 rename {edward/models => examples}/dirichlet_process.py (100%)
 rename {edward/models => examples}/param_mixture.py (100%)

diff --git a/edward/models/__init__.py b/edward/models/__init__.py
index fc1963f89..3cba416a6 100644
--- a/edward/models/__init__.py
+++ b/edward/models/__init__.py
@@ -4,8 +4,6 @@
 from __future__ import division
 from __future__ import print_function
 
-from edward.models.dirichlet_process import *
-from edward.models.param_mixture import *
 from edward.models.random_variable import RandomVariable
 from edward.models.random_variables import *
 
@@ -13,8 +11,6 @@
 from edward.models import random_variables as _module
 
 _allowed_symbols = [
-    'DirichletProcess',
-    'ParamMixture',
     'RandomVariable',
 ]
 for name in dir(_module):
diff --git a/edward/models/dirichlet_process.py b/examples/dirichlet_process.py
similarity index 100%
rename from edward/models/dirichlet_process.py
rename to examples/dirichlet_process.py
diff --git a/edward/models/param_mixture.py b/examples/param_mixture.py
similarity index 100%
rename from edward/models/param_mixture.py
rename to examples/param_mixture.py

From 4d063b5331f98bf15184c1d28fd8d5dd66c6c205 Mon Sep 17 00:00:00 2001
From: Dustin Tran <dustinviettran@gmail.com>
Date: Mon, 15 Jan 2018 23:03:19 -0800
Subject: [PATCH 08/27] remove set_seed

---
 edward/__init__.py      |  3 +--
 edward/util/__init__.py |  1 -
 edward/util/graphs.py   | 19 -------------------
 3 files changed, 1 insertion(+), 22 deletions(-)

diff --git a/edward/__init__.py b/edward/__init__.py
index 78dd81746..562f9b71f 100644
--- a/edward/__init__.py
+++ b/edward/__init__.py
@@ -30,7 +30,7 @@
     copy, dot,
     get_ancestors, get_blanket, get_children, get_control_variate_coef,
     get_descendants, get_parents, get_session, get_siblings, get_variables,
-    is_independent, Progbar, random_variables, rbf, set_seed,
+    is_independent, Progbar, random_variables, rbf,
     to_simplex, transform)
 from edward.version import __version__, VERSION
 
@@ -79,7 +79,6 @@
     'Progbar',
     'random_variables',
     'rbf',
-    'set_seed',
     'to_simplex',
     'transform',
     '__version__',
diff --git a/edward/util/__init__.py b/edward/util/__init__.py
index 40d4417e6..0a7cdc90c 100644
--- a/edward/util/__init__.py
+++ b/edward/util/__init__.py
@@ -27,7 +27,6 @@
     'Progbar',
     'random_variables',
     'rbf',
-    'set_seed',
     'to_simplex',
     'transform',
 ]
diff --git a/edward/util/graphs.py b/edward/util/graphs.py
index d56c2ea89..8a26aaaa0 100644
--- a/edward/util/graphs.py
+++ b/edward/util/graphs.py
@@ -2,8 +2,6 @@
 from __future__ import division
 from __future__ import print_function
 
-import numpy as np
-import six
 import sys
 import tensorflow as tf
 
@@ -54,20 +52,3 @@ def random_variables(graph=None):
     graph = tf.get_default_graph()
 
   return _RANDOM_VARIABLE_COLLECTION[graph]
-
-
-def set_seed(x):
-  """Set seed for both NumPy and TensorFlow.
-
-  Args:
-    x: int, float.
-      seed
-  """
-  node_names = list(six.iterkeys(tf.get_default_graph()._nodes_by_name))
-  if len(node_names) > 0 and node_names != ['keras_learning_phase']:
-    raise RuntimeError("Seeding is not supported after initializing "
-                       "part of the graph. "
-                       "Please move set_seed to the beginning of your code.")
-
-  np.random.seed(x)
-  tf.set_random_seed(x)

From 85461402b30ab8a9963e2b2abcb462e06f835f93 Mon Sep 17 00:00:00 2001
From: Dustin Tran <dustinviettran@gmail.com>
Date: Mon, 15 Jan 2018 23:06:06 -0800
Subject: [PATCH 09/27] remove get_session

---
 edward/__init__.py                   |  3 +--
 edward/inferences/bigan_inference.py |  1 -
 edward/inferences/gan_inference.py   |  1 -
 edward/inferences/implicit_klqp.py   |  2 +-
 edward/inferences/laplace.py         |  2 +-
 edward/inferences/wgan_inference.py  |  1 -
 edward/util/__init__.py              |  1 -
 edward/util/graphs.py                | 32 ----------------------------
 8 files changed, 3 insertions(+), 40 deletions(-)

diff --git a/edward/__init__.py b/edward/__init__.py
index 562f9b71f..a99ed78b0 100644
--- a/edward/__init__.py
+++ b/edward/__init__.py
@@ -29,7 +29,7 @@
 from edward.util import (
     copy, dot,
     get_ancestors, get_blanket, get_children, get_control_variate_coef,
-    get_descendants, get_parents, get_session, get_siblings, get_variables,
+    get_descendants, get_parents, get_siblings, get_variables,
     is_independent, Progbar, random_variables, rbf,
     to_simplex, transform)
 from edward.version import __version__, VERSION
@@ -72,7 +72,6 @@
     'get_control_variate_coef',
     'get_descendants',
     'get_parents',
-    'get_session',
     'get_siblings',
     'get_variables',
     'is_independent',
diff --git a/edward/inferences/bigan_inference.py b/edward/inferences/bigan_inference.py
index 57e7c99e6..7a440a547 100644
--- a/edward/inferences/bigan_inference.py
+++ b/edward/inferences/bigan_inference.py
@@ -7,7 +7,6 @@
 
 from edward.inferences.inference import (check_and_maybe_build_data,
     check_and_maybe_build_latent_vars, transform, check_and_maybe_build_dict, check_and_maybe_build_var_list)
-from edward.util import get_session
 
 
 def bigan_inference(latent_vars=None, data=None, discriminator=None,
diff --git a/edward/inferences/gan_inference.py b/edward/inferences/gan_inference.py
index 40dd99859..a1b979733 100644
--- a/edward/inferences/gan_inference.py
+++ b/edward/inferences/gan_inference.py
@@ -7,7 +7,6 @@
 
 from edward.inferences.inference import (check_and_maybe_build_data,
     transform, check_and_maybe_build_dict, check_and_maybe_build_var_list)
-from edward.util import get_session
 
 
 def gan_inference(data=None, discriminator=None,
diff --git a/edward/inferences/implicit_klqp.py b/edward/inferences/implicit_klqp.py
index 5dcc485f0..9dd5cb66a 100644
--- a/edward/inferences/implicit_klqp.py
+++ b/edward/inferences/implicit_klqp.py
@@ -8,7 +8,7 @@
 from edward.inferences.inference import (check_and_maybe_build_data,
     check_and_maybe_build_latent_vars, transform, check_and_maybe_build_dict, check_and_maybe_build_var_list)
 from edward.models import RandomVariable
-from edward.util import copy, get_session
+from edward.util import copy
 
 
 def implicit_klqp(latent_vars=None, data=None, discriminator=None,
diff --git a/edward/inferences/laplace.py b/edward/inferences/laplace.py
index f97d2cff2..13de2659b 100644
--- a/edward/inferences/laplace.py
+++ b/edward/inferences/laplace.py
@@ -7,7 +7,7 @@
 
 from edward.inferences.map import map
 from edward.models import RandomVariable
-from edward.util import get_session, get_variables
+from edward.util import get_variables
 from edward.util import copy, transform
 
 try:
diff --git a/edward/inferences/wgan_inference.py b/edward/inferences/wgan_inference.py
index 1c850cf2e..8914339a2 100644
--- a/edward/inferences/wgan_inference.py
+++ b/edward/inferences/wgan_inference.py
@@ -7,7 +7,6 @@
 
 from edward.inferences.inference import (check_and_maybe_build_data,
     transform, check_and_maybe_build_dict, check_and_maybe_build_var_list)
-from edward.util import get_session
 
 
 def wgan_inference(data=None, discriminator=None,
diff --git a/edward/util/__init__.py b/edward/util/__init__.py
index 0a7cdc90c..d9744fe07 100644
--- a/edward/util/__init__.py
+++ b/edward/util/__init__.py
@@ -20,7 +20,6 @@
     'get_control_variate_coef',
     'get_descendants',
     'get_parents',
-    'get_session',
     'get_siblings',
     'get_variables',
     'is_independent',
diff --git a/edward/util/graphs.py b/edward/util/graphs.py
index 8a26aaaa0..0ebbef80d 100644
--- a/edward/util/graphs.py
+++ b/edward/util/graphs.py
@@ -2,43 +2,11 @@
 from __future__ import division
 from __future__ import print_function
 
-import sys
 import tensorflow as tf
 
 from edward.models.random_variable import _RANDOM_VARIABLE_COLLECTION
 
 
-def get_session():
-  """Get the globally defined TensorFlow session.
-
-  If the session is not already defined, then the function will create
-  a global session.
-
-  Returns:
-    _ED_SESSION: tf.InteractiveSession.
-  """
-  global _ED_SESSION
-  if tf.get_default_session() is None:
-    _ED_SESSION = tf.InteractiveSession()
-  else:
-    _ED_SESSION = tf.get_default_session()
-
-  save_stderr = sys.stderr
-  try:
-    import os
-    sys.stderr = open(os.devnull, 'w')  # suppress keras import
-    from keras import backend as K
-    sys.stderr = save_stderr
-    have_keras = True
-  except ImportError:
-    sys.stderr = save_stderr
-    have_keras = False
-  if have_keras:
-    K.set_session(_ED_SESSION)
-
-  return _ED_SESSION
-
-
 def random_variables(graph=None):
   """Return all random variables in the TensorFlow graph.
 

From 9146fba38ed6fc8bf2ec7911c503b60dcdd3fafb Mon Sep 17 00:00:00 2001
From: Dustin Tran <dustinviettran@gmail.com>
Date: Sun, 28 Jan 2018 03:34:55 -0800
Subject: [PATCH 10/27] remove Progbar

---
 edward/__init__.py      |   3 +-
 edward/util/__init__.py |   2 -
 edward/util/progbar.py  | 115 ----------------------------------------
 3 files changed, 1 insertion(+), 119 deletions(-)
 delete mode 100644 edward/util/progbar.py

diff --git a/edward/__init__.py b/edward/__init__.py
index a99ed78b0..825fecb19 100644
--- a/edward/__init__.py
+++ b/edward/__init__.py
@@ -30,7 +30,7 @@
     copy, dot,
     get_ancestors, get_blanket, get_children, get_control_variate_coef,
     get_descendants, get_parents, get_siblings, get_variables,
-    is_independent, Progbar, random_variables, rbf,
+    is_independent, random_variables, rbf,
     to_simplex, transform)
 from edward.version import __version__, VERSION
 
@@ -75,7 +75,6 @@
     'get_siblings',
     'get_variables',
     'is_independent',
-    'Progbar',
     'random_variables',
     'rbf',
     'to_simplex',
diff --git a/edward/util/__init__.py b/edward/util/__init__.py
index d9744fe07..279a1354a 100644
--- a/edward/util/__init__.py
+++ b/edward/util/__init__.py
@@ -5,7 +5,6 @@
 from __future__ import print_function
 
 from edward.util.graphs import *
-from edward.util.progbar import *
 from edward.util.random_variables import *
 from edward.util.tensorflow import *
 
@@ -23,7 +22,6 @@
     'get_siblings',
     'get_variables',
     'is_independent',
-    'Progbar',
     'random_variables',
     'rbf',
     'to_simplex',
diff --git a/edward/util/progbar.py b/edward/util/progbar.py
deleted file mode 100644
index 4886d1d91..000000000
--- a/edward/util/progbar.py
+++ /dev/null
@@ -1,115 +0,0 @@
-# -*- coding: utf-8 -*-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-import six
-import sys
-import time
-
-
-class Progbar(object):
-  def __init__(self, target, width=30, interval=0.01, verbose=1):
-    """(Yet another) progress bar.
-
-    Args:
-      target: int.
-        Total number of steps expected.
-      width: int.
-        Width of progress bar.
-      interval: float.
-        Minimum time (in seconds) for progress bar to be displayed
-        during updates.
-      verbose: int.
-        Level of verbosity. 0 suppresses output; 1 is default.
-    """
-    self.target = target
-    self.width = width
-    self.interval = interval
-    self.verbose = verbose
-
-    self.stored_values = {}
-    self.start = time.time()
-    self.last_update = 0
-    self.total_width = 0
-    self.seen_so_far = 0
-
-  def update(self, current, values=None, force=False):
-    """Update progress bar, and print to standard output if `force`
-    is True, or the last update was completed longer than `interval`
-    amount of time ago, or `current` >= `target`.
-
-    The written output is the progress bar and all unique values.
-
-    Args:
-      current: int.
-        Index of current step.
-      values: dict of str to float.
-        Dict of name by value-for-last-step. The progress bar
-        will display averages for these values.
-      force: bool.
-        Whether to force visual progress update.
-    """
-    if values is None:
-      values = {}
-
-    for k, v in six.iteritems(values):
-      self.stored_values[k] = v
-
-    self.seen_so_far = current
-
-    now = time.time()
-    if (not force and
-            (now - self.last_update) < self.interval and
-            current < self.target):
-      return
-
-    self.last_update = now
-    if self.verbose == 0:
-      return
-
-    prev_total_width = self.total_width
-    sys.stdout.write("\b" * prev_total_width)
-    sys.stdout.write("\r")
-
-    # Write progress bar to stdout.
-    n_digits = len(str(self.target))
-    bar = '%%%dd/%%%dd' % (n_digits, n_digits) % (current, self.target)
-    bar += ' [{0}%]'.format(str(int(current / self.target * 100)).rjust(3))
-    bar += ' '
-    prog_width = int(self.width * float(current) / self.target)
-    if prog_width > 0:
-      try:
-        bar += ('█' * prog_width)
-      except UnicodeEncodeError:
-        bar += ('*' * prog_width)
-
-    bar += (' ' * (self.width - prog_width))
-    sys.stdout.write(bar)
-
-    # Write values to stdout.
-    if current:
-      time_per_unit = (now - self.start) / current
-    else:
-      time_per_unit = 0
-
-    eta = time_per_unit * (self.target - current)
-    info = ''
-    if current < self.target:
-      info += ' ETA: %ds' % eta
-    else:
-      info += ' Elapsed: %ds' % (now - self.start)
-
-    for k, v in six.iteritems(self.stored_values):
-      info += ' | {0:s}: {1:0.3f}'.format(k, v)
-
-    self.total_width = len(bar) + len(info)
-    if prev_total_width > self.total_width:
-      info += ((prev_total_width - self.total_width) * " ")
-
-    sys.stdout.write(info)
-    sys.stdout.flush()
-
-    if current >= self.target:
-      sys.stdout.write("\n")

From a7a306ae51c2045a99f2620ca23fe720bfabd532 Mon Sep 17 00:00:00 2001
From: Dustin Tran <dustinviettran@gmail.com>
Date: Sun, 28 Jan 2018 03:39:07 -0800
Subject: [PATCH 11/27] remove rbf

---
 edward/__init__.py                        |  3 +-
 edward/util/__init__.py                   |  1 -
 edward/util/tensorflow.py                 | 58 ---------------
 examples/cox_process.py                   | 57 +++++++++++++-
 notebooks/supervised_classification.ipynb | 65 +++++++++++++++-
 tests/util/rbf_test.py                    | 91 -----------------------
 6 files changed, 121 insertions(+), 154 deletions(-)
 delete mode 100644 tests/util/rbf_test.py

diff --git a/edward/__init__.py b/edward/__init__.py
index 825fecb19..ca729ee5e 100644
--- a/edward/__init__.py
+++ b/edward/__init__.py
@@ -30,7 +30,7 @@
     copy, dot,
     get_ancestors, get_blanket, get_children, get_control_variate_coef,
     get_descendants, get_parents, get_siblings, get_variables,
-    is_independent, random_variables, rbf,
+    is_independent, random_variables,
     to_simplex, transform)
 from edward.version import __version__, VERSION
 
@@ -76,7 +76,6 @@
     'get_variables',
     'is_independent',
     'random_variables',
-    'rbf',
     'to_simplex',
     'transform',
     '__version__',
diff --git a/edward/util/__init__.py b/edward/util/__init__.py
index 279a1354a..0cdd9a480 100644
--- a/edward/util/__init__.py
+++ b/edward/util/__init__.py
@@ -23,7 +23,6 @@
     'get_variables',
     'is_independent',
     'random_variables',
-    'rbf',
     'to_simplex',
     'transform',
 ]
diff --git a/edward/util/tensorflow.py b/edward/util/tensorflow.py
index ca0976471..892db5055 100644
--- a/edward/util/tensorflow.py
+++ b/edward/util/tensorflow.py
@@ -4,8 +4,6 @@
 
 import tensorflow as tf
 
-from tensorflow.python.ops import control_flow_ops
-
 
 def dot(x, y):
   """Compute dot product between a 2-D tensor and a 1-D tensor.
@@ -45,62 +43,6 @@ def dot(x, y):
     return tf.reshape(tf.matmul(mat, tf.expand_dims(vec, 1)), [-1])
 
 
-def rbf(X, X2=None, lengthscale=1.0, variance=1.0):
-  """Radial basis function kernel, also known as the squared
-  exponential or exponentiated quadratic. It is defined as
-
-  $k(x, x') = \sigma^2 \exp\Big(
-      -\\frac{1}{2} \sum_{d=1}^D \\frac{1}{\ell_d^2} (x_d - x'_d)^2 \Big)$
-
-  for output variance $\sigma^2$ and lengthscale $\ell^2$.
-
-  The kernel is evaluated over all pairs of rows, `k(X[i, ], X2[j, ])`.
-  If `X2` is not specified, then it evaluates over all pairs
-  of rows in `X`, `k(X[i, ], X[j, ])`. The output is a matrix
-  where each entry (i, j) is the kernel over the ith and jth rows.
-
-  Args:
-    X: tf.Tensor.
-      N x D matrix of N data points each with D features.
-    X2: tf.Tensor.
-      N x D matrix of N data points each with D features.
-    lengthscale: tf.Tensor.
-      Lengthscale parameter, a positive scalar or D-dimensional vector.
-    variance: tf.Tensor.
-      Output variance parameter, a positive scalar.
-
-  #### Examples
-
-  ```python
-  X = tf.random_normal([100, 5])
-  K = ed.rbf(X)
-  assert K.shape == (100, 100)
-  ```
-  """
-  lengthscale = tf.convert_to_tensor(lengthscale)
-  variance = tf.convert_to_tensor(variance)
-  dependencies = [tf.assert_positive(lengthscale),
-                  tf.assert_positive(variance)]
-  lengthscale = control_flow_ops.with_dependencies(dependencies, lengthscale)
-  variance = control_flow_ops.with_dependencies(dependencies, variance)
-
-  X = tf.convert_to_tensor(X)
-  X = X / lengthscale
-  Xs = tf.reduce_sum(tf.square(X), 1)
-  if X2 is None:
-    X2 = X
-    X2s = Xs
-  else:
-    X2 = tf.convert_to_tensor(X2)
-    X2 = X2 / lengthscale
-    X2s = tf.reduce_sum(tf.square(X2), 1)
-
-  square = tf.reshape(Xs, [-1, 1]) + tf.reshape(X2s, [1, -1]) - \
-      2 * tf.matmul(X, X2, transpose_b=True)
-  output = variance * tf.exp(-square / 2)
-  return output
-
-
 def to_simplex(x):
   """Transform real vector of length `(K-1)` to a simplex of dimension `K`
   using a backward stick breaking construction.
diff --git a/examples/cox_process.py b/examples/cox_process.py
index 1a6f50dbb..c58539d20 100644
--- a/examples/cox_process.py
+++ b/examples/cox_process.py
@@ -25,7 +25,6 @@
 import tensorflow as tf
 
 from edward.models import MultivariateNormalTriL, Normal, Poisson
-from edward.util import rbf
 from scipy.stats import multivariate_normal, poisson
 
 tf.flags.DEFINE_integer("N", default=308, help="Number of NBA players.")
@@ -48,6 +47,62 @@ def build_toy_dataset(N, V):
   return x
 
 
+def rbf(X, X2=None, lengthscale=1.0, variance=1.0):
+  """Radial basis function kernel, also known as the squared
+  exponential or exponentiated quadratic. It is defined as
+
+  $k(x, x') = \sigma^2 \exp\Big(
+      -\\frac{1}{2} \sum_{d=1}^D \\frac{1}{\ell_d^2} (x_d - x'_d)^2 \Big)$
+
+  for output variance $\sigma^2$ and lengthscale $\ell^2$.
+
+  The kernel is evaluated over all pairs of rows, `k(X[i, ], X2[j, ])`.
+  If `X2` is not specified, then it evaluates over all pairs
+  of rows in `X`, `k(X[i, ], X[j, ])`. The output is a matrix
+  where each entry (i, j) is the kernel over the ith and jth rows.
+
+  Args:
+    X: tf.Tensor.
+      N x D matrix of N data points each with D features.
+    X2: tf.Tensor.
+      N x D matrix of N data points each with D features.
+    lengthscale: tf.Tensor.
+      Lengthscale parameter, a positive scalar or D-dimensional vector.
+    variance: tf.Tensor.
+      Output variance parameter, a positive scalar.
+
+  #### Examples
+
+  ```python
+  X = tf.random_normal([100, 5])
+  K = ed.rbf(X)
+  assert K.shape == (100, 100)
+  ```
+  """
+  lengthscale = tf.convert_to_tensor(lengthscale)
+  variance = tf.convert_to_tensor(variance)
+  dependencies = [tf.assert_positive(lengthscale),
+                  tf.assert_positive(variance)]
+  lengthscale = control_flow_ops.with_dependencies(dependencies, lengthscale)
+  variance = control_flow_ops.with_dependencies(dependencies, variance)
+
+  X = tf.convert_to_tensor(X)
+  X = X / lengthscale
+  Xs = tf.reduce_sum(tf.square(X), 1)
+  if X2 is None:
+    X2 = X
+    X2s = Xs
+  else:
+    X2 = tf.convert_to_tensor(X2)
+    X2 = X2 / lengthscale
+    X2s = tf.reduce_sum(tf.square(X2), 1)
+
+  square = tf.reshape(Xs, [-1, 1]) + tf.reshape(X2s, [1, -1]) - \
+      2 * tf.matmul(X, X2, transpose_b=True)
+  output = variance * tf.exp(-square / 2)
+  return output
+
+
 def main(_):
   ed.set_seed(42)
 
diff --git a/notebooks/supervised_classification.ipynb b/notebooks/supervised_classification.ipynb
index 5c78c0da2..d4ba9c599 100644
--- a/notebooks/supervised_classification.ipynb
+++ b/notebooks/supervised_classification.ipynb
@@ -31,10 +31,73 @@
     "import tensorflow as tf\n",
     "\n",
     "from edward.models import Bernoulli, MultivariateNormalTriL, Normal\n",
-    "from edward.util import rbf\n",
     "from observations import crabs"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "def rbf(X, X2=None, lengthscale=1.0, variance=1.0):\n",
+    "  \"\"\"Radial basis function kernel, also known as the squared\n",
+    "  exponential or exponentiated quadratic. It is defined as\n",
+    "\n",
+    "  $k(x, x') = \\sigma^2 \\exp\\Big(\n",
+    "      -\\\\frac{1}{2} \\sum_{d=1}^D \\\\frac{1}{\\ell_d^2} (x_d - x'_d)^2 \\Big)$\n",
+    "\n",
+    "  for output variance $\\sigma^2$ and lengthscale $\\ell^2$.\n",
+    "\n",
+    "  The kernel is evaluated over all pairs of rows, `k(X[i, ], X2[j, ])`.\n",
+    "  If `X2` is not specified, then it evaluates over all pairs\n",
+    "  of rows in `X`, `k(X[i, ], X[j, ])`. The output is a matrix\n",
+    "  where each entry (i, j) is the kernel over the ith and jth rows.\n",
+    "\n",
+    "  Args:\n",
+    "    X: tf.Tensor.\n",
+    "      N x D matrix of N data points each with D features.\n",
+    "    X2: tf.Tensor.\n",
+    "      N x D matrix of N data points each with D features.\n",
+    "    lengthscale: tf.Tensor.\n",
+    "      Lengthscale parameter, a positive scalar or D-dimensional vector.\n",
+    "    variance: tf.Tensor.\n",
+    "      Output variance parameter, a positive scalar.\n",
+    "\n",
+    "  #### Examples\n",
+    "\n",
+    "  ```python\n",
+    "  X = tf.random_normal([100, 5])\n",
+    "  K = ed.rbf(X)\n",
+    "  assert K.shape == (100, 100)\n",
+    "  ```\n",
+    "  \"\"\"\n",
+    "  lengthscale = tf.convert_to_tensor(lengthscale)\n",
+    "  variance = tf.convert_to_tensor(variance)\n",
+    "  dependencies = [tf.assert_positive(lengthscale),\n",
+    "                  tf.assert_positive(variance)]\n",
+    "  lengthscale = control_flow_ops.with_dependencies(dependencies, lengthscale)\n",
+    "  variance = control_flow_ops.with_dependencies(dependencies, variance)\n",
+    "\n",
+    "  X = tf.convert_to_tensor(X)\n",
+    "  X = X / lengthscale\n",
+    "  Xs = tf.reduce_sum(tf.square(X), 1)\n",
+    "  if X2 is None:\n",
+    "    X2 = X\n",
+    "    X2s = Xs\n",
+    "  else:\n",
+    "    X2 = tf.convert_to_tensor(X2)\n",
+    "    X2 = X2 / lengthscale\n",
+    "    X2s = tf.reduce_sum(tf.square(X2), 1)\n",
+    "\n",
+    "  square = tf.reshape(Xs, [-1, 1]) + tf.reshape(X2s, [1, -1]) - \\\n",
+    "      2 * tf.matmul(X, X2, transpose_b=True)\n",
+    "  output = variance * tf.exp(-square / 2)\n",
+    "  return output"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
diff --git a/tests/util/rbf_test.py b/tests/util/rbf_test.py
deleted file mode 100644
index 0b91a9128..000000000
--- a/tests/util/rbf_test.py
+++ /dev/null
@@ -1,91 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-import tensorflow as tf
-
-from edward.util import rbf
-
-
-class test_rbf_class(tf.test.TestCase):
-
-  def test_x(self):
-    with self.test_session():
-      X = tf.constant([[0.0], [0.0]])
-      X2 = tf.constant([[0.0], [0.0]])
-      self.assertAllClose(rbf(X).eval(),
-                          [[1.0, 1.0], [1.0, 1.0]])
-      self.assertAllClose(rbf(X, X2).eval(),
-                          [[1.0, 1.0], [1.0, 1.0]])
-
-  def test_x2(self):
-    with self.test_session():
-      X = tf.constant([[10.0], [2.0]])
-      X2 = tf.constant([[2.0], [10.0]])
-      self.assertAllClose(rbf(X, X2).eval(),
-                          [[1.266417e-14, 1.0], [1.0, 1.266417e-14]])
-      self.assertAllClose(rbf(X2, X).eval(),
-                          [[1.266417e-14, 1.0], [1.0, 1.266417e-14]])
-
-      X = tf.constant([[2.0, 2.5], [4.1, 5.0]])
-      X2 = tf.constant([[1.5, 2.0], [3.1, 4.2]])
-      self.assertAllClose(rbf(X, X2).eval(),
-                          [[0.778800, 0.128734],
-                           [0.000378, 0.440431]], atol=1e-5, rtol=1e-5)
-
-  def test_lengthscale(self):
-    """checked calculations by hand, e.g.,
-    np.exp(-((2.0 - 1.5)**2 / (2.0**2) + (2.5 - 2.0)**2 / (1.5**2)) / 2)
-    np.exp(-((2.0 - 3.1)**2 / (2.0**2) + (2.5 - 4.2)**2 / (1.5**2)) / 2)
-    np.exp(-((4.1 - 1.5)**2 / (2.0**2) + (5.0 - 2.0)**2 / (1.5**2)) / 2)
-    np.exp(-((4.1 - 3.1)**2 / (2.0**2) + (5.0 - 4.2)**2 / (1.5**2)) / 2)
-    """
-    with self.test_session():
-      X = tf.constant([[2.0, 2.5], [4.1, 5.0]])
-      X2 = tf.constant([[1.5, 2.0], [3.1, 4.2]])
-      lengthscale1 = tf.constant(2.0)
-      lengthscale2 = tf.constant([2.0, 2.0])
-      lengthscale3 = tf.constant([2.0, 1.5])
-      self.assertAllClose(rbf(X, X2, lengthscale1).eval(),
-                          [[0.939413, 0.598996],
-                           [0.139456, 0.814647]], atol=1e-5, rtol=1e-5)
-      self.assertAllClose(rbf(X, X2, lengthscale2).eval(),
-                          [[0.939413, 0.598996],
-                           [0.139456, 0.814647]], atol=1e-5, rtol=1e-5)
-      self.assertAllClose(rbf(X, X2, lengthscale3).eval(),
-                          [[0.916855, 0.452271],
-                           [0.058134, 0.765502]], atol=1e-5, rtol=1e-5)
-
-  def test_variance(self):
-    with self.test_session():
-      X = tf.constant([[2.0, 2.5], [4.1, 5.0]])
-      X2 = tf.constant([[1.5, 2.0], [3.1, 4.2]])
-      variance = tf.constant(1.4)
-      self.assertAllClose(rbf(X, X2, variance=variance).eval(),
-                          [[1.090321, 0.180228],
-                           [0.000529, 0.616604]], atol=1e-5, rtol=1e-5)
-
-  def test_all(self):
-    with self.test_session():
-      X = tf.constant([[2.0, 2.5], [4.1, 5.0]])
-      X2 = tf.constant([[1.5, 2.0], [3.1, 4.2]])
-      lengthscale = tf.constant([2.0, 1.5])
-      variance = tf.constant(1.4)
-      self.assertAllClose(rbf(X, X2, lengthscale, variance).eval(),
-                          [[1.283597, 0.633180],
-                           [0.081387, 1.071704]], atol=1e-5, rtol=1e-5)
-
-  def test_raises(self):
-    with self.test_session():
-      X1 = tf.constant([[0.0]])
-      X2 = tf.constant([[0.0]])
-      lengthscale = tf.constant(-5.0)
-      variance = tf.constant(-1.0)
-      with self.assertRaisesOpError('Condition'):
-        rbf(X1, X2, variance=variance).eval()
-        rbf(X1, X2, lengthscale).eval()
-        rbf(X1, X2, lengthscale, variance).eval()
-
-if __name__ == '__main__':
-  tf.test.main()

From 144386671b9377af69f5197d09f76776328aedb6 Mon Sep 17 00:00:00 2001
From: Dustin Tran <dustinviettran@gmail.com>
Date: Sun, 28 Jan 2018 03:43:19 -0800
Subject: [PATCH 12/27] remove to_simplex

---
 edward/__init__.py            |  3 +--
 edward/util/__init__.py       |  1 -
 edward/util/tensorflow.py     | 49 -----------------------------------
 tests/util/to_simplex_test.py | 42 ------------------------------
 4 files changed, 1 insertion(+), 94 deletions(-)
 delete mode 100644 tests/util/to_simplex_test.py

diff --git a/edward/__init__.py b/edward/__init__.py
index ca729ee5e..0792cfa9b 100644
--- a/edward/__init__.py
+++ b/edward/__init__.py
@@ -31,7 +31,7 @@
     get_ancestors, get_blanket, get_children, get_control_variate_coef,
     get_descendants, get_parents, get_siblings, get_variables,
     is_independent, random_variables,
-    to_simplex, transform)
+    transform)
 from edward.version import __version__, VERSION
 
 from tensorflow.python.util.all_util import remove_undocumented
@@ -76,7 +76,6 @@
     'get_variables',
     'is_independent',
     'random_variables',
-    'to_simplex',
     'transform',
     '__version__',
     'VERSION',
diff --git a/edward/util/__init__.py b/edward/util/__init__.py
index 0cdd9a480..393352746 100644
--- a/edward/util/__init__.py
+++ b/edward/util/__init__.py
@@ -23,7 +23,6 @@
     'get_variables',
     'is_independent',
     'random_variables',
-    'to_simplex',
     'transform',
 ]
 
diff --git a/edward/util/tensorflow.py b/edward/util/tensorflow.py
index 892db5055..c6b0e1387 100644
--- a/edward/util/tensorflow.py
+++ b/edward/util/tensorflow.py
@@ -43,55 +43,6 @@ def dot(x, y):
     return tf.reshape(tf.matmul(mat, tf.expand_dims(vec, 1)), [-1])
 
 
-def to_simplex(x):
-  """Transform real vector of length `(K-1)` to a simplex of dimension `K`
-  using a backward stick breaking construction.
-
-  Args:
-    x: tf.Tensor.
-      A 1-D or 2-D tensor.
-
-  Returns:
-    tf.Tensor.
-    A tensor of same shape as input but with last dimension of
-    size `K`.
-
-  Raises:
-    InvalidArgumentError.
-    If the input has Inf or NaN values.
-
-  #### Notes
-
-  x as a 3-D or higher tensor is not guaranteed to be supported.
-  """
-  x = tf.cast(x, dtype=tf.float32)
-  dependencies = [tf.verify_tensor_all_finite(x, msg='')]
-  x = control_flow_ops.with_dependencies(dependencies, x)
-
-  if isinstance(x, (tf.Tensor, tf.Variable)):
-    shape = x.get_shape().as_list()
-  else:
-    shape = x.shape
-
-  if len(shape) == 1:
-    K_minus_one = shape[0]
-    eq = -tf.log(tf.cast(K_minus_one - tf.range(K_minus_one), dtype=tf.float32))
-    z = tf.sigmoid(eq + x)
-    pil = tf.concat([z, tf.constant([1.0])], 0)
-    piu = tf.concat([tf.constant([1.0]), 1.0 - z], 0)
-    S = tf.cumprod(piu)
-    return S * pil
-  else:
-    n_rows = shape[0]
-    K_minus_one = shape[1]
-    eq = -tf.log(tf.cast(K_minus_one - tf.range(K_minus_one), dtype=tf.float32))
-    z = tf.sigmoid(eq + x)
-    pil = tf.concat([z, tf.ones([n_rows, 1])], 1)
-    piu = tf.concat([tf.ones([n_rows, 1]), 1.0 - z], 1)
-    S = tf.cumprod(piu, axis=1)
-    return S * pil
-
-
 def get_control_variate_coef(f, h):
   """Returns scalar used by control variates method for variance reduction in
   Monte Carlo methods.
diff --git a/tests/util/to_simplex_test.py b/tests/util/to_simplex_test.py
deleted file mode 100644
index fbe83fc15..000000000
--- a/tests/util/to_simplex_test.py
+++ /dev/null
@@ -1,42 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-import tensorflow as tf
-
-from edward.util import to_simplex
-
-
-class test_to_simplex_class(tf.test.TestCase):
-
-  def test_to_simplex_1d(self):
-    with self.test_session():
-      x = tf.constant([0.0])
-      self.assertAllClose(to_simplex(x).eval(),
-                          [0.5, 0.5])
-      x = tf.constant([0.0, 10.0])
-      self.assertAllClose(to_simplex(x).eval(),
-                          [3.333333e-01, 6.666363e-01, 3.027916e-05])
-
-  def test_to_simplex_2d(self):
-    with self.test_session():
-      x = tf.constant([[0.0], [0.0]])
-      self.assertAllClose(to_simplex(x).eval(),
-                          [[0.5, 0.5], [0.5, 0.5]])
-      x = tf.constant([[0.0, 10.0], [0.0, 10.0]])
-      self.assertAllClose(to_simplex(x).eval(),
-                          [[3.333333e-01, 6.666363e-01, 3.027916e-05],
-                           [3.333333e-01, 6.666363e-01, 3.027916e-05]])
-
-  def test_all_finite_raises(self):
-    with self.test_session():
-      x = tf.constant([12.5, np.inf])
-      with self.assertRaisesOpError('Inf'):
-        to_simplex(x).eval()
-      x = tf.constant([12.5, np.nan])
-      with self.assertRaisesOpError('NaN'):
-        to_simplex(x).eval()
-
-if __name__ == '__main__':
-  tf.test.main()

From 924fbf00d97a48df6d047e72925a30725d9b618f Mon Sep 17 00:00:00 2001
From: Dustin Tran <dustinviettran@gmail.com>
Date: Thu, 25 Jan 2018 23:46:42 -0800
Subject: [PATCH 13/27] replace ed.dot with tensor.dot(..., [[1], [0]])

---
 edward/__init__.py                            |  3 +-
 edward/util/__init__.py                       |  1 -
 edward/util/tensorflow.py                     | 38 -------------------
 examples/bayesian_linear_regression.py        |  4 +-
 ...bayesian_linear_regression_implicitklqp.py |  2 +-
 examples/bayesian_logistic_regression.py      |  5 ++-
 examples/iwvi.py                              |  2 +-
 notebooks/batch_training.ipynb                |  4 +-
 notebooks/supervised_regression.ipynb         |  4 +-
 notebooks/tensorboard.ipynb                   |  2 +-
 tests/util/dot_test.py                        | 34 -----------------
 11 files changed, 13 insertions(+), 86 deletions(-)
 delete mode 100644 tests/util/dot_test.py

diff --git a/edward/__init__.py b/edward/__init__.py
index 0792cfa9b..6fcefdc6f 100644
--- a/edward/__init__.py
+++ b/edward/__init__.py
@@ -27,7 +27,7 @@
     wgan_inference)
 # from edward.inferences import MonteCarlo, HMC, MetropolisHastings, SGLD, SGHMC, Gibbs
 from edward.util import (
-    copy, dot,
+    copy,
     get_ancestors, get_blanket, get_children, get_control_variate_coef,
     get_descendants, get_parents, get_siblings, get_variables,
     is_independent, random_variables,
@@ -65,7 +65,6 @@
     'wgan_inference',
     'Gibbs',
     'copy',
-    'dot',
     'get_ancestors',
     'get_blanket',
     'get_children',
diff --git a/edward/util/__init__.py b/edward/util/__init__.py
index 393352746..c66e7e0ac 100644
--- a/edward/util/__init__.py
+++ b/edward/util/__init__.py
@@ -12,7 +12,6 @@
 
 _allowed_symbols = [
     'copy',
-    'dot',
     'get_ancestors',
     'get_blanket',
     'get_children',
diff --git a/edward/util/tensorflow.py b/edward/util/tensorflow.py
index c6b0e1387..03a8c9d41 100644
--- a/edward/util/tensorflow.py
+++ b/edward/util/tensorflow.py
@@ -5,44 +5,6 @@
 import tensorflow as tf
 
 
-def dot(x, y):
-  """Compute dot product between a 2-D tensor and a 1-D tensor.
-
-  If x is a `[M x N]` matrix, then y is a `M`-vector.
-
-  If x is a `M`-vector, then y is a `[M x N]` matrix.
-
-  Args:
-    x: tf.Tensor.
-      A 1-D or 2-D tensor (see above).
-    y: tf.Tensor.
-      A 1-D or 2-D tensor (see above).
-
-  Returns:
-    tf.Tensor.
-    A 1-D tensor of length `N`.
-
-  Raises:
-    InvalidArgumentError.
-    If the inputs have Inf or NaN values.
-  """
-  x = tf.convert_to_tensor(x)
-  y = tf.convert_to_tensor(y)
-  dependencies = [tf.verify_tensor_all_finite(x, msg=''),
-                  tf.verify_tensor_all_finite(y, msg='')]
-  x = control_flow_ops.with_dependencies(dependencies, x)
-  y = control_flow_ops.with_dependencies(dependencies, y)
-
-  if len(x.shape) == 1:
-    vec = x
-    mat = y
-    return tf.reshape(tf.matmul(tf.expand_dims(vec, 0), mat), [-1])
-  else:
-    mat = x
-    vec = y
-    return tf.reshape(tf.matmul(mat, tf.expand_dims(vec, 1)), [-1])
-
-
 def get_control_variate_coef(f, h):
   """Returns scalar used by control variates method for variance reduction in
   Monte Carlo methods.
diff --git a/examples/bayesian_linear_regression.py b/examples/bayesian_linear_regression.py
index d3ce100b8..c23390fc0 100644
--- a/examples/bayesian_linear_regression.py
+++ b/examples/bayesian_linear_regression.py
@@ -49,7 +49,7 @@ def main(_):
   X = tf.placeholder(tf.float32, [FLAGS.N, FLAGS.D])
   w = Normal(loc=tf.zeros(FLAGS.D), scale=tf.ones(FLAGS.D))
   b = Normal(loc=tf.zeros(1), scale=tf.ones(1))
-  y = Normal(loc=ed.dot(X, w) + b, scale=tf.ones(FLAGS.N))
+  y = Normal(loc=tf.tensordot(X, w, [[1], [0]]) + b, scale=tf.ones(N))
 
   # INFERENCE
   qw = Empirical(params=tf.get_variable("qw/params", [FLAGS.T, FLAGS.D]))
@@ -68,7 +68,7 @@ def main(_):
   # Posterior predictive checks.
   y_post = ed.copy(y, {w: qw, b: qb})
   # This is equivalent to
-  # y_post = Normal(loc=ed.dot(X, qw) + qb, scale=tf.ones(FLAGS.N))
+  # y_post = Normal(loc=tf.tensordot(X, qw, [[1], [0]]) + qb, scale=tf.ones(N))
 
   print("Mean squared error on test data:")
   print(ed.evaluate('mean_squared_error', data={X: X_test, y_post: y_test}))
diff --git a/examples/bayesian_linear_regression_implicitklqp.py b/examples/bayesian_linear_regression_implicitklqp.py
index 41a72a132..145869b32 100644
--- a/examples/bayesian_linear_regression_implicitklqp.py
+++ b/examples/bayesian_linear_regression_implicitklqp.py
@@ -86,7 +86,7 @@ def ratio_estimator(data, local_vars, global_vars):
   X = tf.placeholder(tf.float32, [FLAGS.M, FLAGS.D])
   y_ph = tf.placeholder(tf.float32, [FLAGS.M])
   w = Normal(loc=tf.zeros(FLAGS.D), scale=tf.ones(FLAGS.D))
-  y = Normal(loc=ed.dot(X, w), scale=tf.ones(FLAGS.M))
+  y = Normal(loc=tf.tensordot(X, w, [[1], [0]]), scale=tf.ones(M))
 
   # INFERENCE
   qw = Normal(loc=tf.get_variable("qw/loc", [FLAGS.D]) + 1.0,
diff --git a/examples/bayesian_logistic_regression.py b/examples/bayesian_logistic_regression.py
index bf4305a88..a08c83c3a 100644
--- a/examples/bayesian_logistic_regression.py
+++ b/examples/bayesian_logistic_regression.py
@@ -41,7 +41,7 @@ def main(_):
   X = tf.placeholder(tf.float32, [FLAGS.N, FLAGS.D])
   w = Normal(loc=tf.zeros(FLAGS.D), scale=3.0 * tf.ones(FLAGS.D))
   b = Normal(loc=tf.zeros([]), scale=3.0 * tf.ones([]))
-  y = Bernoulli(logits=ed.dot(X, w) + b)
+  y = Bernoulli(logits=tf.tensordot(X, w, [[1], [0]]) + b)
 
   # INFERENCE
   qw = Empirical(params=tf.get_variable("qw/params", [FLAGS.T, FLAGS.D]))
@@ -73,7 +73,8 @@ def main(_):
   # Build samples from inferred posterior.
   n_samples = 50
   inputs = np.linspace(-5, 3, num=400, dtype=np.float32).reshape((400, 1))
-  probs = tf.stack([tf.sigmoid(ed.dot(inputs, qw.sample()) + qb.sample())
+  probs = tf.stack([tf.sigmoid(tf.tensordot(inputs, qw.sample(), [[1], [0]]) +
+                               qb.sample())
                     for _ in range(n_samples)])
 
   for t in range(inference.n_iter):
diff --git a/examples/iwvi.py b/examples/iwvi.py
index be199aae3..0df1a0447 100644
--- a/examples/iwvi.py
+++ b/examples/iwvi.py
@@ -105,7 +105,7 @@ def main(_):
   # MODEL
   X = tf.placeholder(tf.float32, [N, D])
   w = Normal(loc=tf.zeros(D), scale=tf.ones(D))
-  y = Bernoulli(logits=ed.dot(X, w))
+  y = Bernoulli(logits=tf.tensordot(X, w, [[1], [0]]))
 
   # INFERENCE
   qw = Normal(loc=tf.get_variable("qw/loc", [D]),
diff --git a/notebooks/batch_training.ipynb b/notebooks/batch_training.ipynb
index 3320b24bd..5a25d40a1 100644
--- a/notebooks/batch_training.ipynb
+++ b/notebooks/batch_training.ipynb
@@ -167,7 +167,7 @@
     "\n",
     "w = Normal(loc=tf.zeros(D), scale=tf.ones(D))\n",
     "b = Normal(loc=tf.zeros(1), scale=tf.ones(1))\n",
-    "y = Normal(loc=ed.dot(X, w) + b, scale=1.0)"
+    "y = Normal(loc=tf.tensordot(X, w, [[1], [0]]) + b, scale=1.0)"
    ]
   },
   {
@@ -334,7 +334,7 @@
    "source": [
     "y_post = ed.copy(y, {w: qw, b: qb})\n",
     "# This is equivalent to\n",
-    "# y_post = Normal(loc=ed.dot(X, qw) + qb, scale=tf.ones(N))"
+    "# y_post = Normal(loc=tf.tensordot(X, qw, [[1], [0]]) + qb, scale=tf.ones(N))"
    ]
   },
   {
diff --git a/notebooks/supervised_regression.ipynb b/notebooks/supervised_regression.ipynb
index 0ad68bee2..bf4bf8f31 100644
--- a/notebooks/supervised_regression.ipynb
+++ b/notebooks/supervised_regression.ipynb
@@ -124,7 +124,7 @@
     "X = tf.placeholder(tf.float32, [N, D])\n",
     "w = Normal(loc=tf.zeros(D), scale=tf.ones(D))\n",
     "b = Normal(loc=tf.zeros(1), scale=tf.ones(1))\n",
-    "y = Normal(loc=ed.dot(X, w) + b, scale=tf.ones(N))"
+    "y = Normal(loc=tf.tensordot(X, w, [[1], [0]]) + b, scale=tf.ones(N))"
    ]
   },
   {
@@ -217,7 +217,7 @@
    "source": [
     "y_post = ed.copy(y, {w: qw, b: qb})\n",
     "# This is equivalent to\n",
-    "# y_post = Normal(loc=ed.dot(X, qw) + qb, scale=tf.ones(N))"
+    "# y_post = Normal(loc=tf.tensordot(X, qw, [[1], [0]]) + qb, scale=tf.ones(N))"
    ]
   },
   {
diff --git a/notebooks/tensorboard.ipynb b/notebooks/tensorboard.ipynb
index c60aa3207..e32633ced 100644
--- a/notebooks/tensorboard.ipynb
+++ b/notebooks/tensorboard.ipynb
@@ -157,7 +157,7 @@
     "  b = Normal(loc=tf.zeros(1, name=\"bias/loc\"),\n",
     "             scale=tf.ones(1, name=\"bias/scale\"),\n",
     "             name=\"bias\")\n",
-    "  y = Normal(loc=ed.dot(X, w) + b,\n",
+    "  y = Normal(loc=tf.tensordot(X, w, [[1], [0]]) + b,\n",
     "             scale=tf.ones(N, name=\"y/scale\"),\n",
     "             name=\"y\")"
    ]
diff --git a/tests/util/dot_test.py b/tests/util/dot_test.py
deleted file mode 100644
index 99f56d238..000000000
--- a/tests/util/dot_test.py
+++ /dev/null
@@ -1,34 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-import tensorflow as tf
-
-from edward.util import dot
-
-
-class test_dot_class(tf.test.TestCase):
-
-  def test_dot(self):
-    with self.test_session():
-      a = tf.constant(np.arange(5, dtype=np.float32))
-      b = tf.diag(tf.ones([5]))
-      self.assertAllEqual(dot(a, b).eval(),
-                          np.dot(a.eval(), b.eval()))
-      self.assertAllEqual(dot(b, a).eval(),
-                          np.dot(b.eval(), a.eval()))
-
-  def test_all_finite_raises(self):
-    with self.test_session():
-      a = np.inf * tf.ones([5])
-      b = tf.diag(tf.ones([5]))
-      with self.assertRaisesOpError('Inf'):
-        dot(a, b).eval()
-      a = tf.ones([5]) * np.arange(5)
-      b = np.inf * tf.diag(tf.ones([5]))
-      with self.assertRaisesOpError('Inf'):
-        dot(a, b).eval()
-
-if __name__ == '__main__':
-  tf.test.main()

From 275ba715b9a6950245b6251d9fd379b21a999771 Mon Sep 17 00:00:00 2001
From: Dustin Tran <dustinviettran@gmail.com>
Date: Sun, 7 Jan 2018 13:32:44 -0800
Subject: [PATCH 14/27] add core with trace v1

---
 edward/__init__.py                            |   2 +
 edward/inferences/conjugacy/conjugacy.py      |   2 +-
 .../conjugacy/conjugate_log_probs.py          |   2 +-
 edward/models/__init__.py                     |   7 +-
 edward/models/core.py                         | 187 ++++++++++++++++++
 edward/models/random_variable.py              |  47 +++--
 edward/models/random_variables.py             |  47 -----
 edward/util/random_variables.py               |   2 +-
 tests/models/random_variable_value_test.py    |  14 --
 tests/models/trace_test.py                    |  35 ++++
 10 files changed, 257 insertions(+), 88 deletions(-)
 create mode 100644 edward/models/core.py
 delete mode 100644 edward/models/random_variables.py
 create mode 100644 tests/models/trace_test.py

diff --git a/edward/__init__.py b/edward/__init__.py
index 6fcefdc6f..13b98df67 100644
--- a/edward/__init__.py
+++ b/edward/__init__.py
@@ -26,6 +26,7 @@
     wake_sleep,
     wgan_inference)
 # from edward.inferences import MonteCarlo, HMC, MetropolisHastings, SGLD, SGHMC, Gibbs
+from edward.models import Trace
 from edward.util import (
     copy,
     get_ancestors, get_blanket, get_children, get_control_variate_coef,
@@ -64,6 +65,7 @@
     'wake_sleep',
     'wgan_inference',
     'Gibbs',
+    'Trace',
     'copy',
     'get_ancestors',
     'get_blanket',
diff --git a/edward/inferences/conjugacy/conjugacy.py b/edward/inferences/conjugacy/conjugacy.py
index 235e39146..dea978627 100644
--- a/edward/inferences/conjugacy/conjugacy.py
+++ b/edward/inferences/conjugacy/conjugacy.py
@@ -11,7 +11,7 @@
 from collections import defaultdict
 from edward.inferences.conjugacy.simplify \
     import symbolic_suff_stat, full_simplify, expr_contains, reconstruct_expr
-from edward.models.random_variables import *
+from edward.models.core import *
 from edward.util import copy, get_blanket
 
 
diff --git a/edward/inferences/conjugacy/conjugate_log_probs.py b/edward/inferences/conjugacy/conjugate_log_probs.py
index a2e25f0ee..a8667a7e5 100644
--- a/edward/inferences/conjugacy/conjugate_log_probs.py
+++ b/edward/inferences/conjugacy/conjugate_log_probs.py
@@ -5,7 +5,7 @@
 import numpy as np
 import tensorflow as tf
 
-from edward.models.random_variables import *
+from edward.models.core import *
 
 
 def _val_wrapper(f):
diff --git a/edward/models/__init__.py b/edward/models/__init__.py
index 3cba416a6..73e0bb608 100644
--- a/edward/models/__init__.py
+++ b/edward/models/__init__.py
@@ -4,14 +4,15 @@
 from __future__ import division
 from __future__ import print_function
 
-from edward.models.random_variable import RandomVariable
-from edward.models.random_variables import *
+from edward.models.core import *
+from edward.models.random_variable import *
 
 from tensorflow.python.util.all_util import remove_undocumented
-from edward.models import random_variables as _module
+from edward.models import core as _module
 
 _allowed_symbols = [
     'RandomVariable',
+    'Trace',
 ]
 for name in dir(_module):
   obj = getattr(_module, name)
diff --git a/edward/models/core.py b/edward/models/core.py
new file mode 100644
index 000000000..e3c7e5886
--- /dev/null
+++ b/edward/models/core.py
@@ -0,0 +1,187 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections as _collections
+import inspect as _inspect
+
+from edward.models.random_variable import RandomVariable as _RandomVariable
+from tensorflow.contrib import distributions as _distributions
+
+
+class Trace(object):
+  """Context manager with two objects:
+
+  + The trace stack stores executions from each primitive fn.
+  + (Optional) The intercept callable intercepts the continuation of a function.
+
+  Optionally, the trace stack stores the function call, its inputs, and
+  its parent primitives. This lets us trace the continuation
+  structure. Storing inputs can be memory-intensive as it prevents
+  garbage collection; hence it's optional.
+  """
+  def __init__(self, intercept=None, trace_continuation=False):
+    self._intercept = intercept
+    self._trace_continuation = trace_continuation
+    # We use OrderedDict. It is essentially a stack where each element is a node
+    # (value) and its name (key); the name is a pointer to the node.
+    self._trace_stack = _collections.OrderedDict({})
+
+  def __enter__(self):
+    # Note if Trace's are nested, global vars are set
+    # to the innermost context's variables.
+    if self._intercept is not None:
+      global _INTERCEPT
+      _INTERCEPT = self._intercept
+    global _TRACE_CONTINUATION, _TRACE_STACK
+    _TRACE_CONTINUATION = self._trace_continuation
+    _TRACE_STACK = self._trace_stack
+    return self
+
+  def __exit__(self, t, v, tb):
+    global _INTERCEPT, _TRACE_CONTINUATION, _TRACE_STACK
+    try:
+      del _INTERCEPT
+    except:
+      pass
+    del _TRACE_CONTINUATION
+    del _TRACE_STACK
+
+  # operator-overloading for convenience
+  def __repr__(self):
+    return self._trace_stack.__repr__()
+
+  def __str__(self):
+    return self._trace_stack.__str__()
+
+  def __delitem__(self, key):
+    del self._trace_stack[key]
+
+  def __getitem__(self, key):
+    return self._trace_stack[key]
+
+  def __setitem__(self, key, value):
+    self._trace_stack[key] = value
+
+  def get(self, key, value=None):
+    return self._trace_stack.get(key, value)
+
+  def iteritems(self):
+    return self._trace_stack.items()
+
+  def iterkeys(self):
+    return self._trace_stack.keys()
+
+  def itervalues(self):
+    return self._trace_stack.values()
+
+  def items(self):
+    return self._trace_stack.items()
+
+  def keys(self):
+    return self._trace_stack.keys()
+
+  def values(self):
+    return self._trace_stack.values()
+
+
+class Node(object):
+  """Node in trace stack. Collection of nodes forms a directed acyclic graph."""
+  __slots__ = ['value', 'f', 'args', 'kwargs', 'parents']
+
+  def __init__(self, value, f=None, args=None, kwargs=None, parents=None):
+    self.value = value
+    self.f = f
+    self.args = args
+    self.kwargs = kwargs
+    self.parents = parents
+
+
+def primitive(fn):
+  """Wraps function so its continuation can be intercepted
+  and its execution can be written to a stack.
+
+  Apply this to decorate primitive functions.
+  """
+  def wrapped_fn(*args, **kwargs):
+    global _INTERCEPT, _TRACE_CONTINUATION, _TRACE_STACK
+    if '_INTERCEPT' in globals():
+      out = _INTERCEPT(fn, *args, **kwargs)
+    else:
+      out = fn(*args, **kwargs)
+    if '_TRACE_CONTINUATION' in globals() and '_TRACE_STACK' in globals():
+      if _TRACE_CONTINUATION:
+        parents = [v for v in list(args) + kwargs.values()
+                   if hasattr(v, "name") and v.name in _TRACE_STACK]
+        _TRACE_STACK[out.name] = Node(out, fn, args, kwargs, parents)
+      else:
+        _TRACE_STACK[out.name] = Node(out)
+    return out
+  return wrapped_fn
+
+
+# TODO(trandustin): wrapping via init, not primitive() so wrapped
+# class still belongs in RandomVariable. Is this distinction
+# necessary?
+def _primitive_cls(__init__):
+  """Wraps class' __init__ so its continuation can be intercepted
+  and its execution can be written to a stack.
+
+  Apply this to decorate primitive classes.
+  """
+  def wrapped_fn(self, *args, **kwargs):
+    global _INTERCEPT, _TRACE_CONTINUATION, _TRACE_STACK
+    if '_INTERCEPT' in globals():
+      _INTERCEPT(__init__, self, *args, **kwargs)
+    else:
+      __init__(self, *args, **kwargs)
+    if '_TRACE_CONTINUATION' in globals() and '_TRACE_STACK' in globals():
+      if _TRACE_CONTINUATION:
+        parents = [v for v in list(args) + kwargs.values()
+                   if hasattr(v, "name") and v.name in _TRACE_STACK]
+        _TRACE_STACK[self.name] = Node(self, __init__, args, kwargs, parents)
+      else:
+        _TRACE_STACK[self.name] = Node(self)
+  return wrapped_fn
+
+
+# Automatically generate random variable classes from classes in
+# tf.contrib.distributions.
+_globals = globals()
+for _name in sorted(dir(_distributions)):
+  _candidate = getattr(_distributions, _name)
+  if (_inspect.isclass(_candidate) and
+          _candidate != _distributions.Distribution and
+          issubclass(_candidate, _distributions.Distribution)):
+
+    # write a new __init__ method in order to decorate class as primitive
+    # and share _candidate's docstring
+    @_primitive_cls
+    def __init__(self, *args, **kwargs):
+      _RandomVariable.__init__(self, *args, **kwargs)
+    __init__.__doc__ = _candidate.__init__.__doc__
+    _params = {'__doc__': _candidate.__doc__,
+               '__init__': __init__}
+    _globals[_name] = type(_name, (_RandomVariable, _candidate), _params)
+
+    del _candidate
+
+# Add supports; these are used, e.g., in conjugacy.
+Bernoulli.support = 'binary'
+Beta.support = '01'
+Binomial.support = 'onehot'
+Categorical.support = 'categorical'
+Chi2.support = 'nonnegative'
+Dirichlet.support = 'simplex'
+Exponential.support = 'nonnegative'
+Gamma.support = 'nonnegative'
+InverseGamma.support = 'nonnegative'
+Laplace.support = 'real'
+Multinomial.support = 'onehot'
+MultivariateNormalDiag.support = 'multivariate_real'
+Normal.support = 'real'
+Poisson.support = 'countable'
+
+del absolute_import
+del division
+del print_function
diff --git a/edward/models/random_variable.py b/edward/models/random_variable.py
index 3bd9c0e02..7a4142ea5 100644
--- a/edward/models/random_variable.py
+++ b/edward/models/random_variable.py
@@ -86,29 +86,11 @@ def __init__(self, *args, **kwargs):
         Optional list of graph collections (lists). The random variable is
         added to these collections. Defaults to `[ed.random_variables()]`.
     """
-    # Force the Distribution class to always use the same name scope
-    # when scoping its parameter names and also when calling any
-    # methods such as sample.
-    name = kwargs.get('name', type(self).__name__)
-    with tf.name_scope(name) as ns:
-      kwargs['name'] = ns
-
-    # pop and store RandomVariable-specific parameters in _kwargs
+    # pop and store RandomVariable-specific parameters
     sample_shape = kwargs.pop('sample_shape', ())
     value = kwargs.pop('value', None)
     collections = kwargs.pop('collections', ["random_variables"])
 
-    # store args, kwargs for easy graph copying
-    self._args = args
-    self._kwargs = kwargs.copy()
-
-    if sample_shape != ():
-      self._kwargs['sample_shape'] = sample_shape
-    if value is not None:
-      self._kwargs['value'] = value
-    if collections != ["random_variables"]:
-      self._kwargs['collections'] = collections
-
     super(RandomVariable, self).__init__(*args, **kwargs)
 
     self._sample_shape = tf.TensorShape(sample_shape)
@@ -153,16 +135,24 @@ def value(self):
     return self._value
 
   def __str__(self):
+    if not hasattr(self.value, "numpy"):
+      name = self.name
+    else:
+      name = numpy_text(self.value)
     return "RandomVariable(\"%s\"%s%s%s)" % (
-        self.name,
+        name,
         (", shape=%s" % self.shape)
         if self.shape.ndims is not None else "",
         (", dtype=%s" % self.dtype.name) if self.dtype else "",
         (", device=%s" % self.value.device) if self.value.device else "")
 
   def __repr__(self):
-    return "<ed.RandomVariable '%s' shape=%s dtype=%s>" % (
+    string = "<ed.RandomVariable '%s' shape=%s dtype=%s>" % (
         self.name, self.shape, self.dtype.name)
+    if hasattr(self.value, "numpy"):
+      string = string[:-1] + " numpy=%s>" % (
+          numpy_text(self.value, is_repr=True))
+    return string
 
   def __hash__(self):
     return id(self)
@@ -220,6 +210,10 @@ def eval(self, session=None, feed_dict=None):
     """
     return self.value.eval(session=session, feed_dict=feed_dict)
 
+  def numpy(self):
+    """Value as NumPy array, only available for TF Eager."""
+    return self.value.numpy()
+
   def get_ancestors(self, collection=None):
     """Get ancestor random variables."""
     from edward.util.random_variables import get_ancestors
@@ -312,6 +306,17 @@ def _tensor_conversion_function(v, dtype=None, name=None, as_ref=False):
     return v.value
 
 
+def numpy_text(tensor, is_repr=False):  # utility fn from TF Eager codebase
+  """Human readable representation of a tensor's numpy value."""
+  if tensor.dtype.is_numpy_compatible:
+    text = repr(tensor.numpy()) if is_repr else str(tensor.numpy())
+  else:
+    text = "<unprintable>"
+  if "\n" in text:
+    text = "\n" + text
+  return text
+
+
 RandomVariable._overload_all_operators()
 
 register_session_run_conversion_functions(
diff --git a/edward/models/random_variables.py b/edward/models/random_variables.py
deleted file mode 100644
index be5cf8058..000000000
--- a/edward/models/random_variables.py
+++ /dev/null
@@ -1,47 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import inspect as _inspect
-
-from edward.models.random_variable import RandomVariable as _RandomVariable
-from tensorflow.contrib import distributions as _distributions
-
-# Automatically generate random variable classes from classes in
-# tf.contrib.distributions.
-_globals = globals()
-for _name in sorted(dir(_distributions)):
-  _candidate = getattr(_distributions, _name)
-  if (_inspect.isclass(_candidate) and
-          _candidate != _distributions.Distribution and
-          issubclass(_candidate, _distributions.Distribution)):
-
-    # to use _candidate's docstring, must write a new __init__ method
-    def __init__(self, *args, **kwargs):
-      _RandomVariable.__init__(self, *args, **kwargs)
-    __init__.__doc__ = _candidate.__init__.__doc__
-    _params = {'__doc__': _candidate.__doc__,
-               '__init__': __init__}
-    _globals[_name] = type(_name, (_RandomVariable, _candidate), _params)
-
-    del _candidate
-
-# Add supports; these are used, e.g., in conjugacy.
-Bernoulli.support = 'binary'
-Beta.support = '01'
-Binomial.support = 'onehot'
-Categorical.support = 'categorical'
-Chi2.support = 'nonnegative'
-Dirichlet.support = 'simplex'
-Exponential.support = 'nonnegative'
-Gamma.support = 'nonnegative'
-InverseGamma.support = 'nonnegative'
-Laplace.support = 'real'
-Multinomial.support = 'onehot'
-MultivariateNormalDiag.support = 'multivariate_real'
-Normal.support = 'real'
-Poisson.support = 'countable'
-
-del absolute_import
-del division
-del print_function
diff --git a/edward/util/random_variables.py b/edward/util/random_variables.py
index b1e62073f..194ed7fb7 100644
--- a/edward/util/random_variables.py
+++ b/edward/util/random_variables.py
@@ -6,8 +6,8 @@
 import tensorflow as tf
 
 from copy import deepcopy
+from edward.models.core import TransformedDistribution
 from edward.models.random_variable import RandomVariable
-from edward.models.random_variables import TransformedDistribution
 from edward.util.graphs import random_variables
 from tensorflow.core.framework import attr_value_pb2
 from tensorflow.python.framework.ops import set_shapes_for_outputs
diff --git a/tests/models/random_variable_value_test.py b/tests/models/random_variable_value_test.py
index 988e86d7b..1be69ab1a 100644
--- a/tests/models/random_variable_value_test.py
+++ b/tests/models/random_variable_value_test.py
@@ -6,7 +6,6 @@
 import tensorflow as tf
 
 from edward.models import Bernoulli, Normal, Poisson, RandomVariable
-from edward.util import copy
 
 
 class test_random_variable_value_class(tf.test.TestCase):
@@ -19,13 +18,6 @@ def _test_sample(self, RV, value, *args, **kwargs):
     self.assertEqual(value_shape, expected_shape)
     self.assertEqual(rv.dtype, rv.value.dtype)
 
-  def _test_copy(self, RV, value, *args, **kwargs):
-    rv1 = RV(*args, value=value, **kwargs)
-    rv2 = copy(rv1)
-    value_shape1 = rv1.value.shape
-    value_shape2 = rv2.value.shape
-    self.assertEqual(value_shape1, value_shape2)
-
   def test_shape_and_dtype(self):
     with self.test_session():
       self._test_sample(Normal, 2, loc=0.5, scale=1.0)
@@ -45,11 +37,5 @@ def test_mismatch_raises(self):
       self.assertRaises(ValueError, self._test_sample, Normal,
                         np.zeros([10, 3]), loc=[0.5, 0.5], scale=[1.0, 1.0])
 
-  def test_copy(self):
-    with self.test_session():
-      self._test_copy(Normal, 2, loc=0.5, scale=1.0)
-      self._test_copy(Normal, [2], loc=[0.5], scale=[1.0])
-      self._test_copy(Poisson, 2, rate=0.5)
-
 if __name__ == '__main__':
   tf.test.main()
diff --git a/tests/models/trace_test.py b/tests/models/trace_test.py
new file mode 100644
index 000000000..13abd3826
--- /dev/null
+++ b/tests/models/trace_test.py
@@ -0,0 +1,35 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import edward as ed
+import tensorflow as tf
+
+from edward.models import Normal, Poisson
+
+
+class test_trace_class(tf.test.TestCase):
+
+  def _test_intercept_value(self, RV, value, *args, **kwargs):
+    def _intercept(f, *args, **kwargs):
+      name = kwargs.get('name', None)
+      if name == "rv2":
+        rv1 = rv1_trace["rv1"].value
+        kwargs['value'] = rv1.value
+      return f(*args, **kwargs)
+    with ed.Trace() as rv1_trace:
+      rv1 = RV(*args, value=value, name="rv1", **kwargs)
+    with ed.Trace(intercept=_intercept) as rv2_trace:
+      rv2 = RV(*args, name="rv2", **kwargs)
+    value_shape1 = rv1.value.shape
+    value_shape2 = rv2.value.shape
+    self.assertEqual(value_shape1, value_shape2)
+
+  def test_intercept_value(self):
+    with self.test_session():
+      self._test_intercept_value(Normal, 2, loc=0.5, scale=1.0)
+      self._test_intercept_value(Normal, [2], loc=[0.5], scale=[1.0])
+      self._test_intercept_value(Poisson, 2, rate=0.5)
+
+if __name__ == '__main__':
+  tf.test.main()

From 34be6fefda55b8151963ad0993fd9c73aef12986 Mon Sep 17 00:00:00 2001
From: Dustin Tran <dustinviettran@gmail.com>
Date: Sun, 28 Jan 2018 04:19:55 -0800
Subject: [PATCH 15/27] move ed.random_variables to model/

---
 edward/__init__.py               |  6 ++++--
 edward/models/__init__.py        |  1 +
 edward/models/random_variable.py | 15 +++++++++++++++
 edward/util/__init__.py          |  2 --
 edward/util/graphs.py            | 22 ----------------------
 5 files changed, 20 insertions(+), 26 deletions(-)
 delete mode 100644 edward/util/graphs.py

diff --git a/edward/__init__.py b/edward/__init__.py
index 13b98df67..3bde7d7a6 100644
--- a/edward/__init__.py
+++ b/edward/__init__.py
@@ -26,12 +26,14 @@
     wake_sleep,
     wgan_inference)
 # from edward.inferences import MonteCarlo, HMC, MetropolisHastings, SGLD, SGHMC, Gibbs
-from edward.models import Trace
+from edward.models import (
+    Trace,
+    random_variables)
 from edward.util import (
     copy,
     get_ancestors, get_blanket, get_children, get_control_variate_coef,
     get_descendants, get_parents, get_siblings, get_variables,
-    is_independent, random_variables,
+    is_independent,
     transform)
 from edward.version import __version__, VERSION
 
diff --git a/edward/models/__init__.py b/edward/models/__init__.py
index 73e0bb608..f68b6ee0e 100644
--- a/edward/models/__init__.py
+++ b/edward/models/__init__.py
@@ -13,6 +13,7 @@
 _allowed_symbols = [
     'RandomVariable',
     'Trace',
+    'random_variables',
 ]
 for name in dir(_module):
   obj = getattr(_module, name)
diff --git a/edward/models/random_variable.py b/edward/models/random_variable.py
index 7a4142ea5..4a4bfb145 100644
--- a/edward/models/random_variable.py
+++ b/edward/models/random_variable.py
@@ -317,6 +317,21 @@ def numpy_text(tensor, is_repr=False):  # utility fn from TF Eager codebase
   return text
 
 
+def random_variables(graph=None):
+  """Return all random variables in the TensorFlow graph.
+
+  Args:
+    graph: TensorFlow graph.
+
+  Returns:
+    list of RandomVariable.
+  """
+  if graph is None:
+    graph = tf.get_default_graph()
+
+  return _RANDOM_VARIABLE_COLLECTION[graph]
+
+
 RandomVariable._overload_all_operators()
 
 register_session_run_conversion_functions(
diff --git a/edward/util/__init__.py b/edward/util/__init__.py
index c66e7e0ac..ec1d99ab3 100644
--- a/edward/util/__init__.py
+++ b/edward/util/__init__.py
@@ -4,7 +4,6 @@
 from __future__ import division
 from __future__ import print_function
 
-from edward.util.graphs import *
 from edward.util.random_variables import *
 from edward.util.tensorflow import *
 
@@ -21,7 +20,6 @@
     'get_siblings',
     'get_variables',
     'is_independent',
-    'random_variables',
     'transform',
 ]
 
diff --git a/edward/util/graphs.py b/edward/util/graphs.py
deleted file mode 100644
index 0ebbef80d..000000000
--- a/edward/util/graphs.py
+++ /dev/null
@@ -1,22 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf
-
-from edward.models.random_variable import _RANDOM_VARIABLE_COLLECTION
-
-
-def random_variables(graph=None):
-  """Return all random variables in the TensorFlow graph.
-
-  Args:
-    graph: TensorFlow graph.
-
-  Returns:
-    list of RandomVariable.
-  """
-  if graph is None:
-    graph = tf.get_default_graph()
-
-  return _RANDOM_VARIABLE_COLLECTION[graph]

From 9691a5bd018cad17208aa3c1e79c7fa256001f23 Mon Sep 17 00:00:00 2001
From: Dustin Tran <dustinviettran@gmail.com>
Date: Sun, 28 Jan 2018 04:31:57 -0800
Subject: [PATCH 16/27] move rv queries to models/

---
 edward/__init__.py                            |  12 +-
 edward/inferences/conjugacy/conjugacy.py      |   3 +-
 edward/inferences/laplace.py                  |   2 +-
 edward/models/__init__.py                     |   9 +
 edward/models/queries.py                      | 421 ++++++++++++++++++
 edward/models/random_variable.py              |  14 +-
 edward/util/__init__.py                       |   8 -
 edward/util/random_variables.py               | 411 -----------------
 tests/{util => models}/get_ancestors_test.py  |   3 +-
 tests/{util => models}/get_blanket_test.py    |   3 +-
 tests/{util => models}/get_children_test.py   |   3 +-
 .../{util => models}/get_descendants_test.py  |   3 +-
 tests/{util => models}/get_parents_test.py    |   3 +-
 tests/{util => models}/get_siblings_test.py   |   3 +-
 tests/{util => models}/get_variables_test.py  |   3 +-
 tests/{util => models}/is_independent_test.py |   3 +-
 16 files changed, 457 insertions(+), 447 deletions(-)
 create mode 100644 edward/models/queries.py
 rename tests/{util => models}/get_ancestors_test.py (97%)
 rename tests/{util => models}/get_blanket_test.py (91%)
 rename tests/{util => models}/get_children_test.py (97%)
 rename tests/{util => models}/get_descendants_test.py (97%)
 rename tests/{util => models}/get_parents_test.py (97%)
 rename tests/{util => models}/get_siblings_test.py (97%)
 rename tests/{util => models}/get_variables_test.py (97%)
 rename tests/{util => models}/is_independent_test.py (96%)

diff --git a/edward/__init__.py b/edward/__init__.py
index 3bde7d7a6..8c44017f8 100644
--- a/edward/__init__.py
+++ b/edward/__init__.py
@@ -28,12 +28,18 @@
 # from edward.inferences import MonteCarlo, HMC, MetropolisHastings, SGLD, SGHMC, Gibbs
 from edward.models import (
     Trace,
+    get_ancestors,
+    get_blanket,
+    get_children,
+    get_descendants,
+    get_parents,
+    get_siblings,
+    get_variables,
+    is_independent,
     random_variables)
 from edward.util import (
     copy,
-    get_ancestors, get_blanket, get_children, get_control_variate_coef,
-    get_descendants, get_parents, get_siblings, get_variables,
-    is_independent,
+    get_control_variate_coef,
     transform)
 from edward.version import __version__, VERSION
 
diff --git a/edward/inferences/conjugacy/conjugacy.py b/edward/inferences/conjugacy/conjugacy.py
index dea978627..118c430d2 100644
--- a/edward/inferences/conjugacy/conjugacy.py
+++ b/edward/inferences/conjugacy/conjugacy.py
@@ -11,8 +11,9 @@
 from collections import defaultdict
 from edward.inferences.conjugacy.simplify \
     import symbolic_suff_stat, full_simplify, expr_contains, reconstruct_expr
+from edward.models import get_blanket
 from edward.models.core import *
-from edward.util import copy, get_blanket
+from edward.util import copy
 
 
 def mvn_diag_from_natural_params(p1, p2):
diff --git a/edward/inferences/laplace.py b/edward/inferences/laplace.py
index 13de2659b..4970c8f5c 100644
--- a/edward/inferences/laplace.py
+++ b/edward/inferences/laplace.py
@@ -7,7 +7,7 @@
 
 from edward.inferences.map import map
 from edward.models import RandomVariable
-from edward.util import get_variables
+from edward.models.queries import get_variables
 from edward.util import copy, transform
 
 try:
diff --git a/edward/models/__init__.py b/edward/models/__init__.py
index f68b6ee0e..0060281c9 100644
--- a/edward/models/__init__.py
+++ b/edward/models/__init__.py
@@ -6,6 +6,7 @@
 
 from edward.models.core import *
 from edward.models.random_variable import *
+from edward.models.queries import *
 
 from tensorflow.python.util.all_util import remove_undocumented
 from edward.models import core as _module
@@ -13,6 +14,14 @@
 _allowed_symbols = [
     'RandomVariable',
     'Trace',
+    'get_ancestors',
+    'get_blanket',
+    'get_children',
+    'get_descendants',
+    'get_parents',
+    'get_siblings',
+    'get_variables',
+    'is_independent',
     'random_variables',
 ]
 for name in dir(_module):
diff --git a/edward/models/queries.py b/edward/models/queries.py
new file mode 100644
index 000000000..663532462
--- /dev/null
+++ b/edward/models/queries.py
@@ -0,0 +1,421 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import six
+import tensorflow as tf
+
+from edward.models.random_variable import RandomVariable, random_variables
+
+
+def get_ancestors(x, collection=None):
+  """Get ancestor random variables of input.
+
+  Args:
+    x: RandomVariable or tf.Tensor.
+      Query node to find ancestors of.
+    collection: list of RandomVariable.
+      The collection of random variables to check with respect to;
+      defaults to all random variables in the graph.
+
+  Returns:
+    list of RandomVariable.
+    Ancestor random variables of x.
+
+  #### Examples
+  ```python
+  a = Normal(0.0, 1.0)
+  b = Normal(a, 1.0)
+  c = Normal(0.0, 1.0)
+  d = Normal(b * c, 1.0)
+  assert set(ed.get_ancestors(d)) == set([a, b, c])
+  ```
+  """
+  if collection is None:
+    collection = random_variables()
+
+  node_dict = {node.value: node for node in collection}
+
+  # Traverse the graph. Add each node to the set if it's in the collection.
+  output = set()
+  visited = set()
+  nodes = {x}
+  while nodes:
+    node = nodes.pop()
+
+    if node in visited:
+      continue
+    visited.add(node)
+
+    if isinstance(node, RandomVariable):
+      node = node.value
+
+    candidate_node = node_dict.get(node, None)
+    if candidate_node is not None and candidate_node != x:
+      output.add(candidate_node)
+
+    nodes.update(node.op.inputs)
+
+  return list(output)
+
+
+def get_blanket(x, collection=None):
+  """Get Markov blanket of input, which consists of its parents, its
+  children, and the other parents of its children.
+
+  Args:
+    x: RandomVariable or tf.Tensor.
+      Query node to find Markov blanket of.
+    collection: list of RandomVariable.
+      The collection of random variables to check with respect to;
+      defaults to all random variables in the graph.
+
+  Returns:
+    list of RandomVariable.
+    Markov blanket of x.
+
+  #### Examples
+
+  ```python
+  a = Normal(0.0, 1.0)
+  b = Normal(0.0, 1.0)
+  c = Normal(a * b, 1.0)
+  d = Normal(0.0, 1.0)
+  e = Normal(c * d, 1.0)
+  assert set(ed.get_blanket(c)) == set([a, b, d, e])
+  ```
+  """
+  output = set()
+  output.update(get_parents(x, collection))
+  children = get_children(x, collection)
+  output.update(children)
+  for child in children:
+    output.update(get_parents(child, collection))
+
+  output.discard(x)
+  return list(output)
+
+
+def get_children(x, collection=None):
+  """Get child random variables of input.
+
+  Args:
+    x: RandomVariable or tf.Tensor>
+      Query node to find children of.
+    collection: list of RandomVariable.
+      The collection of random variables to check with respect to;
+      defaults to all random variables in the graph.
+
+  Returns:
+    list of RandomVariable.
+    Child random variables of x.
+
+  #### Examples
+
+  ```python
+  a = Normal(0.0, 1.0)
+  b = Normal(a, 1.0)
+  c = Normal(a, 1.0)
+  d = Normal(c, 1.0)
+  assert set(ed.get_children(a)) == set([b, c])
+  ```
+  """
+  if collection is None:
+    collection = random_variables()
+
+  node_dict = {node.value: node for node in collection}
+
+  # Traverse the graph. Add each node to the set if it's in the collection.
+  output = set()
+  visited = set()
+  nodes = {x}
+  while nodes:
+    node = nodes.pop()
+
+    if node in visited:
+      continue
+    visited.add(node)
+
+    if isinstance(node, RandomVariable):
+      node = node.value
+
+    candidate_node = node_dict.get(node, None)
+    if candidate_node is not None and candidate_node != x:
+      output.add(candidate_node)
+    else:
+      for op in node.consumers():
+        nodes.update(op.outputs)
+
+  return list(output)
+
+
+def get_descendants(x, collection=None):
+  """Get descendant random variables of input.
+
+  Args:
+    x: RandomVariable or tf.Tensor.
+      Query node to find descendants of.
+    collection: list of RandomVariable.
+      The collection of random variables to check with respect to;
+      defaults to all random variables in the graph.
+
+  Returns:
+    list of RandomVariable.
+    Descendant random variables of x.
+
+  #### Examples
+
+  ```python
+  a = Normal(0.0, 1.0)
+  b = Normal(a, 1.0)
+  c = Normal(a, 1.0)
+  d = Normal(c, 1.0)
+  assert set(ed.get_descendants(a)) == set([b, c, d])
+  ```
+  """
+  if collection is None:
+    collection = random_variables()
+
+  node_dict = {node.value: node for node in collection}
+
+  # Traverse the graph. Add each node to the set if it's in the collection.
+  output = set()
+  visited = set()
+  nodes = {x}
+  while nodes:
+    node = nodes.pop()
+
+    if node in visited:
+      continue
+    visited.add(node)
+
+    if isinstance(node, RandomVariable):
+      node = node.value
+
+    candidate_node = node_dict.get(node, None)
+    if candidate_node is not None and candidate_node != x:
+      output.add(candidate_node)
+
+    for op in node.consumers():
+      nodes.update(op.outputs)
+
+  return list(output)
+
+
+def get_parents(x, collection=None):
+  """Get parent random variables of input.
+
+  Args:
+    x: RandomVariable or tf.Tensor.
+      Query node to find parents of.
+    collection: list of RandomVariable.
+      The collection of random variables to check with respect to;
+      defaults to all random variables in the graph.
+
+  Returns:
+    list of RandomVariable.
+    Parent random variables of x.
+
+  #### Examples
+
+  ```python
+  a = Normal(0.0, 1.0)
+  b = Normal(a, 1.0)
+  c = Normal(0.0, 1.0)
+  d = Normal(b * c, 1.0)
+  assert set(ed.get_parents(d)) == set([b, c])
+  ```
+  """
+  if collection is None:
+    collection = random_variables()
+
+  node_dict = {node.value: node for node in collection}
+
+  # Traverse the graph. Add each node to the set if it's in the collection.
+  output = set()
+  visited = set()
+  nodes = {x}
+  while nodes:
+    node = nodes.pop()
+
+    if node in visited:
+      continue
+    visited.add(node)
+
+    if isinstance(node, RandomVariable):
+      node = node.value
+
+    candidate_node = node_dict.get(node, None)
+    if candidate_node is not None and candidate_node != x:
+      output.add(candidate_node)
+    else:
+      nodes.update(node.op.inputs)
+
+  return list(output)
+
+
+def get_siblings(x, collection=None):
+  """Get sibling random variables of input.
+
+  Args:
+    x: RandomVariable or tf.Tensor.
+      Query node to find siblings of.
+    collection: list of RandomVariable.
+      The collection of random variables to check with respect to;
+      defaults to all random variables in the graph.
+
+  Returns:
+    list of RandomVariable.
+    Sibling random variables of x.
+
+  #### Examples
+
+  ```python
+  a = Normal(0.0, 1.0)
+  b = Normal(a, 1.0)
+  c = Normal(a, 1.0)
+  assert ed.get_siblings(b) == [c]
+  ```
+  """
+  parents = get_parents(x, collection)
+  siblings = set()
+  for parent in parents:
+    siblings.update(get_children(parent, collection))
+
+  siblings.discard(x)
+  return list(siblings)
+
+
+def get_variables(x, collection=None):
+  """Get parent TensorFlow variables of input.
+
+  Args:
+    x: RandomVariable or tf.Tensor.
+      Query node to find parents of.
+    collection: list of tf.Variable.
+      The collection of variables to check with respect to; defaults to
+      all variables in the graph.
+
+  Returns:
+    list of tf.Variable.
+    TensorFlow variables that x depends on.
+
+  #### Examples
+
+  ```python
+  a = tf.Variable(0.0)
+  b = tf.Variable(0.0)
+  c = Normal(a * b, 1.0)
+  assert set(ed.get_variables(c)) == set([a, b])
+  ```
+  """
+  if collection is None:
+    collection = tf.global_variables()
+
+  node_dict = {node.name: node for node in collection}
+
+  # Traverse the graph. Add each node to the set if it's in the collection.
+  output = set()
+  visited = set()
+  nodes = {x}
+  while nodes:
+    node = nodes.pop()
+
+    if node in visited:
+      continue
+    visited.add(node)
+
+    if isinstance(node, RandomVariable):
+      node = node.value
+
+    candidate_node = node_dict.get(node.name, None)
+    if candidate_node is not None and candidate_node != x:
+      output.add(candidate_node)
+
+    nodes.update(node.op.inputs)
+
+  return list(output)
+
+
+def is_independent(a, b, condition=None):
+  """Assess whether a is independent of b given the random variables in
+  condition.
+
+  Implemented using the Bayes-Ball algorithm [@schachter1998bayes].
+
+  Args:
+    a: RandomVariable or list of RandomVariable.
+       Query node(s).
+    b: RandomVariable or list of RandomVariable.
+       Query node(s).
+    condition: RandomVariable or list of RandomVariable.
+       Random variable(s) to condition on.
+
+  Returns:
+    bool.
+    True if a is independent of b given the random variables in condition.
+
+  #### Examples
+
+  ```python
+  a = Normal(0.0, 1.0)
+  b = Normal(a, 1.0)
+  c = Normal(a, 1.0)
+  assert ed.is_independent(b, c, condition=a)
+  ```
+  """
+  if condition is None:
+    condition = []
+  if not isinstance(a, list):
+    a = [a]
+  if not isinstance(b, list):
+    b = [b]
+  if not isinstance(condition, list):
+    condition = [condition]
+  A = set(a)
+  B = set(b)
+  condition = set(condition)
+
+  top_marked = set()
+  # The Bayes-Ball algorithm will traverse the belief network
+  # and add each node that is relevant to B given condition
+  # to the set bottom_marked. A and B are conditionally
+  # independent if no node in A is in bottom_marked.
+  bottom_marked = set()
+
+  schedule = [(node, "child") for node in B]
+  while schedule:
+    node, came_from = schedule.pop()
+
+    if node not in condition and came_from == "child":
+      if node not in top_marked:
+        top_marked.add(node)
+        for parent in get_parents(node):
+          schedule.append((parent, "child"))
+
+      # TODO
+      from edward.models import PointMass
+      if not isinstance(node, PointMass) and node not in bottom_marked:
+        bottom_marked.add(node)
+        if node in A:
+          return False  # node in A is relevant to B
+        for child in get_children(node):
+          schedule.append((child, "parent"))
+
+    elif came_from == "parent":
+      if node in condition and node not in top_marked:
+        top_marked.add(node)
+        for parent in get_parents(node):
+          schedule.append((parent, "child"))
+
+      elif node not in condition and node not in bottom_marked:
+        bottom_marked.add(node)
+        if node in A:
+          return False  # node in A is relevant to B
+        for child in get_children(node):
+          schedule.append((child, "parent"))
+
+  return True
+
+
+del random_variables
diff --git a/edward/models/random_variable.py b/edward/models/random_variable.py
index 4a4bfb145..3c2986ad0 100644
--- a/edward/models/random_variable.py
+++ b/edward/models/random_variable.py
@@ -216,37 +216,37 @@ def numpy(self):
 
   def get_ancestors(self, collection=None):
     """Get ancestor random variables."""
-    from edward.util.random_variables import get_ancestors
+    from edward.models.queries import get_ancestors
     return get_ancestors(self, collection)
 
   def get_blanket(self, collection=None):
     """Get the random variable's Markov blanket."""
-    from edward.util.random_variables import get_blanket
+    from edward.models.queries import get_blanket
     return get_blanket(self, collection)
 
   def get_children(self, collection=None):
     """Get child random variables."""
-    from edward.util.random_variables import get_children
+    from edward.models.queries import get_children
     return get_children(self, collection)
 
   def get_descendants(self, collection=None):
     """Get descendant random variables."""
-    from edward.util.random_variables import get_descendants
+    from edward.models.queries import get_descendants
     return get_descendants(self, collection)
 
   def get_parents(self, collection=None):
     """Get parent random variables."""
-    from edward.util.random_variables import get_parents
+    from edward.models.queries import get_parents
     return get_parents(self, collection)
 
   def get_siblings(self, collection=None):
     """Get sibling random variables."""
-    from edward.util.random_variables import get_siblings
+    from edward.models.queries import get_siblings
     return get_siblings(self, collection)
 
   def get_variables(self, collection=None):
     """Get TensorFlow variables that the random variable depends on."""
-    from edward.util.random_variables import get_variables
+    from edward.models.queries import get_variables
     return get_variables(self, collection)
 
   def get_shape(self):
diff --git a/edward/util/__init__.py b/edward/util/__init__.py
index ec1d99ab3..6dfba6160 100644
--- a/edward/util/__init__.py
+++ b/edward/util/__init__.py
@@ -11,15 +11,7 @@
 
 _allowed_symbols = [
     'copy',
-    'get_ancestors',
-    'get_blanket',
-    'get_children',
     'get_control_variate_coef',
-    'get_descendants',
-    'get_parents',
-    'get_siblings',
-    'get_variables',
-    'is_independent',
     'transform',
 ]
 
diff --git a/edward/util/random_variables.py b/edward/util/random_variables.py
index 194ed7fb7..f6770a6a0 100644
--- a/edward/util/random_variables.py
+++ b/edward/util/random_variables.py
@@ -2,7 +2,6 @@
 from __future__ import division
 from __future__ import print_function
 
-import six
 import tensorflow as tf
 
 from copy import deepcopy
@@ -378,416 +377,6 @@ def copy(org_instance, dict_swap=None, scope="copied",
     raise TypeError("Could not copy instance: " + str(org_instance))
 
 
-def get_ancestors(x, collection=None):
-  """Get ancestor random variables of input.
-
-  Args:
-    x: RandomVariable or tf.Tensor.
-      Query node to find ancestors of.
-    collection: list of RandomVariable.
-      The collection of random variables to check with respect to;
-      defaults to all random variables in the graph.
-
-  Returns:
-    list of RandomVariable.
-    Ancestor random variables of x.
-
-  #### Examples
-  ```python
-  a = Normal(0.0, 1.0)
-  b = Normal(a, 1.0)
-  c = Normal(0.0, 1.0)
-  d = Normal(b * c, 1.0)
-  assert set(ed.get_ancestors(d)) == set([a, b, c])
-  ```
-  """
-  if collection is None:
-    collection = random_variables()
-
-  node_dict = {node.value: node for node in collection}
-
-  # Traverse the graph. Add each node to the set if it's in the collection.
-  output = set()
-  visited = set()
-  nodes = {x}
-  while nodes:
-    node = nodes.pop()
-
-    if node in visited:
-      continue
-    visited.add(node)
-
-    if isinstance(node, RandomVariable):
-      node = node.value
-
-    candidate_node = node_dict.get(node, None)
-    if candidate_node is not None and candidate_node != x:
-      output.add(candidate_node)
-
-    nodes.update(node.op.inputs)
-
-  return list(output)
-
-
-def get_blanket(x, collection=None):
-  """Get Markov blanket of input, which consists of its parents, its
-  children, and the other parents of its children.
-
-  Args:
-    x: RandomVariable or tf.Tensor.
-      Query node to find Markov blanket of.
-    collection: list of RandomVariable.
-      The collection of random variables to check with respect to;
-      defaults to all random variables in the graph.
-
-  Returns:
-    list of RandomVariable.
-    Markov blanket of x.
-
-  #### Examples
-
-  ```python
-  a = Normal(0.0, 1.0)
-  b = Normal(0.0, 1.0)
-  c = Normal(a * b, 1.0)
-  d = Normal(0.0, 1.0)
-  e = Normal(c * d, 1.0)
-  assert set(ed.get_blanket(c)) == set([a, b, d, e])
-  ```
-  """
-  output = set()
-  output.update(get_parents(x, collection))
-  children = get_children(x, collection)
-  output.update(children)
-  for child in children:
-    output.update(get_parents(child, collection))
-
-  output.discard(x)
-  return list(output)
-
-
-def get_children(x, collection=None):
-  """Get child random variables of input.
-
-  Args:
-    x: RandomVariable or tf.Tensor>
-      Query node to find children of.
-    collection: list of RandomVariable.
-      The collection of random variables to check with respect to;
-      defaults to all random variables in the graph.
-
-  Returns:
-    list of RandomVariable.
-    Child random variables of x.
-
-  #### Examples
-
-  ```python
-  a = Normal(0.0, 1.0)
-  b = Normal(a, 1.0)
-  c = Normal(a, 1.0)
-  d = Normal(c, 1.0)
-  assert set(ed.get_children(a)) == set([b, c])
-  ```
-  """
-  if collection is None:
-    collection = random_variables()
-
-  node_dict = {node.value: node for node in collection}
-
-  # Traverse the graph. Add each node to the set if it's in the collection.
-  output = set()
-  visited = set()
-  nodes = {x}
-  while nodes:
-    node = nodes.pop()
-
-    if node in visited:
-      continue
-    visited.add(node)
-
-    if isinstance(node, RandomVariable):
-      node = node.value
-
-    candidate_node = node_dict.get(node, None)
-    if candidate_node is not None and candidate_node != x:
-      output.add(candidate_node)
-    else:
-      for op in node.consumers():
-        nodes.update(op.outputs)
-
-  return list(output)
-
-
-def get_descendants(x, collection=None):
-  """Get descendant random variables of input.
-
-  Args:
-    x: RandomVariable or tf.Tensor.
-      Query node to find descendants of.
-    collection: list of RandomVariable.
-      The collection of random variables to check with respect to;
-      defaults to all random variables in the graph.
-
-  Returns:
-    list of RandomVariable.
-    Descendant random variables of x.
-
-  #### Examples
-
-  ```python
-  a = Normal(0.0, 1.0)
-  b = Normal(a, 1.0)
-  c = Normal(a, 1.0)
-  d = Normal(c, 1.0)
-  assert set(ed.get_descendants(a)) == set([b, c, d])
-  ```
-  """
-  if collection is None:
-    collection = random_variables()
-
-  node_dict = {node.value: node for node in collection}
-
-  # Traverse the graph. Add each node to the set if it's in the collection.
-  output = set()
-  visited = set()
-  nodes = {x}
-  while nodes:
-    node = nodes.pop()
-
-    if node in visited:
-      continue
-    visited.add(node)
-
-    if isinstance(node, RandomVariable):
-      node = node.value
-
-    candidate_node = node_dict.get(node, None)
-    if candidate_node is not None and candidate_node != x:
-      output.add(candidate_node)
-
-    for op in node.consumers():
-      nodes.update(op.outputs)
-
-  return list(output)
-
-
-def get_parents(x, collection=None):
-  """Get parent random variables of input.
-
-  Args:
-    x: RandomVariable or tf.Tensor.
-      Query node to find parents of.
-    collection: list of RandomVariable.
-      The collection of random variables to check with respect to;
-      defaults to all random variables in the graph.
-
-  Returns:
-    list of RandomVariable.
-    Parent random variables of x.
-
-  #### Examples
-
-  ```python
-  a = Normal(0.0, 1.0)
-  b = Normal(a, 1.0)
-  c = Normal(0.0, 1.0)
-  d = Normal(b * c, 1.0)
-  assert set(ed.get_parents(d)) == set([b, c])
-  ```
-  """
-  if collection is None:
-    collection = random_variables()
-
-  node_dict = {node.value: node for node in collection}
-
-  # Traverse the graph. Add each node to the set if it's in the collection.
-  output = set()
-  visited = set()
-  nodes = {x}
-  while nodes:
-    node = nodes.pop()
-
-    if node in visited:
-      continue
-    visited.add(node)
-
-    if isinstance(node, RandomVariable):
-      node = node.value
-
-    candidate_node = node_dict.get(node, None)
-    if candidate_node is not None and candidate_node != x:
-      output.add(candidate_node)
-    else:
-      nodes.update(node.op.inputs)
-
-  return list(output)
-
-
-def get_siblings(x, collection=None):
-  """Get sibling random variables of input.
-
-  Args:
-    x: RandomVariable or tf.Tensor.
-      Query node to find siblings of.
-    collection: list of RandomVariable.
-      The collection of random variables to check with respect to;
-      defaults to all random variables in the graph.
-
-  Returns:
-    list of RandomVariable.
-    Sibling random variables of x.
-
-  #### Examples
-
-  ```python
-  a = Normal(0.0, 1.0)
-  b = Normal(a, 1.0)
-  c = Normal(a, 1.0)
-  assert ed.get_siblings(b) == [c]
-  ```
-  """
-  parents = get_parents(x, collection)
-  siblings = set()
-  for parent in parents:
-    siblings.update(get_children(parent, collection))
-
-  siblings.discard(x)
-  return list(siblings)
-
-
-def get_variables(x, collection=None):
-  """Get parent TensorFlow variables of input.
-
-  Args:
-    x: RandomVariable or tf.Tensor.
-      Query node to find parents of.
-    collection: list of tf.Variable.
-      The collection of variables to check with respect to; defaults to
-      all variables in the graph.
-
-  Returns:
-    list of tf.Variable.
-    TensorFlow variables that x depends on.
-
-  #### Examples
-
-  ```python
-  a = tf.Variable(0.0)
-  b = tf.Variable(0.0)
-  c = Normal(a * b, 1.0)
-  assert set(ed.get_variables(c)) == set([a, b])
-  ```
-  """
-  if collection is None:
-    collection = tf.global_variables()
-
-  node_dict = {node.name: node for node in collection}
-
-  # Traverse the graph. Add each node to the set if it's in the collection.
-  output = set()
-  visited = set()
-  nodes = {x}
-  while nodes:
-    node = nodes.pop()
-
-    if node in visited:
-      continue
-    visited.add(node)
-
-    if isinstance(node, RandomVariable):
-      node = node.value
-
-    candidate_node = node_dict.get(node.name, None)
-    if candidate_node is not None and candidate_node != x:
-      output.add(candidate_node)
-
-    nodes.update(node.op.inputs)
-
-  return list(output)
-
-
-def is_independent(a, b, condition=None):
-  """Assess whether a is independent of b given the random variables in
-  condition.
-
-  Implemented using the Bayes-Ball algorithm [@schachter1998bayes].
-
-  Args:
-    a: RandomVariable or list of RandomVariable.
-       Query node(s).
-    b: RandomVariable or list of RandomVariable.
-       Query node(s).
-    condition: RandomVariable or list of RandomVariable.
-       Random variable(s) to condition on.
-
-  Returns:
-    bool.
-    True if a is independent of b given the random variables in condition.
-
-  #### Examples
-
-  ```python
-  a = Normal(0.0, 1.0)
-  b = Normal(a, 1.0)
-  c = Normal(a, 1.0)
-  assert ed.is_independent(b, c, condition=a)
-  ```
-  """
-  if condition is None:
-    condition = []
-  if not isinstance(a, list):
-    a = [a]
-  if not isinstance(b, list):
-    b = [b]
-  if not isinstance(condition, list):
-    condition = [condition]
-  A = set(a)
-  B = set(b)
-  condition = set(condition)
-
-  top_marked = set()
-  # The Bayes-Ball algorithm will traverse the belief network
-  # and add each node that is relevant to B given condition
-  # to the set bottom_marked. A and B are conditionally
-  # independent if no node in A is in bottom_marked.
-  bottom_marked = set()
-
-  schedule = [(node, "child") for node in B]
-  while schedule:
-    node, came_from = schedule.pop()
-
-    if node not in condition and came_from == "child":
-      if node not in top_marked:
-        top_marked.add(node)
-        for parent in get_parents(node):
-          schedule.append((parent, "child"))
-
-      # TODO
-      from edward.models import PointMass
-      if not isinstance(node, PointMass) and node not in bottom_marked:
-        bottom_marked.add(node)
-        if node in A:
-          return False  # node in A is relevant to B
-        for child in get_children(node):
-          schedule.append((child, "parent"))
-
-    elif came_from == "parent":
-      if node in condition and node not in top_marked:
-        top_marked.add(node)
-        for parent in get_parents(node):
-          schedule.append((parent, "child"))
-
-      elif node not in condition and node not in bottom_marked:
-        bottom_marked.add(node)
-        if node in A:
-          return False  # node in A is relevant to B
-        for child in get_children(node):
-          schedule.append((child, "parent"))
-
-  return True
-
-
 def transform(x, *args, **kwargs):
   """Transform a continuous random variable to the unconstrained space.
 
diff --git a/tests/util/get_ancestors_test.py b/tests/models/get_ancestors_test.py
similarity index 97%
rename from tests/util/get_ancestors_test.py
rename to tests/models/get_ancestors_test.py
index eaaa67ad4..9b2852cf6 100644
--- a/tests/util/get_ancestors_test.py
+++ b/tests/models/get_ancestors_test.py
@@ -4,8 +4,7 @@
 
 import tensorflow as tf
 
-from edward.models import Bernoulli, Normal
-from edward.util import get_ancestors
+from edward.models import Bernoulli, Normal, get_ancestors
 
 
 class test_get_ancestors_class(tf.test.TestCase):
diff --git a/tests/util/get_blanket_test.py b/tests/models/get_blanket_test.py
similarity index 91%
rename from tests/util/get_blanket_test.py
rename to tests/models/get_blanket_test.py
index 21f32eec6..e1a6ed109 100644
--- a/tests/util/get_blanket_test.py
+++ b/tests/models/get_blanket_test.py
@@ -4,8 +4,7 @@
 
 import tensorflow as tf
 
-from edward.models import Bernoulli, Normal
-from edward.util import get_blanket
+from edward.models import Bernoulli, Normal, get_blanket
 
 
 class test_get_blanket_class(tf.test.TestCase):
diff --git a/tests/util/get_children_test.py b/tests/models/get_children_test.py
similarity index 97%
rename from tests/util/get_children_test.py
rename to tests/models/get_children_test.py
index bf6c05a99..345d4b500 100644
--- a/tests/util/get_children_test.py
+++ b/tests/models/get_children_test.py
@@ -4,8 +4,7 @@
 
 import tensorflow as tf
 
-from edward.models import Bernoulli, Normal
-from edward.util import get_children
+from edward.models import Bernoulli, Normal, get_children
 
 
 class test_get_children_class(tf.test.TestCase):
diff --git a/tests/util/get_descendants_test.py b/tests/models/get_descendants_test.py
similarity index 97%
rename from tests/util/get_descendants_test.py
rename to tests/models/get_descendants_test.py
index f70eeb6bd..4e5fbb72c 100644
--- a/tests/util/get_descendants_test.py
+++ b/tests/models/get_descendants_test.py
@@ -4,8 +4,7 @@
 
 import tensorflow as tf
 
-from edward.models import Bernoulli, Normal
-from edward.util import get_descendants
+from edward.models import Bernoulli, Normal, get_descendants
 
 
 class test_get_descendants_class(tf.test.TestCase):
diff --git a/tests/util/get_parents_test.py b/tests/models/get_parents_test.py
similarity index 97%
rename from tests/util/get_parents_test.py
rename to tests/models/get_parents_test.py
index eccb0782f..7fd40b619 100644
--- a/tests/util/get_parents_test.py
+++ b/tests/models/get_parents_test.py
@@ -4,8 +4,7 @@
 
 import tensorflow as tf
 
-from edward.models import Bernoulli, Normal
-from edward.util import get_parents
+from edward.models import Bernoulli, Normal, get_parents
 
 
 class test_get_parents_class(tf.test.TestCase):
diff --git a/tests/util/get_siblings_test.py b/tests/models/get_siblings_test.py
similarity index 97%
rename from tests/util/get_siblings_test.py
rename to tests/models/get_siblings_test.py
index b54543d6b..5a792468a 100644
--- a/tests/util/get_siblings_test.py
+++ b/tests/models/get_siblings_test.py
@@ -4,8 +4,7 @@
 
 import tensorflow as tf
 
-from edward.models import Bernoulli, Normal
-from edward.util import get_siblings
+from edward.models import Bernoulli, Normal, get_siblings
 
 
 class test_get_siblings_class(tf.test.TestCase):
diff --git a/tests/util/get_variables_test.py b/tests/models/get_variables_test.py
similarity index 97%
rename from tests/util/get_variables_test.py
rename to tests/models/get_variables_test.py
index cd5e91f78..c73a9bf96 100644
--- a/tests/util/get_variables_test.py
+++ b/tests/models/get_variables_test.py
@@ -4,8 +4,7 @@
 
 import tensorflow as tf
 
-from edward.models import Bernoulli, Normal
-from edward.util import get_variables
+from edward.models import Bernoulli, Normal, get_variables
 
 
 class test_get_variables_class(tf.test.TestCase):
diff --git a/tests/util/is_independent_test.py b/tests/models/is_independent_test.py
similarity index 96%
rename from tests/util/is_independent_test.py
rename to tests/models/is_independent_test.py
index adeaf9ab5..fec5a2324 100644
--- a/tests/util/is_independent_test.py
+++ b/tests/models/is_independent_test.py
@@ -4,8 +4,7 @@
 
 import tensorflow as tf
 
-from edward.models import Normal
-from edward.util import is_independent
+from edward.models import Normal, is_independent
 
 
 class test_is_independent_class(tf.test.TestCase):

From c92e369527f45c9b3fc5f57db32bc51d25cd9e23 Mon Sep 17 00:00:00 2001
From: Dustin Tran <dustinviettran@gmail.com>
Date: Sun, 14 Jan 2018 15:59:59 -0800
Subject: [PATCH 17/27] extend {klpq,klqp,laplace,map,wake_sleep}.py,gans to
 trace

---
 edward/__init__.py                            |   26 +-
 edward/inferences/__init__.py                 |   54 +-
 edward/inferences/bigan_inference.py          |   57 +-
 edward/inferences/conjugacy/conjugacy.py      |    1 -
 edward/inferences/gan_inference.py            |   38 +-
 edward/inferences/hmc.py                      |    1 -
 edward/inferences/inference.py                |  492 +++----
 edward/inferences/klpq.py                     |  100 +-
 edward/inferences/klqp.py                     | 1173 ++++-------------
 .../{implicit_klqp.py => klqp_implicit.py}    |  150 +--
 edward/inferences/laplace.py                  |  112 +-
 edward/inferences/map.py                      |   84 +-
 edward/inferences/metropolis_hastings.py      |    2 +-
 edward/inferences/sghmc.py                    |    1 -
 edward/inferences/sgld.py                     |    1 -
 edward/inferences/wake_sleep.py               |  117 +-
 edward/inferences/wgan_inference.py           |   34 +-
 edward/util/__init__.py                       |    1 -
 edward/util/random_variables.py               |  368 ------
 tests/util/copy_test.py                       |  248 ----
 20 files changed, 741 insertions(+), 2319 deletions(-)
 rename edward/inferences/{implicit_klqp.py => klqp_implicit.py} (59%)
 delete mode 100644 tests/util/copy_test.py

diff --git a/edward/__init__.py b/edward/__init__.py
index 8c44017f8..ae081073b 100644
--- a/edward/__init__.py
+++ b/edward/__init__.py
@@ -11,16 +11,12 @@
     bigan_inference,
     complete_conditional,
     gan_inference,
-    implicit_klqp,
     klpq,
     klqp,
-    reparameterization_klqp,
-    reparameterization_kl_klqp,
-    reparameterization_entropy_klqp,
-    score_klqp,
-    score_kl_klqp,
-    score_entropy_klqp,
-    score_rb_klqp,
+    klqp_implicit,
+    klqp_reparameterization,
+    klqp_reparameterization_kl,
+    klqp_score,
     laplace,
     map,
     wake_sleep,
@@ -38,7 +34,6 @@
     is_independent,
     random_variables)
 from edward.util import (
-    copy,
     get_control_variate_coef,
     transform)
 from edward.version import __version__, VERSION
@@ -53,7 +48,6 @@
     'bigan_inference',
     'complete_conditional',
     'gan_inference',
-    'implicit_klqp',
     'MonteCarlo',
     'HMC',
     'MetropolisHastings',
@@ -61,20 +55,16 @@
     'SGHMC',
     'klpq',
     'klqp',
-    'reparameterization_klqp',
-    'reparameterization_kl_klqp',
-    'reparameterization_entropy_klqp',
-    'score_klqp',
-    'score_kl_klqp',
-    'score_entropy_klqp',
-    'score_rb_klqp',
+    'klqp_implicit',
+    'klqp_reparameterization',
+    'klqp_reparameterization_kl',
+    'klqp_score',
     'laplace',
     'map',
     'wake_sleep',
     'wgan_inference',
     'Gibbs',
     'Trace',
-    'copy',
     'get_ancestors',
     'get_blanket',
     'get_children',
diff --git a/edward/inferences/__init__.py b/edward/inferences/__init__.py
index 10c664cb3..d11af8efc 100644
--- a/edward/inferences/__init__.py
+++ b/edward/inferences/__init__.py
@@ -1,4 +1,44 @@
 """
+There are two approaches to inference.
+
+1. Idiomatic TensorFlow
+  1. Build train_op (*).
+  2. Build summary file writer.
+  3. Build and run TensorFlow variable initializer ops.
+  4. Build progressbar (*).
+  5. Within a training loop:
+    + sess.run with infeeding and summary writers.
+    + Update progressbar (*).
+    + Check convergence (*).
+  6. Build and run post-training ops (*).
+2. Idiomatic TensorFlow Estimator
+  + Call train() (*). It is a higher-order function taking in the
+  model program, data, an optional inference function to build the
+  train_op, and various other things. As an inference engine, it
+  automates the process above.
+
+Inference provides functions for both approaches. In the first
+approach, it provides (*), namely: (1) inference algorithms to help
+produce the train_op (and low-level functions to build your own
+algorithms); (2) a progressbar to build and update; (3) convergence
+diagnostics; and (4) post-training ops for certain algorithms. In the
+second approach, it provides the fully automated train().
+
+Inference uses (unbinded) pure functions with TensorFlow idiomatic
+exceptions (e.g., mutable state via TensorFlow variables; side effect
+of adding to global collections and TF graph). It forgoes OO.
+
+This file is a collection of functions shared across inference
+algorithms, used for the following:
+
++ "call f up to args" (in `inferences/inference`)
++ a "make intercept" factory (in `inferences/inference`)
++ automated transforms (in `inferences/inference` and `util/random_variables`)
++ programmatic docstrings (in `inferences/docstrings`)
++ `train` (in `inferences/inference`)
+
+Specific inference files provide functions to help produce the train
+(and post-training) ops.
 """
 from __future__ import absolute_import
 from __future__ import division
@@ -9,10 +49,10 @@
 from edward.inferences.gan_inference import *
 # from edward.inferences.gibbs import *
 # from edward.inferences.hmc import *
-from edward.inferences.implicit_klqp import *
 from edward.inferences.inference import *
 from edward.inferences.klpq import *
 from edward.inferences.klqp import *
+from edward.inferences.klqp_implicit import *
 from edward.inferences.laplace import *
 from edward.inferences.map import *
 # from edward.inferences.metropolis_hastings import *
@@ -28,18 +68,14 @@
     'bigan_inference',
     'complete_conditional',
     'gan_inference',
-    'implicit_klqp',
     'Gibbs',
     'HMC',
     'klpq',
     'klqp',
-    'reparameterization_klqp',
-    'reparameterization_kl_klqp',
-    'reparameterization_entropy_klqp',
-    'score_klqp',
-    'score_kl_klqp',
-    'score_entropy_klqp',
-    'score_rb_klqp',
+    'klqp_implicit',
+    'klqp_reparameterization',
+    'klqp_reparameterization_kl',
+    'klqp_score',
     'laplace',
     'map',
     'MetropolisHastings',
diff --git a/edward/inferences/bigan_inference.py b/edward/inferences/bigan_inference.py
index 7a440a547..9a9effa75 100644
--- a/edward/inferences/bigan_inference.py
+++ b/edward/inferences/bigan_inference.py
@@ -5,13 +5,12 @@
 import six
 import tensorflow as tf
 
-from edward.inferences.inference import (check_and_maybe_build_data,
-    check_and_maybe_build_latent_vars, transform, check_and_maybe_build_dict, check_and_maybe_build_var_list)
+from edward.inferences.inference import call_function_up_to_args
+from edward.models.core import Trace
 
 
-def bigan_inference(latent_vars=None, data=None, discriminator=None,
-                    auto_transform=True, scale=None, var_list=None,
-                    collections=None):
+def bigan_inference(model, variational, discriminator, align_data,
+                    align_latent, collections=None, *args, **kwargs):
   """Adversarially Learned Inference [@dumuolin2017adversarially] or
   Bidirectional Generative Adversarial Networks [@donahue2017adversarial]
   for joint learning of generator and inference networks.
@@ -44,20 +43,27 @@ def bigan_inference(latent_vars=None, data=None, discriminator=None,
     zf = gen_latent(x_ph)
   inference = ed.BiGANInference({z_ph: zf}, {xf: x_ph}, discriminator)
   ```
-  """
-  if not callable(discriminator):
-    raise TypeError("discriminator must be a callable function.")
-  latent_vars = check_and_maybe_build_latent_vars(latent_vars)
-  data = check_and_maybe_build_data(data)
-  latent_vars, _ = transform(latent_vars, auto_transform)
-  scale = check_and_maybe_build_dict(scale)
-  var_list = check_and_maybe_build_var_list(var_list, latent_vars, data)
-
-  x_true = list(six.itervalues(self.data))[0]
-  x_fake = list(six.iterkeys(self.data))[0]
 
-  z_true = list(six.iterkeys(self.latent_vars))[0]
-  z_fake = list(six.itervalues(self.latent_vars))[0]
+  `align_latent` must only align one random variable in `model` and
+  `variational`. `model` must return the generated data.
+  """
+  with Trace() as posterior_trace:
+    call_function_up_to_args(variational, *args, **kwargs)
+  with Trace() as model_trace:
+    x_fake = call_function_up_to_args(model, *args, **kwargs)
+
+  key = align_data(x_fake.name.split(':')[0])
+  if isinstance(key, int):
+    x_true = args[key]
+  elif kwargs.get(key, None) is not None:
+    x_true = kwargs.get(key)
+
+  for name, node in six.iteritems(model_trace):
+    aligned = align_latent(name)
+    if aligned is not None:
+      z_true = node.value
+      z_fake = posterior_trace[aligned].value
+      break
 
   with tf.variable_scope("Disc"):
       # xtzf := x_true, z_fake
@@ -76,18 +82,9 @@ def bigan_inference(latent_vars=None, data=None, discriminator=None,
           labels=tf.ones_like(d_xtzf), logits=d_xtzf)
 
   reg_terms_d = tf.losses.get_regularization_losses(scope="Disc")
-  reg_terms = tf.losses.get_regularization_losses(scope="Gen")
+  reg_terms_all = tf.losses.get_regularization_losses()
+  reg_terms = [r for r in reg_terms_all if r not in reg_terms_d]
 
   loss_d = tf.reduce_mean(loss_d) + tf.reduce_sum(reg_terms_d)
   loss = tf.reduce_mean(loss) + tf.reduce_sum(reg_terms)
-
-  var_list_d = tf.get_collection(
-      tf.GraphKeys.TRAINABLE_VARIABLES, scope="Disc")
-  var_list = tf.get_collection(
-      tf.GraphKeys.TRAINABLE_VARIABLES, scope="Gen")
-
-  grads_d = tf.gradients(loss_d, var_list_d)
-  grads = tf.gradients(loss, var_list)
-  grads_and_vars_d = list(zip(grads_d, var_list_d))
-  grads_and_vars = list(zip(grads, var_list))
-  return loss, grads_and_vars, loss_d, grads_and_vars_d
+  return loss, loss_d
diff --git a/edward/inferences/conjugacy/conjugacy.py b/edward/inferences/conjugacy/conjugacy.py
index 118c430d2..1f805c53d 100644
--- a/edward/inferences/conjugacy/conjugacy.py
+++ b/edward/inferences/conjugacy/conjugacy.py
@@ -13,7 +13,6 @@
     import symbolic_suff_stat, full_simplify, expr_contains, reconstruct_expr
 from edward.models import get_blanket
 from edward.models.core import *
-from edward.util import copy
 
 
 def mvn_diag_from_natural_params(p1, p2):
diff --git a/edward/inferences/gan_inference.py b/edward/inferences/gan_inference.py
index a1b979733..0713f2813 100644
--- a/edward/inferences/gan_inference.py
+++ b/edward/inferences/gan_inference.py
@@ -5,12 +5,11 @@
 import six
 import tensorflow as tf
 
-from edward.inferences.inference import (check_and_maybe_build_data,
-    transform, check_and_maybe_build_dict, check_and_maybe_build_var_list)
+from edward.inferences.inference import call_function_up_to_args
 
 
-def gan_inference(data=None, discriminator=None,
-                  scale=None, var_list=None, collections=None):
+def gan_inference(model, discriminator, align_data,
+                  collections=None, *args, **kwargs):
   """Parameter estimation with GAN-style training
   [@goodfellow2014generative].
 
@@ -55,18 +54,15 @@ def gan_inference(data=None, discriminator=None,
       Function (with parameters) to discriminate samples. It should
       output logit probabilities (real-valued) and not probabilities
       in $[0, 1]$.
-    var_list: list of tf.Variable, optional.
-      List of TensorFlow variables to optimize over (in the generative
-      model). Default is all trainable variables that `data` depends on.
+
+  `model` must return the generated data.
   """
-  if not callable(discriminator):
-    raise TypeError("discriminator must be a callable function.")
-  data = check_and_maybe_build_data(data)
-  scale = check_and_maybe_build_dict(scale)
-  var_list = check_and_maybe_build_var_list(var_list, {}, data)
-
-  x_true = list(six.itervalues(data))[0]
-  x_fake = list(six.iterkeys(data))[0]
+  x_fake = call_function_up_to_args(model, *args, **kwargs)
+  key = align_data(x_fake.name.split(':')[0])
+  if isinstance(key, int):
+    x_true = args[key]
+  elif kwargs.get(key, None) is not None:
+    x_true = kwargs.get(key)
   with tf.variable_scope("Disc"):
     d_true = discriminator(x_true)
 
@@ -90,14 +86,4 @@ def gan_inference(data=None, discriminator=None,
       labels=tf.ones_like(d_fake), logits=d_fake)
   loss_d = tf.reduce_mean(loss_d) + tf.reduce_sum(reg_terms_d)
   loss = tf.reduce_mean(loss) + tf.reduce_sum(reg_terms)
-
-  var_list_d = tf.get_collection(
-      tf.GraphKeys.TRAINABLE_VARIABLES, scope="Disc")
-  if var_list is None:
-    var_list = [v for v in tf.trainable_variables() if v not in var_list_d]
-
-  grads_d = tf.gradients(loss_d, var_list_d)
-  grads = tf.gradients(loss, var_list)
-  grads_and_vars_d = list(zip(grads_d, var_list_d))
-  grads_and_vars = list(zip(grads, var_list))
-  return loss, grads_and_vars, loss_d, grads_and_vars_d
+  return loss, loss_d
diff --git a/edward/inferences/hmc.py b/edward/inferences/hmc.py
index 23b8a9875..32b8120fc 100644
--- a/edward/inferences/hmc.py
+++ b/edward/inferences/hmc.py
@@ -8,7 +8,6 @@
 from collections import OrderedDict
 from edward.inferences.monte_carlo import MonteCarlo
 from edward.models import RandomVariable
-from edward.util import copy
 
 
 class HMC(MonteCarlo):
diff --git a/edward/inferences/inference.py b/edward/inferences/inference.py
index 24f8238c4..92f14cc30 100644
--- a/edward/inferences/inference.py
+++ b/edward/inferences/inference.py
@@ -1,46 +1,3 @@
-"""
-There are two approaches to inference.
-
-1. Idiomatic TensorFlow
-  1. Build train_op (need functions).
-  2. Build summary file writer.
-  3. Build and run TensorFlow variable initializer ops.
-  4. Build progressbar (need functions).
-  5. Within a training loop:
-    + sess.run with infeeding and summary writers.
-    + Update progressbar (need functions).
-    + Check convergence (need functions).
-  6. Build and run post-training ops (need functions).
-2. Idiomatic TensorFlow Estimator
-  + Call train(). It is a higher-order function taking in the model
-    program, inference function to build the train_op, and various
-    other things.
-
-Inference provides functions for both approaches. In the first
-approach, it provides (1) inference algorithms to help produce the
-train_op (and low-level functions to build your own algorithms); (2) a
-progressbar to build and update; (3) convergence diagnostics; and (4)
-post-training ops for certain algorithms. In the second approach, it
-provides the fully automated train().
-
-Inference uses (unbinded) pure functions with TensorFlow idiomatic
-exceptions (e.g., mutable state via TensorFlow variables; side effect
-of adding to global collections and TF graph). It forgoes OO.
-
-This file is a collection of functions shared across inference
-algorithms, used for the following:
-
-+ input checking and default constructors
-+ programmatic docstrings
-+ automated transforms
-+ summaries
-+ variable scoping
-+ train()
-+ for a subset of algs, optimizer and Monte Carlo stuff (TBA).
-
-Other files provide functions to help produce the train (and
-post-training) ops.
-"""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -52,266 +9,102 @@
 
 from datetime import datetime
 from edward.models import RandomVariable
-from edward.util import get_session, get_variables, Progbar
+from edward.util import get_variables, Progbar
 from edward.util import transform as _transform
 
-from tensorflow.contrib.distributions import bijectors
+tfb = tf.contrib.distributions.bijectors
 
 
-def check_and_maybe_build_data(data):
-  """Check that the data dictionary passed during inference and
-  criticism is valid.
-
-  Args:
-    data: dict.
-      Data dictionary which binds observed variables (of type
-      `RandomVariable` or `tf.Tensor`) to their realizations (of
-      type `tf.Tensor`). It can also bind placeholders (of type
-      `tf.Tensor`) used in the model to their realizations; and
-      prior latent variables (of type `RandomVariable`) to posterior
-      latent variables (of type `RandomVariable`).
-  """
-  sess = get_session()
-  if data is None:
-    data = {}
-  elif not isinstance(data, dict):
-    raise TypeError("data must have type dict.")
-
-  for key, value in six.iteritems(data):
-    if isinstance(key, tf.Tensor) and "Placeholder" in key.op.type:
-      if isinstance(value, RandomVariable):
-        raise TypeError("The value of a feed cannot be a ed.RandomVariable "
-                        "object. "
-                        "Acceptable feed values include Python scalars, "
-                        "strings, lists, numpy ndarrays, or TensorHandles.")
-      elif isinstance(value, tf.Tensor):
-        raise TypeError("The value of a feed cannot be a tf.Tensor object. "
-                        "Acceptable feed values include Python scalars, "
-                        "strings, lists, numpy ndarrays, or TensorHandles.")
-    elif isinstance(key, (RandomVariable, tf.Tensor)):
-      if isinstance(value, (RandomVariable, tf.Tensor)):
-        if not key.shape.is_compatible_with(value.shape):
-          raise TypeError("Key-value pair in data does not have same "
-                          "shape: {}, {}".format(key.shape, value.shape))
-        elif key.dtype != value.dtype:
-          raise TypeError("Key-value pair in data does not have same "
-                          "dtype: {}, {}".format(key.dtype, value.dtype))
-      elif isinstance(value, (float, list, int, np.ndarray, np.number, str)):
-        if not key.shape.is_compatible_with(np.shape(value)):
-          raise TypeError("Key-value pair in data does not have same "
-                          "shape: {}, {}".format(key.shape, np.shape(value)))
-        elif isinstance(value, (np.ndarray, np.number)) and \
-                not np.issubdtype(value.dtype, np.float) and \
-                not np.issubdtype(value.dtype, np.int) and \
-                not np.issubdtype(value.dtype, np.str):
-          raise TypeError("Data value has an invalid dtype: "
-                          "{}".format(value.dtype))
-      else:
-        raise TypeError("Data value has an invalid type: "
-                        "{}".format(type(value)))
-    else:
-      raise TypeError("Data key has an invalid type: {}".format(type(key)))
-
-  processed_data = {}
-  for key, value in six.iteritems(data):
-    if isinstance(key, tf.Tensor) and "Placeholder" in key.op.type:
-      processed_data[key] = value
-    elif isinstance(key, (RandomVariable, tf.Tensor)):
-      if isinstance(value, (RandomVariable, tf.Tensor)):
-        processed_data[key] = value
-      elif isinstance(value, (float, list, int, np.ndarray, np.number, str)):
-        # If value is a Python type, store it in the graph.
-        # Assign its placeholder with the key's data type.
-        with tf.variable_scope(None, default_name="data"):
-          ph = tf.placeholder(key.dtype, np.shape(value))
-          var = tf.Variable(ph, trainable=False, collections=[])
-          sess.run(var.initializer, {ph: value})
-          processed_data[key] = var
-  return processed_data
-
-
-def check_and_maybe_build_latent_vars(latent_vars):
-  """Check that the latent variable dictionary passed during inference and
-  criticism is valid.
-
-  Args:
-    latent_vars: dict.
-      Collection of latent variables (of type `RandomVariable` or
-      `tf.Tensor`) to perform inference on. Each random variable is
-      binded to another random variable; the latter will infer the
-      former conditional on data.
-  """
-  if latent_vars is None:
-    latent_vars = {}
-  elif not isinstance(latent_vars, dict):
-    raise TypeError("latent_vars must have type dict.")
-
-  for key, value in six.iteritems(latent_vars):
-    if not isinstance(key, (RandomVariable, tf.Tensor)):
-      raise TypeError("Latent variable key has an invalid type: "
-                      "{}".format(type(key)))
-    elif not isinstance(value, (RandomVariable, tf.Tensor)):
-      raise TypeError("Latent variable value has an invalid type: "
-                      "{}".format(type(value)))
-    elif not key.shape.is_compatible_with(value.shape):
-      raise TypeError("Key-value pair in latent_vars does not have same "
-                      "shape: {}, {}".format(key.shape, value.shape))
-    elif key.dtype != value.dtype:
-      raise TypeError("Key-value pair in latent_vars does not have same "
-                      "dtype: {}, {}".format(key.dtype, value.dtype))
-  return latent_vars
-
-
-def check_and_maybe_build_dict(x):
-  if x is None:
-    x = {}
-  elif not isinstance(x, dict):
-    raise TypeError("x must be dict; got {}".format(type(x).__name__))
-  return x
-
-
-def check_and_maybe_build_var_list(var_list, latent_vars, data):
-  """
-  Returns:
-    List of TensorFlow variables to optimize over. Default is all
-    trainable variables that `latent_vars` and `data` depend on,
-    excluding those that are only used in conditionals in `data`.
-  """
-  # Traverse random variable graphs to get default list of variables.
-  if var_list is None:
-    var_list = set()
-    trainables = tf.trainable_variables()
-    for z, qz in six.iteritems(latent_vars):
-      var_list.update(get_variables(z, collection=trainables))
-      var_list.update(get_variables(qz, collection=trainables))
-
-    for x, qx in six.iteritems(data):
-      if isinstance(x, RandomVariable) and \
-              not isinstance(qx, RandomVariable):
-        var_list.update(get_variables(x, collection=trainables))
-
-    var_list = list(var_list)
-  return var_list
-
-
-def transform(latent_vars, auto_transform=True):
-  """
-  Args:
-    auto_transform: bool, optional.
-      Whether to automatically transform continuous latent variables
-      of unequal support to be on the unconstrained space. It is
-      only applied if the argument is `True`, the latent variable
-      pair are `ed.RandomVariable`s with the `support` attribute,
-      the supports are both continuous and unequal.
-  """
-  # map from original latent vars to unconstrained versions
-  if auto_transform:
-    latent_vars_temp = latent_vars.copy()
-    # latent_vars maps original latent vars to constrained Q's.
-    # latent_vars_unconstrained maps unconstrained vars to unconstrained Q's.
-    latent_vars = {}
-    latent_vars_unconstrained = {}
-    for z, qz in six.iteritems(latent_vars_temp):
-      if hasattr(z, 'support') and hasattr(qz, 'support') and \
-            z.support != qz.support and qz.support != 'point':
-
-        # transform z to an unconstrained space
-        z_unconstrained = _transform(z)
-
-        # make sure we also have a qz that covers the unconstrained space
-        if qz.support == "points":
-          qz_unconstrained = qz
-        else:
-          qz_unconstrained = _transform(qz)
-        latent_vars_unconstrained[z_unconstrained] = qz_unconstrained
-
-        # additionally construct the transformation of qz
-        # back into the original constrained space
-        if z_unconstrained != z:
-          qz_constrained = _transform(
-            qz_unconstrained, bijectors.Invert(z_unconstrained.bijector))
-
-          try: # attempt to pushforward the params of Empirical distributions
-            qz_constrained.params = z_unconstrained.bijector.inverse(
-              qz_unconstrained.params)
-          except: # qz_unconstrained is not an Empirical distribution
-            pass
-
-        else:
-          qz_constrained = qz_unconstrained
-
-        latent_vars[z] = qz_constrained
-      else:
-        latent_vars[z] = qz
-        latent_vars_unconstrained[z] = qz
+def call_function_up_to_args(f, *args, **kwargs):
+  """Call f, removing any args/kwargs it doesn't take as input."""
+  import inspect
+  if hasattr(f, "_func"):  # tf.make_template()
+    argspec = inspect.getargspec(f._func)
   else:
-    latent_vars_unconstrained = None
-  return latent_vars, latent_vars_unconstrained
-
-
-def summary_variables(latent_vars=None, data=None, variables=None,
-                      *args, **kwargs):
-  # Note: to use summary_key, set
-  # collections=[tf.get_default_graph().unique_name("summaries")]
-  # TODO include in TensorBoard tutorial
-  """Log variables to TensorBoard.
-
-  For each variable in `variables`, forms a `tf.summary.scalar` if
-  the variable has scalar shape; otherwise forms a `tf.summary.histogram`.
-
-  Args:
-    variables: list, optional.
-      Specifies the list of variables to log after each `n_print`
-      steps. If None, will log all variables. If `[]`, no variables
-      will be logged.
+    argspec = inspect.getargspec(f)
+  fkwargs = {}
+  for k, v in six.iteritems(kwargs):
+    if k in argspec.args:
+      fkwargs[k] = v
+  num_args = len(argspec.args) - len(fkwargs)
+  if num_args > 0:
+    return f(*args[:num_args], **fkwargs)
+  elif len(fkwargs) > 0:
+    return f(**fkwargs)
+  return f()
+
+
+def make_intercept(trace, align_data, align_latent, args, kwargs):
+  def _intercept(f, *fargs, **fkwargs):
+    """Set model's sample values to variational distribution's and data."""
+    name = fkwargs.get('name', None)
+    key = align_data(name)
+    if isinstance(key, int):
+      fkwargs['value'] = args[key]
+    elif kwargs.get(key, None) is not None:
+      fkwargs['value'] = kwargs.get(key)
+    elif align_latent(name) is not None:
+      qz = trace[align_latent(name)].value
+      if isinstance(qz, RandomVariable):
+        value = qz.value
+      else:  # e.g. replacement is Tensor
+        value = tf.convert_to_tensor(qz)
+      fkwargs['value'] = value
+    # if auto_transform and 'qz' in locals():
+    #   # TODO for generation to work, must output original dist. to
+    #   keep around TD? must maintain another stack to write to as a
+    #   side-effect (or augment the original stack).
+    #   return transform(f, qz, *fargs, **fkwargs)
+    return f(*fargs, **fkwargs)
+  return _intercept
+
+
+def transform(f, qz, *args, **kwargs):
+  """Transform prior -> unconstrained -> q's constraint.
+
+  When using in VI, we keep variational distribution on its original
+  space (for sake of implementing only one intercepting function).
   """
-  if variables is None:
-    variables = []
-    for key in six.iterkeys(data):
-      variables += get_variables(key)
-
-    for key, value in six.iteritems(latent_vars):
-      variables += get_variables(key)
-      variables += get_variables(value)
-
-    variables = set(variables)
-
-  for var in variables:
-    # replace colons which are an invalid character
-    var_name = var.name.replace(':', '/')
-    # Log all scalars.
-    if len(var.shape) == 0:
-      tf.summary.scalar("parameter/{}".format(var_name),
-                        var, *args, **kwargs)
-    elif len(var.shape) == 1 and var.shape[0] == 1:
-      tf.summary.scalar("parameter/{}".format(var_name),
-                        var[0], *args, **kwargs)
-    else:
-      # If var is multi-dimensional, log a histogram of its values.
-      tf.summary.histogram("parameter/{}".format(var_name),
-                           var, *args, **kwargs)
+  # TODO deal with f or qz being 'point' or 'points'
+  if (not hasattr(f, 'support') or not hasattr(qz, 'support') or
+          f.support == qz.support):
+    return f(*args, **kwargs)
+  value = kwargs.pop('value')
+  kwargs['value'] = 0.0  # to avoid sampling; TODO follow sample shape
+  rv = f(*args, **kwargs)
+  # Take shortcuts in logic if p or q are already unconstrained.
+  if qz.support in ('real', 'multivariate_real'):
+    return _transform(rv, value=value)
+  if rv.support in ('real', 'multivariate_real'):
+    rv_unconstrained = rv
+  else:
+    rv_unconstrained = _transform(rv, value=0.0)
+  unconstrained_to_constrained = tfb.Invert(_transform(qz).bijector)
+  return _transform(rv_unconstrained,
+                    unconstrained_to_constrained,
+                    value=value)
 
 
-def train(train_op, summary_key=None, n_iter=1000, n_print=None,
+def train(model, inference=None,
+          summary_key=None, n_iter=1000, n_print=None,
           logdir=None, log_timestamp=True,
-          debug=False, variables=None,
-          use_coordinator=True, *args, **kwargs):
-  """A wrapper to run inference.
+          variables=None,
+          *args, **kwargs):
+  """An automated inference engine. It takes a model as input (and
+  optional args) and fully trains it until convergence given data to
+  return a posterior.
+
+  Given a defaulted inference algorithm (later, we might automate its
+  choice, or dynamically apply them), it performs the following steps:
 
   1. (Optional) Build a TensorFlow summary writer for TensorBoard.
   2. (Optional) Initialize TensorFlow variables.
-  3. (Optional) Start queue runners.
-  4. Run `update` for `n_iter` iterations.
-  5. Finalize algorithm via `finalize`.
-  6. (Optional) Stop queue runners.
-  + summary writer
-  + variable initialization
-  + update
-  + convergence diagnostics
-  + finalize
-
-  To customize the way inference is run, run these steps
-  individually.
+  3. while not converged: (for now, set by `n_iter` iterations)
+    3a. Run update ops.
+    3b. If within print window:
+      3bi. Print progress.
+      3bii. Run convergence diagnostics.
+  4. Run finalize (post-training) ops.
 
   Args:
     n_iter: int, optional.
@@ -329,29 +122,26 @@ def train(train_op, summary_key=None, n_iter=1000, n_print=None,
       If True (and `logdir` is specified), create a subdirectory of
       `logdir` to save the specific run results. The subdirectory's
       name is the current UTC timestamp with format 'YYYYMMDD_HHMMSS'.
-    debug: bool, optional.
-      If True, add checks for `NaN` and `Inf` to all computations
-      in the graph. May result in substantially slower execution
-      times.
     variables: list, optional.
       A list of TensorFlow variables to initialize during inference.
       Default is to initialize all variables (this includes
       reinitializing variables that were already initialized). To
       avoid initializing any variables, pass in an empty list.
-    use_coordinator: bool, optional.
-      Whether to start and stop queue runners during inference using a
-      TensorFlow coordinator. For example, queue runners are necessary
-      for batch training with file readers.
   """
   if n_print is None:
     n_print = int(n_iter / 100)
+  if inference in (bigan_inference, gan_inference, implicit_klqp):
+    _update = _gan_update
+  elif inference == wgan_inference:
+    _update = _wgan_update
+  else:
+    _update = _default_update
   progbar = Progbar(n_iter)
   t = tf.Variable(0, trainable=False, name="iteration")
   kwargs['t'] = t.assign_add(1)  # add to update()
 
   if summary_key is not None:
-    # TODO should run() also add summaries; or should user call
-    # summary_variables() manually?
+    # TODO _summary_variables()
     summarize = tf.summary.merge_all(key=summary_key)
     if log_timestamp:
       logdir = os.path.expanduser(logdir)
@@ -362,11 +152,6 @@ def train(train_op, summary_key=None, n_iter=1000, n_print=None,
     summarize = None
     train_writer = None
 
-  if debug:
-    op_check = tf.add_check_numerics_ops()
-  else:
-    op_check = None
-
   if variables is None:
     init = tf.global_variables_initializer()
   else:
@@ -374,17 +159,16 @@ def train(train_op, summary_key=None, n_iter=1000, n_print=None,
 
   # Feed placeholders in case initialization depends on them.
   feed_dict = kwargs.get('feed_dict', {})
+  # TODO use feed dict outside since static
+  # feed_dict = {}
+  for key, value in six.iteritems(data):
+    if isinstance(key, tf.Tensor) and "Placeholder" in key.op.type:
+      feed_dict[key] = value
   init.run(feed_dict)
 
-  if use_coordinator:
-    # Start input enqueue threads.
-    coord = tf.train.Coordinator()
-    threads = tf.train.start_queue_runners(coord=coord)
-
   for _ in range(n_iter):
-    info_dict = update(progbar, n_print, summarize,
-                       train_writer, debug, op_check,
-                       train_op, *args, **kwargs)
+    info_dict = _update(progbar, n_print, summarize,
+                        train_writer, train_op, *args, **kwargs)
 
   finalize = None
   if finalize is not None:
@@ -395,13 +179,52 @@ def train(train_op, summary_key=None, n_iter=1000, n_print=None,
     if summary_key is not None:
       train_writer.close()
 
-  if use_coordinator:
-    # Ask threads to stop.
-    coord.request_stop()
-    coord.join(threads)
 
-def optimize(loss, grads_and_vars, collections=None, var_list=None,
-             optimizer=None, use_prettytensor=False, global_step=None):
+def _summary_variables(latent_vars=None, data=None, variables=None,
+                       *args, **kwargs):
+  # Note: to use summary_key, set
+  # collections=[tf.get_default_graph().unique_name("summaries")]
+  # TODO include in TensorBoard tutorial
+  """Log variables to TensorBoard.
+
+  For each variable in `variables`, forms a `tf.summary.scalar` if
+  the variable has scalar shape; otherwise forms a `tf.summary.histogram`.
+
+  Args:
+    variables: list, optional.
+      Specifies the list of variables to log after each `n_print`
+      steps. If None, will log all variables. If `[]`, no variables
+      will be logged.
+  """
+  if variables is None:
+    variables = []
+    for key in six.iterkeys(data):
+      variables += get_variables(key)
+
+    for key, value in six.iteritems(latent_vars):
+      variables += get_variables(key)
+      variables += get_variables(value)
+
+    variables = set(variables)
+
+  for var in variables:
+    # replace colons which are an invalid character
+    var_name = var.name.replace(':', '/')
+    # Log all scalars.
+    if len(var.shape) == 0:
+      tf.summary.scalar("parameter/{}".format(var_name),
+                        var, *args, **kwargs)
+    elif len(var.shape) == 1 and var.shape[0] == 1:
+      tf.summary.scalar("parameter/{}".format(var_name),
+                        var[0], *args, **kwargs)
+    else:
+      # If var is multi-dimensional, log a histogram of its values.
+      tf.summary.histogram("parameter/{}".format(var_name),
+                           var, *args, **kwargs)
+
+
+def _optimize(loss, grads_and_vars, collections=None, var_list=None,
+              optimizer=None, use_prettytensor=False, global_step=None):
   """Build optimizer and its train op applied to loss or
   grads_and_vars.
 
@@ -481,8 +304,8 @@ def optimize(loss, grads_and_vars, collections=None, var_list=None,
   return train_op
 
 
-def update(progbar, n_print, summarize=None, train_writer=None,
-           debug=False, op_check=None, *args, **kwargs):
+def _default_update(progbar, n_print, summarize=None, train_writer=None,
+                    *args, **kwargs):
   """Run one iteration of optimization.
 
   Args:
@@ -497,19 +320,11 @@ def update(progbar, n_print, summarize=None, train_writer=None,
     Dictionary of algorithm-specific information. In this case, the
     loss function value after one iteration.
   """
-  # TODO use if more automated
-  # feed_dict = {}
-  # for key, value in six.iteritems(data):
-  #   if isinstance(key, tf.Tensor) and "Placeholder" in key.op.type:
-  #     feed_dict[key] = value
   sess = get_session()
   feed_dict = kwargs.pop('feed_dict', {})
   values = sess.run(list(args) + list(kwargs.values()), feed_dict)
   info_dict = dict(zip(kwargs.keys(), values[len(args):]))
 
-  if debug:
-    sess.run(op_check, feed_dict)
-
   if n_print != 0:
     t = info_dict['t']
     if t == 1 or t % n_print == 0:
@@ -527,10 +342,8 @@ def update(progbar, n_print, summarize=None, train_writer=None,
   return info_dict
 
 
-# TODO within run(), use this for gan_inference, wgan_inference,
-# implicit_klqp, bigan_inference
-def update(train_op, train_op_d, n_print, summarize=None, train_writer=None,
-           debug=False, op_check=None, variables=None, *args, **kwargs):
+def _gan_update(train_op, train_op_d, n_print, summarize=None,
+                train_writer=None, variables=None, *args, **kwargs):
   """Run one iteration of optimization.
 
   Args:
@@ -570,9 +383,6 @@ def update(train_op, train_op_d, n_print, summarize=None, train_writer=None,
   else:
     raise NotImplementedError("variables must be None, 'Gen', or 'Disc'.")
 
-  if debug:
-    sess.run(op_check, feed_dict)
-
   if summarize is not None and n_print != 0:
     if t == 1 or t % self.n_print == 0:
       summary = sess.run(summarize, feed_dict)
@@ -580,9 +390,11 @@ def update(train_op, train_op_d, n_print, summarize=None, train_writer=None,
 
   return dict(zip(kwargs_temp.keys(), values))
 
-# TODO within run(), use this for wgan_inference
-def update(clip_op, variables=None, *args, **kwargs):
-  info_dict = gan_inference.update(variables=variables, *args, **kwargs)
+
+def _wgan_update(clip_op, variables=None, *args, **kwargs):
+  # TODO make sure increment_t and clipping is called after the update
+  # (e.g., with control_dependencies, for monte carlo)
+  info_dict = gan_update(variables=variables, *args, **kwargs)
 
   sess = get_session()
   if clip_op is not None and variables in (None, "Disc"):
diff --git a/edward/inferences/klpq.py b/edward/inferences/klpq.py
index 797612a74..e595d1790 100644
--- a/edward/inferences/klpq.py
+++ b/edward/inferences/klpq.py
@@ -5,10 +5,9 @@
 import six
 import tensorflow as tf
 
-from edward.inferences.inference import (check_and_maybe_build_data,
-    check_and_maybe_build_latent_vars, transform, check_and_maybe_build_dict, check_and_maybe_build_var_list)
-from edward.models import RandomVariable
-from edward.util import copy, get_descendants
+from edward.inferences.inference import (
+    call_function_up_to_args, make_intercept)
+from edward.models.core import Trace
 
 try:
   from edward.models import Normal
@@ -16,8 +15,9 @@
   raise ImportError("{0}. Your TensorFlow version is not supported.".format(e))
 
 
-def klpq(latent_vars=None, data=None, n_samples=1,
-         auto_transform=True, scale=None, var_list=None, collections=None):
+def klpq(model, variational, align_latent, align_data,
+         scale=lambda name: 1.0, n_samples=1, auto_transform=True,
+         collections=None, *args, **kwargs):
   """Variational inference with the KL divergence
 
   $\\text{KL}( p(z \mid x) \| q(z) ).$
@@ -91,65 +91,30 @@ def klpq(latent_vars=None, data=None, n_samples=1,
   $- \sum_{s=1}^S [
     w_{\\text{norm}}(z^s; \lambda) \\nabla_{\lambda} \log q(z^s; \lambda) ].$
   """
-  if isinstance(latent_vars, list):
-    with tf.variable_scope(None, default_name="posterior"):
-      latent_vars_dict = {}
-      continuous = \
-          ('01', 'nonnegative', 'simplex', 'real', 'multivariate_real')
-      for z in latent_vars:
-        if not hasattr(z, 'support') or z.support not in continuous:
-          raise AttributeError(
-              "Random variable {} is not continuous or a random "
-              "variable with supported continuous support.".format(z))
-        batch_event_shape = z.batch_shape.concatenate(z.event_shape)
-        loc = tf.Variable(tf.random_normal(batch_event_shape))
-        scale = tf.nn.softplus(
-            tf.Variable(tf.random_normal(batch_event_shape)))
-        latent_vars_dict[z] = Normal(loc=loc, scale=scale)
-      latent_vars = latent_vars_dict
-      del latent_vars_dict
-  latent_vars = check_and_maybe_build_latent_vars(latent_vars)
-  data = check_and_maybe_build_data(data)
-  latent_vars, _ = transform(latent_vars, auto_transform)
-  scale = check_and_maybe_build_dict(scale)
-  var_list = check_and_maybe_build_var_list(var_list, latent_vars, data)
-
   p_log_prob = [0.0] * n_samples
   q_log_prob = [0.0] * n_samples
-  base_scope = tf.get_default_graph().unique_name("inference") + '/'
   for s in range(n_samples):
-    # Form dictionary in order to replace conditioning on prior or
-    # observed variable with conditioning on a specific value.
-    scope = base_scope + tf.get_default_graph().unique_name("sample")
-    dict_swap = {}
-    for x, qx in six.iteritems(data):
-      if isinstance(x, RandomVariable):
-        if isinstance(qx, RandomVariable):
-          qx_copy = copy(qx, scope=scope)
-          dict_swap[x] = qx_copy.value
-        else:
-          dict_swap[x] = qx
-
-    for z, qz in six.iteritems(latent_vars):
-      # Copy q(z) to obtain new set of posterior samples.
-      qz_copy = copy(qz, scope=scope)
-      dict_swap[z] = qz_copy.value
-      q_log_prob[s] += tf.reduce_sum(
-          qz_copy.log_prob(tf.stop_gradient(dict_swap[z])))
-
-    for z in six.iterkeys(latent_vars):
-      z_copy = copy(z, dict_swap, scope=scope)
-      p_log_prob[s] += tf.reduce_sum(z_copy.log_prob(dict_swap[z]))
-
-    for x in six.iterkeys(data):
-      if isinstance(x, RandomVariable):
-        x_copy = copy(x, dict_swap, scope=scope)
-        p_log_prob[s] += tf.reduce_sum(x_copy.log_prob(dict_swap[x]))
+    with Trace() as posterior_trace:
+      call_function_up_to_args(variational, *args, **kwargs)
+    intercept = make_intercept(
+        posterior_trace, align_data, align_latent, args, kwargs)
+    with Trace(intercept=intercept) as model_trace:
+      call_function_up_to_args(model, *args, **kwargs)
+
+    for name, node in six.iteritems(model_trace):
+      rv = node.value
+      scale_factor = scale(name)
+      if align_latent(name) is not None or align_data(name) is not None:
+        p_log_prob[s] += tf.reduce_sum(
+            scale_factor * rv.log_prob(tf.stop_gradient(rv.value)))
+      if align_latent(name) is not None:
+        qz = posterior_trace[align_latent(name)].value
+        q_log_prob[s] += tf.reduce_sum(
+            scale_factor * qz.log_prob(tf.stop_gradient(qz.value)))
 
   p_log_prob = tf.stack(p_log_prob)
   q_log_prob = tf.stack(q_log_prob)
   reg_penalty = tf.reduce_sum(tf.losses.get_regularization_losses())
-
   if collections is not None:
     tf.summary.scalar("loss/p_log_prob", tf.reduce_mean(p_log_prob),
                       collections=collections)
@@ -158,18 +123,11 @@ def klpq(latent_vars=None, data=None, n_samples=1,
     tf.summary.scalar("loss/reg_penalty", reg_penalty,
                       collections=collections)
 
-  log_w = p_log_prob - q_log_prob
+  log_w = p_log_prob - tf.stop_gradient(q_log_prob)
   log_w_norm = log_w - tf.reduce_logsumexp(log_w)
   w_norm = tf.exp(log_w_norm)
-  loss = tf.reduce_sum(w_norm * log_w) - reg_penalty
-
-  q_rvs = list(six.itervalues(latent_vars))
-  q_vars = [v for v in var_list
-            if len(get_descendants(tf.convert_to_tensor(v), q_rvs)) != 0]
-  q_grads = tf.gradients(
-      -(tf.reduce_sum(q_log_prob * tf.stop_gradient(w_norm)) - reg_penalty),
-      q_vars)
-  p_vars = [v for v in var_list if v not in q_vars]
-  p_grads = tf.gradients(-loss, p_vars)
-  grads_and_vars = list(zip(q_grads, q_vars)) + list(zip(p_grads, p_vars))
-  return loss, grads_and_vars
+  loss = -tf.reduce_sum(w_norm * log_w) + reg_penalty
+  # Model parameter gradients will backprop into loss. Variational
+  # parameter gradients will backprop into reg_penalty and last term.
+  surrogate_loss = loss + tf.reduce_sum(q_log_prob * tf.stop_gradient(w_norm))
+  return loss, surrogate_loss
diff --git a/edward/inferences/klqp.py b/edward/inferences/klqp.py
index 8bdf3a674..0e456d6ed 100644
--- a/edward/inferences/klqp.py
+++ b/edward/inferences/klqp.py
@@ -5,10 +5,9 @@
 import six
 import tensorflow as tf
 
-from edward.inferences.inference import (check_and_maybe_build_data,
-    check_and_maybe_build_latent_vars, transform, check_and_maybe_build_dict, check_and_maybe_build_var_list)
-from edward.models import RandomVariable
-from edward.util import copy, get_descendants
+from edward.inferences.inference import (
+    call_function_up_to_args, make_intercept)
+from edward.models.core import Trace
 
 try:
   from edward.models import Normal
@@ -16,9 +15,12 @@
 except Exception as e:
   raise ImportError("{0}. Your TensorFlow version is not supported.".format(e))
 
+tfd = tf.contrib.distributions
 
-def klqp(latent_vars=None, data=None, n_samples=1, kl_scaling=None,
-         auto_transform=True, scale=None, var_list=None, summary_key=None):
+
+def klqp(model, variational, align_latent, align_data,
+         scale=lambda name: 1.0, n_samples=1, kl_scaling=lambda name: 1.0,
+         auto_transform=True, collections=None, *args, **kwargs):
   """Variational inference with the KL divergence
 
   $\\text{KL}( q(z; \lambda) \| p(z \mid x) ).$
@@ -27,20 +29,38 @@ def klqp(latent_vars=None, data=None, n_samples=1, kl_scaling=None,
   variety of black box inference techniques.
 
   Args:
-    latent_vars: list of RandomVariable or
-                 dict of RandomVariable to RandomVariable.
-      Collection of random variables to perform inference on. If
-      list, each random variable will be implictly optimized using a
-      `Normal` random variable that is defined internally with a
+    model: function whose inputs are a subset of `args` (e.g., for
+      discriminative). Output is not used.
+      TODO auto_transform docstring
+      Collection of random variables to perform inference on.
+      If list, each random variable will be implictly optimized using
+      a `Normal` random variable that is defined internally with a
       free parameter per location and scale and is initialized using
-      standard normal draws. The random variables to approximate
-      must be continuous.
+      standard normal draws. The random variables to approximate must
+      be continuous.
+    variational: function whose inputs are a subset of `args` (e.g.,
+      for amortized). Output is not used.
+    align_latent: function of string, aligning `model` latent
+      variables with `variational`. It takes a model variable's name
+      as input and returns a string, indexing `variational`'s trace;
+      else identity.
+    align_data: function of string, aligning `model` observed
+      variables with data. It takes a model variable's name as input
+      and returns an integer, indexing `args`; else identity.
+    scale: function of string, aligning `model` observed
+      variables with scale factors. It takes a model variable's name
+      as input and returns a scale factor; else 1.0. The scale
+      factor's shape must be broadcastable; it is multiplied
+      element-wise to the random variable. For example, this is useful
+      for mini-batch scaling when inferring global variables, or
+      applying masks on a random variable.
     n_samples: int, optional.
       Number of samples from variational model for calculating
       stochastic gradients.
-    kl_scaling: dict of RandomVariable to tf.Tensor, optional.
-      Provides option to scale terms when using ELBO with KL divergence.
-      If the KL divergence terms are
+    kl_scaling: function of string, aligning `model` latent
+      variables with KL scale factors. This provides option to scale
+      terms when using ELBO with KL divergence. If the KL divergence
+      terms are
 
       $\\alpha_p \mathbb{E}_{q(z\mid x, \lambda)} [
             \log q(z\mid x, \lambda) - \log p(z)],$
@@ -48,12 +68,8 @@ def klqp(latent_vars=None, data=None, n_samples=1, kl_scaling=None,
       then pass {$p(z)$: $\\alpha_p$} as `kl_scaling`,
       where $\\alpha_p$ is a tensor. Its shape must be broadcastable;
       it is multiplied element-wise to the batchwise KL terms.
-    scale: dict of RandomVariable to tf.Tensor, optional.
-      A tensor to dict computation for any random variable that it is
-      binded to. Its shape must be broadcastable; it is multiplied
-      element-wise to the random variable. For example, this is useful
-      for mini-batch scaling when inferring global variables, or
-      applying masks on a random variable.
+    args: data inputs. It is passed at compile-time in Graph
+      mode or runtime in Eager mode.
 
   #### Notes
 
@@ -102,470 +118,119 @@ def klqp(latent_vars=None, data=None, n_samples=1, kl_scaling=None,
   where the KL term is computed analytically [@kingma2014auto]. We
   compute this automatically when $p(z)$ and $q(z; \lambda)$ are
   Normal.
-  """
-  if isinstance(latent_vars, list):
-    with tf.variable_scope(None, default_name="posterior"):
-      latent_vars_dict = {}
-      continuous = \
-          ('01', 'nonnegative', 'simplex', 'real', 'multivariate_real')
-      for z in latent_vars:
-        if not hasattr(z, 'support') or z.support not in continuous:
-          raise AttributeError(
-              "Random variable {} is not continuous or a random "
-              "variable with supported continuous support.".format(z))
-        batch_event_shape = z.batch_shape.concatenate(z.event_shape)
-        loc = tf.Variable(tf.random_normal(batch_event_shape))
-        scale = tf.nn.softplus(
-            tf.Variable(tf.random_normal(batch_event_shape)))
-        latent_vars_dict[z] = Normal(loc=loc, scale=scale)
-      latent_vars = latent_vars_dict
-      del latent_vars_dict
-  latent_vars = check_and_maybe_build_latent_vars(latent_vars)
-  data = check_and_maybe_build_data(data)
-  latent_vars, _ = transform(latent_vars, auto_transform)
-  kl_scaling = check_and_maybe_build_dict(kl_scaling)
-  scale = check_and_maybe_build_dict(scale)
-  var_list = check_and_maybe_build_var_list(var_list, latent_vars, data)
-
-  is_reparameterizable = all([
-      rv.reparameterization_type ==
-      tf.contrib.distributions.FULLY_REPARAMETERIZED
-      for rv in six.itervalues(latent_vars)])
-  is_analytic_kl = all([isinstance(z, Normal) and isinstance(qz, Normal)
-                        for z, qz in six.iteritems(latent_vars)])
-  if not is_analytic_kl and kl_scaling:
-    raise TypeError("kl_scaling must be None when using non-analytic KL term")
-  if is_reparameterizable:
-    if is_analytic_kl:
-      return build_reparam_kl_loss_and_gradients(
-          latent_vars, data, var_list,
-          scale, n_samples, kl_scaling, summary_key)
-    # elif is_analytic_entropy:
-    #    return build_reparam_entropy_loss_and_gradients(...)
-    else:
-      return build_reparam_loss_and_gradients(
-          latent_vars, data, var_list,
-          scale, n_samples, summary_key)
-  else:
-    # Prefer Rao-Blackwellization over analytic KL. Unknown what
-    # would happen stability-wise if the two are combined.
-    # if is_analytic_kl:
-    #   return build_score_kl_loss_and_gradients(...)
-    # Analytic entropies may lead to problems around
-    # convergence; for now it is deactivated.
-    # elif is_analytic_entropy:
-    #    return build_score_entropy_loss_and_gradients(...)
-    # else:
-    return build_score_rb_loss_and_gradients(
-        latent_vars, data, var_list,
-        scale, n_samples, summary_key)
-
-
-def reparameterization_klqp(
-    latent_vars=None, data=None, n_samples=1,
-    auto_transform=True, scale=None, var_list=None, summary_key=None):
-  """Variational inference with the KL divergence
-
-  $\\text{KL}( q(z; \lambda) \| p(z \mid x) ).$
-
-  This class minimizes the objective using the reparameterization
-  gradient.
-
-  Args:
-    latent_vars: list of RandomVariable or
-                 dict of RandomVariable to RandomVariable.
-      Collection of random variables to perform inference on. If
-      list, each random variable will be implictly optimized using a
-      `Normal` random variable that is defined internally with a
-      free parameter per location and scale and is initialized using
-      standard normal draws. The random variables to approximate
-      must be continuous.
-    n_samples: int, optional.
-      Number of samples from variational model for calculating
-      stochastic gradients.
-
-  The objective function also adds to itself a summation over all
-  tensors in the `REGULARIZATION_LOSSES` collection.
-  """
-  if isinstance(latent_vars, list):
-    with tf.variable_scope(None, default_name="posterior"):
-      latent_vars_dict = {}
-      continuous = \
-          ('01', 'nonnegative', 'simplex', 'real', 'multivariate_real')
-      for z in latent_vars:
-        if not hasattr(z, 'support') or z.support not in continuous:
-          raise AttributeError(
-              "Random variable {} is not continuous or a random "
-              "variable with supported continuous support.".format(z))
-        batch_event_shape = z.batch_shape.concatenate(z.event_shape)
-        loc = tf.Variable(tf.random_normal(batch_event_shape))
-        scale = tf.nn.softplus(
-            tf.Variable(tf.random_normal(batch_event_shape)))
-        latent_vars_dict[z] = Normal(loc=loc, scale=scale)
-      latent_vars = latent_vars_dict
-      del latent_vars_dict
-  latent_vars = check_and_maybe_build_latent_vars(latent_vars)
-  data = check_and_maybe_build_data(data)
-  latent_vars, _ = transform(latent_vars, auto_transform)
-  scale = check_and_maybe_build_dict(scale)
-  var_list = check_and_maybe_build_var_list(var_list, latent_vars, data)
-
-  return build_reparam_loss_and_gradients(
-      latent_vars, data, var_list,
-      scale, n_samples, summary_key)
-
-
-def reparameterization_kl_klqp(
-    latent_vars=None, data=None, n_samples=1, kl_scaling=None,
-    auto_transform=True, scale=None, var_list=None, summary_key=None):
-  """Variational inference with the KL divergence
-
-  $\\text{KL}( q(z; \lambda) \| p(z \mid x) ).$
-
-  This class minimizes the objective using the reparameterization
-  gradient and an analytic KL term.
-
-  Args:
-    latent_vars: list of RandomVariable or
-                 dict of RandomVariable to RandomVariable.
-      Collection of random variables to perform inference on. If
-      list, each random variable will be implictly optimized using a
-      `Normal` random variable that is defined internally with a
-      free parameter per location and scale and is initialized using
-      standard normal draws. The random variables to approximate
-      must be continuous.
-    n_samples: int, optional.
-      Number of samples from variational model for calculating
-      stochastic gradients.
-    kl_scaling: dict of RandomVariable to tf.Tensor, optional.
-      Provides option to scale terms when using ELBO with KL divergence.
-      If the KL divergence terms are
 
-      $\\alpha_p \mathbb{E}_{q(z\mid x, \lambda)} [
-            \log q(z\mid x, \lambda) - \log p(z)],$
-
-      then pass {$p(z)$: $\\alpha_p$} as `kl_scaling`,
-      where $\\alpha_p$ is a tensor. Its shape must be broadcastable;
-      it is multiplied element-wise to the batchwise KL terms.
-
-  The objective function also adds to itself a summation over all
-  tensors in the `REGULARIZATION_LOSSES` collection.
-  """
-  if isinstance(latent_vars, list):
-    with tf.variable_scope(None, default_name="posterior"):
-      latent_vars_dict = {}
-      continuous = \
-          ('01', 'nonnegative', 'simplex', 'real', 'multivariate_real')
-      for z in latent_vars:
-        if not hasattr(z, 'support') or z.support not in continuous:
-          raise AttributeError(
-              "Random variable {} is not continuous or a random "
-              "variable with supported continuous support.".format(z))
-        batch_event_shape = z.batch_shape.concatenate(z.event_shape)
-        loc = tf.Variable(tf.random_normal(batch_event_shape))
-        scale = tf.nn.softplus(
-            tf.Variable(tf.random_normal(batch_event_shape)))
-        latent_vars_dict[z] = Normal(loc=loc, scale=scale)
-      latent_vars = latent_vars_dict
-      del latent_vars_dict
-  latent_vars = check_and_maybe_build_latent_vars(latent_vars)
-  data = check_and_maybe_build_data(data)
-  latent_vars, _ = transform(latent_vars, auto_transform)
-  kl_scaling = check_and_maybe_build_dict(kl_scaling)
-  scale = check_and_maybe_build_dict(scale)
-  var_list = check_and_maybe_build_var_list(var_list, latent_vars, data)
-
-  return build_reparam_kl_loss_and_gradients(
-      latent_vars, data, var_list,
-      scale, n_samples, kl_scaling, summary_key)
-
-
-def reparameterization_entropy_klqp(
-    latent_vars=None, data=None, n_samples=1,
-    auto_transform=True, scale=None, var_list=None, summary_key=None):
-  """Variational inference with the KL divergence
-
-  $\\text{KL}( q(z; \lambda) \| p(z \mid x) ).$
-
-  This class minimizes the objective using the reparameterization
-  gradient and an analytic entropy term.
-
-  Args:
-    latent_vars: list of RandomVariable or
-                 dict of RandomVariable to RandomVariable.
-      Collection of random variables to perform inference on. If
-      list, each random variable will be implictly optimized using a
-      `Normal` random variable that is defined internally with a
-      free parameter per location and scale and is initialized using
-      standard normal draws. The random variables to approximate
-      must be continuous.
-    n_samples: int, optional.
-      Number of samples from variational model for calculating
-      stochastic gradients.
+  This class minimizes the objective using the score function gradient
+  and Rao-Blackwellization [@ranganath2014black].
 
-  The objective function also adds to itself a summation over all
-  tensors in the `REGULARIZATION_LOSSES` collection.
-  """
-  if isinstance(latent_vars, list):
-    with tf.variable_scope(None, default_name="posterior"):
-      latent_vars_dict = {}
-      continuous = \
-          ('01', 'nonnegative', 'simplex', 'real', 'multivariate_real')
-      for z in latent_vars:
-        if not hasattr(z, 'support') or z.support not in continuous:
-          raise AttributeError(
-              "Random variable {} is not continuous or a random "
-              "variable with supported continuous support.".format(z))
-        batch_event_shape = z.batch_shape.concatenate(z.event_shape)
-        loc = tf.Variable(tf.random_normal(batch_event_shape))
-        scale = tf.nn.softplus(
-            tf.Variable(tf.random_normal(batch_event_shape)))
-        latent_vars_dict[z] = Normal(loc=loc, scale=scale)
-      latent_vars = latent_vars_dict
-      del latent_vars_dict
-  latent_vars = check_and_maybe_build_latent_vars(latent_vars)
-  data = check_and_maybe_build_data(data)
-  latent_vars, _ = transform(latent_vars, auto_transform)
-  scale = check_and_maybe_build_dict(scale)
-  var_list = check_and_maybe_build_var_list(var_list, latent_vars, data)
-
-  return build_reparam_entropy_loss_and_gradients(
-      latent_vars, data, var_list,
-      scale, n_samples, summary_key)
-
-
-def score_klqp(
-    latent_vars=None, data=None, n_samples=1,
-    auto_transform=True, scale=None, var_list=None, summary_key=None):
-  """Variational inference with the KL divergence
+  Computed by sampling from :math:`q(z;\lambda)` and evaluating the
+  expectation using Monte Carlo sampling and Rao-Blackwellization.
 
-  $\\text{KL}( q(z; \lambda) \| p(z \mid x) ).$
+  The implementation takes the surrogate loss approach. See
+  @schulman2015stochastic; @ruiz2016generalized; @ritchie2016deep.
 
-  This class minimizes the objective using the score function
-  gradient.
+  #### Notes
 
-  Args:
-    latent_vars: list of RandomVariable or
-                 dict of RandomVariable to RandomVariable.
-      Collection of random variables to perform inference on. If
-      list, each random variable will be implictly optimized using a
-      `Normal` random variable that is defined internally with a
-      free parameter per location and scale and is initialized using
-      standard normal draws. The random variables to approximate
-      must be continuous.
+  Current Rao-Blackwellization is limited to Rao-Blackwellizing across
+  stochastic nodes in the computation graph. It does not
+  Rao-Blackwellize within a node such as when a node represents
+  multiple random variables via non-scalar batch shape.
 
   The objective function also adds to itself a summation over all
   tensors in the `REGULARIZATION_LOSSES` collection.
   """
-  if isinstance(latent_vars, list):
-    with tf.variable_scope(None, default_name="posterior"):
-      latent_vars_dict = {}
-      continuous = \
-          ('01', 'nonnegative', 'simplex', 'real', 'multivariate_real')
-      for z in latent_vars:
-        if not hasattr(z, 'support') or z.support not in continuous:
-          raise AttributeError(
-              "Random variable {} is not continuous or a random "
-              "variable with supported continuous support.".format(z))
-        batch_event_shape = z.batch_shape.concatenate(z.event_shape)
-        loc = tf.Variable(tf.random_normal(batch_event_shape))
-        scale = tf.nn.softplus(
-            tf.Variable(tf.random_normal(batch_event_shape)))
-        latent_vars_dict[z] = Normal(loc=loc, scale=scale)
-      latent_vars = latent_vars_dict
-      del latent_vars_dict
-  latent_vars = check_and_maybe_build_latent_vars(latent_vars)
-  data = check_and_maybe_build_data(data)
-  latent_vars, _ = transform(latent_vars, auto_transform)
-  scale = check_and_maybe_build_dict(scale)
-  var_list = check_and_maybe_build_var_list(var_list, latent_vars, data)
-
-  return build_score_loss_and_gradients(
-      latent_vars, data, var_list,
-      scale, n_samples, summary_key)
-
-
-def score_kl_klqp(
-    latent_vars=None, data=None, n_samples=1, kl_scaling=None,
-    auto_transform=True, scale=None, var_list=None, summary_key=None):
-  """Variational inference with the KL divergence
+  # TODO control variates
+  # + baseline, learnable baseline
+  # + Ruiz+ 2016
+  # + Tucker+ 2017; Cremer+ 2017
+  # + Miller+ 2017
+  # TODO analytic stuff
+  # + Roeder+ 2017
+  p_log_prob = [None] * n_samples
+  q_log_prob = [None] * n_samples
+  surrogate_loss = [None] * n_samples
+  kl_penalty = 0.0
+  for s in range(n_samples):
+    with Trace() as posterior_trace:
+      call_function_up_to_args(variational, *args, **kwargs)
+    intercept = make_intercept(
+        posterior_trace, align_data, align_latent, args, kwargs)
+    with Trace(intercept=intercept) as model_trace:
+      call_function_up_to_args(model, *args, **kwargs)
+
+    # Collect key-value pairs of (rv, rv's (scaled) log prob).
+    p_dict = {}
+    q_dict = {}
+    inverse_align_latent = {}
+    for name, node in six.iteritems(model_trace):
+      rv = node.value
+      scale_factor = scale(name)
+      if align_data(name) is not None:
+        p_dict[rv] = tf.reduce_sum(scale_factor * rv.log_prob(rv.value))
+      if align_latent(name) is not None:
+        qz = posterior_trace[align_latent(name)].value
+        # For pairs with analytic KL, accumulate KL divergences for
+        # first iteration in loop.
+        if isinstance(rv, Normal) and isinstance(qz, Normal):
+          if s == 0:
+            kl_penalty += tf.reduce_sum(
+                kl_scaling(name) * kl_divergence(qz, rv))
+        else:
+          p_dict[rv] = tf.reduce_sum(scale_factor * rv.log_prob(rv.value))
+          q_dict[qz] = tf.reduce_sum(scale_factor * qz.log_prob(qz.value))
+          inverse_align_latent[qz] = rv
+
+    # Build surrogate loss.
+    scaled_q_log_prob = 0.0
+    for qz, log_prob in six.iteritems(q_dict):
+      if qz.reparameterization_type == tfd.FULLY_REPARAMETERIZED:
+        scale_factor = 1.0
+      else:
+        scale_factor = 0.0
+        for rv in qz.get_blanket(q_rvs) + [qz]:
+          scale_factor += q_dict[rv]
+          scale_factor -= p_dict[inverse_align_latent[qz]]
+      scaled_q_log_prob += scale_factor * log_prob
+
+    p_log_prob_s = tf.reduce_sum(list(six.itervalues(p_dict)))
+    p_log_prob[s] = p_log_prob_s
+    q_log_prob[s] = tf.reduce_sum(list(six.itervalues(q_dict)))
+    surrogate_loss[s] = scaled_q_log_prob - p_log_prob_s
 
-  $\\text{KL}( q(z; \lambda) \| p(z \mid x) ).$
+  p_log_prob = tf.reduce_mean(p_log_prob)
+  q_log_prob = tf.reduce_mean(q_log_prob)
+  surrogate_loss = tf.reduce_mean(surrogate_loss) + kl_penalty
 
-  This class minimizes the objective using the score function gradient
-  and an analytic KL term.
+  reg_penalty = tf.reduce_sum(tf.losses.get_regularization_losses())
+  surrogate_loss += reg_penalty
 
-  Args:
-    latent_vars: list of RandomVariable or
-                 dict of RandomVariable to RandomVariable.
-      Collection of random variables to perform inference on. If
-      list, each random variable will be implictly optimized using a
-      `Normal` random variable that is defined internally with a
-      free parameter per location and scale and is initialized using
-      standard normal draws. The random variables to approximate
-      must be continuous.
-    n_samples: int, optional.
-      Number of samples from variational model for calculating
-      stochastic gradients.
-    kl_scaling: dict of RandomVariable to tf.Tensor, optional.
-      Provides option to scale terms when using ELBO with KL divergence.
-      If the KL divergence terms are
+  if collections is not None:
+    tf.summary.scalar("loss/p_log_prob", p_log_prob,
+                      collections=collections)
+    tf.summary.scalar("loss/q_log_prob", q_log_prob,
+                      collections=collections)
+    tf.summary.scalar("loss/reg_penalty", reg_penalty,
+                      collections=collections)
 
-      $\\alpha_p \mathbb{E}_{q(z\mid x, \lambda)} [
-            \log q(z\mid x, \lambda) - \log p(z)],$
+  loss = q_log_prob - p_log_prob + kl_penalty + reg_penalty
+  return loss, surrogate_loss
 
-      then pass {$p(z)$: $\\alpha_p$} as `kl_scaling`,
-      where $\\alpha_p$ is a tensor. Its shape must be broadcastable;
-      it is multiplied element-wise to the batchwise KL terms.
 
-  The objective function also adds to itself a summation over all
-  tensors in the `REGULARIZATION_LOSSES` collection.
-  """
-  if isinstance(latent_vars, list):
-    with tf.variable_scope(None, default_name="posterior"):
-      latent_vars_dict = {}
-      continuous = \
-          ('01', 'nonnegative', 'simplex', 'real', 'multivariate_real')
-      for z in latent_vars:
-        if not hasattr(z, 'support') or z.support not in continuous:
-          raise AttributeError(
-              "Random variable {} is not continuous or a random "
-              "variable with supported continuous support.".format(z))
-        batch_event_shape = z.batch_shape.concatenate(z.event_shape)
-        loc = tf.Variable(tf.random_normal(batch_event_shape))
-        scale = tf.nn.softplus(
-            tf.Variable(tf.random_normal(batch_event_shape)))
-        latent_vars_dict[z] = Normal(loc=loc, scale=scale)
-      latent_vars = latent_vars_dict
-      del latent_vars_dict
-  latent_vars = check_and_maybe_build_latent_vars(latent_vars)
-  data = check_and_maybe_build_data(data)
-  latent_vars, _ = transform(latent_vars, auto_transform)
-  kl_scaling = check_and_maybe_build_dict(kl_scaling)
-  scale = check_and_maybe_build_dict(scale)
-  var_list = check_and_maybe_build_var_list(var_list, latent_vars, data)
-
-  return build_score_kl_loss_and_gradients(
-      latent_vars, data, var_list,
-      scale, n_samples, kl_scaling, summary_key)
-
-
-def score_entropy_klqp(
-    latent_vars=None, data=None, n_samples=1,
-    auto_transform=True, scale=None, var_list=None, summary_key=None):
+def klqp_reparameterization(model, variational, align_latent, align_data,
+                            scale=lambda name: 1.0, n_samples=1,
+                            auto_transform=True, collections=None,
+                            *args, **kwargs):
   """Variational inference with the KL divergence
 
   $\\text{KL}( q(z; \lambda) \| p(z \mid x) ).$
 
-  This class minimizes the objective using the score function gradient
-  and an analytic entropy term.
-
-  Args:
-    latent_vars: list of RandomVariable or
-                 dict of RandomVariable to RandomVariable.
-      Collection of random variables to perform inference on. If
-      list, each random variable will be implictly optimized using a
-      `Normal` random variable that is defined internally with a
-      free parameter per location and scale and is initialized using
-      standard normal draws. The random variables to approximate
-      must be continuous.
-    n_samples: int, optional.
-      Number of samples from variational model for calculating
-      stochastic gradients.
+  This class minimizes the objective using the reparameterization
+  gradient.
 
   The objective function also adds to itself a summation over all
   tensors in the `REGULARIZATION_LOSSES` collection.
-  """
-  if isinstance(latent_vars, list):
-    with tf.variable_scope(None, default_name="posterior"):
-      latent_vars_dict = {}
-      continuous = \
-          ('01', 'nonnegative', 'simplex', 'real', 'multivariate_real')
-      for z in latent_vars:
-        if not hasattr(z, 'support') or z.support not in continuous:
-          raise AttributeError(
-              "Random variable {} is not continuous or a random "
-              "variable with supported continuous support.".format(z))
-        batch_event_shape = z.batch_shape.concatenate(z.event_shape)
-        loc = tf.Variable(tf.random_normal(batch_event_shape))
-        scale = tf.nn.softplus(
-            tf.Variable(tf.random_normal(batch_event_shape)))
-        latent_vars_dict[z] = Normal(loc=loc, scale=scale)
-      latent_vars = latent_vars_dict
-      del latent_vars_dict
-  latent_vars = check_and_maybe_build_latent_vars(latent_vars)
-  data = check_and_maybe_build_data(data)
-  latent_vars, _ = transform(latent_vars, auto_transform)
-  scale = check_and_maybe_build_dict(scale)
-  var_list = check_and_maybe_build_var_list(var_list, latent_vars, data)
-
-  return build_score_entropy_loss_and_gradients(
-      latent_vars, data, var_list,
-      scale, n_samples, summary_key)
-
-
-def score_rb_klqp(
-    latent_vars=None, data=None, n_samples=1,
-    auto_transform=True, scale=None, var_list=None, summary_key=None):
-  """Variational inference with the KL divergence
-
-  $\\text{KL}( q(z; \lambda) \| p(z \mid x) ).$
 
-  This class minimizes the objective using the score function gradient
-  and Rao-Blackwellization.
-
-  Args:
-    latent_vars: list of RandomVariable or
-                 dict of RandomVariable to RandomVariable.
-      Collection of random variables to perform inference on. If
-      list, each random variable will be implictly optimized using a
-      `Normal` random variable that is defined internally with a
-      free parameter per location and scale and is initialized using
-      standard normal draws. The random variables to approximate
-      must be continuous.
-
-  #### Notes
-
-  Current Rao-Blackwellization is limited to Rao-Blackwellizing across
-  stochastic nodes in the computation graph. It does not
-  Rao-Blackwellize within a node such as when a node represents
-  multiple random variables via non-scalar batch shape.
-
-  The objective function also adds to itself a summation over all
-  tensors in the `REGULARIZATION_LOSSES` collection.
-  """
-  if isinstance(latent_vars, list):
-    with tf.variable_scope(None, default_name="posterior"):
-      latent_vars_dict = {}
-      continuous = \
-          ('01', 'nonnegative', 'simplex', 'real', 'multivariate_real')
-      for z in latent_vars:
-        if not hasattr(z, 'support') or z.support not in continuous:
-          raise AttributeError(
-              "Random variable {} is not continuous or a random "
-              "variable with supported continuous support.".format(z))
-        batch_event_shape = z.batch_shape.concatenate(z.event_shape)
-        loc = tf.Variable(tf.random_normal(batch_event_shape))
-        scale = tf.nn.softplus(
-            tf.Variable(tf.random_normal(batch_event_shape)))
-        latent_vars_dict[z] = Normal(loc=loc, scale=scale)
-      latent_vars = latent_vars_dict
-      del latent_vars_dict
-  latent_vars = check_and_maybe_build_latent_vars(latent_vars)
-  data = check_and_maybe_build_data(data)
-  latent_vars, _ = transform(latent_vars, auto_transform)
-  scale = check_and_maybe_build_dict(scale)
-  var_list = check_and_maybe_build_var_list(var_list, latent_vars, data)
-
-  return build_score_rb_loss_and_gradients(
-      latent_vars, data, var_list,
-      scale, n_samples, summary_key)
-
-
-def build_reparam_loss_and_gradients(
-    latent_vars, data, var_list, scale, n_samples, summary_key):
-  """Build loss function. Its automatic differentiation
-  is a stochastic gradient of
+  Build loss function equal to KL(q||p) up to a constant. Its
+  automatic differentiation is a stochastic gradient of
 
   $-\\text{ELBO} =
       -\mathbb{E}_{q(z; \lambda)} [ \log p(x, z) - \log q(z; \lambda) ]$
@@ -574,63 +239,77 @@ def build_reparam_loss_and_gradients(
 
   Computed by sampling from $q(z;\lambda)$ and evaluating the
   expectation using Monte Carlo sampling.
+
+  Note if user defines constrained posterior, then auto_transform
+  can do inference on real-valued; then test time user can use
+  constrained. If user defines unconstrained posterior, then how to
+  work with constrained at test time? For now, user must manually
+  write the bijectors according to transform.
   """
   p_log_prob = [0.0] * n_samples
   q_log_prob = [0.0] * n_samples
-  base_scope = tf.get_default_graph().unique_name("inference") + '/'
   for s in range(n_samples):
-    # Form dictionary in order to replace conditioning on prior or
-    # observed variable with conditioning on a specific value.
-    scope = base_scope + tf.get_default_graph().unique_name("sample")
-    dict_swap = {}
-    for x, qx in six.iteritems(data):
-      if isinstance(x, RandomVariable):
-        if isinstance(qx, RandomVariable):
-          qx_copy = copy(qx, scope=scope)
-          dict_swap[x] = qx_copy.value
-        else:
-          dict_swap[x] = qx
-
-    for z, qz in six.iteritems(latent_vars):
-      # Copy q(z) to obtain new set of posterior samples.
-      qz_copy = copy(qz, scope=scope)
-      dict_swap[z] = qz_copy.value
-      q_log_prob[s] += tf.reduce_sum(
-          scale.get(z, 1.0) * qz_copy.log_prob(dict_swap[z]))
-
-    for z in six.iterkeys(latent_vars):
-      z_copy = copy(z, dict_swap, scope=scope)
-      p_log_prob[s] += tf.reduce_sum(
-          scale.get(z, 1.0) * z_copy.log_prob(dict_swap[z]))
-
-    for x in six.iterkeys(data):
-      if isinstance(x, RandomVariable):
-        x_copy = copy(x, dict_swap, scope=scope)
-        p_log_prob[s] += tf.reduce_sum(
-            scale.get(x, 1.0) * x_copy.log_prob(dict_swap[x]))
+    with Trace() as posterior_trace:
+      call_function_up_to_args(variational, *args, **kwargs)
+    intercept = make_intercept(
+        posterior_trace, align_data, align_latent, args, kwargs)
+    with Trace(intercept=intercept) as model_trace:
+      call_function_up_to_args(model, *args, **kwargs)
+
+    for name, node in six.iteritems(model_trace):
+      rv = node.value
+      scale_factor = scale(name)
+      if align_latent(name) is not None or align_data(name) is not None:
+        p_log_prob[s] += tf.reduce_sum(scale_factor * rv.log_prob(rv.value))
+      if align_latent(name) is not None:
+        qz = posterior_trace[align_latent(name)].value
+        q_log_prob[s] += tf.reduce_sum(scale_factor * qz.log_prob(qz.value))
 
   p_log_prob = tf.reduce_mean(p_log_prob)
   q_log_prob = tf.reduce_mean(q_log_prob)
   reg_penalty = tf.reduce_sum(tf.losses.get_regularization_losses())
-
-  if summary_key is not None:
+  if collections is not None:
     tf.summary.scalar("loss/p_log_prob", p_log_prob,
-                      collections=[summary_key])
+                      collections=collections)
     tf.summary.scalar("loss/q_log_prob", q_log_prob,
-                      collections=[summary_key])
+                      collections=collections)
     tf.summary.scalar("loss/reg_penalty", reg_penalty,
-                      collections=[summary_key])
+                      collections=collections)
+  loss = q_log_prob - p_log_prob + reg_penalty
+  return loss
+
+
+@doc.set_doc(
+    args=(doc.arg_model +
+          doc.arg_variational +
+          doc.arg_align_latent +
+          doc.arg_align_data +
+          doc.arg_scale +
+          doc.arg_n_samples +
+          doc.arg_kl_scaling +
+          doc.arg_auto_transform +
+          doc.arg_collections +
+          doc.arg_args_kwargs)[:-1],
+    returns=doc.return_loss,
+    notes_model_parameters=doc.notes_model_parameters,
+    notes_conditional_inference=doc.notes_conditional_inference_samples,
+    notes_regularization_losses=doc.notes_regularization_losses)
+def klqp_reparameterization_kl(model, variational, align_latent, align_data,
+                               scale=lambda name: 1.0, n_samples=1,
+                               kl_scaling=lambda name: 1.0,
+                               auto_transform=True, collections=None,
+                               *args, **kwargs):
+  """Variational inference with the KL divergence
 
-  loss = -(p_log_prob - q_log_prob - reg_penalty)
+  $\\text{KL}( q(z; \lambda) \| p(z \mid x) ).$
 
-  grads = tf.gradients(loss, var_list)
-  grads_and_vars = list(zip(grads, var_list))
-  return loss, grads_and_vars
+  This class minimizes the objective using the reparameterization
+  gradient and an analytic KL term.
 
+  The objective function also adds to itself a summation over all
+  tensors in the `REGULARIZATION_LOSSES` collection.
 
-def build_reparam_kl_loss_and_gradients(
-    latent_vars, data, var_list, scale, n_samples, kl_scaling, summary_key):
-  """Build loss function. Its automatic differentiation
+  Build loss function. Its automatic differentiation
   is a stochastic gradient of
 
   .. math::
@@ -646,437 +325,107 @@ def build_reparam_kl_loss_and_gradients(
   expectation using Monte Carlo sampling.
   """
   p_log_lik = [0.0] * n_samples
-  base_scope = tf.get_default_graph().unique_name("inference") + '/'
   for s in range(n_samples):
-    # Form dictionary in order to replace conditioning on prior or
-    # observed variable with conditioning on a specific value.
-    scope = base_scope + tf.get_default_graph().unique_name("sample")
-    dict_swap = {}
-    for x, qx in six.iteritems(data):
-      if isinstance(x, RandomVariable):
-        if isinstance(qx, RandomVariable):
-          qx_copy = copy(qx, scope=scope)
-          dict_swap[x] = qx_copy.value
-        else:
-          dict_swap[x] = qx
-
-    for z, qz in six.iteritems(latent_vars):
-      # Copy q(z) to obtain new set of posterior samples.
-      qz_copy = copy(qz, scope=scope)
-      dict_swap[z] = qz_copy.value
-
-    for x in six.iterkeys(data):
-      if isinstance(x, RandomVariable):
-        x_copy = copy(x, dict_swap, scope=scope)
-        p_log_lik[s] += tf.reduce_sum(
-            scale.get(x, 1.0) * x_copy.log_prob(dict_swap[x]))
+    with Trace() as posterior_trace:
+      call_function_up_to_args(variational, *args, **kwargs)
+    intercept = make_intercept(
+        posterior_trace, align_data, align_latent, args, kwargs)
+    with Trace(intercept=intercept) as model_trace:
+      call_function_up_to_args(model, *args, **kwargs)
+
+    for name, node in six.iteritems(model_trace):
+      if align_data(name) is not None:
+        rv = node.value
+        scale_factor = scale(name)
+        p_log_lik[s] += tf.reduce_sum(scale_factor * rv.log_prob(rv.value))
 
   p_log_lik = tf.reduce_mean(p_log_lik)
 
-  kl_penalty = tf.reduce_sum([
-      tf.reduce_sum(kl_scaling.get(z, 1.0) * kl_divergence(qz, z))
-      for z, qz in six.iteritems(latent_vars)])
+  kl_penalty = 0.0
+  for name, node in six.iteritems(model_trace):
+    if align_latent(name) is not None:
+      rv = node.value
+      qz = posterior_trace[align_latent(name)].value
+      kl_penalty += tf.reduce_sum(kl_scaling(name) * kl_divergence(qz, rv))
 
   reg_penalty = tf.reduce_sum(tf.losses.get_regularization_losses())
-
-  if summary_key is not None:
+  if collections is not None:
     tf.summary.scalar("loss/p_log_lik", p_log_lik,
-                      collections=[summary_key])
+                      collections=collections)
     tf.summary.scalar("loss/kl_penalty", kl_penalty,
-                      collections=[summary_key])
-    tf.summary.scalar("loss/reg_penalty", reg_penalty,
-                      collections=[summary_key])
-
-  loss = -(p_log_lik - kl_penalty - reg_penalty)
-
-  grads = tf.gradients(loss, var_list)
-  grads_and_vars = list(zip(grads, var_list))
-  return loss, grads_and_vars
-
-
-def build_reparam_entropy_loss_and_gradients(
-    latent_vars, data, var_list, scale, n_samples, summary_key):
-  """Build loss function. Its automatic differentiation
-  is a stochastic gradient of
-
-  $-\\text{ELBO} =  -( \mathbb{E}_{q(z; \lambda)} [ \log p(x , z) ]
-          + \mathbb{H}(q(z; \lambda)) )$
-
-  based on the reparameterization trick [@kingma2014auto].
-
-  It assumes the entropy is analytic.
-
-  Computed by sampling from $q(z;\lambda)$ and evaluating the
-  expectation using Monte Carlo sampling.
-  """
-  p_log_prob = [0.0] * n_samples
-  base_scope = tf.get_default_graph().unique_name("inference") + '/'
-  for s in range(n_samples):
-    # Form dictionary in order to replace conditioning on prior or
-    # observed variable with conditioning on a specific value.
-    scope = base_scope + tf.get_default_graph().unique_name("sample")
-    dict_swap = {}
-    for x, qx in six.iteritems(data):
-      if isinstance(x, RandomVariable):
-        if isinstance(qx, RandomVariable):
-          qx_copy = copy(qx, scope=scope)
-          dict_swap[x] = qx_copy.value
-        else:
-          dict_swap[x] = qx
-
-    for z, qz in six.iteritems(latent_vars):
-      # Copy q(z) to obtain new set of posterior samples.
-      qz_copy = copy(qz, scope=scope)
-      dict_swap[z] = qz_copy.value
-
-    for z in six.iterkeys(latent_vars):
-      z_copy = copy(z, dict_swap, scope=scope)
-      p_log_prob[s] += tf.reduce_sum(
-          scale.get(z, 1.0) * z_copy.log_prob(dict_swap[z]))
-
-    for x in six.iterkeys(data):
-      if isinstance(x, RandomVariable):
-        x_copy = copy(x, dict_swap, scope=scope)
-        p_log_prob[s] += tf.reduce_sum(
-            scale.get(x, 1.0) * x_copy.log_prob(dict_swap[x]))
-
-  p_log_prob = tf.reduce_mean(p_log_prob)
-
-  q_entropy = tf.reduce_sum([
-      tf.reduce_sum(qz.entropy())
-      for z, qz in six.iteritems(latent_vars)])
-
-  reg_penalty = tf.reduce_sum(tf.losses.get_regularization_losses())
-
-  if summary_key is not None:
-    tf.summary.scalar("loss/p_log_prob", p_log_prob,
-                      collections=[summary_key])
-    tf.summary.scalar("loss/q_entropy", q_entropy,
-                      collections=[summary_key])
-    tf.summary.scalar("loss/reg_penalty", reg_penalty,
-                      collections=[summary_key])
-
-  loss = -(p_log_prob + q_entropy - reg_penalty)
-
-  grads = tf.gradients(loss, var_list)
-  grads_and_vars = list(zip(grads, var_list))
-  return loss, grads_and_vars
-
-
-def build_score_loss_and_gradients(
-    latent_vars, data, var_list, scale, n_samples, summary_key):
-  """Build loss function and gradients based on the score function
-  estimator [@paisley2012variational].
-
-  Computed by sampling from $q(z;\lambda)$ and evaluating the
-  expectation using Monte Carlo sampling.
-  """
-  p_log_prob = [0.0] * n_samples
-  q_log_prob = [0.0] * n_samples
-  base_scope = tf.get_default_graph().unique_name("inference") + '/'
-  for s in range(n_samples):
-    # Form dictionary in order to replace conditioning on prior or
-    # observed variable with conditioning on a specific value.
-    scope = base_scope + tf.get_default_graph().unique_name("sample")
-    dict_swap = {}
-    for x, qx in six.iteritems(data):
-      if isinstance(x, RandomVariable):
-        if isinstance(qx, RandomVariable):
-          qx_copy = copy(qx, scope=scope)
-          dict_swap[x] = qx_copy.value
-        else:
-          dict_swap[x] = qx
-
-    for z, qz in six.iteritems(latent_vars):
-      # Copy q(z) to obtain new set of posterior samples.
-      qz_copy = copy(qz, scope=scope)
-      dict_swap[z] = qz_copy.value
-      q_log_prob[s] += tf.reduce_sum(
-          scale.get(z, 1.0) *
-          qz_copy.log_prob(tf.stop_gradient(dict_swap[z])))
-
-    for z in six.iterkeys(latent_vars):
-      z_copy = copy(z, dict_swap, scope=scope)
-      p_log_prob[s] += tf.reduce_sum(
-          scale.get(z, 1.0) * z_copy.log_prob(dict_swap[z]))
-
-    for x in six.iterkeys(data):
-      if isinstance(x, RandomVariable):
-        x_copy = copy(x, dict_swap, scope=scope)
-        p_log_prob[s] += tf.reduce_sum(
-            scale.get(x, 1.0) * x_copy.log_prob(dict_swap[x]))
-
-  p_log_prob = tf.stack(p_log_prob)
-  q_log_prob = tf.stack(q_log_prob)
-  reg_penalty = tf.reduce_sum(tf.losses.get_regularization_losses())
-
-  if summary_key is not None:
-    tf.summary.scalar("loss/p_log_prob", tf.reduce_mean(p_log_prob),
-                      collections=[summary_key])
-    tf.summary.scalar("loss/q_log_prob", tf.reduce_mean(q_log_prob),
-                      collections=[summary_key])
+                      collections=collections)
     tf.summary.scalar("loss/reg_penalty", reg_penalty,
-                      collections=[summary_key])
-
-  losses = p_log_prob - q_log_prob
-  loss = -(tf.reduce_mean(losses) - reg_penalty)
-
-  q_rvs = list(six.itervalues(latent_vars))
-  q_vars = [v for v in var_list
-            if len(get_descendants(tf.convert_to_tensor(v), q_rvs)) != 0]
-  q_grads = tf.gradients(
-      -(tf.reduce_mean(q_log_prob * tf.stop_gradient(losses)) - reg_penalty),
-      q_vars)
-  p_vars = [v for v in var_list if v not in q_vars]
-  p_grads = tf.gradients(loss, p_vars)
-  grads_and_vars = list(zip(q_grads, q_vars)) + list(zip(p_grads, p_vars))
-  return loss, grads_and_vars
+                      collections=collections)
+  loss = -p_log_lik + kl_penalty + reg_penalty
+  return loss
+
+
+@doc.set_doc(
+    args=(doc.arg_model +
+          doc.arg_variational +
+          doc.arg_align_latent +
+          doc.arg_align_data +
+          doc.arg_scale +
+          doc.arg_n_samples +
+          doc.arg_auto_transform +
+          doc.arg_collections +
+          doc.arg_args_kwargs)[:-1],
+    returns=doc.return_loss_surrogate_loss,
+    notes_model_parameters=doc.notes_model_parameters,
+    notes_conditional_inference=doc.notes_conditional_inference_samples,
+    notes_regularization_losses=doc.notes_regularization_losses)
+def klqp_score(model, variational, align_latent, align_data,
+               scale=lambda name: 1.0, n_samples=1, auto_transform=True,
+               collections=None, *args, **kwargs):
+  """Variational inference with the KL divergence
 
+  $\\text{KL}( q(z; \lambda) \| p(z \mid x) ).$
 
-def build_score_kl_loss_and_gradients(
-    latent_vars, data, var_list, scale, n_samples, kl_scaling, summary_key):
-  """Build loss function and gradients based on the score function
-  estimator [@paisley2012variational].
+  This class minimizes the objective using the score function
+  gradient.
 
-  It assumes the KL is analytic.
+  Build loss function equal to KL(q||p) up to a constant. It
+  returns an surrogate loss function whose automatic differentiation
+  is based on the score function estimator [@paisley2012variational].
 
   Computed by sampling from $q(z;\lambda)$ and evaluating the
   expectation using Monte Carlo sampling.
-  """
-  p_log_lik = [0.0] * n_samples
-  q_log_prob = [0.0] * n_samples
-  base_scope = tf.get_default_graph().unique_name("inference") + '/'
-  for s in range(n_samples):
-    # Form dictionary in order to replace conditioning on prior or
-    # observed variable with conditioning on a specific value.
-    scope = base_scope + tf.get_default_graph().unique_name("sample")
-    dict_swap = {}
-    for x, qx in six.iteritems(data):
-      if isinstance(x, RandomVariable):
-        if isinstance(qx, RandomVariable):
-          qx_copy = copy(qx, scope=scope)
-          dict_swap[x] = qx_copy.value
-        else:
-          dict_swap[x] = qx
-
-    for z, qz in six.iteritems(latent_vars):
-      # Copy q(z) to obtain new set of posterior samples.
-      qz_copy = copy(qz, scope=scope)
-      dict_swap[z] = qz_copy.value
-      q_log_prob[s] += tf.reduce_sum(
-          scale.get(z, 1.0) *
-          qz_copy.log_prob(tf.stop_gradient(dict_swap[z])))
-
-    for x in six.iterkeys(data):
-      if isinstance(x, RandomVariable):
-        x_copy = copy(x, dict_swap, scope=scope)
-        p_log_lik[s] += tf.reduce_sum(
-            scale.get(x, 1.0) * x_copy.log_prob(dict_swap[x]))
-
-  p_log_lik = tf.stack(p_log_lik)
-  q_log_prob = tf.stack(q_log_prob)
-
-  kl_penalty = tf.reduce_sum([
-      tf.reduce_sum(kl_scaling.get(z, 1.0) * kl_divergence(qz, z))
-      for z, qz in six.iteritems(latent_vars)])
 
-  reg_penalty = tf.reduce_sum(tf.losses.get_regularization_losses())
-
-  if summary_key is not None:
-    tf.summary.scalar("loss/p_log_lik", tf.reduce_mean(p_log_lik),
-                      collections=[summary_key])
-    tf.summary.scalar("loss/kl_penalty", kl_penalty,
-                      collections=[summary_key])
-    tf.summary.scalar("loss/reg_penalty", reg_penalty,
-                      collections=[summary_key])
-
-  loss = -(tf.reduce_mean(p_log_lik) - kl_penalty - reg_penalty)
-
-  q_rvs = list(six.itervalues(latent_vars))
-  q_vars = [v for v in var_list
-            if len(get_descendants(tf.convert_to_tensor(v), q_rvs)) != 0]
-  q_grads = tf.gradients(
-      -(tf.reduce_mean(q_log_prob * tf.stop_gradient(p_log_lik)) - kl_penalty -
-          reg_penalty),
-      q_vars)
-  p_vars = [v for v in var_list if v not in q_vars]
-  p_grads = tf.gradients(loss, p_vars)
-  grads_and_vars = list(zip(q_grads, q_vars)) + list(zip(p_grads, p_vars))
-  return loss, grads_and_vars
-
-
-def build_score_entropy_loss_and_gradients(
-    latent_vars, data, var_list, scale, n_samples, summary_key):
-  """Build loss function and gradients based on the score function
-  estimator [@paisley2012variational].
-
-  It assumes the entropy is analytic.
-
-  Computed by sampling from $q(z;\lambda)$ and evaluating the
-  expectation using Monte Carlo sampling.
+  The objective function also adds to itself a summation over all
+  tensors in the `REGULARIZATION_LOSSES` collection.
   """
   p_log_prob = [0.0] * n_samples
   q_log_prob = [0.0] * n_samples
-  base_scope = tf.get_default_graph().unique_name("inference") + '/'
   for s in range(n_samples):
-    # Form dictionary in order to replace conditioning on prior or
-    # observed variable with conditioning on a specific value.
-    scope = base_scope + tf.get_default_graph().unique_name("sample")
-    dict_swap = {}
-    for x, qx in six.iteritems(data):
-      if isinstance(x, RandomVariable):
-        if isinstance(qx, RandomVariable):
-          qx_copy = copy(qx, scope=scope)
-          dict_swap[x] = qx_copy.value
-        else:
-          dict_swap[x] = qx
-
-    for z, qz in six.iteritems(latent_vars):
-      # Copy q(z) to obtain new set of posterior samples.
-      qz_copy = copy(qz, scope=scope)
-      dict_swap[z] = qz_copy.value
-      q_log_prob[s] += tf.reduce_sum(
-          scale.get(z, 1.0) *
-          qz_copy.log_prob(tf.stop_gradient(dict_swap[z])))
-
-    for z in six.iterkeys(latent_vars):
-      z_copy = copy(z, dict_swap, scope=scope)
-      p_log_prob[s] += tf.reduce_sum(
-          scale.get(z, 1.0) * z_copy.log_prob(dict_swap[z]))
-
-    for x in six.iterkeys(data):
-      if isinstance(x, RandomVariable):
-        x_copy = copy(x, dict_swap, scope=scope)
-        p_log_prob[s] += tf.reduce_sum(
-            scale.get(x, 1.0) * x_copy.log_prob(dict_swap[x]))
+    with Trace() as posterior_trace:
+      call_function_up_to_args(variational, *args, **kwargs)
+    intercept = make_intercept(
+        posterior_trace, align_data, align_latent, args, kwargs)
+    with Trace(intercept=intercept) as model_trace:
+      call_function_up_to_args(model, *args, **kwargs)
+
+    for name, node in six.iteritems(model_trace):
+      rv = node.value
+      scale_factor = scale(name)
+      if align_latent(name) is not None or align_data(name) is not None:
+        p_log_prob[s] += tf.reduce_sum(scale_factor * rv.log_prob(rv.value))
+      if align_latent(name) is not None:
+        qz = posterior_trace[align_latent(name)].value
+        q_log_prob[s] += tf.reduce_sum(
+            scale_factor * qz.log_prob(tf.stop_gradient(qz.value)))
 
   p_log_prob = tf.stack(p_log_prob)
   q_log_prob = tf.stack(q_log_prob)
-
-  q_entropy = tf.reduce_sum([
-      tf.reduce_sum(qz.entropy())
-      for z, qz in six.iteritems(latent_vars)])
-
   reg_penalty = tf.reduce_sum(tf.losses.get_regularization_losses())
-
-  if summary_key is not None:
+  if collections is not None:
     tf.summary.scalar("loss/p_log_prob", tf.reduce_mean(p_log_prob),
-                      collections=[summary_key])
+                      collections=collections)
     tf.summary.scalar("loss/q_log_prob", tf.reduce_mean(q_log_prob),
-                      collections=[summary_key])
-    tf.summary.scalar("loss/q_entropy", q_entropy,
-                      collections=[summary_key])
+                      collections=collections)
     tf.summary.scalar("loss/reg_penalty", reg_penalty,
-                      collections=[summary_key])
-
-  loss = -(tf.reduce_mean(p_log_prob) + q_entropy - reg_penalty)
-
-  q_rvs = list(six.itervalues(latent_vars))
-  q_vars = [v for v in var_list
-            if len(get_descendants(tf.convert_to_tensor(v), q_rvs)) != 0]
-  q_grads = tf.gradients(
-      -(tf.reduce_mean(q_log_prob * tf.stop_gradient(p_log_prob)) +
-          q_entropy - reg_penalty),
-      q_vars)
-  p_vars = [v for v in var_list if v not in q_vars]
-  p_grads = tf.gradients(loss, p_vars)
-  grads_and_vars = list(zip(q_grads, q_vars)) + list(zip(p_grads, p_vars))
-  return loss, grads_and_vars
-
-
-def build_score_rb_loss_and_gradients(
-    latent_vars, data, var_list, scale, n_samples, summary_key):
-  """Build loss function and gradients based on the score function
-  estimator [@paisley2012variational] and Rao-Blackwellization
-  [@ranganath2014black].
-
-  Computed by sampling from :math:`q(z;\lambda)` and evaluating the
-  expectation using Monte Carlo sampling and Rao-Blackwellization.
-  """
-  # Build tensors for loss and gradient calculations. There is one set
-  # for each sample from the variational distribution.
-  p_log_probs = [{}] * n_samples
-  q_log_probs = [{}] * n_samples
-  base_scope = tf.get_default_graph().unique_name("inference") + '/'
-  for s in range(n_samples):
-    # Form dictionary in order to replace conditioning on prior or
-    # observed variable with conditioning on a specific value.
-    scope = base_scope + tf.get_default_graph().unique_name("sample")
-    dict_swap = {}
-    for x, qx in six.iteritems(data):
-      if isinstance(x, RandomVariable):
-        if isinstance(qx, RandomVariable):
-          qx_copy = copy(qx, scope=scope)
-          dict_swap[x] = qx_copy.value
-        else:
-          dict_swap[x] = qx
-
-    for z, qz in six.iteritems(latent_vars):
-      # Copy q(z) to obtain new set of posterior samples.
-      qz_copy = copy(qz, scope=scope)
-      dict_swap[z] = qz_copy.value
-      q_log_probs[s][qz] = tf.reduce_sum(
-          scale.get(z, 1.0) *
-          qz_copy.log_prob(tf.stop_gradient(dict_swap[z])))
-
-    for z in six.iterkeys(latent_vars):
-      z_copy = copy(z, dict_swap, scope=scope)
-      p_log_probs[s][z] = tf.reduce_sum(
-          scale.get(z, 1.0) * z_copy.log_prob(dict_swap[z]))
-
-    for x in six.iterkeys(data):
-      if isinstance(x, RandomVariable):
-        x_copy = copy(x, dict_swap, scope=scope)
-        p_log_probs[s][x] = tf.reduce_sum(
-            scale.get(x, 1.0) * x_copy.log_prob(dict_swap[x]))
-
-  # Take gradients of Rao-Blackwellized loss for each variational parameter.
-  p_rvs = list(six.iterkeys(latent_vars)) + \
-      [x for x in six.iterkeys(data) if isinstance(x, RandomVariable)]
-  q_rvs = list(six.itervalues(latent_vars))
-  reverse_latent_vars = {v: k for k, v in six.iteritems(latent_vars)}
-  grads = []
-  grads_vars = []
-  for var in var_list:
-    # Get all variational factors depending on the parameter.
-    descendants = get_descendants(tf.convert_to_tensor(var), q_rvs)
-    if len(descendants) == 0:
-      continue  # skip if not a variational parameter
-    # Get p and q's Markov blanket wrt these latent variables.
-    var_p_rvs = set()
-    for qz in descendants:
-      z = reverse_latent_vars[qz]
-      var_p_rvs.update(z.get_blanket(p_rvs) + [z])
-
-    var_q_rvs = set()
-    for qz in descendants:
-      var_q_rvs.update(qz.get_blanket(q_rvs) + [qz])
-
-    pi_log_prob = [0.0] * n_samples
-    qi_log_prob = [0.0] * n_samples
-    for s in range(n_samples):
-      pi_log_prob[s] = tf.reduce_sum([p_log_probs[s][rv] for rv in var_p_rvs])
-      qi_log_prob[s] = tf.reduce_sum([q_log_probs[s][rv] for rv in var_q_rvs])
-
-    pi_log_prob = tf.stack(pi_log_prob)
-    qi_log_prob = tf.stack(qi_log_prob)
-    grad = tf.gradients(
-        -tf.reduce_mean(qi_log_prob *
-                        tf.stop_gradient(pi_log_prob - qi_log_prob)) +
-        tf.reduce_sum(tf.losses.get_regularization_losses()),
-        var)
-    grads.extend(grad)
-    grads_vars.append(var)
-
-  # Take gradients of total loss function for model parameters.
-  loss = -(tf.reduce_mean([tf.reduce_sum(list(six.itervalues(p_log_prob)))
-                           for p_log_prob in p_log_probs]) -
-           tf.reduce_mean([tf.reduce_sum(list(six.itervalues(q_log_prob)))
-                           for q_log_prob in q_log_probs]) -
-           tf.reduce_sum(tf.losses.get_regularization_losses()))
-  model_vars = [v for v in var_list if v not in grads_vars]
-  model_grads = tf.gradients(loss, model_vars)
-  grads.extend(model_grads)
-  grads_vars.extend(model_vars)
-  grads_and_vars = list(zip(grads, grads_vars))
-  return loss, grads_and_vars
+                      collections=collections)
+  losses = q_log_prob - p_log_prob
+  loss = tf.reduce_mean(losses) + reg_penalty
+  surrogate_loss = (tf.reduce_mean(q_log_prob * tf.stop_gradient(losses)) +
+                    reg_penalty)
+  return loss, surrogate_loss
diff --git a/edward/inferences/implicit_klqp.py b/edward/inferences/klqp_implicit.py
similarity index 59%
rename from edward/inferences/implicit_klqp.py
rename to edward/inferences/klqp_implicit.py
index 9dd5cb66a..305372678 100644
--- a/edward/inferences/implicit_klqp.py
+++ b/edward/inferences/klqp_implicit.py
@@ -5,15 +5,15 @@
 import six
 import tensorflow as tf
 
-from edward.inferences.inference import (check_and_maybe_build_data,
-    check_and_maybe_build_latent_vars, transform, check_and_maybe_build_dict, check_and_maybe_build_var_list)
-from edward.models import RandomVariable
-from edward.util import copy
+from edward.inferences.inference import (
+    call_function_up_to_args, make_intercept)
+from edward.models.core import Trace
 
 
-def implicit_klqp(latent_vars=None, data=None, discriminator=None,
-                  global_vars=None, ratio_loss='log',
-                  auto_transform=True, scale=None, var_list=None, collections=None):
+def klqp_implicit(model, variational, discriminator, align_latent,
+                  align_data, align_latent_global=lambda name: name,
+                  ratio_loss='log', scale=lambda name: 1.0,
+                  auto_transform=True, collections=None, *args, **kwargs):
   """Variational inference with implicit probabilistic models
   [@tran2017deep].
 
@@ -44,14 +44,6 @@ def implicit_klqp(latent_vars=None, data=None, discriminator=None,
   Note the type for `discriminator`'s output changes when one
   passes in the `scale` argument to `initialize()`.
 
-  + If `scale` has at most one item, then `discriminator`
-  outputs a tensor whose multiplication with that element is
-  broadcastable. (For example, the output is a tensor and the single
-  scale factor is a scalar.)
-  + If `scale` has more than one item, then in order to scale
-  its corresponding output, `discriminator` must output a
-  dictionary of same size and keys as `scale`.
-
   The objective function also adds to itself a summation over all
   tensors in the `REGULARIZATION_LOSSES` collection.
   """
@@ -121,106 +113,96 @@ def implicit_klqp(latent_vars=None, data=None, discriminator=None,
     function for q's as well, and an additional loop. we opt not to
     because it complicates the code;
   + analytic KL/swapping out the penalty term for the globals.
+
+  align_latent aligns all global and local latents;
+  align_global_latent only aligns global latents.
   """
-  if not callable(discriminator):
-    raise TypeError("discriminator must be a callable function.")
   if callable(ratio_loss):
     ratio_loss = ratio_loss
   elif ratio_loss == 'log':
-    ratio_loss = log_loss
+    ratio_loss = _log_loss
   elif ratio_loss == 'hinge':
-    ratio_loss = hinge_loss
+    ratio_loss = _hinge_loss
   else:
     raise ValueError('Ratio loss not found:', ratio_loss)
-  latent_vars = check_and_maybe_build_latent_vars(latent_vars)
-  data = check_and_maybe_build_data(data)
-  global_vars = check_and_maybe_build_latent_vars(global_vars)
-  latent_vars, _ = transform(latent_vars, auto_transform)
-  scale = check_and_maybe_build_dict(scale)
-  var_list = check_and_maybe_build_var_list(var_list, latent_vars, data)
+
+  with Trace() as posterior_trace:
+    call_function_up_to_args(variational, *args, **kwargs)
+  global_intercept = make_intercept(
+      posterior_trace, align_data, align_latent_global, args, kwargs)
+  with Trace(intercept=global_intercept) as model_trace:
+    # Intercept model's global latent variables and set to posterior
+    # samples (but not its locals).
+    call_function_up_to_args(model, *args, **kwargs)
 
   # Collect tensors used in calculation of losses.
-  scope = tf.get_default_graph().unique_name("inference")
-  qbeta_sample = {}
   pbeta_log_prob = 0.0
   qbeta_log_prob = 0.0
-  for beta, qbeta in six.iteritems(global_vars):
-    # Draw a sample beta' ~ q(beta) and calculate
-    # log p(beta') and log q(beta').
-    qbeta_sample[beta] = qbeta.value
-    pbeta_log_prob += tf.reduce_sum(beta.log_prob(qbeta_sample[beta]))
-    qbeta_log_prob += tf.reduce_sum(qbeta.log_prob(qbeta_sample[beta]))
-
+  qbeta_sample = {}
   pz_sample = {}
   qz_sample = {}
-  for z, qz in six.iteritems(latent_vars):
-    if z not in global_vars:
-      # Copy local variables p(z), q(z) to draw samples
-      # z' ~ p(z | beta'), z' ~ q(z | beta').
-      pz_copy = copy(z, dict_swap=qbeta_sample, scope=scope)
-      pz_sample[z] = pz_copy.value
-      qz_sample[z] = qz.value
-
-  # Collect x' ~ p(x | z', beta') and x' ~ q(x).
-  dict_swap = qbeta_sample.copy()
-  dict_swap.update(qz_sample)
   x_psample = {}
   x_qsample = {}
-  for x, x_data in six.iteritems(data):
-    if isinstance(x, tf.Tensor):
-      if "Placeholder" not in x.op.type:
-        # Copy p(x | z, beta) to get draw p(x | z', beta').
-        x_copy = copy(x, dict_swap=dict_swap, scope=scope)
-        x_psample[x] = x_copy
-        x_qsample[x] = x_data
-    elif isinstance(x, RandomVariable):
-      # Copy p(x | z, beta) to get draw p(x | z', beta').
-      x_copy = copy(x, dict_swap=dict_swap, scope=scope)
-      x_psample[x] = x_copy.value
-      x_qsample[x] = x_data
+  for name, node in six.iteritems(model_trace):
+    # Calculate log p(beta') and log q(beta').
+    if align_latent_global(name) is not None:
+      pbeta = node.value
+      qbeta = posterior_trace[align_latent_global(name)].value
+      scale_factor = scale(name)
+      pbeta_log_prob += tf.reduce_sum(
+          scale_factor * pbeta.log_prob(pbeta.value))
+      qbeta_log_prob += tf.reduce_sum(
+          scale_factor * qbeta.log_prob(qbeta.value))
+      qbeta_sample[name] = qbeta.value
+    else:
+      # TODO This assumes implicit variables are tf.Tensors existing
+      # on the Trace stack.
+      if align_latent(name) is not None:
+        pz = node.value
+        qz = posterior_trace[align_latent(Name)].value
+        pz_sample[name] = pz
+        qz_sample[name] = qz
+      else:
+        key = align_data(name)
+        if isinstance(key, int):
+          data_node = args[key]
+        elif kwargs.get(key, None) is not None:
+          data_node = kwargs.get(key)
+        px = node.value
+        qx = data_node.value
+        x_psample[name] = px
+        x_qsample[name] = qx
 
+  # Collect x' ~ p(x | z', beta') and x' ~ q(x).
   with tf.variable_scope("Disc"):
+    # TODO For now, this assumes the discriminator automagically knows
+    # how to index the dictionaries and computes some forward pass on
+    # them (which can vary across executions). Dictionaries should be
+    # improved to be more idiomatic.
     r_psample = discriminator(x_psample, pz_sample, qbeta_sample)
 
   with tf.variable_scope("Disc", reuse=True):
     r_qsample = discriminator(x_qsample, qz_sample, qbeta_sample)
 
   # Form ratio loss and ratio estimator.
-  if len(scale) <= 1:
-    loss_d = tf.reduce_mean(ratio_loss(r_psample, r_qsample))
-    scale = list(six.itervalues(scale))
-    scale = scale[0] if scale else 1.0
-    scaled_ratio = tf.reduce_sum(scale * r_qsample)
-  else:
-    loss_d = [tf.reduce_mean(ratio_loss(r_psample[key], r_qsample[key]))
-              for key in six.iterkeys(scale)]
-    loss_d = tf.reduce_sum(loss_d)
-    scaled_ratio = [tf.reduce_sum(scale[key] * r_qsample[key])
-                    for key in six.iterkeys(scale)]
-    scaled_ratio = tf.reduce_sum(scaled_ratio)
+  loss_d = 0.0
+  scaled_ratio = 0.0
+  for key, value in six.iteritems(r_qsample):
+    loss_d += tf.reduce_mean(ratio_loss(r_psample[key], value))
+    scaled_ratio += tf.reduce_sum(scale(key) * value)
 
   reg_terms_d = tf.losses.get_regularization_losses(scope="Disc")
   reg_terms_all = tf.losses.get_regularization_losses()
   reg_terms = [r for r in reg_terms_all if r not in reg_terms_d]
 
   # Form variational objective.
-  loss = -(pbeta_log_prob - qbeta_log_prob + scaled_ratio -
-           tf.reduce_sum(reg_terms))
+  loss = (qbeta_log_prob - pbeta_log_prob - scaled_ratio +
+          tf.reduce_sum(reg_terms))
   loss_d = loss_d + tf.reduce_sum(reg_terms_d)
-
-  var_list_d = tf.get_collection(
-      tf.GraphKeys.TRAINABLE_VARIABLES, scope="Disc")
-  if var_list is None:
-    var_list = [v for v in tf.trainable_variables() if v not in var_list_d]
-
-  grads = tf.gradients(loss, var_list)
-  grads_d = tf.gradients(loss_d, var_list_d)
-  grads_and_vars = list(zip(grads, var_list))
-  grads_and_vars_d = list(zip(grads_d, var_list_d))
-  return loss, grads_and_vars, loss_d, grads_and_vars_d
+  return loss, loss_d
 
 
-def log_loss(psample, qsample):
+def _log_loss(psample, qsample):
   """Point-wise log loss."""
   loss = tf.nn.sigmoid_cross_entropy_with_logits(
       labels=tf.ones_like(psample), logits=psample) + \
@@ -229,7 +211,7 @@ def log_loss(psample, qsample):
   return loss
 
 
-def hinge_loss(psample, qsample):
+def _hinge_loss(psample, qsample):
   """Point-wise hinge loss."""
   loss = tf.nn.relu(1.0 - psample) + tf.nn.relu(1.0 + qsample)
   return loss
diff --git a/edward/inferences/laplace.py b/edward/inferences/laplace.py
index 4970c8f5c..3cb9a64a5 100644
--- a/edward/inferences/laplace.py
+++ b/edward/inferences/laplace.py
@@ -5,10 +5,11 @@
 import six
 import tensorflow as tf
 
+from edward.inferences.inference import call_function_up_to_args
+from edward.inferences import docstrings as doc
 from edward.inferences.map import map
-from edward.models import RandomVariable
+from edward.models.core import Trace
 from edward.models.queries import get_variables
-from edward.util import copy, transform
 
 try:
   from edward.models import \
@@ -17,8 +18,9 @@
   raise ImportError("{0}. Your TensorFlow version is not supported.".format(e))
 
 
-def laplace(latent_vars=None, data=None,
-            auto_transform=True, scale=None, var_list=None, collections=None):
+def laplace(model, variational, align_latent, align_data,
+            scale=lambda name: 1.0, auto_transform=True,
+            collections=None, *args, **kwargs):
   """Laplace approximation [@laplace1986memoir].
 
   It approximates the posterior distribution using a multivariate
@@ -75,59 +77,49 @@ def laplace(latent_vars=None, data=None,
       variable must be a `MultivariateNormalDiag`,
       `MultivariateNormalTriL`, or `Normal` random variable.
   """
-  if isinstance(latent_vars, list):
-    with tf.variable_scope(None, default_name="posterior"):
-      latent_vars_dict = {}
-      for z in latent_vars:
-        # Define location to have constrained support and
-        # unconstrained free parameters.
-        batch_event_shape = z.batch_shape.concatenate(z.event_shape)
-        loc = tf.Variable(tf.random_normal(batch_event_shape))
-        if hasattr(z, 'support'):
-          z_transform = transform(z)
-          if hasattr(z_transform, 'bijector'):
-            loc = z_transform.bijector.inverse(loc)
-        scale_tril = tf.Variable(tf.random_normal(
-            batch_event_shape.concatenate(batch_event_shape[-1])))
-        qz = MultivariateNormalTriL(loc=loc, scale_tril=scale_tril)
-        latent_vars_dict[z] = qz
-      latent_vars = latent_vars_dict
-      del latent_vars_dict
-  elif isinstance(latent_vars, dict):
-    for qz in six.itervalues(latent_vars):
-      if not isinstance(
-              qz, (MultivariateNormalDiag, MultivariateNormalTriL, Normal)):
-        raise TypeError("Posterior approximation must consist of only "
-                        "MultivariateNormalDiag, MultivariateTriL, or "
-                        "Normal random variables.")
-
-  # Store latent variables in a temporary object; MAP will
-  # optimize `PointMass` random variables, which subsequently
-  # optimizes location parameters of the normal approximations.
-  latent_vars_normal = latent_vars.copy()
-  latent_vars = {z: PointMass(params=qz.loc)
-                 for z, qz in six.iteritems(latent_vars_normal)}
-
-  loss, grads_and_vars = map(
-      latent_vars, data,
-      auto_transform, scale, var_list, collections)
-  def _finalize(loss, latent_vars, latent_vars_normal):
-    """Function to call after convergence.
-
-    Computes the Hessian at the mode.
-    """
-    hessians = tf.hessians(loss, list(six.itervalues(latent_vars)))
-    finalize_ops = []
-    for z, hessian in zip(six.iterkeys(latent_vars), hessians):
-      qz = latent_vars_normal[z]
-      if isinstance(qz, (MultivariateNormalDiag, Normal)):
-        scale_var = get_variables(qz.variance())[0]
-        scale = 1.0 / tf.diag_part(hessian)
-      else:  # qz is MultivariateNormalTriL
-        scale_var = get_variables(qz.covariance())[0]
-        scale = tf.matrix_inverse(tf.cholesky(hessian))
-
-      finalize_ops.append(scale_var.assign(scale))
-    return tf.group(*finalize_ops)
-  finalize_op = _finalize(loss, latent_vars, latent_vars_normal)
-  return loss, grads_and_vars, finalize_op
+  variational_pointmass = _make_variational_pointmass(
+      variational, *args, **kwargs)
+  loss = map(model, variational, align_latent, align_data,
+             scale, auto_transform, collections, *args, **kwargs)
+  finalize_op = _finalize(loss, variational)
+  return loss, finalize_op
+
+
+def _finalize(loss, variational):
+  """Function to call after convergence.
+
+  Computes the Hessian at the mode.
+  """
+  with Trace() as trace:
+    call_function_up_to_args(variational, *args, **kwargs)
+  hessians = tf.hessians(
+      loss, [node.value.loc for node in six.itervalues(trace)])
+  finalize_ops = []
+  for qz, hessian in zip(six.itervalues(trace), hessians):
+    if isinstance(qz, (MultivariateNormalDiag, Normal)):
+      scale_var = get_variables(qz.variance())[0]
+      scale = 1.0 / tf.diag_part(hessian)
+    else:  # qz is MultivariateNormalTriL
+      scale_var = get_variables(qz.covariance())[0]
+      scale = tf.matrix_inverse(tf.cholesky(hessian))
+
+    finalize_ops.append(scale_var.assign(scale))
+  return tf.group(*finalize_ops)
+
+
+def _make_variational_pointmass(variational, *args, **kwargs):
+  """Take a variational program and build a new one that replaces all
+  random variables with point masses.
+
+  We assume all latent variables are traceable in one execution.
+  """
+  with Trace() as trace:
+    call_function_up_to_args(variational, *args, **kwargs)
+
+  def variational_pointmass(*args, **kwargs):
+    for name, node in six.iteritems(trace):
+      qz = node.value
+      qz_pointmass = PointMass(params=qz.loc,
+                               name=qz.name + "_pointmass",
+                               value=qz.loc)
+  return variational_pointmass
diff --git a/edward/inferences/map.py b/edward/inferences/map.py
index 593f3efbc..f1e457869 100644
--- a/edward/inferences/map.py
+++ b/edward/inferences/map.py
@@ -5,10 +5,9 @@
 import six
 import tensorflow as tf
 
-from edward.inferences.inference import (check_and_maybe_build_data,
-    check_and_maybe_build_latent_vars, transform, check_and_maybe_build_dict, check_and_maybe_build_var_list)
-from edward.models import RandomVariable, PointMass
-from edward.util import copy, transform
+from edward.inferences.inference import (
+    call_function_up_to_args, make_intercept)
+from edward.models.core import Trace
 
 try:
   from tensorflow.contrib.distributions import bijectors
@@ -16,8 +15,9 @@
   raise ImportError("{0}. Your TensorFlow version is not supported.".format(e))
 
 
-def map(latent_vars=None, data=None,
-        auto_transform=True, scale=None, var_list=None, collections=None):
+def map(model, variational, align_latent, align_data,
+        scale=lambda name: 1.0, auto_transform=True, collections=None,
+        *args, **kwargs):
   """Maximum a posteriori.
 
   This class implements gradient-based optimization to solve the
@@ -95,62 +95,26 @@ def map(latent_vars=None, data=None,
 
   $- \log p(x,z).$
   """
-  if isinstance(latent_vars, list):
-    with tf.variable_scope(None, default_name="posterior"):
-      latent_vars_dict = {}
-      for z in latent_vars:
-        # Define point masses to have constrained support and
-        # unconstrained free parameters.
-        batch_event_shape = z.batch_shape.concatenate(z.event_shape)
-        params = tf.Variable(tf.random_normal(batch_event_shape))
-        if hasattr(z, 'support'):
-          z_transform = transform(z)
-          if hasattr(z_transform, 'bijector'):
-            params = z_transform.bijector.inverse(params)
-        latent_vars_dict[z] = PointMass(params=params)
-      latent_vars = latent_vars_dict
-      del latent_vars_dict
-  elif isinstance(latent_vars, dict):
-    for qz in six.itervalues(latent_vars):
-      if not isinstance(qz, PointMass):
-        raise TypeError("Posterior approximation must consist of only "
-                        "PointMass random variables.")
-  latent_vars = check_and_maybe_build_latent_vars(latent_vars)
-  data = check_and_maybe_build_data(data)
-  latent_vars, _ = transform(latent_vars, auto_transform)
-  scale = check_and_maybe_build_dict(scale)
-  var_list = check_and_maybe_build_var_list(var_list, latent_vars, data)
-
-  # Form dictionary in order to replace conditioning on prior or
-  # observed variable with conditioning on a specific value.
-  scope = tf.get_default_graph().unique_name("inference")
-  dict_swap = {z: qz.value
-               for z, qz in six.iteritems(latent_vars)}
-  for x, qx in six.iteritems(data):
-    if isinstance(x, RandomVariable):
-      if isinstance(qx, RandomVariable):
-        dict_swap[x] = qx.value
-      else:
-        dict_swap[x] = qx
+  with Trace() as posterior_trace:
+    call_function_up_to_args(variational, *args, **kwargs)
+  intercept = make_intercept(
+      posterior_trace, align_data, align_latent, args, kwargs)
+  with Trace(intercept=intercept) as model_trace:
+    call_function_up_to_args(model, *args, **kwargs)
 
   p_log_prob = 0.0
-  for z in six.iterkeys(latent_vars):
-    z_copy = copy(z, dict_swap, scope=scope)
-    p_log_prob += tf.reduce_sum(
-        scale.get(z, 1.0) * z_copy.log_prob(dict_swap[z]))
-
-  for x in six.iterkeys(data):
-    if isinstance(x, RandomVariable):
-      if dict_swap:
-        x_copy = copy(x, dict_swap, scope=scope)
-      else:
-        x_copy = x
-      p_log_prob += tf.reduce_sum(
-          scale.get(x, 1.0) * x_copy.log_prob(dict_swap[x]))
+  for name, node in six.iteritems(model_trace):
+    if align_latent(name) is not None or align_data(name) is not None:
+      rv = node.value
+      scale_factor = scale(name)
+      p_log_prob += tf.reduce_sum(scale_factor * rv.log_prob(rv.value))
 
   reg_penalty = tf.reduce_sum(tf.losses.get_regularization_losses())
-  loss = -p_log_prob + reg_penalty
+  if collections is not None:
+    tf.summary.scalar("loss/p_log_prob", p_log_prob,
+                      collections=collections)
+    tf.summary.scalar("loss/reg_penalty", reg_penalty,
+                      collections=collections)
 
-  grads = tf.gradients(loss, var_list)
-  grads_and_vars = list(zip(grads, var_list))
-  return loss, grads_and_vars
+  loss = -p_log_prob + reg_penalty
+  return loss
diff --git a/edward/inferences/metropolis_hastings.py b/edward/inferences/metropolis_hastings.py
index 69c1a5307..3c6323131 100644
--- a/edward/inferences/metropolis_hastings.py
+++ b/edward/inferences/metropolis_hastings.py
@@ -8,7 +8,7 @@
 from collections import OrderedDict
 from edward.inferences.monte_carlo import MonteCarlo
 from edward.models import RandomVariable
-from edward.util import check_and_maybe_build_latent_vars, copy
+from edward.util import check_and_maybe_build_latent_vars
 
 
 class MetropolisHastings(MonteCarlo):
diff --git a/edward/inferences/sghmc.py b/edward/inferences/sghmc.py
index 50a744194..7cc7599fc 100644
--- a/edward/inferences/sghmc.py
+++ b/edward/inferences/sghmc.py
@@ -7,7 +7,6 @@
 
 from edward.inferences.monte_carlo import MonteCarlo
 from edward.models import RandomVariable, Empirical
-from edward.util import copy
 
 
 class SGHMC(MonteCarlo):
diff --git a/edward/inferences/sgld.py b/edward/inferences/sgld.py
index 82c5163a4..930eae583 100644
--- a/edward/inferences/sgld.py
+++ b/edward/inferences/sgld.py
@@ -7,7 +7,6 @@
 
 from edward.inferences.monte_carlo import MonteCarlo
 from edward.models import RandomVariable
-from edward.util import copy
 
 
 class SGLD(MonteCarlo):
diff --git a/edward/inferences/wake_sleep.py b/edward/inferences/wake_sleep.py
index 4e98fa897..2314d513a 100644
--- a/edward/inferences/wake_sleep.py
+++ b/edward/inferences/wake_sleep.py
@@ -5,14 +5,14 @@
 import six
 import tensorflow as tf
 
-from edward.inferences.inference import (check_and_maybe_build_data,
-    check_and_maybe_build_latent_vars, transform, check_and_maybe_build_dict, check_and_maybe_build_var_list)
-from edward.models import RandomVariable
-from edward.util import copy, get_descendants
+from edward.inferences.inference import (
+    call_function_up_to_args, make_intercept)
+from edward.models.core import Trace
 
 
-def wake_sleep(latent_vars=None, data=None, n_samples=1, phase_q='sleep',
-               auto_transform=True, scale=None, var_list=None, collections=None):
+def wake_sleep(model, variational, align_latent, align_data,
+               scale=lambda name: 1.0, n_samples=1, phase_q='sleep',
+               auto_transform=True, collections=None, *args, **kwargs):
   """Wake-Sleep algorithm [@hinton1995wake].
 
   Given a probability model $p(x, z; \\theta)$ and variational
@@ -68,69 +68,54 @@ def wake_sleep(latent_vars=None, data=None, n_samples=1, phase_q='sleep',
       (Unlike reparameterization gradients, the sample is held
       fixed.)
   """
-  latent_vars = check_and_maybe_build_latent_vars(latent_vars)
-  data = check_and_maybe_build_data(data)
-  latent_vars, _ = transform(latent_vars, auto_transform)
-  scale = check_and_maybe_build_dict(scale)
-  var_list = check_and_maybe_build_var_list(var_list, latent_vars, data)
-
   p_log_prob = [0.0] * n_samples
   q_log_prob = [0.0] * n_samples
-  base_scope = tf.get_default_graph().unique_name("inference") + '/'
   for s in range(n_samples):
-    # Form dictionary in order to replace conditioning on prior or
-    # observed variable with conditioning on a specific value.
-    scope = base_scope + tf.get_default_graph().unique_name("q_sample")
-    dict_swap = {}
-    for x, qx in six.iteritems(data):
-      if isinstance(x, RandomVariable):
-        if isinstance(qx, RandomVariable):
-          qx_copy = copy(qx, scope=scope)
-          dict_swap[x] = qx_copy.value
-        else:
-          dict_swap[x] = qx
-
-    # Sample z ~ q(z), then compute log p(x, z).
-    q_dict_swap = dict_swap.copy()
-    for z, qz in six.iteritems(latent_vars):
-      # Copy q(z) to obtain new set of posterior samples.
-      qz_copy = copy(qz, scope=scope)
-      q_dict_swap[z] = qz_copy.value
-      if phase_q != 'sleep':
+    with Trace() as posterior_trace:
+      call_function_up_to_args(variational, *args, **kwargs)
+    intercept = make_intercept(
+        posterior_trace, align_data, align_latent, args, kwargs)
+    with Trace(intercept=intercept) as model_trace:
+      call_function_up_to_args(model, *args, **kwargs)
+
+    for name, node in six.iteritems(model_trace):
+      rv = node.value
+      scale_factor = scale(name)
+      if align_data(name) is not None or align_latent(name) is not None:
+        p_log_prob[s] += tf.reduce_sum(scale_factor * rv.log_prob(rv.value))
+      if phase_q != 'sleep' and align_latent(name) is not None:
         # If not sleep phase, compute log q(z).
+        qz = posterior_trace[align_latent(name)].value
         q_log_prob[s] += tf.reduce_sum(
-            scale.get(z, 1.0) *
-            qz_copy.log_prob(tf.stop_gradient(q_dict_swap[z])))
-
-    for z in six.iterkeys(latent_vars):
-      z_copy = copy(z, q_dict_swap, scope=scope)
-      p_log_prob[s] += tf.reduce_sum(
-          scale.get(z, 1.0) * z_copy.log_prob(q_dict_swap[z]))
-
-    for x in six.iterkeys(data):
-      if isinstance(x, RandomVariable):
-        x_copy = copy(x, q_dict_swap, scope=scope)
-        p_log_prob[s] += tf.reduce_sum(
-            scale.get(x, 1.0) * x_copy.log_prob(q_dict_swap[x]))
+            scale_factor * qz.log_prob(tf.stop_gradient(qz.value)))
 
     if phase_q == 'sleep':
-      # Sample z ~ p(z), then compute log q(z).
-      scope = base_scope + tf.get_default_graph().unique_name("p_sample")
-      p_dict_swap = dict_swap.copy()
-      for z, qz in six.iteritems(latent_vars):
-        # Copy p(z) to obtain new set of prior samples.
-        z_copy = copy(z, scope=scope)
-        p_dict_swap[qz] = z_copy.value
-      for qz in six.itervalues(latent_vars):
-        qz_copy = copy(qz, p_dict_swap, scope=scope)
+      with Trace() as model_trace:
+        call_function_up_to_args(model, *args, **kwargs)
+      intercept = _make_sleep_intercept(
+          model_trace, align_data, align_latent, args, kwargs)
+      with Trace(intercept=intercept) as posterior_trace:
+        call_function_up_to_args(variational, *args, **kwargs)
+
+      # Build dictionary to return scale factor for a posterior
+      # variable via its corresponding prior. The implementation is
+      # naive.
+      scale_posterior = {}
+      for name, node in six.iteritems(model_trace):
+        rv = node.value
+        if align_latent(name) is not None:
+          qz = posterior_trace[align_latent(name)].value
+          scale_posterior[qz] = rv
+
+      for name, node in six.iteritems(posterior_trace):
+        rv = node.value
+        scale_factor = scale_posterior[rv]
         q_log_prob[s] += tf.reduce_sum(
-            scale.get(z, 1.0) *
-            qz_copy.log_prob(tf.stop_gradient(p_dict_swap[qz])))
+            scale_factor * rv.log_prob(tf.stop_gradient(rv.value)))
 
   p_log_prob = tf.reduce_mean(p_log_prob)
   q_log_prob = tf.reduce_mean(q_log_prob)
   reg_penalty = tf.reduce_sum(tf.losses.get_regularization_losses())
-
   if collections is not None:
     tf.summary.scalar("loss/p_log_prob", p_log_prob,
                       collections=collections)
@@ -141,12 +126,14 @@ def wake_sleep(latent_vars=None, data=None, n_samples=1, phase_q='sleep',
 
   loss_p = -p_log_prob + reg_penalty
   loss_q = -q_log_prob + reg_penalty
+  return loss_p, loss_q
+
 
-  q_rvs = list(six.itervalues(latent_vars))
-  q_vars = [v for v in var_list
-            if len(get_descendants(tf.convert_to_tensor(v), q_rvs)) != 0]
-  q_grads = tf.gradients(loss_q, q_vars)
-  p_vars = [v for v in var_list if v not in q_vars]
-  p_grads = tf.gradients(loss_p, p_vars)
-  grads_and_vars = list(zip(q_grads, q_vars)) + list(zip(p_grads, p_vars))
-  return loss_p, grads_and_vars
+def _make_sleep_intercept(trace, align_data, align_latent, args, kwargs):
+  def _intercept(f, *fargs, **fkwargs):
+    """Set variational distribution's sample value to prior's."""
+    name = fkwargs.get('name', None)
+    z = trace[align_latent(name)].value
+    fkwargs['value'] = z.value
+    return f(*fargs, **fkwargs)
+  return _intercept
diff --git a/edward/inferences/wgan_inference.py b/edward/inferences/wgan_inference.py
index 8914339a2..fedc87d85 100644
--- a/edward/inferences/wgan_inference.py
+++ b/edward/inferences/wgan_inference.py
@@ -5,13 +5,11 @@
 import six
 import tensorflow as tf
 
-from edward.inferences.inference import (check_and_maybe_build_data,
-    transform, check_and_maybe_build_dict, check_and_maybe_build_var_list)
+from edward.inferences.inference import call_function_up_to_args
 
 
-def wgan_inference(data=None, discriminator=None,
-                   penalty=10.0,
-                   scale=None, var_list=None, collections=None):
+def wgan_inference(model, discriminator, align_data,
+                   penalty=10.0, collections=None, *args, **kwargs):
   """Parameter estimation with GAN-style training
   [@goodfellow2014generative], using the Wasserstein distance
   [@arjovsky2017wasserstein].
@@ -62,13 +60,15 @@ def wgan_inference(data=None, discriminator=None,
       None (or 0.0) if using no penalty.
     clip: float, optional.
       Value to clip weights by. Default is no clipping.
-  """
-  data = check_and_maybe_build_data(data)
-  scale = check_and_maybe_build_dict(scale)
-  var_list = check_and_maybe_build_var_list(var_list, {}, data)
 
-  x_true = list(six.itervalues(data))[0]
-  x_fake = list(six.iterkeys(data))[0]
+  `model` must return the generated data.
+  """
+  x_fake = call_function_up_to_args(model, *args, **kwargs)
+  key = align_data(x_fake.name.split(':')[0])
+  if isinstance(key, int):
+    x_true = args[key]
+  elif kwargs.get(key, None) is not None:
+    x_true = kwargs.get(key)
   with tf.variable_scope("Disc"):
     d_true = discriminator(x_true)
 
@@ -96,14 +96,4 @@ def wgan_inference(data=None, discriminator=None,
   mean_fake = tf.reduce_mean(d_fake)
   loss_d = mean_fake - mean_true + penalty + tf.reduce_sum(reg_terms_d)
   loss = -mean_fake + tf.reduce_sum(reg_terms)
-
-  var_list_d = tf.get_collection(
-      tf.GraphKeys.TRAINABLE_VARIABLES, scope="Disc")
-  if var_list is None:
-    var_list = [v for v in tf.trainable_variables() if v not in var_list_d]
-
-  grads_d = tf.gradients(loss_d, var_list_d)
-  grads = tf.gradients(loss, var_list)
-  grads_and_vars_d = list(zip(grads_d, var_list_d))
-  grads_and_vars = list(zip(grads, var_list))
-  return loss, grads_and_vars, loss_d, grads_and_vars_d
+  return loss, loss_d
diff --git a/edward/util/__init__.py b/edward/util/__init__.py
index 6dfba6160..fb8f8833c 100644
--- a/edward/util/__init__.py
+++ b/edward/util/__init__.py
@@ -10,7 +10,6 @@
 from tensorflow.python.util.all_util import remove_undocumented
 
 _allowed_symbols = [
-    'copy',
     'get_control_variate_coef',
     'transform',
 ]
diff --git a/edward/util/random_variables.py b/edward/util/random_variables.py
index f6770a6a0..080501a8a 100644
--- a/edward/util/random_variables.py
+++ b/edward/util/random_variables.py
@@ -4,379 +4,11 @@
 
 import tensorflow as tf
 
-from copy import deepcopy
 from edward.models.core import TransformedDistribution
-from edward.models.random_variable import RandomVariable
-from edward.util.graphs import random_variables
-from tensorflow.core.framework import attr_value_pb2
-from tensorflow.python.framework.ops import set_shapes_for_outputs
-from tensorflow.python.util import compat
 
 tfb = tf.contrib.distributions.bijectors
 
 
-def _get_context_copy(ctx, scope):
-    # contexts are stored in graph collections
-    # is there a more efficient way to do this?
-
-    graph = tf.get_default_graph()
-
-    for name, collection in six.iteritems(graph._collections):
-      if ctx in collection:
-        for item in collection:
-          if item.name == scope + ctx.name:
-            return item
-
-    return None
-
-
-def _copy_context(ctx, context_matches, dict_swap, scope, copy_q):
-  if ctx is None:
-    return None
-
-  # We'd normally check about returning early, but the context won't
-  # be copied until after all children are, so we check that first.
-
-  graph = tf.get_default_graph()
-
-  # copy all nodes within context
-  for tensorname in ctx._values:
-    tensor = graph.as_graph_element(tensorname)
-    copy(tensor, dict_swap, scope, True, copy_q)
-
-  # now make sure we haven't already copied the context we're currently
-  # trying to copy (in the course of copying another child)
-  ctx_copy = _get_context_copy(ctx, scope)
-  if ctx_copy:
-    return ctx_copy
-
-  ctx_copy = ctx.from_proto(ctx.to_proto(), scope[:-1])
-  outer_copy = _copy_context(ctx.outer_context, context_matches, dict_swap,
-                             scope, copy_q)
-  ctx_copy._outer_context = outer_copy
-
-  for name, collection in six.iteritems(graph._collections):
-      if ctx in collection:
-        graph.add_to_collection(name, ctx_copy)
-  return ctx_copy
-
-
-def _copy_default(x, *args, **kwargs):
-  if isinstance(x, (RandomVariable, tf.Operation, tf.Tensor, tf.Variable)):
-    x = copy(x, *args, **kwargs)
-
-  return x
-
-
-def copy(org_instance, dict_swap=None, scope="copied",
-         replace_itself=False, copy_q=False, copy_parent_rvs=True):
-  """Build a new node in the TensorFlow graph from `org_instance`,
-  where any of its ancestors existing in `dict_swap` are
-  replaced with `dict_swap`'s corresponding value.
-
-  Copying is done recursively. Any `Operation` whose output is
-  required to copy `org_instance` is also copied (if it isn't already
-  copied within the new scope).
-
-  `tf.Variable`s, `tf.placeholder`s, and nodes of type `Queue` are
-  always reused and not copied. In addition, `tf.Operation`s with
-  operation-level seeds are copied with a new operation-level seed.
-
-  Args:
-    org_instance: RandomVariable, tf.Operation, tf.Tensor, or tf.Variable.
-      Node to add in graph with replaced ancestors.
-    dict_swap: dict.
-      Random variables, variables, tensors, or operations to swap with.
-      Its keys are what `org_instance` may depend on, and its values are
-      the corresponding object (not necessarily of the same class
-      instance, but must have the same type, e.g., float32) that is used
-      in exchange.
-    scope: str.
-      A scope for the new node(s). This is used to avoid name
-      conflicts with the original node(s).
-    replace_itself: bool.
-      Whether to replace `org_instance` itself if it exists in
-      `dict_swap`. (This is used for the recursion.)
-    copy_q: bool.
-      Whether to copy the replaced tensors too (if not already
-      copied within the new scope). Otherwise will reuse them.
-    copy_parent_rvs:
-      Whether to copy parent random variables `org_instance` depends
-      on. Otherwise will copy only the sample tensors and not the
-      random variable class itself.
-
-  Returns:
-    RandomVariable, tf.Variable, tf.Tensor, or tf.Operation.
-    The copied node.
-
-  Raises:
-    TypeError.
-    If `org_instance` is not one of the above types.
-
-  #### Examples
-
-  ```python
-  x = tf.constant(2.0)
-  y = tf.constant(3.0)
-  z = x * y
-
-  qx = tf.constant(4.0)
-  # The TensorFlow graph is currently
-  # `x` -> `z` <- y`, `qx`
-
-  # This adds a subgraph with newly copied nodes,
-  # `qx` -> `copied/z` <- `copied/y`
-  z_new = ed.copy(z, {x: qx})
-
-  sess = tf.Session()
-  sess.run(z)
-  6.0
-  sess.run(z_new)
-  12.0
-  ```
-  """
-  if not isinstance(org_instance,
-                    (RandomVariable, tf.Operation, tf.Tensor, tf.Variable)):
-    raise TypeError("Could not copy instance: " + str(org_instance))
-
-  if dict_swap is None:
-    dict_swap = {}
-  if scope[-1] != '/':
-    scope += '/'
-
-  # Swap instance if in dictionary.
-  if org_instance in dict_swap and replace_itself:
-    org_instance = dict_swap[org_instance]
-    if not copy_q:
-      return org_instance
-  elif isinstance(org_instance, tf.Tensor) and replace_itself:
-    # Deal with case when `org_instance` is the associated tensor
-    # from the RandomVariable, e.g., `z.value`. If
-    # `dict_swap={z: qz}`, we aim to swap it with `qz.value`.
-    for key, value in six.iteritems(dict_swap):
-      if isinstance(key, RandomVariable):
-        if org_instance == key.value:
-          if isinstance(value, RandomVariable):
-            org_instance = value.value
-          else:
-            org_instance = value
-          if not copy_q:
-            return org_instance
-          break
-
-  # If instance is a tf.Variable, return it; do not copy any. Note we
-  # check variables via their name. If we get variables through an
-  # op's inputs, it has type tf.Tensor and not tf.Variable.
-  if isinstance(org_instance, (tf.Tensor, tf.Variable)):
-    for variable in tf.global_variables():
-      if org_instance.name == variable.name:
-        if variable in dict_swap and replace_itself:
-          # Deal with case when `org_instance` is the associated _ref
-          # tensor for a tf.Variable.
-          org_instance = dict_swap[variable]
-          if not copy_q or isinstance(org_instance, tf.Variable):
-            return org_instance
-          for variable in tf.global_variables():
-            if org_instance.name == variable.name:
-              return variable
-          break
-        else:
-          return variable
-
-  graph = tf.get_default_graph()
-  new_name = scope + org_instance.name
-
-  # If an instance of the same name exists, return it.
-  if isinstance(org_instance, RandomVariable):
-    for rv in random_variables():
-      if new_name == rv.name:
-        return rv
-  elif isinstance(org_instance, (tf.Tensor, tf.Operation)):
-    try:
-      return graph.as_graph_element(new_name,
-                                    allow_tensor=True,
-                                    allow_operation=True)
-    except:
-      pass
-
-  # Preserve ordering of random variables. Random variables are always
-  # copied first (from parent -> child) before any deterministic
-  # operations that depend on them.
-  if copy_parent_rvs and \
-          isinstance(org_instance, (RandomVariable, tf.Tensor, tf.Variable)):
-    for v in get_parents(org_instance):
-      copy(v, dict_swap, scope, True, copy_q, True)
-
-  if isinstance(org_instance, RandomVariable):
-    rv = org_instance
-
-    # If it has copiable arguments, copy them.
-    args = [_copy_default(arg, dict_swap, scope, True, copy_q, False)
-            for arg in rv._args]
-
-    kwargs = {}
-    for key, value in six.iteritems(rv._kwargs):
-      if isinstance(value, list):
-        kwargs[key] = [_copy_default(v, dict_swap, scope, True, copy_q, False)
-                       for v in value]
-      else:
-        kwargs[key] = _copy_default(
-            value, dict_swap, scope, True, copy_q, False)
-
-    kwargs['name'] = new_name
-    # Create new random variable with copied arguments.
-    try:
-      new_rv = type(rv)(*args, **kwargs)
-    except ValueError:
-      # Handle case where parameters are copied under absolute name
-      # scope. This can cause an error when creating a new random
-      # variable as tf.identity name ops are called on parameters ("op
-      # with name already exists"). To avoid remove absolute name scope.
-      kwargs['name'] = new_name[:-1]
-      new_rv = type(rv)(*args, **kwargs)
-    return new_rv
-  elif isinstance(org_instance, tf.Tensor):
-    tensor = org_instance
-
-    # Do not copy tf.placeholders.
-    if 'Placeholder' in tensor.op.type:
-      return tensor
-
-    # A tensor is one of the outputs of its underlying
-    # op. Therefore copy the op itself.
-    op = tensor.op
-    new_op = copy(op, dict_swap, scope, True, copy_q, False)
-
-    output_index = op.outputs.index(tensor)
-    new_tensor = new_op.outputs[output_index]
-
-    # Add copied tensor to collections that the original one is in.
-    for name, collection in six.iteritems(tensor.graph._collections):
-      if tensor in collection:
-        graph.add_to_collection(name, new_tensor)
-
-    return new_tensor
-  elif isinstance(org_instance, tf.Operation):
-    op = org_instance
-
-    # Do not copy queue operations.
-    if 'Queue' in op.type:
-      return op
-
-    # Copy the node def.
-    # It is unique to every Operation instance. Replace the name and
-    # its operation-level seed if it has one.
-    node_def = deepcopy(op.node_def)
-    node_def.name = new_name
-
-    # when copying control flow contexts,
-    # we need to make sure frame definitions are copied
-    if 'frame_name' in node_def.attr and node_def.attr['frame_name'].s != b'':
-      node_def.attr['frame_name'].s = (scope.encode('utf-8') +
-                                       node_def.attr['frame_name'].s)
-
-    if 'seed2' in node_def.attr and tf.get_seed(None)[1] is not None:
-      node_def.attr['seed2'].i = tf.get_seed(None)[1]
-
-    # Copy other arguments needed for initialization.
-    output_types = op._output_types[:]
-
-    # If it has an original op, copy it.
-    if op._original_op is not None:
-      original_op = copy(op._original_op, dict_swap, scope, True, copy_q, False)
-    else:
-      original_op = None
-
-    # Copy the op def.
-    # It is unique to every Operation type.
-    op_def = deepcopy(op.op_def)
-
-    new_op = tf.Operation(node_def,
-                          graph,
-                          [],  # inputs; will add them afterwards
-                          output_types,
-                          [],  # control inputs; will add them afterwards
-                          [],  # input types; will add them afterwards
-                          original_op,
-                          op_def)
-
-    # advertise op early to break recursions
-    graph._add_op(new_op)
-
-    # If it has control inputs, copy them.
-    control_inputs = []
-    for x in op.control_inputs:
-      elem = copy(x, dict_swap, scope, True, copy_q, False)
-      if not isinstance(elem, tf.Operation):
-        elem = tf.convert_to_tensor(elem)
-
-      control_inputs.append(elem)
-
-    new_op._add_control_inputs(control_inputs)
-
-    # If it has inputs, copy them.
-    for x in op.inputs:
-      elem = copy(x, dict_swap, scope, True, copy_q, False)
-      if not isinstance(elem, tf.Operation):
-        elem = tf.convert_to_tensor(elem)
-
-      new_op._add_input(elem)
-
-    # Copy the control flow context.
-    control_flow_context = _copy_context(op._get_control_flow_context(), {},
-                                         dict_swap, scope, copy_q)
-    new_op._set_control_flow_context(control_flow_context)
-
-    # Use Graph's private methods to add the op, following
-    # implementation of `tf.Graph().create_op()`.
-    compute_shapes = True
-    compute_device = True
-    op_type = new_name
-
-    if compute_shapes:
-      set_shapes_for_outputs(new_op)
-    graph._record_op_seen_by_control_dependencies(new_op)
-
-    if compute_device:
-      graph._apply_device_functions(new_op)
-
-    if graph._colocation_stack:
-      all_colocation_groups = []
-      for colocation_op in graph._colocation_stack:
-        all_colocation_groups.extend(colocation_op.colocation_groups())
-        if colocation_op.device:
-          # Make this device match the device of the colocated op, to
-          # provide consistency between the device and the colocation
-          # property.
-          if new_op.device and new_op.device != colocation_op.device:
-            logging.warning("Tried to colocate %s with an op %s that had "
-                            "a different device: %s vs %s. "
-                            "Ignoring colocation property.",
-                            name, colocation_op.name, new_op.device,
-                            colocation_op.device)
-
-      all_colocation_groups = sorted(set(all_colocation_groups))
-      new_op.node_def.attr["_class"].CopyFrom(attr_value_pb2.AttrValue(
-          list=attr_value_pb2.AttrValue.ListValue(s=all_colocation_groups)))
-
-    # Sets "container" attribute if
-    # (1) graph._container is not None
-    # (2) "is_stateful" is set in OpDef
-    # (3) "container" attribute is in OpDef
-    # (4) "container" attribute is None
-    if (graph._container and
-        op_type in graph._registered_ops and
-        graph._registered_ops[op_type].is_stateful and
-        "container" in new_op.node_def.attr and
-            not new_op.node_def.attr["container"].s):
-      new_op.node_def.attr["container"].CopyFrom(
-          attr_value_pb2.AttrValue(s=compat.as_bytes(graph._container)))
-
-    return new_op
-  else:
-    raise TypeError("Could not copy instance: " + str(org_instance))
-
-
 def transform(x, *args, **kwargs):
   """Transform a continuous random variable to the unconstrained space.
 
diff --git a/tests/util/copy_test.py b/tests/util/copy_test.py
deleted file mode 100644
index 7aa714f4d..000000000
--- a/tests/util/copy_test.py
+++ /dev/null
@@ -1,248 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import edward as ed
-import numpy as np
-import tensorflow as tf
-
-from edward.models import Bernoulli, Categorical, Mixture, Normal
-
-
-class test_copy_class(tf.test.TestCase):
-
-  def test_scope(self):
-    with self.test_session():
-      x = tf.constant(2.0)
-      x_new = ed.copy(x, scope='new_scope')
-      self.assertTrue(x_new.name.startswith('new_scope'))
-
-  def test_replace_itself(self):
-    with self.test_session():
-      x = tf.constant(2.0)
-      y = tf.constant(3.0)
-      x_new = ed.copy(x, {x: y}, replace_itself=False)
-      self.assertEqual(x_new.eval(), 2.0)
-      x_new = ed.copy(x, {x: y}, replace_itself=True)
-      self.assertEqual(x_new.eval(), 3.0)
-
-  def test_copy_q(self):
-    with self.test_session() as sess:
-      x = tf.constant(2.0)
-      y = tf.random_normal([])
-      x_new = ed.copy(x, {x: y}, replace_itself=True, copy_q=False)
-      x_new_val, y_val = sess.run([x_new, y])
-      self.assertEqual(x_new_val, y_val)
-      x_new = ed.copy(x, {x: y}, replace_itself=True, copy_q=True)
-      x_new_val, x_val, y_val = sess.run([x_new, x, y])
-      self.assertNotEqual(x_new_val, x_val)
-      self.assertNotEqual(x_new_val, y_val)
-
-  def test_copy_parent_rvs(self):
-    with self.test_session() as sess:
-      x = Normal(0.0, 1.0)
-      y = tf.constant(3.0)
-      z = x * y
-      z_new = ed.copy(z, scope='no_copy_parent_rvs', copy_parent_rvs=False)
-      self.assertEqual(len(ed.random_variables()), 1)
-      z_new = ed.copy(z, scope='copy_parent_rvs', copy_parent_rvs=True)
-      self.assertEqual(len(ed.random_variables()), 2)
-
-  def test_placeholder(self):
-    with self.test_session() as sess:
-      x = tf.placeholder(tf.float32, name="CustomName")
-      y = tf.constant(3.0)
-      z = x * y
-      z_new = ed.copy(z)
-      self.assertEqual(sess.run(z_new, feed_dict={x: 4.0}), 12.0)
-
-  def test_variable(self):
-    with self.test_session() as sess:
-      x = tf.Variable(2.0, name="CustomName")
-      y = tf.constant(3.0)
-      z = x * y
-      z_new = ed.copy(z)
-      tf.variables_initializer([x]).run()
-      self.assertEqual(z_new.eval(), 6.0)
-
-  def test_queue(self):
-    with self.test_session() as sess:
-      tensor = tf.constant([0.0, 1.0, 2.0, 3.0])
-      x = tf.train.batch([tensor], batch_size=2, enqueue_many=True,
-                         name='CustomName')
-      y = tf.constant(3.0)
-      z = x * y
-      z_new = ed.copy(z)
-      coord = tf.train.Coordinator()
-      threads = tf.train.start_queue_runners(coord=coord)
-      self.assertAllEqual(sess.run(z_new), np.array([0.0, 3.0]))
-      self.assertAllEqual(sess.run(z_new), np.array([6.0, 9.0]))
-      coord.request_stop()
-      coord.join(threads)
-
-  def test_list(self):
-    with self.test_session() as sess:
-      x = Normal(tf.constant(0.0), tf.constant(0.1))
-      y = Normal(tf.constant(10.0), tf.constant(0.1))
-      cat = Categorical(logits=tf.zeros(5))
-      components = [Normal(x, tf.constant(0.1))
-                    for _ in range(5)]
-      z = Mixture(cat=cat, components=components)
-      z_new = ed.copy(z, {x: y.value})
-      self.assertGreater(z_new.value.eval(), 5.0)
-
-  def test_random(self):
-    with self.test_session() as sess:
-      ed.set_seed(3742)
-      x = tf.random_normal([])
-      x_copy = ed.copy(x)
-
-      result_copy, result = sess.run([x_copy, x])
-      self.assertNotAlmostEquals(result_copy, result)
-
-  def test_scan(self):
-    with self.test_session() as sess:
-      ed.set_seed(42)
-      op = tf.scan(lambda a, x: a + x, tf.constant([2.0, 3.0, 1.0]))
-      copy_op = ed.copy(op)
-
-      result_copy, result = sess.run([copy_op, op])
-      self.assertAllClose(result_copy, [2.0, 5.0, 6.0])
-      self.assertAllClose(result, [2.0, 5.0, 6.0])
-
-  def test_scan_gradients(self):
-    with self.test_session() as sess:
-      a = tf.Variable([1.0, 2.0, 3.0])
-      op = tf.scan(lambda a, x: a + x, a)
-      copy_op = ed.copy(op)
-      gradient = tf.gradients(op, [a])[0]
-      copy_gradient = tf.gradients(copy_op, [a])[0]
-
-      tf.variables_initializer([a]).run()
-      result_copy, result = sess.run([copy_gradient, gradient])
-      self.assertAllClose(result, [3.0, 2.0, 1.0])
-      self.assertAllClose(result_copy, [3.0, 2.0, 1.0])
-
-  def test_nested_scan_gradients(self):
-    with self.test_session() as sess:
-      a = tf.Variable([1.0, 2.0, 3.0])
-      i = tf.constant(0.0)
-      tot = tf.constant([0.0, 0.0, 0.0])
-      op = tf.while_loop(lambda i, tot: i < 5,
-                         lambda i, tot: (i + 1,
-                                         tot + tf.scan(lambda x0, x:
-                                                       x0 + i * x, a, 0.0)),
-                         [i, tot])[1]
-      copy_op = ed.copy(op)
-      gradient = tf.gradients(op, [a])[0]
-      copy_gradient = tf.gradients(copy_op, [a])[0]
-
-      tf.variables_initializer([a]).run()
-      result_copy, result = sess.run([copy_gradient, gradient])
-      self.assertAllClose(result, [30.0, 20.0, 10.0])
-      self.assertAllClose(result_copy, [30.0, 20.0, 10.0])
-
-  def test_swap_tensor_tensor(self):
-    with self.test_session():
-      x = tf.constant(2.0)
-      y = tf.constant(3.0)
-      z = x * y
-      qx = tf.constant(4.0)
-      z_new = ed.copy(z, {x: qx})
-      self.assertEqual(z_new.eval(), 12.0)
-
-  def test_swap_placeholder_tensor(self):
-    with self.test_session():
-      x = tf.placeholder(tf.float32, name="CustomName")
-      y = tf.constant(3.0)
-      z = x * y
-      qx = tf.constant(4.0)
-      z_new = ed.copy(z, {x: qx})
-      self.assertEqual(z_new.eval(), 12.0)
-
-  def test_swap_tensor_placeholder(self):
-    with self.test_session() as sess:
-      x = tf.constant(2.0)
-      y = tf.constant(3.0)
-      z = x * y
-      qx = tf.placeholder(tf.float32, name="CustomName")
-      z_new = ed.copy(z, {x: qx})
-      self.assertEqual(sess.run(z_new, feed_dict={qx: 4.0}), 12.0)
-
-  def test_swap_variable_tensor(self):
-    with self.test_session():
-      x = tf.Variable(2.0, name="CustomName")
-      y = tf.constant(3.0)
-      z = x * y
-      qx = tf.constant(4.0)
-      z_new = ed.copy(z, {x: qx})
-      tf.variables_initializer([x]).run()
-      self.assertEqual(z_new.eval(), 12.0)
-
-  def test_swap_tensor_variable(self):
-    with self.test_session() as sess:
-      x = tf.constant(2.0)
-      y = tf.constant(3.0)
-      z = x * y
-      qx = tf.Variable(4.0, name="CustomName")
-      z_new = ed.copy(z, {x: qx})
-      tf.variables_initializer([qx]).run()
-      self.assertEqual(z_new.eval(), 12.0)
-
-  def test_swap_rv_rv(self):
-    with self.test_session():
-      ed.set_seed(325135)
-      x = Normal(0.0, 0.1)
-      y = tf.constant(1.0)
-      z = x * y
-      qx = Normal(10.0, 0.1)
-      z_new = ed.copy(z, {x: qx})
-      self.assertGreater(z_new.eval(), 5.0)
-
-  def test_swap_rv_tensor(self):
-    with self.test_session():
-      ed.set_seed(289362)
-      x = Normal(0.0, 0.1)
-      y = tf.constant(1.0)
-      z = x * y
-      qx = Normal(10.0, 0.1)
-      z_new = ed.copy(z, {x: qx.value})
-      self.assertGreater(z_new.eval(), 5.0)
-
-  def test_swap_tensor_rv(self):
-    with self.test_session():
-      ed.set_seed(95258)
-      x = Normal(0.0, 0.1)
-      y = tf.constant(1.0)
-      z = x * y
-      qx = Normal(10.0, 0.1)
-      z_new = ed.copy(z, {x.value: qx})
-      self.assertGreater(z_new.eval(), 5.0)
-
-  def test_ordering_rv_tensor(self):
-    # Check that random variables are copied correctly in dependency
-    # structure.
-    with self.test_session() as sess:
-      ed.set_seed(12432)
-      x = Bernoulli(logits=0.0)
-      y = tf.cast(x, tf.float32)
-      y_new = ed.copy(y)
-      x_new = ed.copy(x)
-      x_new_val, y_new_val = sess.run([x_new, y_new])
-      self.assertEqual(x_new_val, y_new_val)
-
-  def test_ordering_rv_rv(self):
-    # Check that random variables are copied correctly in dependency
-    # structure.
-    with self.test_session() as sess:
-      ed.set_seed(21782)
-      x = Normal(loc=0.0, scale=10.0)
-      x_abs = tf.abs(x)
-      y = Normal(loc=x_abs, scale=1e-8)
-      y_new = ed.copy(y)
-      x_new = ed.copy(x)
-      x_new_val, y_new_val = sess.run([x_new, y_new])
-      self.assertAllClose(abs(x_new_val), y_new_val)
-
-if __name__ == '__main__':
-  tf.test.main()

From 574fbacf8f7377b0b6100c87e3a105d6e7837758 Mon Sep 17 00:00:00 2001
From: Dustin Tran <dustinviettran@gmail.com>
Date: Fri, 19 Jan 2018 14:18:43 -0800
Subject: [PATCH 18/27] add programmable docstrings

---
 edward/inferences/bigan_inference.py |  72 ++++--
 edward/inferences/docstrings.py      | 182 +++++++++++++++
 edward/inferences/gan_inference.py   |  62 ++---
 edward/inferences/inference.py       |  22 +-
 edward/inferences/klpq.py            | 105 ++++-----
 edward/inferences/klqp.py            | 325 ++++++++++++++++-----------
 edward/inferences/klqp_implicit.py   |  96 +++++---
 edward/inferences/laplace.py         |  57 ++---
 edward/inferences/map.py             |  78 ++++---
 edward/inferences/wake_sleep.py      |  72 ++++--
 edward/inferences/wgan_inference.py  |  68 +++---
 11 files changed, 759 insertions(+), 380 deletions(-)
 create mode 100644 edward/inferences/docstrings.py

diff --git a/edward/inferences/bigan_inference.py b/edward/inferences/bigan_inference.py
index 9a9effa75..b9ac408c4 100644
--- a/edward/inferences/bigan_inference.py
+++ b/edward/inferences/bigan_inference.py
@@ -5,47 +5,77 @@
 import six
 import tensorflow as tf
 
+from edward.inferences import docstrings as doc
 from edward.inferences.inference import call_function_up_to_args
 from edward.models.core import Trace
 
 
-def bigan_inference(model, variational, discriminator, align_data,
-                    align_latent, collections=None, *args, **kwargs):
+@doc.set_doc(
+    args=(doc.arg_model +
+          doc.arg_variational +
+          doc.arg_discriminator +
+          doc.arg_align_latent +
+          doc.arg_align_data +
+          doc.arg_collections +
+          doc.arg_args_kwargs)[:-1],
+    returns=doc.return_loss_loss_d,
+    notes_discriminator_scope=doc.notes_discriminator_scope,
+    notes_regularization_losses=doc.notes_regularization_losses)
+def bigan_inference(model, variational, discriminator, align_latent,
+                    align_data, collections=None, *args, **kwargs):
   """Adversarially Learned Inference [@dumuolin2017adversarially] or
   Bidirectional Generative Adversarial Networks [@donahue2017adversarial]
   for joint learning of generator and inference networks.
 
+  The function matches a mapping from data to latent variables and a
+  mapping from latent variables to data through a joint discriminator.
+
   Works for the class of implicit (and differentiable) probabilistic
   models. These models do not require a tractable density and assume
   only a program that generates samples.
 
-  #### Notes
+  Args:
+  @{args}
+
+  `align_latent` must only align one random variable in `model` and
+  `variational`. `model` must return the generated data. `variational`
+  assumes a random variable output and not an implicit density (or at
+  least recorded on trace).
 
-  `BiGANInference` matches a mapping from data to latent variables and a
-  mapping from latent variables to data through a joint
-  discriminator.
+  Returns:
+  @{returns}
 
-  In building the computation graph for inference, the
-  discriminator's parameters can be accessed with the variable scope
-  "Disc".
-  In building the computation graph for inference, the
-  encoder and decoder parameters can be accessed with the variable scope
-  "Gen".
+  #### Notes
+
+  @{notes_discriminator_scope}
 
-  The objective function also adds to itself a summation over all tensors
-  in the `REGULARIZATION_LOSSES` collection.
+  @{notes_regularization_losses}
 
   #### Examples
 
   ```python
-  with tf.variable_scope("Gen"):
-    xf = gen_data(z_ph)
-    zf = gen_latent(x_ph)
-  inference = ed.BiGANInference({z_ph: zf}, {xf: x_ph}, discriminator)
+  def model():
+    z = Normal(loc=0.0, scale=1.0, sample_shape=[256, 25], name="z")
+    x = generative_network(z, name="x")
+    return x
+
+  def variational(x_data):
+    net = tf.layers.dense(x_data, 25 * 2)
+    qz = Normal(loc=net[:, :25],
+                scale=tf.nn.softplus(net[:, 25:]),
+                sample_shape=[256,],
+                name="qz")
+
+  def discriminator(x):
+    net = tf.layers.dense(x, 256, activation=tf.nn.relu)
+    return tf.layers.dense(net, 1, activation=tf.sigmoid)
+
+  loss, loss_d = ed.bigan_inference(
+      model, variational, discriminator,
+      align_latent=lambda name: "qz" if name == "z" else None,
+      align_data=lambda name: "x_data" if name == "x" else None,
+      x_data=x_data)
   ```
-
-  `align_latent` must only align one random variable in `model` and
-  `variational`. `model` must return the generated data.
   """
   with Trace() as posterior_trace:
     call_function_up_to_args(variational, *args, **kwargs)
diff --git a/edward/inferences/docstrings.py b/edward/inferences/docstrings.py
new file mode 100644
index 000000000..bad0f1b99
--- /dev/null
+++ b/edward/inferences/docstrings.py
@@ -0,0 +1,182 @@
+"""Programmable docstrings.
+
+The args below represent a global vocabulary of arguments shared
+across at least two inference algorithms. They are sorted
+alphabetically. They are also written with newlines at the end such
+that they can be easily added together. After composing args
+docstrings, remove the last newline.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import re
+import six
+import sys
+
+
+def set_doc(**kwargs):
+  """Decorator to programmatically set the docstring."""
+  def _update(cls_or_fn):
+    # Trim indenting level of current doc.
+    doc = trim(cls_or_fn.__doc__)
+    for k, v in six.iteritems(kwargs):
+      # Capture each @{k} reference to replace with v.
+      # We wrap the replacement in a function so no backslash escapes
+      # are processed.
+      pattern = r'@\{' + str(k) + r'\}'
+      doc = re.sub(pattern, lambda match: v, doc)
+    cls_or_fn.__doc__ = doc
+    return cls_or_fn
+  return _update
+
+
+def trim(docstring):
+  """Trims docstring indentation. Taken from PEP 257 docs."""
+  if not docstring:
+    return ''
+  # Convert tabs to spaces (following the normal Python rules)
+  # and split into a list of lines:
+  lines = docstring.expandtabs().splitlines()
+  # Determine minimum indentation (first line doesn't count):
+  indent = sys.maxint
+  for line in lines[1:]:
+    stripped = line.lstrip()
+    if stripped:
+      indent = min(indent, len(line) - len(stripped))
+  # Remove indentation (first line is special):
+  trimmed = [lines[0].strip()]
+  if indent < sys.maxint:
+    for line in lines[1:]:
+      trimmed.append(line[indent:].rstrip())
+  # Strip off trailing and leading blank lines:
+  while trimmed and not trimmed[-1]:
+    trimmed.pop()
+  while trimmed and not trimmed[0]:
+    trimmed.pop(0)
+  # Return a single string:
+  return '\n'.join(trimmed)
+
+
+arg_align_data = """
+  align_data: function of string, aligning `model` observed
+    variables with data. It takes a model variable's name as input
+    and returns an integer, indexing `args`, or key, indexing
+    `kwargs`. Other inputs must return None.
+"""[1:]
+arg_align_latent = """
+  align_latent: function of string, aligning `model` latent
+    variables with `variational`. It takes a model variable's name
+    as input and returns a string, indexing `variational`'s trace.
+    Other inputs must return None.
+"""[1:]
+arg_args_kwargs = """
+  args, kwargs: data inputs. `kwargs`' keys are directly the argument
+    keys in `model` (and if present, `variational`). Data inputs are
+    passed at compile-time in TF's Graph mode or runtime in TF's Eager
+    mode.
+"""[1:]
+arg_auto_transform = """
+  auto_transform:
+"""[1:]
+arg_collections = """
+  collections:
+"""[1:]
+arg_discriminator = """
+  discriminator: function.
+    Function (with parameters) to discriminate samples. It should
+    output logit probabilities (real-valued) and not probabilities
+    in $[0, 1]$.
+"""[1:]
+arg_kl_scaling = """
+  kl_scaling: function of string, aligning `model` latent
+    variables with KL scale factors. This provides option to scale
+    terms when using ELBO with KL divergence. If the KL divergence
+    terms are
+
+    $\\alpha_p \mathbb{E}_{q(z\mid x, \lambda)} [
+          \log q(z\mid x, \lambda) - \log p(z)],$
+
+    then pass {$p(z)$: $\\alpha_p$} as `kl_scaling`,
+    where $\\alpha_p$ is a tensor. Its shape must be broadcastable;
+    it is multiplied element-wise to the batchwise KL terms.
+"""[1:]
+arg_model = """
+  model: function whose inputs are a subset of `args` (e.g., for
+    discriminative). Output is not used.
+    TODO auto_transform docstring
+    Collection of random variables to perform inference on.
+    If list, each random variable will be implictly optimized using
+    a `Normal` random variable that is defined internally with a
+    free parameter per location and scale and is initialized using
+    standard normal draws. The random variables to approximate must
+    be continuous.
+    TODO note above only applicable to variational(?) inferences
+"""[1:]
+arg_n_samples = """
+  n_samples: int.
+    Number of samples from variational model for calculating
+    stochastic gradients.
+"""[1:]
+arg_scale = """
+  scale: function of string, aligning `model` observed
+    variables with scale factors. It takes a model variable's name
+    as input and returns a scale factor; else 1.0. The scale
+    factor's shape must be broadcastable; it is multiplied
+    element-wise to the random variable. For example, this is useful
+    for mini-batch scaling when inferring global variables, or
+    applying masks on a random variable.
+"""[1:]
+arg_variational = """
+  variational: function whose inputs are a subset of `args` (e.g.,
+    for amortized). Output is not used.
+"""[1:]
+notes_conditional_inference_samples = """
+In conditional inference, we infer $z$ in $p(z, \\beta
+\mid x)$ while fixing inference over $\\beta$ using another
+distribution $q(\\beta)$. During gradient calculation, instead
+of using the model's density
+
+$\log p(x, z^{(s)}), z^{(s)} \sim q(z; \lambda),$
+
+for each sample $s=1,\ldots,S$, this function uses
+
+$\log p(x, z^{(s)}, \\beta^{(s)}),$
+
+where $z^{(s)} \sim q(z; \lambda)$ and $\\beta^{(s)}
+\sim q(\\beta)$.
+"""[1:-1]
+notes_discriminator_scope = """
+In building the computation graph for inference, the
+discriminator's parameters can be accessed with the variable scope
+"Disc".
+"""[1:-1]
+notes_model_parameters = """
+The function also enables optimizing model parameters $p(z \mid x;
+\\theta)$. It does this by variational EM, maximizing
+
+$\mathbb{E}_{q(z; \lambda)} [ \log p(x, z; \\theta) ]$
+
+with respect to $\\theta$.
+"""[1:-1]
+notes_regularization_losses = """
+The objective function also adds to itself a summation over all
+tensors in the `REGULARIZATION_LOSSES` collection.
+"""
+return_loss = """
+  Scalar tf.Tensor representing the loss. Its automatic
+  differentiation is the gradient to follow for optimization.
+"""[1:-1]
+return_loss_loss_d = """
+  Pair of scalar tf.Tensors, representing the generative loss and
+  discriminative loss respectively.
+"""[1:-1]
+return_loss_surrogate_loss = """
+  Pair of scalar tf.Tensors, representing the loss and surrogate loss
+  respectively. The surrogate loss' automatic differentiation is the
+  gradient to follow for optimization.
+"""[1:-1]
+return_surrogate_loss = """
+  Scalar tf.Tensor representing the surrogate loss. Its automatic
+  differentiation is the gradient to follow for optimization.
+"""[1:-1]
diff --git a/edward/inferences/gan_inference.py b/edward/inferences/gan_inference.py
index 0713f2813..ab4566f7c 100644
--- a/edward/inferences/gan_inference.py
+++ b/edward/inferences/gan_inference.py
@@ -5,9 +5,19 @@
 import six
 import tensorflow as tf
 
+from edward.inferences import docstrings as doc
 from edward.inferences.inference import call_function_up_to_args
 
 
+@doc.set_doc(
+    args=(doc.arg_model +
+          doc.arg_discriminator +
+          doc.arg_align_data +
+          doc.arg_collections +
+          doc.arg_args_kwargs)[:-1],
+    returns=doc.return_loss_loss_d,
+    notes_discriminator_scope=doc.notes_discriminator_scope,
+    notes_regularization_losses=doc.notes_regularization_losses)
 def gan_inference(model, discriminator, align_data,
                   collections=None, *args, **kwargs):
   """Parameter estimation with GAN-style training
@@ -17,46 +27,38 @@ def gan_inference(model, discriminator, align_data,
   models. These models do not require a tractable density and assume
   only a program that generates samples.
 
-  #### Notes
+  Args:
+  @{args}
+
+  `model` must return the generated data.
 
-  `GANInference` does not support latent variable inference. Note
-  that GAN-style training also samples from the prior: this does not
-  work well for latent variables that are shared across many data
-  points (global variables).
+  Returns:
+  @{returns}
 
-  In building the computation graph for inference, the
-  discriminator's parameters can be accessed with the variable scope
-  "Disc".
+  #### Notes
 
-  GANs also only work for one observed random variable in `data`.
+  @{notes_discriminator_scope}
 
-  The objective function also adds to itself a summation over all
-  tensors in the `REGULARIZATION_LOSSES` collection.
+  @{notes_regularization_losses}
 
   #### Examples
 
   ```python
-  z = Normal(loc=tf.zeros([100, 10]), scale=tf.ones([100, 10]))
-  x = generative_network(z)
-
-  inference = ed.GANInference({x: x_data}, discriminator)
+  def model():
+    z = Normal(loc=0.0, scale=1.0, sample_shape=[256, 25])
+    x = generative_network(z, name="x")
+    return x
+
+  def discriminator(x):
+    net = tf.layers.dense(x, 256, activation=tf.nn.relu)
+    return tf.layers.dense(net, 1, activation=tf.sigmoid)
+
+  loss, loss_d = ed.gan_inference(
+      model, discriminator,
+      align_data=lambda name: "x_data" if name == "x" else None,
+      x_data=x_data)
   ```
   """
-  """Create an inference algorithm.
-
-  Args:
-    data: dict.
-      Data dictionary which binds observed variables (of type
-      `RandomVariable` or `tf.Tensor`) to their realizations (of
-      type `tf.Tensor`).  It can also bind placeholders (of type
-      `tf.Tensor`) used in the model to their realizations.
-    discriminator: function.
-      Function (with parameters) to discriminate samples. It should
-      output logit probabilities (real-valued) and not probabilities
-      in $[0, 1]$.
-
-  `model` must return the generated data.
-  """
   x_fake = call_function_up_to_args(model, *args, **kwargs)
   key = align_data(x_fake.name.split(':')[0])
   if isinstance(key, int):
diff --git a/edward/inferences/inference.py b/edward/inferences/inference.py
index 92f14cc30..d74668fcb 100644
--- a/edward/inferences/inference.py
+++ b/edward/inferences/inference.py
@@ -107,22 +107,22 @@ def train(model, inference=None,
   4. Run finalize (post-training) ops.
 
   Args:
-    n_iter: int, optional.
+    n_iter: int.
       Number of iterations for algorithm when calling `run()`.
       Alternatively if controlling inference manually, it is the
       expected number of calls to `update()`; this number determines
       tracking information during the print progress.
-    n_print: int, optional.
+    n_print: int.
       Number of iterations for each print progress. To suppress print
       progress, then specify 0. Default is `int(n_iter / 100)`.
-    logdir: str, optional.
+    logdir: str.
       Directory where event file will be written. For details,
       see `tf.summary.FileWriter`. Default is to log nothing.
-    log_timestamp: bool, optional.
+    log_timestamp: bool.
       If True (and `logdir` is specified), create a subdirectory of
       `logdir` to save the specific run results. The subdirectory's
       name is the current UTC timestamp with format 'YYYYMMDD_HHMMSS'.
-    variables: list, optional.
+    variables: list.
       A list of TensorFlow variables to initialize during inference.
       Default is to initialize all variables (this includes
       reinitializing variables that were already initialized). To
@@ -191,7 +191,7 @@ def _summary_variables(latent_vars=None, data=None, variables=None,
   the variable has scalar shape; otherwise forms a `tf.summary.histogram`.
 
   Args:
-    variables: list, optional.
+    variables: list.
       Specifies the list of variables to log after each `n_print`
       steps. If None, will log all variables. If `[]`, no variables
       will be logged.
@@ -229,16 +229,16 @@ def _optimize(loss, grads_and_vars, collections=None, var_list=None,
   grads_and_vars.
 
   Args:
-    optimizer: str or tf.train.Optimizer, optional.
+    optimizer: str or tf.train.Optimizer.
       A TensorFlow optimizer, to use for optimizing the variational
       objective. Alternatively, one can pass in the name of a
       TensorFlow optimizer, and default parameters for the optimizer
       will be used.
-    use_prettytensor: bool, optional.
+    use_prettytensor: bool.
       `True` if aim to use PrettyTensor optimizer (when using
       PrettyTensor) or `False` if aim to use TensorFlow optimizer.
       Defaults to TensorFlow.
-    global_step: tf.Variable, optional.
+    global_step: tf.Variable.
       A TensorFlow variable to hold the global step.
   """
   if collections is not None:
@@ -311,7 +311,7 @@ def _default_update(progbar, n_print, summarize=None, train_writer=None,
   Args:
     args: things like `loss`
     kwargs: things like 'feed_dict'
-    feed_dict: dict, optional.
+    feed_dict: dict.
       Feed dictionary for a TensorFlow session run. It is used to feed
       placeholders that are not fed during initialization.
 
@@ -347,7 +347,7 @@ def _gan_update(train_op, train_op_d, n_print, summarize=None,
   """Run one iteration of optimization.
 
   Args:
-    variables: str, optional.
+    variables: str.
       Which set of variables to update. Either "Disc" or "Gen".
       Default is both.
 
diff --git a/edward/inferences/klpq.py b/edward/inferences/klpq.py
index e595d1790..965c35d56 100644
--- a/edward/inferences/klpq.py
+++ b/edward/inferences/klpq.py
@@ -5,6 +5,7 @@
 import six
 import tensorflow as tf
 
+from edward.inferences import docstrings as doc
 from edward.inferences.inference import (
     call_function_up_to_args, make_intercept)
 from edward.models.core import Trace
@@ -15,81 +16,85 @@
   raise ImportError("{0}. Your TensorFlow version is not supported.".format(e))
 
 
+@doc.set_doc(
+    args=(doc.arg_model +
+          doc.arg_variational +
+          doc.arg_align_latent +
+          doc.arg_align_data +
+          doc.arg_scale +
+          doc.arg_n_samples +
+          doc.arg_auto_transform +
+          doc.arg_collections +
+          doc.arg_args_kwargs),
+    returns=doc.return_loss_surrogate_loss,
+    notes_model_parameters=doc.notes_model_parameters,
+    notes_conditional_inference=doc.notes_conditional_inference_samples,
+    notes_regularization_losses=doc.notes_regularization_losses)
 def klpq(model, variational, align_latent, align_data,
          scale=lambda name: 1.0, n_samples=1, auto_transform=True,
          collections=None, *args, **kwargs):
   """Variational inference with the KL divergence
 
-  $\\text{KL}( p(z \mid x) \| q(z) ).$
+  $\\text{KL}( p(z \mid x) \| q(z) )
+    = \mathbb{E}_{p(z \mid x)} [ \log p(z \mid x) - \log q(z; \lambda) ]$.
 
-  To perform the optimization, this class uses a technique from
+  To perform the optimization, this function uses a technique from
   adaptive importance sampling [@oh1992adaptive].
 
-  #### Notes
-
-  `KLpq` also optimizes any model parameters $p(z\mid x;
-  \\theta)$. It does this by variational EM, maximizing
-
-  $\mathbb{E}_{p(z \mid x; \lambda)} [ \log p(x, z; \\theta) ]$
+  The loss function can be estimated up to a constant as
 
-  with respect to $\\theta$.
-
-  In conditional inference, we infer $z` in $p(z, \\beta
-  \mid x)$ while fixing inference over $\\beta$ using another
-  distribution $q(\\beta)$. During gradient calculation, instead
-  of using the model's density
+  $\sum_{s=1}^S [
+    w_{\\text{norm}}(z^s; \lambda) (\log p(x, z^s) - \log q(z^s; \lambda) ],$
 
-  $\log p(x, z^{(s)}), z^{(s)} \sim q(z; \lambda),$
+  where for $z^s \sim q(z; \lambda)$,
 
-  for each sample $s=1,\ldots,S$, `KLpq` uses
+  $w_{\\text{norm}}(z^s; \lambda) =
+        w(z^s; \lambda) / \sum_{s=1}^S w(z^s; \lambda)$
 
-  $\log p(x, z^{(s)}, \\beta^{(s)}),$
+  normalizes the importance weights, $w(z^s; \lambda) = p(x,
+  z^s) / q(z^s; \lambda)$.
 
-  where $z^{(s)} \sim q(z; \lambda)$ and$\\beta^{(s)}
-  \sim q(\\beta)$.
+  This provides a gradient,
 
-  The objective function also adds to itself a summation over all
-  tensors in the `REGULARIZATION_LOSSES` collection.
-  """
-  """Create an inference algorithm.
+  $- \sum_{s=1}^S [
+    w_{\\text{norm}}(z^s; \lambda) \\nabla_{\lambda} \log q(z^s; \lambda) ].$
 
   Args:
-    latent_vars: list of RandomVariable or
-                 dict of RandomVariable to RandomVariable.
-      Collection of random variables to perform inference on. If
-      list, each random variable will be implictly optimized using a
-      `Normal` random variable that is defined internally with a
-      free parameter per location and scale and is initialized using
-      standard normal draws. The random variables to approximate
-      must be continuous.
-    n_samples: int, optional.
-      Number of samples from variational model for calculating
-      stochastic gradients.
-  """
-  """Build loss function
+  @{args}
 
-  $\\text{KL}( p(z \mid x) \| q(z) )
-    = \mathbb{E}_{p(z \mid x)} [ \log p(z \mid x) - \log q(z; \lambda) ]$
+  Returns:
+  @{returns}
 
-  and stochastic gradients based on importance sampling.
+  #### Notes
 
-  The loss function can be estimated as
+  Probabilistic programs may have random variables which vary across
+  executions. The algorithm returns calculations following `n_samples`
+  executions of the model and variational programs.
 
-  $\sum_{s=1}^S [
-    w_{\\text{norm}}(z^s; \lambda) (\log p(x, z^s) - \log q(z^s; \lambda) ],$
+  @{notes_model_parameters}
 
-  where for $z^s \sim q(z; \lambda)$,
+  @{notes_conditional_inference}
 
-  $w_{\\text{norm}}(z^s; \lambda) =
-        w(z^s; \lambda) / \sum_{s=1}^S w(z^s; \lambda)$
+  @{notes_regularization_losses}
 
-  normalizes the importance weights, $w(z^s; \lambda) = p(x,
-  z^s) / q(z^s; \lambda)$.
+  #### Examples
 
-  This provides a gradient,
+  ```python
+  def model():
+    mu = Normal(loc=0.0, scale=1.0, name="mu")
+    x = Normal(loc=mu, scale=1.0, sample_shape=10, name="x")
 
-  $- \sum_{s=1}^S [
-    w_{\\text{norm}}(z^s; \lambda) \\nabla_{\lambda} \log q(z^s; \lambda) ].$
+  def variational():
+    qmu = Normal(loc=tf.get_variable("loc", []),
+                 scale=tf.nn.softplus(tf.get_variable("shape", [])),
+                 name="qmu")
+
+  loss, surrogate_loss = ed.klpq(
+      model, variational,
+      align_latent=lambda name: "qmu" if name == "mu" else None,
+      align_data=lambda name: "x" if name == "x" else None,
+      x=x_data)
+  ```
   """
   p_log_prob = [0.0] * n_samples
   q_log_prob = [0.0] * n_samples
diff --git a/edward/inferences/klqp.py b/edward/inferences/klqp.py
index 0e456d6ed..8267e824a 100644
--- a/edward/inferences/klqp.py
+++ b/edward/inferences/klqp.py
@@ -5,6 +5,7 @@
 import six
 import tensorflow as tf
 
+from edward.inferences import docstrings as doc
 from edward.inferences.inference import (
     call_function_up_to_args, make_intercept)
 from edward.models.core import Trace
@@ -18,6 +19,21 @@
 tfd = tf.contrib.distributions
 
 
+@doc.set_doc(
+    args=(doc.arg_model +
+          doc.arg_variational +
+          doc.arg_align_latent +
+          doc.arg_align_data +
+          doc.arg_scale +
+          doc.arg_n_samples +
+          doc.arg_kl_scaling +
+          doc.arg_auto_transform +
+          doc.arg_collections +
+          doc.arg_args_kwargs)[:-1],
+    returns=doc.return_loss_surrogate_loss,
+    notes_model_parameters=doc.notes_model_parameters,
+    notes_conditional_inference=doc.notes_conditional_inference_samples,
+    notes_regularization_losses=doc.notes_regularization_losses)
 def klqp(model, variational, align_latent, align_data,
          scale=lambda name: 1.0, n_samples=1, kl_scaling=lambda name: 1.0,
          auto_transform=True, collections=None, *args, **kwargs):
@@ -25,92 +41,17 @@ def klqp(model, variational, align_latent, align_data,
 
   $\\text{KL}( q(z; \lambda) \| p(z \mid x) ).$
 
-  This class minimizes the objective by automatically selecting from a
-  variety of black box inference techniques.
-
-  Args:
-    model: function whose inputs are a subset of `args` (e.g., for
-      discriminative). Output is not used.
-      TODO auto_transform docstring
-      Collection of random variables to perform inference on.
-      If list, each random variable will be implictly optimized using
-      a `Normal` random variable that is defined internally with a
-      free parameter per location and scale and is initialized using
-      standard normal draws. The random variables to approximate must
-      be continuous.
-    variational: function whose inputs are a subset of `args` (e.g.,
-      for amortized). Output is not used.
-    align_latent: function of string, aligning `model` latent
-      variables with `variational`. It takes a model variable's name
-      as input and returns a string, indexing `variational`'s trace;
-      else identity.
-    align_data: function of string, aligning `model` observed
-      variables with data. It takes a model variable's name as input
-      and returns an integer, indexing `args`; else identity.
-    scale: function of string, aligning `model` observed
-      variables with scale factors. It takes a model variable's name
-      as input and returns a scale factor; else 1.0. The scale
-      factor's shape must be broadcastable; it is multiplied
-      element-wise to the random variable. For example, this is useful
-      for mini-batch scaling when inferring global variables, or
-      applying masks on a random variable.
-    n_samples: int, optional.
-      Number of samples from variational model for calculating
-      stochastic gradients.
-    kl_scaling: function of string, aligning `model` latent
-      variables with KL scale factors. This provides option to scale
-      terms when using ELBO with KL divergence. If the KL divergence
-      terms are
-
-      $\\alpha_p \mathbb{E}_{q(z\mid x, \lambda)} [
-            \log q(z\mid x, \lambda) - \log p(z)],$
-
-      then pass {$p(z)$: $\\alpha_p$} as `kl_scaling`,
-      where $\\alpha_p$ is a tensor. Its shape must be broadcastable;
-      it is multiplied element-wise to the batchwise KL terms.
-    args: data inputs. It is passed at compile-time in Graph
-      mode or runtime in Eager mode.
-
-  #### Notes
-
-  `KLqp` also optimizes any model parameters $p(z \mid x;
-  \\theta)$. It does this by variational EM, maximizing
-
-  $\mathbb{E}_{q(z; \lambda)} [ \log p(x, z; \\theta) ]$
+  This function returns a loss and surrogate loss
+  [@schulman2015stochastic; @ruiz2016generalized; @ritchie2016deep].
+  The surrogate loss' autodiff automates selection of two black box
+  gradient estimators given a variational factor:
 
-  with respect to $\\theta$.
+  1. score function gradients [@paisley2012variational] with
+     Rao-Blackwellization [@ranganath2014black];
+  2. reparameterization gradients [@kingma2014auto].
 
-  In conditional inference, we infer $z$ in $p(z, \\beta
-  \mid x)$ while fixing inference over $\\beta$ using another
-  distribution $q(\\beta)$. During gradient calculation, instead
-  of using the model's density
-
-  $\log p(x, z^{(s)}), z^{(s)} \sim q(z; \lambda),$
-
-  for each sample $s=1,\ldots,S$, `KLqp` uses
-
-  $\log p(x, z^{(s)}, \\beta^{(s)}),$
-
-  where $z^{(s)} \sim q(z; \lambda)$ and $\\beta^{(s)}
-  \sim q(\\beta)$.
-
-  The objective function also adds to itself a summation over all
-  tensors in the `REGULARIZATION_LOSSES` collection.
-
-  ##
-
-  $-\\text{ELBO} =
-      -\mathbb{E}_{q(z; \lambda)} [ \log p(x, z) - \log q(z; \lambda) ]$
-
-  KLqp supports
-
-  1. score function gradients [@paisley2012variational]
-  2. reparameterization gradients [@kingma2014auto]
-
-  of the loss function.
-
-  If the KL divergence between the variational model and the prior
-  is tractable, then the loss function can be written as
+  If the KL divergence between a variational factor and its aligned
+  prior is tractable, then the loss function can be written as
 
   $-\mathbb{E}_{q(z; \lambda)}[\log p(x \mid z)] +
       \\text{KL}( q(z; \lambda) \| p(z) ),$
@@ -119,24 +60,48 @@ def klqp(model, variational, align_latent, align_data,
   compute this automatically when $p(z)$ and $q(z; \lambda)$ are
   Normal.
 
-  This class minimizes the objective using the score function gradient
-  and Rao-Blackwellization [@ranganath2014black].
+  Current Rao-Blackwellization is limited to Rao-Blackwellizing across
+  stochastic nodes in the computation graph. It does not
+  Rao-Blackwellize within a node such as when a node represents
+  multiple random variables via non-scalar batch shape.
+  Rao-Blackwellization is performed at runtime for each sample.
 
-  Computed by sampling from :math:`q(z;\lambda)` and evaluating the
-  expectation using Monte Carlo sampling and Rao-Blackwellization.
+  Args:
+  @{args}
 
-  The implementation takes the surrogate loss approach. See
-  @schulman2015stochastic; @ruiz2016generalized; @ritchie2016deep.
+  Returns:
+  @{returns}
 
   #### Notes
 
-  Current Rao-Blackwellization is limited to Rao-Blackwellizing across
-  stochastic nodes in the computation graph. It does not
-  Rao-Blackwellize within a node such as when a node represents
-  multiple random variables via non-scalar batch shape.
+  Probabilistic programs may have random variables which vary across
+  executions. The algorithm returns calculations following `n_samples`
+  executions of the model and variational programs.
+
+  @{notes_model_parameters}
+
+  @{notes_conditional_inference}
+
+  @{notes_regularization_losses}
+
+  #### Examples
+
+  ```python
+  def model():
+    mu = Normal(loc=0.0, scale=1.0, name="mu")
+    x = Normal(loc=mu, scale=1.0, sample_shape=10, name="x")
+
+  def variational():
+    qmu = Normal(loc=tf.get_variable("loc", []),
+                 scale=tf.nn.softplus(tf.get_variable("shape", [])),
+                 name="qmu")
 
-  The objective function also adds to itself a summation over all
-  tensors in the `REGULARIZATION_LOSSES` collection.
+  loss, surrogate_loss = ed.klqp(
+      model, variational,
+      align_latent=lambda name: "qmu" if name == "mu" else None,
+      align_data=lambda name: "x" if name == "x" else None,
+      x=x_data)
+  ```
   """
   # TODO control variates
   # + baseline, learnable baseline
@@ -215,6 +180,20 @@ def klqp(model, variational, align_latent, align_data,
   return loss, surrogate_loss
 
 
+@doc.set_doc(
+    args=(doc.arg_model +
+          doc.arg_variational +
+          doc.arg_align_latent +
+          doc.arg_align_data +
+          doc.arg_scale +
+          doc.arg_n_samples +
+          doc.arg_auto_transform +
+          doc.arg_collections +
+          doc.arg_args_kwargs)[:-1],
+    returns=doc.return_loss,
+    notes_model_parameters=doc.notes_model_parameters,
+    notes_conditional_inference=doc.notes_conditional_inference_samples,
+    notes_regularization_losses=doc.notes_regularization_losses)
 def klqp_reparameterization(model, variational, align_latent, align_data,
                             scale=lambda name: 1.0, n_samples=1,
                             auto_transform=True, collections=None,
@@ -223,14 +202,8 @@ def klqp_reparameterization(model, variational, align_latent, align_data,
 
   $\\text{KL}( q(z; \lambda) \| p(z \mid x) ).$
 
-  This class minimizes the objective using the reparameterization
-  gradient.
-
-  The objective function also adds to itself a summation over all
-  tensors in the `REGULARIZATION_LOSSES` collection.
-
-  Build loss function equal to KL(q||p) up to a constant. Its
-  automatic differentiation is a stochastic gradient of
+  This function builds a loss function equal to KL(q||p) up to a
+  constant. Its automatic differentiation is a stochastic gradient of
 
   $-\\text{ELBO} =
       -\mathbb{E}_{q(z; \lambda)} [ \log p(x, z) - \log q(z; \lambda) ]$
@@ -240,11 +213,42 @@ def klqp_reparameterization(model, variational, align_latent, align_data,
   Computed by sampling from $q(z;\lambda)$ and evaluating the
   expectation using Monte Carlo sampling.
 
-  Note if user defines constrained posterior, then auto_transform
-  can do inference on real-valued; then test time user can use
-  constrained. If user defines unconstrained posterior, then how to
-  work with constrained at test time? For now, user must manually
-  write the bijectors according to transform.
+  Args:
+  @{args}
+
+  Returns:
+  @{returns}
+
+  #### Notes
+
+  Probabilistic programs may have random variables which vary across
+  executions. The algorithm returns calculations following `n_samples`
+  executions of the model and variational programs.
+
+  @{notes_model_parameters}
+
+  @{notes_conditional_inference}
+
+  @{notes_regularization_losses}
+
+  #### Examples
+
+  ```python
+  def model():
+    mu = Normal(loc=0.0, scale=1.0, name="mu")
+    x = Normal(loc=mu, scale=1.0, sample_shape=10, name="x")
+
+  def variational():
+    qmu = Normal(loc=tf.get_variable("loc", []),
+                 scale=tf.nn.softplus(tf.get_variable("shape", [])),
+                 name="qmu")
+
+  loss = ed.klqp_reparameterization(
+      model, variational,
+      align_latent=lambda name: "qmu" if name == "mu" else None,
+      align_data=lambda name: "x" if name == "x" else None,
+      x=x_data)
+  ```
   """
   p_log_prob = [0.0] * n_samples
   q_log_prob = [0.0] * n_samples
@@ -303,19 +307,11 @@ def klqp_reparameterization_kl(model, variational, align_latent, align_data,
 
   $\\text{KL}( q(z; \lambda) \| p(z \mid x) ).$
 
-  This class minimizes the objective using the reparameterization
-  gradient and an analytic KL term.
-
-  The objective function also adds to itself a summation over all
-  tensors in the `REGULARIZATION_LOSSES` collection.
-
-  Build loss function. Its automatic differentiation
-  is a stochastic gradient of
-
-  .. math::
+  This function builds a loss function equal to KL(q||p) up to a
+  constant. Its automatic differentiation is a stochastic gradient of
 
-    -\\text{ELBO} =  - ( \mathbb{E}_{q(z; \lambda)} [ \log p(x \mid z) ]
-          + \\text{KL}(q(z; \lambda) \| p(z)) )
+  $-\\text{ELBO} =
+      -\mathbb{E}_{q(z; \lambda)} [ \log p(x, z) - \log q(z; \lambda) ]$
 
   based on the reparameterization trick [@kingma2014auto].
 
@@ -323,6 +319,43 @@ def klqp_reparameterization_kl(model, variational, align_latent, align_data,
 
   Computed by sampling from $q(z;\lambda)$ and evaluating the
   expectation using Monte Carlo sampling.
+
+  Args:
+  @{args}
+
+  Returns:
+  @{returns}
+
+  #### Notes
+
+  Probabilistic programs may have random variables which vary across
+  executions. The algorithm returns calculations following `n_samples`
+  executions of the model and variational programs.
+
+  @{notes_model_parameters}
+
+  @{notes_conditional_inference}
+
+  @{notes_regularization_losses}
+
+  #### Examples
+
+  ```python
+  def model():
+    mu = Normal(loc=0.0, scale=1.0, name="mu")
+    x = Normal(loc=mu, scale=1.0, sample_shape=10, name="x")
+
+  def variational():
+    qmu = Normal(loc=tf.get_variable("loc", []),
+                 scale=tf.nn.softplus(tf.get_variable("shape", [])),
+                 name="qmu")
+
+  loss = ed.klqp_reparameterization_kl(
+      model, variational,
+      align_latent=lambda name: "qmu" if name == "mu" else None,
+      align_data=lambda name: "x" if name == "x" else None,
+      x=x_data)
+  ```
   """
   p_log_lik = [0.0] * n_samples
   for s in range(n_samples):
@@ -381,18 +414,54 @@ def klqp_score(model, variational, align_latent, align_data,
 
   $\\text{KL}( q(z; \lambda) \| p(z \mid x) ).$
 
-  This class minimizes the objective using the score function
-  gradient.
+  This function builds a loss function equal to KL(q||p) up to a
+  constant. It also builds a surrogate loss whose automatic
+  differentiation is a stochastic gradient of
 
-  Build loss function equal to KL(q||p) up to a constant. It
-  returns an surrogate loss function whose automatic differentiation
-  is based on the score function estimator [@paisley2012variational].
+  $-\\text{ELBO} =
+      -\mathbb{E}_{q(z; \lambda)} [ \log p(x, z) - \log q(z; \lambda) ]$
+
+  based on the score function estimator [@paisley2012variational].
 
   Computed by sampling from $q(z;\lambda)$ and evaluating the
   expectation using Monte Carlo sampling.
 
-  The objective function also adds to itself a summation over all
-  tensors in the `REGULARIZATION_LOSSES` collection.
+  Args:
+  @{args}
+
+  Returns:
+  @{returns}
+
+  #### Notes
+
+  Probabilistic programs may have random variables which vary across
+  executions. The algorithm returns calculations following `n_samples`
+  executions of the model and variational programs.
+
+  @{notes_model_parameters}
+
+  @{notes_conditional_inference}
+
+  @{notes_regularization_losses}
+
+  #### Examples
+
+  ```python
+  def model():
+    mu = Normal(loc=0.0, scale=1.0, name="mu")
+    x = Normal(loc=mu, scale=1.0, sample_shape=10, name="x")
+
+  def variational():
+    qmu = Normal(loc=tf.get_variable("loc", []),
+                 scale=tf.nn.softplus(tf.get_variable("shape", [])),
+                 name="qmu")
+
+  loss, surrogate_loss = ed.klqp_score(
+      model, variational,
+      align_latent=lambda name: "qmu" if name == "mu" else None,
+      align_data=lambda name: "x" if name == "x" else None,
+      x=x_data)
+  ```
   """
   p_log_prob = [0.0] * n_samples
   q_log_prob = [0.0] * n_samples
diff --git a/edward/inferences/klqp_implicit.py b/edward/inferences/klqp_implicit.py
index 305372678..aeeb79fd7 100644
--- a/edward/inferences/klqp_implicit.py
+++ b/edward/inferences/klqp_implicit.py
@@ -5,11 +5,24 @@
 import six
 import tensorflow as tf
 
+from edward.inferences import docstrings as doc
 from edward.inferences.inference import (
     call_function_up_to_args, make_intercept)
 from edward.models.core import Trace
 
 
+@doc.set_doc(
+    args_part_one=(doc.arg_model +
+                   doc.arg_variational)[:-1],
+    args_part_two=(doc.arg_align_latent +
+                   doc.arg_align_data)[:-1],
+    args_part_three=(doc.arg_scale +
+                     doc.arg_auto_transform +
+                     doc.arg_collections +
+                     doc.arg_args_kwargs)[:-1],
+    returns=doc.return_loss_loss_d,
+    notes_discriminator_scope=doc.notes_discriminator_scope,
+    notes_regularization_losses=doc.notes_regularization_losses)
 def klqp_implicit(model, variational, discriminator, align_latent,
                   align_data, align_latent_global=lambda name: name,
                   ratio_loss='log', scale=lambda name: 1.0,
@@ -33,23 +46,8 @@ def klqp_implicit(model, variational, discriminator, align_latent,
   random variables (`rv`) satisfies `rv.is_reparameterized` and
   `rv.is_continuous`.
 
-  #### Notes
-
-  Unlike `GANInference`, `discriminator` takes dict's as input,
-  and must subset to the appropriate values through lexical scoping
-  from the previously defined model and latent variables. This is
-  necessary as the discriminator can take an arbitrary set of data,
-  latent, and global variables.
-
-  Note the type for `discriminator`'s output changes when one
-  passes in the `scale` argument to `initialize()`.
-
-  The objective function also adds to itself a summation over all
-  tensors in the `REGULARIZATION_LOSSES` collection.
-  """
-  """Create an inference algorithm.
-
   Args:
+  @{args_part_one}
     discriminator: function.
       Function (with parameters). Unlike `GANInference`, it is
       interpreted as a ratio estimator rather than a discriminator.
@@ -58,23 +56,41 @@ def klqp_implicit(model, variational, discriminator, align_latent,
       discriminators, it can take a batch of data points and local
       variables, of size $M$, and output a vector of length
       $M$.
-    global_vars: dict of RandomVariable to RandomVariable, optional.
+  @{args_part_two}
+    align_latent_global: dict of RandomVariable to RandomVariable.
       Identifying which variables in `latent_vars` are global
       variables, shared across data points. These will not be
       encompassed in the ratio estimation problem, and will be
       estimated with tractable variational approximations.
-  """
-  """Initialize inference algorithm. It initializes hyperparameters
-  and builds ops for the algorithm's computation graph.
-
-  Args:
-    ratio_loss: str or fn, optional.
+    ratio_loss: str or fn.
       Loss function minimized to get the ratio estimator. 'log' or 'hinge'.
       Alternatively, one can pass in a function of two inputs,
       `psamples` and `qsamples`, and output a point-wise value
       with shape matching the shapes of the two inputs.
-  """
-  """Build loss function
+  @{args_part_three}
+
+  Unlike `GANInference`, `discriminator` takes dict's as input,
+  and must subset to the appropriate values through lexical scoping
+  from the previously defined model and latent variables. This is
+  necessary as the discriminator can take an arbitrary set of data,
+  latent, and global variables.
+
+  align_latent aligns all global and local latents;
+  align_global_latent only aligns global latents.
+
+  Returns:
+  @{returns}
+
+  #### Notes
+
+  Note the type for `discriminator`'s output changes when one
+  passes in the `scale` argument to `initialize()`.
+
+  @{notes_discriminator_scope}
+
+  @{notes_regularization_losses}
+
+  Build loss function
 
   $-\Big(\mathbb{E}_{q(\\beta)} [\log p(\\beta) - \log q(\\beta) ] +
       \sum_{n=1}^N \mathbb{E}_{q(\\beta)q(z_n\mid\\beta)} [
@@ -97,8 +113,6 @@ def klqp_implicit(model, variational, discriminator, align_latent,
   Gradients are taken using the reparameterization trick
   [@kingma2014auto].
 
-  #### Notes
-
   This also includes model parameters $p(x, z, \\beta; \\theta)$
   and variational distributions with inference networks
   $q(z\mid x)$.
@@ -114,8 +128,32 @@ def klqp_implicit(model, variational, discriminator, align_latent,
     because it complicates the code;
   + analytic KL/swapping out the penalty term for the globals.
 
-  align_latent aligns all global and local latents;
-  align_global_latent only aligns global latents.
+  #### Examples
+
+  ```python
+  def model():
+    z = Normal(loc=0.0, scale=1.0, sample_shape=[256, 25], name="z")
+    x = generative_network(z, name="x")
+    return x
+
+  def variational(x):
+    net = tf.layers.dense(x_data, 25 * 2)
+    qz = Normal(loc=net[:, :25],
+                scale=tf.nn.softplus(net[:, 25:]),
+                name="qz")
+
+  def ratio_estimator(data, local_vars, global_vars):
+    # concatenated input has shape (batch_size, 28*28 + 25)
+    net = tf.concat([data["x"], local_vars["z"]], 1)
+    net = tf.layers.dense(net, 256, activation=tf.nn.relu)
+    return tf.layers.dense(net, 1, activation=tf.sigmoid)
+
+  loss, loss_d = ed.klqp_implicit(
+      model, variational, ratio_estimator,
+      align_latent=lambda name: "qz" if name == "z" else None,
+      align_data=lambda name: "x" if name == "x" else None,
+      x=x_data)
+  ```
   """
   if callable(ratio_loss):
     ratio_loss = ratio_loss
diff --git a/edward/inferences/laplace.py b/edward/inferences/laplace.py
index 3cb9a64a5..c2a6c70ab 100644
--- a/edward/inferences/laplace.py
+++ b/edward/inferences/laplace.py
@@ -5,6 +5,7 @@
 import six
 import tensorflow as tf
 
+from edward.inferences import docstrings as doc
 from edward.inferences.inference import call_function_up_to_args
 from edward.inferences import docstrings as doc
 from edward.inferences.map import map
@@ -18,6 +19,15 @@
   raise ImportError("{0}. Your TensorFlow version is not supported.".format(e))
 
 
+@doc.set_doc(
+    args=(doc.arg_model +
+          doc.arg_variational +
+          doc.arg_align_latent +
+          doc.arg_align_data +
+          doc.arg_scale +
+          doc.arg_auto_transform +
+          doc.arg_collections +
+          doc.arg_args_kwargs)[:-1])
 def laplace(model, variational, align_latent, align_data,
             scale=lambda name: 1.0, auto_transform=True,
             collections=None, *args, **kwargs):
@@ -26,11 +36,14 @@ def laplace(model, variational, align_latent, align_data,
   It approximates the posterior distribution using a multivariate
   normal distribution centered at the mode of the posterior.
 
-  We implement this by running `MAP` to find the posterior mode.
+  We implement this by running `ed.map` to find the posterior mode.
   This forms the mean of the normal approximation. We then compute the
   inverse Hessian at the mode of the posterior. This forms the
   covariance of the normal approximation.
 
+  Args:
+  @{args}
+
   #### Notes
 
   If `MultivariateNormalDiag` or `Normal` random variables are
@@ -42,8 +55,8 @@ def laplace(model, variational, align_latent, align_data,
   Random variables with both scalar batch and event shape are not
   supported as `tf.hessians` is currently not applicable to scalars.
 
-  Note that `Laplace` finds the location parameter of the normal
-  approximation using `MAP`, which is performed on the latent
+  Note that this function finds the location parameter of the normal
+  approximation using `ed.map`, which is performed on the latent
   variable's original (constrained) support. The scale parameter
   is calculated by evaluating the Hessian of $-\log p(x, z)$ in the
   constrained space and under the mode. This implies the Laplace
@@ -53,30 +66,24 @@ def laplace(model, variational, align_latent, align_data,
   #### Examples
 
   ```python
-  X = tf.placeholder(tf.float32, [N, D])
-  w = Normal(loc=tf.zeros(D), scale=tf.ones(D))
-  y = Normal(loc=ed.dot(X, w), scale=tf.ones(N))
-
-  qw = MultivariateNormalTriL(
-      loc=tf.Variable(tf.random_normal([D])),
-      scale_tril=tf.Variable(tf.random_normal([D, D])))
-
-  inference = ed.Laplace({w: qw}, data={X: X_train, y: y_train})
+  def model(X):
+    w = Normal(loc=tf.zeros(D), scale=tf.ones(D), name="w")
+    y = Normal(loc=tf.tensordot(X, w, [[1], [0]]), scale=tf.ones(N), name="y")
+
+  def variational():
+    qw = MultivariateNormalTriL(
+        loc=tf.Variable(tf.random_normal([D])),
+        scale_tril=tf.Variable(tf.random_normal([D, D])),
+        name="qw")
+
+  loss = ed.laplace(
+      model, variational,
+      align_latent=lambda name: "qw" if name == "w" else None,
+      align_data=lambda name: "y" if name == "y" else None,
+      X=X_data,
+      y=y_data)
   ```
   """
-  """Create an inference algorithm.
-
-  Args:
-    latent_vars: list of RandomVariable or
-                 dict of RandomVariable to RandomVariable.
-      Collection of random variables to perform inference on. If list,
-      each random variable will be implictly optimized using a
-      `MultivariateNormalTriL` random variable that is defined
-      internally with unconstrained support and is initialized using
-      standard normal draws. If dictionary, each random
-      variable must be a `MultivariateNormalDiag`,
-      `MultivariateNormalTriL`, or `Normal` random variable.
-  """
   variational_pointmass = _make_variational_pointmass(
       variational, *args, **kwargs)
   loss = map(model, variational, align_latent, align_data,
diff --git a/edward/inferences/map.py b/edward/inferences/map.py
index f1e457869..76610f4fa 100644
--- a/edward/inferences/map.py
+++ b/edward/inferences/map.py
@@ -5,6 +5,7 @@
 import six
 import tensorflow as tf
 
+from edward.inferences import docstrings as doc
 from edward.inferences.inference import (
     call_function_up_to_args, make_intercept)
 from edward.models.core import Trace
@@ -15,12 +16,23 @@
   raise ImportError("{0}. Your TensorFlow version is not supported.".format(e))
 
 
+@doc.set_doc(
+    args=(doc.arg_model +
+          doc.arg_variational +
+          doc.arg_align_latent +
+          doc.arg_align_data +
+          doc.arg_scale +
+          doc.arg_auto_transform +
+          doc.arg_collections +
+          doc.arg_args_kwargs)[:-1],
+    returns=doc.return_loss,
+    notes_regularization_losses=doc.notes_regularization_losses)
 def map(model, variational, align_latent, align_data,
         scale=lambda name: 1.0, auto_transform=True, collections=None,
         *args, **kwargs):
   """Maximum a posteriori.
 
-  This class implements gradient-based optimization to solve the
+  This function implements gradient-based optimization to solve the
   optimization problem,
 
   $\min_{z} - p(z \mid x).$
@@ -30,13 +42,23 @@ def map(model, variational, align_latent, align_data,
 
   $- \mathbb{E}_{q(z; \lambda)} [ \log p(x, z) ].$
 
+  Args:
+  @{args}
+
+  Returns:
+  @{returns}
+
   #### Notes
 
-  This class is currently restricted to optimization over
+  This function is currently restricted to optimization over
   differentiable latent variables. For example, it does not solve
   discrete optimization.
 
-  This class also minimizes the loss with respect to any model
+  Probabilistic programs may have random variables which vary across
+  executions. The algorithm returns calculations following one
+  execution of the model and variational programs.
+
+  This function also minimizes the loss with respect to any model
   parameters $p(z \mid x; \\theta)$.
 
   In conditional inference, we infer $z$ in $p(z, \\beta
@@ -48,52 +70,34 @@ def map(model, variational, align_latent, align_data,
   marginal density $\log p(x, z)$, and it is exact if
   $q(\\beta) = p(\\beta \mid x)$ (up to stochasticity).
 
+  @{notes_regularization_losses}
+
   #### Examples
 
-  Most explicitly, `MAP` is specified via a dictionary:
+  Most explicitly, this function is specified via a variational
+  program over pointmasses.
 
   ```python
-  qpi = PointMass(params=ed.to_simplex(tf.Variable(tf.zeros(K-1))))
-  qmu = PointMass(params=tf.Variable(tf.zeros(K*D)))
-  qsigma = PointMass(params=tf.nn.softplus(tf.Variable(tf.zeros(K*D))))
-  ed.MAP({pi: qpi, mu: qmu, sigma: qsigma}, data)
+  def variational():
+    qpi = PointMass(params=to_simplex(tf.Variable(tf.zeros(K-1))),
+                    name="qpi")
+    qmu = PointMass(params=tf.Variable(tf.zeros(K*D)),
+                    name="qmu")
+    qsigma = PointMass(params=tf.nn.softplus(tf.Variable(tf.zeros(K*D))),
+                       name="qsigma")
+
+  loss = ed.map(..., variational, ...)
   ```
 
-  We also automate the specification of `PointMass` distributions,
-  so one can pass in a list of latent variables instead:
-
-  ```python
-  ed.MAP([beta], data)
-  ed.MAP([pi, mu, sigma], data)
-  ```
+  We also automate the specification of `PointMass` distributions
+  so you don't pass in `variational`. (TODO not implemented yet.)
 
-  Note that for `MAP` to optimize over latent variables with
+  Note that for this function to optimize over latent variables with
   constrained continuous support, the point mass must be constrained
   to have the same support while its free parameters are
   unconstrained; see, e.g., `qsigma` above. This is different than
   performing MAP on the unconstrained space: in general, the MAP of
   the transform is not the transform of the MAP.
-
-  The objective function also adds to itself a summation over all
-  tensors in the `REGULARIZATION_LOSSES` collection.
-  """
-  """Create an inference algorithm.
-
-  Args:
-    latent_vars: list of RandomVariable or
-                 dict of RandomVariable to RandomVariable.
-      Collection of random variables to perform inference on. If
-      list, each random variable will be implictly optimized using a
-      `PointMass` random variable that is defined internally with
-      constrained support, has unconstrained free parameters, and is
-      initialized using standard normal draws. If dictionary, each
-      value in the dictionary must be a `PointMass` random variable
-      with the same support as the key.
-  """
-  """Build loss function. Its automatic differentiation
-  is the gradient of
-
-  $- \log p(x,z).$
   """
   with Trace() as posterior_trace:
     call_function_up_to_args(variational, *args, **kwargs)
diff --git a/edward/inferences/wake_sleep.py b/edward/inferences/wake_sleep.py
index 2314d513a..132978f6b 100644
--- a/edward/inferences/wake_sleep.py
+++ b/edward/inferences/wake_sleep.py
@@ -5,11 +5,24 @@
 import six
 import tensorflow as tf
 
+from edward.inferences import docstrings as doc
 from edward.inferences.inference import (
     call_function_up_to_args, make_intercept)
 from edward.models.core import Trace
 
 
+@doc.set_doc(
+    args_part_one=(doc.arg_model +
+                   doc.arg_variational +
+                   doc.arg_align_latent +
+                   doc.arg_align_data +
+                   doc.arg_scale +
+                   doc.arg_n_samples)[:-1],
+    args_part_two=(doc.arg_auto_transform +
+                   doc.arg_collections +
+                   doc.arg_args_kwargs)[:-1],
+    notes_conditional_inference=doc.notes_conditional_inference_samples,
+    notes_regularization_losses=doc.notes_regularization_losses)
 def wake_sleep(model, variational, align_latent, align_data,
                scale=lambda name: 1.0, n_samples=1, phase_q='sleep',
                auto_transform=True, collections=None, *args, **kwargs):
@@ -38,35 +51,50 @@ def wake_sleep(model, variational, align_latent, align_data,
   corresponds to minimizing the reverse KL $\\text{KL}(p\|q)$ in
   expectation over the data distribution.
 
+  Args:
+  @{args_part_one}
+    phase_q: str.
+      Phase for updating parameters of q. If 'sleep', update using
+      a sample from p. If 'wake', update using a sample from q.
+      (Unlike reparameterization gradients, the sample is held
+      fixed.)
+  @{args_part_two}
+
+  Returns:
+    Pair of scalar tf.Tensors, representing losses for training p
+    and q respectively.
+
   #### Notes
 
-  In conditional inference, we infer $z$ in $p(z, \\beta
-  \mid x)$ while fixing inference over $\\beta$ using another
-  distribution $q(\\beta)$. During gradient calculation, instead
-  of using the model's density
+  Probabilistic programs may have random variables which vary across
+  executions. The algorithm returns calculations following `n_samples`
+  executions of the model and variational programs.
 
-  $\log p(x, z^{(s)}), z^{(s)} \sim q(z; \lambda),$
+  @{notes_conditional_inference}
 
-  for each sample $s=1,\ldots,S$, `WakeSleep` uses
+  @{notes_regularization_losses}
 
-  $\log p(x, z^{(s)}, \\beta^{(s)}),$
+  #### Examples
 
-  where $z^{(s)} \sim q(z; \lambda)$ and $\\beta^{(s)}
-  \sim q(\\beta)$.
+  ```python
+  def model():
+    z = Normal(loc=0.0, scale=1.0, sample_shape=[256, 25], name="z")
+    net = tf.layers.dense(z, 512, activation=tf.nn.relu)
+    net = tf.layers.dense(net, 28 * 28, activation=None)
+    x = Normal(loc=net, scale=1.0, name="x")
 
-  The objective function also adds to itself a summation over all
-  tensors in the `REGULARIZATION_LOSSES` collection.
-  """
-  """
-  Args:
-    n_samples: int, optional.
-      Number of samples for calculating stochastic gradients during
-      wake and sleep phases.
-    phase_q: str, optional.
-      Phase for updating parameters of q. If 'sleep', update using
-      a sample from p. If 'wake', update using a sample from q.
-      (Unlike reparameterization gradients, the sample is held
-      fixed.)
+  def variational(x):
+    net = tf.layers.dense(x, 25 * 2)
+    qz = Normal(loc=net[:, :25],
+                scale=tf.nn.softplus(net[:, 25:]),
+                name="qz")
+
+  loss_p, loss_q = ed.wake_sleep(
+      model, variational,
+      align_latent=lambda name: "qz" if name == "z" else None,
+      align_data=lambda name: "x" if name == "x" else None,
+      x=x_data)
+  ```
   """
   p_log_prob = [0.0] * n_samples
   q_log_prob = [0.0] * n_samples
diff --git a/edward/inferences/wgan_inference.py b/edward/inferences/wgan_inference.py
index fedc87d85..8ad6ad446 100644
--- a/edward/inferences/wgan_inference.py
+++ b/edward/inferences/wgan_inference.py
@@ -5,9 +5,19 @@
 import six
 import tensorflow as tf
 
+from edward.inferences import docstrings as doc
 from edward.inferences.inference import call_function_up_to_args
 
 
+@doc.set_doc(
+    args_part_one=(doc.arg_model +
+                   doc.arg_discriminator +
+                   doc.arg_align_data)[:-1],
+    args_part_twoe=(doc.arg_collections +
+                    doc.arg_args_kwargs)[:-1],
+    returns=doc.return_loss_loss_d,
+    notes_discriminator_scope=doc.notes_discriminator_scope,
+    notes_regularization_losses=doc.notes_regularization_losses)
 def wgan_inference(model, discriminator, align_data,
                    penalty=10.0, collections=None, *args, **kwargs):
   """Parameter estimation with GAN-style training
@@ -18,51 +28,55 @@ def wgan_inference(model, discriminator, align_data,
   models. These models do not require a tractable density and assume
   only a program that generates samples.
 
+  Args:
+  @{args_part_one}
+    penalty: float.
+      Scalar value to enforce gradient penalty that ensures the
+      gradients have norm equal to 1 [@gulrajani2017improved]. Set to
+      None (or 0.0) if using no penalty.
+  @{args_part_two}
+
+  `model` must return the generated data.
+
+  Returns:
+  @{returns}
+
+  #### Notes
+
   The original WGAN clips weight parameters of the discriminator as an
   approximation to the 1-Lipschitz constraint. To clip weights, one
   must manually add a clipping op and then call it after each gradient
   update during training. For example:
 
   ```python
-  ... = wgan_inference(data, discriminator, penalty=None)
+  ... = wgan_inference(..., penalty=None)
   var_list = tf.get_collection(
       tf.GraphKeys.TRAINABLE_VARIABLES, scope="Disc")
   clip_op = [w.assign(tf.clip_by_value(w, -0.1, 0.1)) for w in var_list]
   ```
 
-  #### Notes
+  @{notes_discriminator_scope}
 
-  Argument-wise, the only difference from `GANInference` is
-  conceptual: the `discriminator` is better described as a test
-  function or critic. `WGANInference` continues to use
-  `discriminator` only to share methods and attributes with
-  `GANInference`.
-
-  The objective function also adds to itself a summation over all
-  tensors in the `REGULARIZATION_LOSSES` collection.
+  @{notes_regularization_losses}
 
   #### Examples
 
   ```python
-  z = Normal(loc=tf.zeros([100, 10]), scale=tf.ones([100, 10]))
-  x = generative_network(z)
-
-  inference = ed.WGANInference({x: x_data}, discriminator)
+  def model():
+    z = Normal(loc=0.0, scale=1.0, sample_shape=[256, 25])
+    x = generative_network(z, name="x")
+    return x
+
+  def discriminator(x):
+    net = tf.layers.dense(x, 256, activation=tf.nn.relu)
+    return tf.layers.dense(net, 1, activation=tf.sigmoid)
+
+  loss, loss_d = ed.wgan_inference(
+      model, discriminator,
+      align_data=lambda name: "x_data" if name == "x" else None,
+      x_data=x_data)
   ```
   """
-  """Initialize inference algorithm. It initializes hyperparameters
-  and builds ops for the algorithm's computation graph.
-
-  Args:
-    penalty: float, optional.
-      Scalar value to enforce gradient penalty that ensures the
-      gradients have norm equal to 1 [@gulrajani2017improved]. Set to
-      None (or 0.0) if using no penalty.
-    clip: float, optional.
-      Value to clip weights by. Default is no clipping.
-
-  `model` must return the generated data.
-  """
   x_fake = call_function_up_to_args(model, *args, **kwargs)
   key = align_data(x_fake.name.split(':')[0])
   if isinstance(key, int):

From d3f80350a6b50afbce4d93c6e35605a1af297608 Mon Sep 17 00:00:00 2001
From: Dustin Tran <dustinviettran@gmail.com>
Date: Sat, 20 Jan 2018 12:31:06 -0800
Subject: [PATCH 19/27] extend {gibbs,hmc,metropolis_hastings,sghmc,sgld}.py to
 trace

---
 docs/tex/bib.bib                         |  14 +
 edward/__init__.py                       |  15 +-
 edward/inferences/__init__.py            |  20 +-
 edward/inferences/docstrings.py          |  85 +++++-
 edward/inferences/gibbs.py               | 152 ----------
 edward/inferences/hmc.py                 | 305 +++++++------------
 edward/inferences/inference.py           |  10 +
 edward/inferences/metropolis_hastings.py | 288 +++++++++---------
 edward/inferences/monte_carlo.py         | 102 -------
 edward/inferences/sghmc.py               | 345 ++++++++++++++-------
 edward/inferences/sgld.py                | 363 ++++++++++++++++-------
 11 files changed, 890 insertions(+), 809 deletions(-)
 delete mode 100644 edward/inferences/gibbs.py
 delete mode 100644 edward/inferences/monte_carlo.py

diff --git a/docs/tex/bib.bib b/docs/tex/bib.bib
index 50521ce66..b5ba0e0e8 100644
--- a/docs/tex/bib.bib
+++ b/docs/tex/bib.bib
@@ -484,6 +484,13 @@ @inproceedings{welling2011bayesian
 year = {2011}
 }
 
+@inproceedings{wingate2011lightweight,
+  title={Lightweight implementations of probabilistic programming languages via transformational compilation},
+  author={Wingate, David and Stuhlmueller, Andreas and Goodman, Noah},
+  booktitle={Artificial Intelligence and Statistics},
+  year={2011}
+}
+
 @inproceedings{goodman2012church,
 author = {Goodman, Noah and Mansinghka, Vikash and Roy, Daniel M and Bonawitz, Keith and Tenenbaum, Joshua B},
 title = {{Church: a language for generative models}},
@@ -720,6 +727,13 @@ @article{johnson2016composing
   year = {2016},
 }
 
+@inproceedings{li2016preconditioned,
+  title={Preconditioned Stochastic Gradient Langevin Dynamics for Deep Neural Networks.},
+  author={Li, Chunyuan and Chen, Changyou and Carlson, David E and Carin, Lawrence},
+  booktitle={Association for the Advancement of Artificial Intelligence},
+  year={2016}
+}
+
 @article{mohamed2016learning,
 author = {Mohamed, Shakir and Lakshminarayanan, Balaji},
 title = {{Learning in Implicit Generative Models}},
diff --git a/edward/__init__.py b/edward/__init__.py
index ae081073b..cb512cd2c 100644
--- a/edward/__init__.py
+++ b/edward/__init__.py
@@ -11,6 +11,7 @@
     bigan_inference,
     complete_conditional,
     gan_inference,
+    hmc,
     klpq,
     klqp,
     klqp_implicit,
@@ -19,9 +20,11 @@
     klqp_score,
     laplace,
     map,
+    metropolis_hastings,
+    sghmc,
+    sgld,
     wake_sleep,
     wgan_inference)
-# from edward.inferences import MonteCarlo, HMC, MetropolisHastings, SGLD, SGHMC, Gibbs
 from edward.models import (
     Trace,
     get_ancestors,
@@ -48,11 +51,7 @@
     'bigan_inference',
     'complete_conditional',
     'gan_inference',
-    'MonteCarlo',
-    'HMC',
-    'MetropolisHastings',
-    'SGLD',
-    'SGHMC',
+    'hmc',
     'klpq',
     'klqp',
     'klqp_implicit',
@@ -61,9 +60,11 @@
     'klqp_score',
     'laplace',
     'map',
+    'metropolis_hastings',
+    'sghmc',
+    'sgld',
     'wake_sleep',
     'wgan_inference',
-    'Gibbs',
     'Trace',
     'get_ancestors',
     'get_blanket',
diff --git a/edward/inferences/__init__.py b/edward/inferences/__init__.py
index d11af8efc..f6949b483 100644
--- a/edward/inferences/__init__.py
+++ b/edward/inferences/__init__.py
@@ -47,18 +47,16 @@
 from edward.inferences.bigan_inference import *
 from edward.inferences.conjugacy import *
 from edward.inferences.gan_inference import *
-# from edward.inferences.gibbs import *
-# from edward.inferences.hmc import *
+from edward.inferences.hmc import *
 from edward.inferences.inference import *
 from edward.inferences.klpq import *
 from edward.inferences.klqp import *
 from edward.inferences.klqp_implicit import *
 from edward.inferences.laplace import *
 from edward.inferences.map import *
-# from edward.inferences.metropolis_hastings import *
-# from edward.inferences.monte_carlo import *
-# from edward.inferences.sgld import *
-# from edward.inferences.sghmc import *
+from edward.inferences.metropolis_hastings import *
+from edward.inferences.sgld import *
+from edward.inferences.sghmc import *
 from edward.inferences.wake_sleep import *
 from edward.inferences.wgan_inference import *
 
@@ -68,8 +66,7 @@
     'bigan_inference',
     'complete_conditional',
     'gan_inference',
-    'Gibbs',
-    'HMC',
+    'hmc',
     'klpq',
     'klqp',
     'klqp_implicit',
@@ -78,10 +75,9 @@
     'klqp_score',
     'laplace',
     'map',
-    'MetropolisHastings',
-    'MonteCarlo',
-    'SGLD',
-    'SGHMC',
+    'metropolis_hastings',
+    'sghmc',
+    'sgld',
     'wake_sleep',
     'wgan_inference',
 ]
diff --git a/edward/inferences/docstrings.py b/edward/inferences/docstrings.py
index bad0f1b99..c9fd1dfb6 100644
--- a/edward/inferences/docstrings.py
+++ b/edward/inferences/docstrings.py
@@ -18,7 +18,6 @@
 def set_doc(**kwargs):
   """Decorator to programmatically set the docstring."""
   def _update(cls_or_fn):
-    # Trim indenting level of current doc.
     doc = trim(cls_or_fn.__doc__)
     for k, v in six.iteritems(kwargs):
       # Capture each @{k} reference to replace with v.
@@ -70,6 +69,13 @@ def trim(docstring):
     as input and returns a string, indexing `variational`'s trace.
     Other inputs must return None.
 """[1:]
+arg_align_latent_monte_carlo = """
+  align_latent: function of string, aligning `model` latent
+    variables with posterior trace. It takes a model variable's name
+    as input and returns a string. The return output determines the
+    name of the returned dictionary of states' keys. If None,
+    will not perform inference over them.
+"""[1:]
 arg_args_kwargs = """
   args, kwargs: data inputs. `kwargs`' keys are directly the argument
     keys in `model` (and if present, `variational`). Data inputs are
@@ -88,6 +94,9 @@ def trim(docstring):
     output logit probabilities (real-valued) and not probabilities
     in $[0, 1]$.
 """[1:]
+arg_current_grads_target_log_prob = """
+  current_grads_target_log_prob:
+"""[1:]
 arg_kl_scaling = """
   kl_scaling: function of string, aligning `model` latent
     variables with KL scale factors. This provides option to scale
@@ -127,10 +136,39 @@ def trim(docstring):
     for mini-batch scaling when inferring global variables, or
     applying masks on a random variable.
 """[1:]
+arg_current_state = """
+  current_state: Tensor or list of Tensors. Each element is a
+    posterior variable whose name is its current state. If the model
+    encounters a latent variable not aligned with a key in `states`,
+    its state is a draw from the distribution. Default is None
+    (equivalent to empty dict).
+"""[1:]
+arg_step_size = """
+  step_size: float.
+    Step size of numerical integrator. The implementation may be
+    extended in the future to enable a step size per random variable
+    (`step_size` would be a callable).
+"""[1:]
+arg_current_target_log_prob = """
+  current_target_log_prob:
+"""[1:]
 arg_variational = """
   variational: function whose inputs are a subset of `args` (e.g.,
     for amortized). Output is not used.
 """[1:]
+notes_conditional_inference = """
+In conditional inference, we infer $z$ in $p(z, \\beta
+\mid x)$ while fixing inference over $\\beta$ using another
+distribution $q(\\beta)$. During calculations, this function uses an
+estimate of the marginal density,
+
+$\log p(x, z) = \log \mathbb{E}_{q(\\beta)} [ p(x, z, \\beta) ]
+              \\approx \log p(x, z, \\beta^*)$
+
+leveraging a single Monte Carlo sample, where $\\beta^* \sim
+q(\\beta)$. This is unbiased (and therefore asymptotically exact as a
+pseudo-marginal method) if $q(\\beta) = p(\\beta \mid x)$.
+"""[1:-1]
 notes_conditional_inference_samples = """
 In conditional inference, we infer $z$ in $p(z, \\beta
 \mid x)$ while fixing inference over $\\beta$ using another
@@ -151,6 +189,46 @@ def trim(docstring):
 discriminator's parameters can be accessed with the variable scope
 "Disc".
 """[1:-1]
+notes_mcmc_programs = """
+Probabilistic programs may have random variables which vary across
+executions. At each iteration, the MCMC algorithm transitions across
+the (finite) list of latent variables seen during one execution of
+the model. The previous state is read from `states`: if the
+execution encounters a latent variable not existing in `states`, the
+previous state is a draw from the prior.
+
+We recommend updating `states` with the sampler's output after each
+iteration. For example, in Eager mode:
+```python
+states = {}
+for _ in range(10000):
+  new_states, ... = mcmc(..., states=states, ...)
+  states.update(new_states)
+```
+This caches previous states within the `states` dictionary. States
+are only updated when the associated latent variable is seen again
+in the model's execution. As long as every latent variable of
+interest appears in the execution with non-zero probability, the
+distribution of each state is guaranteed to converge to the target
+distribution.
+
+This idea can be seen as a joint version of single-site
+Metropolis-Hastings [@wingate2011lightweight], but note it does not
+rerun any part of the program. In fact, the newly transitioned states
+given old states may not actually be a valid output of the program.
+For example, consider
+```python
+def model():
+  x = Bernoulli(probs=0.5)
+  if tf.cast(x, tf.bool):
+    y = Normal(0.0, 1.0)
+  else:
+    y = Gamma(1.0, 1.0)
+  return x, y
+```
+Given a previous state from (Bernoulli, Normal), the proposal might
+generate (0, -0.3), which is not in the program's support.
+"""[1:-1]
 notes_model_parameters = """
 The function also enables optimizing model parameters $p(z \mid x;
 \\theta)$. It does this by variational EM, maximizing
@@ -176,6 +254,11 @@ def trim(docstring):
   respectively. The surrogate loss' automatic differentiation is the
   gradient to follow for optimization.
 """[1:-1]
+return_samples = """
+  Dict of tf.Tensor. The keys are according to the return values of
+  `align_latent`. The associated values are the transitioned states
+  from the Markov chain.
+"""[1:-1]
 return_surrogate_loss = """
   Scalar tf.Tensor representing the surrogate loss. Its automatic
   differentiation is the gradient to follow for optimization.
diff --git a/edward/inferences/gibbs.py b/edward/inferences/gibbs.py
deleted file mode 100644
index a5af7bd5d..000000000
--- a/edward/inferences/gibbs.py
+++ /dev/null
@@ -1,152 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import random
-import six
-import tensorflow as tf
-
-from collections import OrderedDict
-from edward.inferences.conjugacy import complete_conditional
-from edward.inferences.monte_carlo import MonteCarlo
-from edward.models import RandomVariable
-from edward.util import check_and_maybe_build_latent_vars, get_session
-
-
-class Gibbs(MonteCarlo):
-  """Gibbs sampling [@geman1984stochastic].
-
-  Note `Gibbs` assumes the proposal distribution has the same
-  support as the prior. The `auto_transform` attribute in
-  the method `initialize()` is not applicable.
-
-  #### Examples
-
-  ```python
-  x_data = np.array([0, 1, 0, 0, 0, 0, 0, 0, 0, 1])
-
-  p = Beta(1.0, 1.0)
-  x = Bernoulli(probs=p, sample_shape=10)
-
-  qp = Empirical(tf.Variable(tf.zeros(500)))
-  inference = ed.Gibbs({p: qp}, data={x: x_data})
-  ```
-  """
-  def __init__(self, latent_vars, proposal_vars=None, data=None):
-    """Create an inference algorithm.
-
-    Args:
-      proposal_vars: dict of RandomVariable to RandomVariable.
-        Collection of random variables to perform inference on; each is
-        binded to its complete conditionals which Gibbs cycles draws on.
-        If not specified, default is to use `ed.complete_conditional`.
-    """
-    if proposal_vars is None:
-      proposal_vars = {z: complete_conditional(z)
-                       for z in six.iterkeys(latent_vars)}
-    else:
-      proposal_vars = check_and_maybe_build_latent_vars(proposal_vars)
-
-    self.proposal_vars = proposal_vars
-    super(Gibbs, self).__init__(latent_vars, data)
-
-  def initialize(self, scan_order='random', *args, **kwargs):
-    """Initialize inference algorithm. It initializes hyperparameters
-    and builds ops for the algorithm's computation graph.
-
-    Args:
-      scan_order: list or str.
-        The scan order for each Gibbs update. If list, it is the
-        deterministic order of latent variables. An element in the list
-        can be a `RandomVariable` or itself a list of
-        `RandomVariable`s (this defines a blocked Gibbs sampler). If
-        'random', will use a random order at each update.
-    """
-    self.scan_order = scan_order
-    self.feed_dict = {}
-    kwargs['auto_transform'] = False
-    return super(Gibbs, self).initialize(*args, **kwargs)
-
-  def update(self, feed_dict=None):
-    """Run one iteration of sampling.
-
-    Args:
-      feed_dict: dict.
-        Feed dictionary for a TensorFlow session run. It is used to feed
-        placeholders that are not fed during initialization.
-
-    Returns:
-      dict.
-      Dictionary of algorithm-specific information. In this case, the
-      acceptance rate of samples since (and including) this iteration.
-    """
-    sess = get_session()
-    if not self.feed_dict:
-      # Initialize feed for all conditionals to be the draws at step 0.
-      samples = OrderedDict(self.latent_vars)
-      inits = sess.run([qz.params[0] for qz in six.itervalues(samples)])
-      for z, init in zip(six.iterkeys(samples), inits):
-        self.feed_dict[z] = init
-
-      for key, value in six.iteritems(self.data):
-        if isinstance(key, tf.Tensor) and "Placeholder" in key.op.type:
-          self.feed_dict[key] = value
-        elif isinstance(key, RandomVariable) and \
-                isinstance(value, (tf.Tensor, tf.Variable)):
-          self.feed_dict[key] = sess.run(value)
-
-    if feed_dict is None:
-      feed_dict = {}
-
-    self.feed_dict.update(feed_dict)
-
-    # Determine scan order.
-    if self.scan_order == 'random':
-      scan_order = list(six.iterkeys(self.latent_vars))
-      random.shuffle(scan_order)
-    else:  # list
-      scan_order = self.scan_order
-
-    # Fetch samples by iterating over complete conditional draws.
-    for z in scan_order:
-      if isinstance(z, RandomVariable):
-        draw = sess.run(self.proposal_vars[z], self.feed_dict)
-        self.feed_dict[z] = draw
-      else:  # list
-        draws = sess.run([self.proposal_vars[zz] for zz in z], self.feed_dict)
-        for zz, draw in zip(z, draws):
-          self.feed_dict[zz] = draw
-
-    # Assign the samples to the Empirical random variables.
-    _, accept_rate = sess.run(
-        [self.train, self.n_accept_over_t], self.feed_dict)
-    t = sess.run(self.increment_t)
-
-    if self.debug:
-      sess.run(self.op_check, self.feed_dict)
-
-    if self.logging and self.n_print != 0:
-      if t == 1 or t % self.n_print == 0:
-        summary = sess.run(self.summarize, self.feed_dict)
-        self.train_writer.add_summary(summary, t)
-
-    return {'t': t, 'accept_rate': accept_rate}
-
-  def build_update(self):
-    """
-    #### Notes
-
-    The updates assume each Empirical random variable is directly
-    parameterized by `tf.Variable`s.
-    """
-    # Update Empirical random variables according to the complete
-    # conditionals. We will feed the conditionals when calling `update()`.
-    assign_ops = []
-    for z, qz in six.iteritems(self.latent_vars):
-      variable = qz.get_variables()[0]
-      assign_ops.append(
-          tf.scatter_update(variable, self.t, self.proposal_vars[z]))
-
-    # Increment n_accept (if accepted).
-    assign_ops.append(self.n_accept.assign_add(1))
-    return tf.group(*assign_ops)
diff --git a/edward/inferences/hmc.py b/edward/inferences/hmc.py
index 32b8120fc..843ba5cd6 100644
--- a/edward/inferences/hmc.py
+++ b/edward/inferences/hmc.py
@@ -5,205 +5,130 @@
 import six
 import tensorflow as tf
 
-from collections import OrderedDict
-from edward.inferences.monte_carlo import MonteCarlo
-from edward.models import RandomVariable
-
-
-class HMC(MonteCarlo):
+from edward.inferences import docstrings as doc
+from edward.inferences.inference import (
+    call_function_up_to_args, make_intercept)
+from edward.models.core import Node, Trace
+
+tfp = tf.contrib.bayesflow
+
+
+@doc.set_doc(
+    args_part_one=(doc.arg_model +
+                   doc.arg_align_latent_monte_carlo +
+                   doc.arg_align_data +
+                   doc.arg_current_state)[:-1],
+    args_part_two=(doc.arg_step_size +
+                   doc.arg_current_target_log_prob +
+                   doc.arg_current_grads_target_log_prob +
+                   doc.arg_auto_transform +
+                   doc.arg_collections +
+                   doc.arg_args_kwargs)[:-1],
+    returns=doc.return_samples,
+    notes_mcmc_programs=doc.notes_mcmc_programs,
+    notes_conditional_inference=doc.notes_conditional_inference)
+def hmc(model,
+        align_latent,
+        align_data,
+        current_state=None,
+        num_leapfrog_steps=2,
+        step_size=0.25,
+        current_target_log_prob=None,
+        current_grads_target_log_prob=None,
+        auto_transform=True,
+        collections=None,
+        *args, **kwargs):
   """Hamiltonian Monte Carlo, also known as hybrid Monte Carlo
   [@duane1987hybrid; @neal2011mcmc].
 
-  #### Notes
+  HMC simulates Hamiltonian dynamics using a numerical integrator. The
+  integrator has a discretization error and is corrected with a
+  Metropolis accept-reject step.
+
+  Works for any probabilistic program whose latent variables of
+  interest are differentiable. If `auto_transform=True`, the latent
+  variables may exist on any constrained differentiable support.
+
+  Args:
+  @{args_part_one}
+    num_leapfrog_steps: int.
+      Number of steps of numerical integrator.
+  @{args_part_two}
+
+  Returns:
+  @{returns}
 
-  In conditional inference, we infer $z$ in $p(z, \\beta
-  \mid x)$ while fixing inference over $\\beta$ using another
-  distribution $q(\\beta)$.
-  `HMC` substitutes the model's log marginal density
+  #### Notes
 
-  $\log p(x, z) = \log \mathbb{E}_{q(\\beta)} [ p(x, z, \\beta) ]
-                \\approx \log p(x, z, \\beta^*)$
+  @{notes_mcmc_programs}
 
-  leveraging a single Monte Carlo sample, where $\\beta^* \sim
-  q(\\beta)$. This is unbiased (and therefore asymptotically exact as a
-  pseudo-marginal method) if $q(\\beta) = p(\\beta \mid x)$.
+  @{notes_conditional_inference}
 
   #### Examples
 
+  Consider the following setup.
   ```python
-  mu = Normal(loc=0.0, scale=1.0)
-  x = Normal(loc=mu, scale=1.0, sample_shape=10)
-
-  qmu = Empirical(tf.Variable(tf.zeros(500)))
-  inference = ed.HMC({mu: qmu}, {x: np.zeros(10, dtype=np.float32)})
+  def model():
+    mu = Normal(loc=0.0, scale=1.0, name="mu")
+    x = Normal(loc=mu, scale=1.0, sample_shape=10, name="x")
+  ```
+  In graph mode, build `tf.Variable`s which are updated via the Markov
+  chain. The update op is fetched at runtime over many iterations.
+  ```python
+  qmu = tf.get_variable("qmu", initializer=1.)
+  new_state, _, _ = ed.hmc(
+      model,
+      ...,
+      current_state=qmu,
+      align_latent=lambda name: "qmu" if name == "mu" else None,
+      align_data=lambda name: "x_data" if name == "x" else None,
+      x_data=x_data)
+  qmu_update = qmu.assign(new_state)
+  ```
+  In eager mode, call the function at runtime, updating its inputs
+  such as `state`.
+  ```python
+  qmu = 1.
+  new_log_prob = None
+  new_gradients = None
+  for _ in range(1000):
+    new_state, new_log_prob, new_gradients = ed.hmc(
+        model,
+        ...,
+        current_state=qmu,
+        align_latent=lambda name: "qmu" if name == "mu" else None,
+        align_data=lambda name: "x_data" if name == "x" else None,
+        current_target_log_prob=new_log_prob,
+        current_grads_target_log_prob=new_gradients,
+        x_data=x_data)
+    qmu = new_state
   ```
   """
-  def __init__(self, *args, **kwargs):
-    super(HMC, self).__init__(*args, **kwargs)
-
-  def initialize(self, step_size=0.25, n_steps=2, *args, **kwargs):
-    """Initialize inference algorithm. It initializes hyperparameters
-    and builds ops for the algorithm's computation graph.
-
-    Args:
-      step_size: float.
-        Step size of numerical integrator.
-      n_steps: int.
-        Number of steps of numerical integrator.
-    """
-    self.step_size = step_size
-    self.n_steps = n_steps
-    # store global scope for log joint calculations
-    self._scope = tf.get_default_graph().unique_name("inference") + '/'
-    return super(HMC, self).initialize(*args, **kwargs)
-
-  def build_update(self):
-    """Simulate Hamiltonian dynamics using a numerical integrator.
-    Correct for the integrator's discretization error using an
-    acceptance ratio.
-
-    #### Notes
-
-    The updates assume each Empirical random variable is directly
-    parameterized by `tf.Variable`s.
-    """
-
-    # Gather the initial state, transformed to unconstrained space.
-    try:
-      self.latent_vars_unconstrained
-    except:
-      raise ValueError("This implementation of HMC requires that all "
-                       "variables have unconstrained support. Please "
-                       "initialize with auto_transform=True to ensure "
-                       "this. (if your variables already have unconstrained "
-                       "support then doing this is a no-op).")
-    old_sample = {z_unconstrained:
-                  tf.gather(qz_unconstrained.params, tf.maximum(self.t - 1, 0))
-                  for z_unconstrained, qz_unconstrained in
-                  six.iteritems(self.latent_vars_unconstrained)}
-    old_sample = OrderedDict(old_sample)
-
-    # Sample momentum.
-    old_r_sample = OrderedDict()
-    for z, qz in six.iteritems(self.latent_vars_unconstrained):
-      event_shape = qz.event_shape
-      old_r_sample[z] = tf.random_normal(event_shape, dtype=qz.dtype)
-
-    # Simulate Hamiltonian dynamics.
-    new_sample, new_r_sample = leapfrog(old_sample, old_r_sample,
-                                        self.step_size,
-                                        self._log_joint_unconstrained,
-                                        self.n_steps)
-
-    # Calculate acceptance ratio.
-    ratio = tf.reduce_sum([0.5 * tf.reduce_sum(tf.square(r))
-                           for r in six.itervalues(old_r_sample)])
-    ratio -= tf.reduce_sum([0.5 * tf.reduce_sum(tf.square(r))
-                            for r in six.itervalues(new_r_sample)])
-    ratio += self._log_joint_unconstrained(new_sample)
-    ratio -= self._log_joint_unconstrained(old_sample)
-
-    # Accept or reject sample.
-    u = tf.random_uniform([], dtype=ratio.dtype)
-    accept = tf.log(u) < ratio
-    sample_values = tf.cond(accept, lambda: list(six.itervalues(new_sample)),
-                            lambda: list(six.itervalues(old_sample)))
-    if not isinstance(sample_values, list):
-      # `tf.cond` returns tf.Tensor if output is a list of size 1.
-      sample_values = [sample_values]
-
-    sample = {z_unconstrained: sample_value for
-              z_unconstrained, sample_value in
-              zip(six.iterkeys(new_sample), sample_values)}
-
-    # Update Empirical random variables.
-    assign_ops = []
-    for z_unconstrained, qz_unconstrained in six.iteritems(
-            self.latent_vars_unconstrained):
-      variable = qz_unconstrained.get_variables()[0]
-      assign_ops.append(tf.scatter_update(
-          variable, self.t, sample[z_unconstrained]))
-
-    # Increment n_accept (if accepted).
-    assign_ops.append(self.n_accept.assign_add(tf.where(accept, 1, 0)))
-    return tf.group(*assign_ops)
-
-  def _log_joint_unconstrained(self, z_sample):
-    """
-    Given a sample in unconstrained latent space, transform it back into
-    the original space, and compute the log joint density with appropriate
-    Jacobian correction.
-    """
-
-    unconstrained_to_z = {v: k for (k, v) in self.transformations.items()}
-
-    # transform all samples back into the original (potentially
-    # constrained) space.
-    z_sample_transformed = {}
-    log_det_jacobian = 0.0
-    for z_unconstrained, qz_unconstrained in z_sample.items():
-      z = (unconstrained_to_z[z_unconstrained]
-           if z_unconstrained in unconstrained_to_z
-           else z_unconstrained)
-
-      try:
-        bij = self.transformations[z].bijector
-        z_sample_transformed[z] = bij.inverse(qz_unconstrained)
-        log_det_jacobian += tf.reduce_sum(
-            bij.inverse_log_det_jacobian(qz_unconstrained))
-      except:  # if z not in self.transformations,
-               # or is not a TransformedDist w/ bijector
-        z_sample_transformed[z] = qz_unconstrained
-
-    return self._log_joint(z_sample_transformed) + log_det_jacobian
-
-  def _log_joint(self, z_sample):
-    """Utility function to calculate model's log joint density,
-    log p(x, z), for inputs z (and fixed data x).
-
-    Args:
-      z_sample: dict.
-        Latent variable keys to samples.
-    """
-    scope = self._scope + tf.get_default_graph().unique_name("sample")
-    # Form dictionary in order to replace conditioning on prior or
-    # observed variable with conditioning on a specific value.
-    dict_swap = z_sample.copy()
-
-    for x, qx in six.iteritems(self.data):
-      if isinstance(x, RandomVariable):
-        if isinstance(qx, RandomVariable):
-          qx_copy = copy(qx, scope=scope)
-          dict_swap[x] = qx_copy.value
-        else:
-          dict_swap[x] = qx
-
-    log_joint = 0.0
-    for z in six.iterkeys(self.latent_vars):
-      z_copy = copy(z, dict_swap, scope=scope)
-      log_joint += tf.reduce_sum(z_copy.log_prob(dict_swap[z]))
-
-    for x in six.iterkeys(self.data):
-      if isinstance(x, RandomVariable):
-        x_copy = copy(x, dict_swap, scope=scope)
-        log_joint += tf.reduce_sum(x_copy.log_prob(dict_swap[x]))
-
-    return log_joint
-
-
-def leapfrog(z_old, r_old, step_size, log_joint, n_steps):
-  z_new = z_old.copy()
-  r_new = r_old.copy()
-
-  grad_log_joint = tf.gradients(log_joint(z_new), list(six.itervalues(z_new)))
-  for _ in range(n_steps):
-    for i, key in enumerate(six.iterkeys(z_new)):
-      z, r = z_new[key], r_new[key]
-      r_new[key] = r + 0.5 * step_size * tf.convert_to_tensor(grad_log_joint[i])
-      z_new[key] = z + step_size * r_new[key]
-
-    grad_log_joint = tf.gradients(log_joint(z_new), list(six.itervalues(z_new)))
-    for i, key in enumerate(six.iterkeys(z_new)):
-      r_new[key] += 0.5 * step_size * tf.convert_to_tensor(grad_log_joint[i])
-
-  return z_new, r_new
+  def _target_log_prob_fn(*fargs):
+    """Target's unnormalized log-joint density as a function of states."""
+    posterior_trace = {state.name.split(':')[0]: Node(arg)
+                       for state, arg in zip(states, fargs)}
+    intercept = make_intercept(
+        posterior_trace, align_data, align_latent, args, kwargs)
+    with Trace(intercept=intercept) as model_trace:
+      call_function_up_to_args(model, *args, **kwargs)
+
+    p_log_prob = 0.0
+    for name, node in six.iteritems(model_trace):
+      if align_latent(name) is not None or align_data(name) is not None:
+        rv = node.value
+        p_log_prob += tf.reduce_sum(rv.log_prob(rv.value))
+    return p_log_prob
+
+  is_list_like = lambda x: isinstance(x, (tuple, list))
+  maybe_list = lambda x: list(x) if is_list_like(x) else [x]
+  states = maybe_list(current_state)
+
+  out = tfp.hmc.kernel(
+      target_log_prob_fn=_target_log_prob_fn,
+      current_state=current_state,
+      step_size=step_size,
+      num_leapfrog_steps=num_leapfrog_steps,
+      current_target_log_prob=current_target_log_prob,
+      current_grads_target_log_prob=current_grads_target_log_prob)
+  return out
diff --git a/edward/inferences/inference.py b/edward/inferences/inference.py
index d74668fcb..094a60cd1 100644
--- a/edward/inferences/inference.py
+++ b/edward/inferences/inference.py
@@ -401,3 +401,13 @@ def _wgan_update(clip_op, variables=None, *args, **kwargs):
     sess.run(clip_op)
 
   return info_dict
+
+
+def _build_n_accept(collections):
+  # TODO for monte carlo methods
+  n_accept = tf.Variable(0, trainable=False, name="n_accept")
+  n_accept_over_t = n_accept / t
+  if collections is not None:
+    tf.summary.scalar("n_accept", n_accept,
+                      collections=collections)
+  return n_accept_over_t
diff --git a/edward/inferences/metropolis_hastings.py b/edward/inferences/metropolis_hastings.py
index 3c6323131..7ad8633b4 100644
--- a/edward/inferences/metropolis_hastings.py
+++ b/edward/inferences/metropolis_hastings.py
@@ -5,152 +5,172 @@
 import six
 import tensorflow as tf
 
-from collections import OrderedDict
-from edward.inferences.monte_carlo import MonteCarlo
-from edward.models import RandomVariable
-from edward.util import check_and_maybe_build_latent_vars
+from edward.inferences import docstrings as doc
+from edward.inferences.inference import (
+    call_function_up_to_args, make_intercept)
+from edward.models.core import Node, Trace
+
+tfp = tf.contrib.bayesflow
+
+
+@doc.set_doc(
+    arg_model=doc.arg_model[:-1],
+    arg_align_latent=doc.arg_align_latent_monte_carlo[:-1],
+    args=(doc.arg_align_data +
+          doc.arg_current_state +
+          doc.arg_current_target_log_prob +
+          doc.arg_collections +
+          doc.arg_args_kwargs)[:-1],
+    returns=doc.return_samples,
+    notes_mcmc_programs=doc.notes_mcmc_programs,
+    notes_conditional_inference=doc.notes_conditional_inference)
+def metropolis_hastings(model,
+                        proposal,
+                        align_latent,
+                        align_proposal,
+                        align_data,
+                        current_state=None,
+                        current_target_log_prob=None,
+                        collections=None,
+                        *args, **kwargs):
+  """Metropolis-Hastings [@metropolis1953equation; @hastings1970monte].
 
+  MH draws a sample from `proposal` given the last sample. The
+  proposed sample is accepted with log-probability given by
 
-class MetropolisHastings(MonteCarlo):
-  """Metropolis-Hastings [@metropolis1953equation; @hastings1970monte].
+  $\\text{ratio} =
+        \log p(x, z^{\\text{new}}) - \log p(x, z^{\\text{old}}) -
+        \log g(z^{\\text{new}} \mid z^{\\text{old}}) +
+        \log g(z^{\\text{old}} \mid z^{\\text{new}})$
 
-  #### Notes
+  where $p$ is the model's joint density over observed and latent
+  variables, and $g$ is the proposal's density.
+
+  Args:
+  @{arg_model}
+    proposal: function whose inputs are each state. It returns a new
+      collection (Python list) of states given the inputs, $z'\sim
+      g(z' \mid z)$.
+  @{arg_align_latent}
+    align_proposal:
+  @{args}
 
-  In conditional inference, we infer $z$ in $p(z, \\beta
-  \mid x)$ while fixing inference over $\\beta$ using another
-  distribution $q(\\beta)$.
-  To calculate the acceptance ratio, `MetropolisHastings` uses an
-  estimate of the marginal density,
+  Returns:
+  @{returns}
 
-  $p(x, z) = \mathbb{E}_{q(\\beta)} [ p(x, z, \\beta) ]
-            \\approx p(x, z, \\beta^*)$
+  #### Notes
 
-  leveraging a single Monte Carlo sample, where $\\beta^* \sim
-  q(\\beta)$. This is unbiased (and therefore asymptotically exact as a
-  pseudo-marginal method) if $q(\\beta) = p(\\beta \mid x)$.
+  @{notes_mcmc_programs}
 
-  `MetropolisHastings` assumes the proposal distribution has the same
-  support as the prior. The `auto_transform` attribute in
-  the method `initialize()` is not applicable.
+  @{notes_conditional_inference}
 
   #### Examples
 
+  Consider the following setup.
   ```python
-  mu = Normal(loc=0.0, scale=1.0)
-  x = Normal(loc=mu, scale=1.0, sample_shape=10)
+  def model():
+    mu = Normal(loc=0.0, scale=1.0, name="mu")
+    x = Normal(loc=mu, scale=1.0, sample_shape=10, name="x")
 
-  qmu = Empirical(tf.Variable(tf.zeros(500)))
-  proposal_mu = Normal(loc=mu, scale=0.5)
-  inference = ed.MetropolisHastings({mu: qmu}, {mu: proposal_mu},
-                                    data={x: np.zeros(10, dtype=np.float32)})
+  def proposal(mu):
+    proposal_mu = Normal(loc=mu, scale=0.5, name="proposal/mu")
+  ```
+  In graph mode, build `tf.Variable`s which are updated via the Markov
+  chain. The update op is fetched at runtime over many iterations.
+  ```python
+  qmu = tf.get_variable("qmu", initializer=1.)
+  new_state, _ = ed.metropolis_hastings(
+      model, proposal,
+      current_state=qmu,
+      align_latent=lambda name: "qmu" if name == "mu" else None,
+      align_data=lambda name: "x_data" if name == "x" else None,
+      x_data=x_data)
+  qmu_update = qmu.assign(new_state)
+  ```
+  In eager mode, call the function at runtime, updating its inputs
+  such as `state`.
+  ```python
+  qmu = 1.
+  new_log_prob = None
+  for _ in range(1000):
+    new_state, new_log_prob = ed.metropolis_hastings(
+        model, proposal,
+        current_state=qmu,
+        align_latent=lambda name: "qmu" if name == "mu" else None,
+        align_proposal=lambda name: "proposal/mu" if name == "mu" else None,
+        align_data=lambda name: "x_data" if name == "x" else None,
+        current_target_log_prob=new_log_prob,
+        x_data=x_data)
+    qmu = new_state
   ```
   """
-  def __init__(self, latent_vars, proposal_vars, data=None):
-    """Create an inference algorithm.
-
-    Args:
-      proposal_vars: dict of RandomVariable to RandomVariable.
-        Collection of random variables to perform inference on; each is
-        binded to a proposal distribution $g(z' \mid z)$.
-    """
-    proposal_vars = check_and_maybe_build_latent_vars(proposal_vars)
-    self.proposal_vars = proposal_vars
-    super(MetropolisHastings, self).__init__(latent_vars, data)
-
-  def initialize(self, *args, **kwargs):
-    kwargs['auto_transform'] = False
-    return super(MetropolisHastings, self).initialize(*args, **kwargs)
-
-  def build_update(self):
-    """Draw sample from proposal conditional on last sample. Then
-    accept or reject the sample based on the ratio,
-
-    $\\text{ratio} =
-          \log p(x, z^{\\text{new}}) - \log p(x, z^{\\text{old}}) -
-          \log g(z^{\\text{new}} \mid z^{\\text{old}}) +
-          \log g(z^{\\text{old}} \mid z^{\\text{new}})$
-
-    #### Notes
-
-    The updates assume each Empirical random variable is directly
-    parameterized by `tf.Variable`s.
+  def _target_log_prob_fn(*fargs):
+    """Target's unnormalized log-joint density as a function of states."""
+    posterior_trace = {state.name.split(':')[0]: Node(arg)
+                       for state, arg in zip(states, fargs)}
+    intercept = make_intercept(
+        posterior_trace, align_data, align_latent, args, kwargs)
+    with Trace(intercept=intercept) as model_trace:
+      call_function_up_to_args(model, *args, **kwargs)
+
+    global inverse_align_latent
+    inverse_align_latent = {}
+    p_log_prob = 0.0
+    for name, node in six.iteritems(model_trace):
+      if align_latent(name) is not None or align_data(name) is not None:
+        if align_latent(name) is not None:
+          inverse_align_latent[align_latent(name)] = name
+        rv = node.value
+        p_log_prob += tf.reduce_sum(rv.log_prob(rv.value))
+    return p_log_prob
+
+  def _proposal_fn(*fargs):
+    """Takes inputted states and returns (proposed states, log Hastings ratio).
+
+    This implementation doesn't let `proposal take *args, **kwargs as
+    input (i.e., it cannot be amortized). We also assume proposal
+    returns same size and order as inputted states.
     """
-    old_sample = {z: tf.gather(qz.params, tf.maximum(self.t - 1, 0))
-                  for z, qz in six.iteritems(self.latent_vars)}
-    old_sample = OrderedDict(old_sample)
-
-    # Form dictionary in order to replace conditioning on prior or
-    # observed variable with conditioning on a specific value.
-    dict_swap = {}
-    for x, qx in six.iteritems(self.data):
-      if isinstance(x, RandomVariable):
-        if isinstance(qx, RandomVariable):
-          qx_copy = copy(qx, scope='conditional')
-          dict_swap[x] = qx_copy.value
-        else:
-          dict_swap[x] = qx
-
-    dict_swap_old = dict_swap.copy()
-    dict_swap_old.update(old_sample)
-    base_scope = tf.get_default_graph().unique_name("inference") + '/'
-    scope_old = base_scope + 'old'
-    scope_new = base_scope + 'new'
-
-    # Draw proposed sample and calculate acceptance ratio.
-    new_sample = old_sample.copy()  # copy to ensure same order
-    ratio = 0.0
-    for z, proposal_z in six.iteritems(self.proposal_vars):
-      # Build proposal g(znew | zold).
-      proposal_znew = copy(proposal_z, dict_swap_old, scope=scope_old)
-      # Sample znew ~ g(znew | zold).
-      new_sample[z] = proposal_znew.value
-      # Increment ratio.
-      ratio -= tf.reduce_sum(proposal_znew.log_prob(new_sample[z]))
-
-    dict_swap_new = dict_swap.copy()
-    dict_swap_new.update(new_sample)
-
-    for z, proposal_z in six.iteritems(self.proposal_vars):
-      # Build proposal g(zold | znew).
-      proposal_zold = copy(proposal_z, dict_swap_new, scope=scope_new)
-      # Increment ratio.
-      ratio += tf.reduce_sum(proposal_zold.log_prob(dict_swap_old[z]))
-
-    for z in six.iterkeys(self.latent_vars):
-      # Build priors p(znew) and p(zold).
-      znew = copy(z, dict_swap_new, scope=scope_new)
-      zold = copy(z, dict_swap_old, scope=scope_old)
-      # Increment ratio.
-      ratio += tf.reduce_sum(znew.log_prob(dict_swap_new[z]))
-      ratio -= tf.reduce_sum(zold.log_prob(dict_swap_old[z]))
-
-    for x in six.iterkeys(self.data):
-      if isinstance(x, RandomVariable):
-        # Build likelihoods p(x | znew) and p(x | zold).
-        x_znew = copy(x, dict_swap_new, scope=scope_new)
-        x_zold = copy(x, dict_swap_old, scope=scope_old)
-        # Increment ratio.
-        ratio += tf.reduce_sum(x_znew.log_prob(dict_swap[x]))
-        ratio -= tf.reduce_sum(x_zold.log_prob(dict_swap[x]))
-
-    # Accept or reject sample.
-    u = tf.random_uniform([], dtype=ratio.dtype)
-    accept = tf.log(u) < ratio
-    sample_values = tf.cond(accept, lambda: list(six.itervalues(new_sample)),
-                            lambda: list(six.itervalues(old_sample)))
-    if not isinstance(sample_values, list):
-      # `tf.cond` returns tf.Tensor if output is a list of size 1.
-      sample_values = [sample_values]
-
-    sample = {z: sample_value for z, sample_value in
-              zip(six.iterkeys(new_sample), sample_values)}
-
-    # Update Empirical random variables.
-    assign_ops = []
-    for z, qz in six.iteritems(self.latent_vars):
-      variable = qz.get_variables()[0]
-      assign_ops.append(tf.scatter_update(variable, self.t, sample[z]))
-
-    # Increment n_accept (if accepted).
-    assign_ops.append(self.n_accept.assign_add(tf.where(accept, 1, 0)))
-    return tf.group(*assign_ops)
+    global inverse_align_latent
+    with Trace() as new_trace:
+      # Build g(new | old): new states are drawn given old states as input.
+      call_function_up_to_args(proposal, *fargs)
+    new_states = []
+    old_proposal_trace = {}
+    for state, farg in zip(states, fargs):
+      name = state.name.split(':')[0]
+      new_state = new_trace[align_proposal(inverse_align_latent[name])].value
+      new_state_name = new_state.name.split(':')[0]
+      old_proposal_trace[new_state_name] = Node(farg)
+      new_states.append(new_state)
+    align_latent = lambda name: name if name in old_proposal_trace else None
+    intercept = make_intercept(
+        old_proposal_trace, align_data, align_latent, args, kwargs)
+    with Trace(intercept=intercept) as old_trace:
+      # Build g(old | new): `value`s set to old states; new states are input.
+      call_function_up_to_args(proposal, *new_states)
+    old_states = [old_trace[align_proposal(
+                    inverse_align_latent[state.name.split(':')[0]])].value
+                  for state in states]
+    old_states = []
+    for state, farg in zip(states, fargs):
+      name = state.name.split(':')[0]
+      old_state = old_trace[align_proposal(inverse_align_latent[name])].value
+      old_states.append(old_state)
+    # Compute log p(old | new) - log p(new | old).
+    log_hastings_ratio = 0.0
+    for old_state, new_state in zip(old_states, new_states):
+      log_hastings_ratio += tf.reduce_sum(old_state.log_prob(old_state.value))
+      log_hastings_ratio -= tf.reduce_sum(new_state.log_prob(new_state.value))
+    return new_states, log_hastings_ratio
+
+  is_list_like = lambda x: isinstance(x, (tuple, list))
+  maybe_list = lambda x: list(x) if is_list_like(x) else [x]
+  states = maybe_list(current_state)
+
+  out = tfp.metropolis_hastings.kernel(
+      target_log_prob_fn=_target_log_prob_fn,
+      proposal_fn=_proposal_fn,
+      current_state=current_state,
+      current_target_log_prob=current_target_log_prob)
+  return out
diff --git a/edward/inferences/monte_carlo.py b/edward/inferences/monte_carlo.py
deleted file mode 100644
index d5ef6a2fe..000000000
--- a/edward/inferences/monte_carlo.py
+++ /dev/null
@@ -1,102 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-import six
-import tensorflow as tf
-
-from edward.models import Empirical, RandomVariable
-from edward.util import get_session
-
-
-class MonteCarlo(Inference):
-  """Abstract base class for Monte Carlo. Specific Monte Carlo methods
-  inherit from `MonteCarlo`, sharing methods in this class.
-
-  To build an algorithm inheriting from `MonteCarlo`, one must at the
-  minimum implement `build_update`: it determines how to assign
-  the samples in the `Empirical` approximations.
-
-  #### Notes
-
-  The number of Monte Carlo iterations is set according to the
-  minimum of all `Empirical` sizes.
-
-  Initialization is assumed from `params[0, :]`. This generalizes
-  initializing randomly and initializing from user input. Updates
-  are along this outer dimension, where iteration t updates
-  `params[t, :]` in each `Empirical` random variable.
-
-  No warm-up is implemented. Users must run MCMC for a long period
-  of time, then manually burn in the Empirical random variable.
-
-  #### Examples
-
-  Most explicitly, `MonteCarlo` is specified via a dictionary:
-
-  ```python
-  qpi = Empirical(params=tf.Variable(tf.zeros([T, K-1])))
-  qmu = Empirical(params=tf.Variable(tf.zeros([T, K*D])))
-  qsigma = Empirical(params=tf.Variable(tf.zeros([T, K*D])))
-  ed.MonteCarlo({pi: qpi, mu: qmu, sigma: qsigma}, data)
-  ```
-
-  The inferred posterior is comprised of `Empirical` random
-  variables with `T` samples. We also automate the specification
-  of `Empirical` random variables. One can pass in a list of
-  latent variables instead:
-
-  ```python
-  ed.MonteCarlo([beta], data)
-  ed.MonteCarlo([pi, mu, sigma], data)
-  ```
-
-  It defaults to `Empirical` random variables with 10,000 samples for
-  each dimension.
-  """
-  """Create an inference algorithm.
-
-  Args:
-    latent_vars: list or dict, optional.
-      Collection of random variables (of type `RandomVariable` or
-      `tf.Tensor`) to perform inference on. If list, each random
-      variable will be approximated using a `Empirical` random
-      variable that is defined internally (with unconstrained
-      support). If dictionary, each value in the dictionary must be a
-      `Empirical` random variable.
-    data: dict, optional.
-      Data dictionary which binds observed variables (of type
-      `RandomVariable` or `tf.Tensor`) to their realizations (of
-      type `tf.Tensor`). It can also bind placeholders (of type
-      `tf.Tensor`) used in the model to their realizations.
-  """
-  if isinstance(latent_vars, list):
-    with tf.variable_scope(None, default_name="posterior"):
-      latent_vars = {z: Empirical(params=tf.Variable(tf.zeros(
-          [1e4] + z.batch_shape.concatenate(z.event_shape).as_list())))
-          for z in latent_vars}
-  elif isinstance(latent_vars, dict):
-    for qz in six.itervalues(latent_vars):
-      if not isinstance(qz, Empirical):
-        raise TypeError("Posterior approximation must consist of only "
-                        "Empirical random variables.")
-      elif len(qz.sample_shape) != 0:
-        raise ValueError("Empirical posterior approximations must have "
-                         "a scalar sample shape.")
-
-  def initialize(self, *args, **kwargs):
-    kwargs['n_iter'] = np.amin([qz.params.shape.as_list()[0] for
-                                qz in six.itervalues(self.latent_vars)])
-    super(MonteCarlo, self).initialize(*args, **kwargs)
-
-    self.n_accept = tf.Variable(0, trainable=False, name="n_accept")
-    self.n_accept_over_t = self.n_accept / self.t
-    self.train = self.build_update()
-
-    self.reset.append(tf.variables_initializer([self.n_accept]))
-
-    if self.logging:
-      tf.summary.scalar("n_accept", self.n_accept,
-                        collections=[self._summary_key])
-      self.summarize = tf.summary.merge_all(key=self._summary_key)
diff --git a/edward/inferences/sghmc.py b/edward/inferences/sghmc.py
index 7cc7599fc..0672578ac 100644
--- a/edward/inferences/sghmc.py
+++ b/edward/inferences/sghmc.py
@@ -5,125 +5,248 @@
 import six
 import tensorflow as tf
 
-from edward.inferences.monte_carlo import MonteCarlo
-from edward.models import RandomVariable, Empirical
-
-
-class SGHMC(MonteCarlo):
+from edward.inferences import docstrings as doc
+from edward.inferences.inference import (
+    call_function_up_to_args, make_intercept)
+from edward.models.core import Node, Trace
+
+
+@doc.set_doc(
+    args_part_one=(doc.arg_model +
+                   doc.arg_align_latent_monte_carlo +
+                   doc.arg_align_data +
+                   doc.arg_current_state +
+                   doc.arg_step_size)[:-1],
+    args_part_two=(doc.arg_current_target_log_prob +
+                   doc.arg_current_grads_target_log_prob +
+                   doc.arg_auto_transform +
+                   doc.arg_collections +
+                   doc.arg_args_kwargs)[:-1],
+    returns=doc.return_samples,
+    notes_mcmc_programs=doc.notes_mcmc_programs,
+    notes_conditional_inference=doc.notes_conditional_inference)
+def sghmc(model,
+          align_latent,
+          align_data,
+          # current_state=None,  # TODO kwarg before arg
+          current_state,
+          momentum,
+          momentum_state,
+          learning_rate,
+          friction=0.1,
+          preconditioner_decay_rate=0.95,
+          num_pseudo_batches=1,
+          diagonal_bias=1e-8,
+          target_log_prob=None,
+          grads_target_log_prob=None,
+          auto_transform=True,
+          collections=None,
+          *args, **kwargs):
   """Stochastic gradient Hamiltonian Monte Carlo [@chen2014stochastic].
 
-  #### Notes
+  SGHMC simulates Hamiltonian dynamics with friction using a discretized
+  integrator. Its discretization error goes to zero as the learning
+  rate decreases. Namely, it implements the update equations from (15)
+  of @chen2014stochastic.
+
+  This function implements an adaptive mass matrix using RMSProp.
+  Namely, it uses the update from pre-conditioned SGLD
+  [@li2016preconditioned] extended to second-order Langevin dynamics
+  (SGHMC): the preconditioner is equal to the inverse of the mass
+  matrix [@chen2014stochastic].
+
+  Works for any probabilistic program whose latent variables of
+  interest are differentiable. If `auto_transform=True`, the latent
+  variables may exist on any constrained differentiable support.
+
+  Args:
+  @{args_part_one}
+    friction: float.
+      Constant scale on the friction term in the Hamiltonian system.
+      The implementation may be extended in the future to enable a
+      friction per random variable (`friction` would be a callable).
+    momentum:
+    momentum_state:
+    learning_rate:
+    friction:
+    preconditioner_decay_rate:
+    num_pseudo_batches:
+    diagonal_bias:
+  @{args_part_two}
+
+  Returns:
+  @{returns}
 
-  In conditional inference, we infer $z$ in $p(z, \\beta
-  \mid x)$ while fixing inference over $\\beta$ using another
-  distribution $q(\\beta)$.
-  `SGHMC` substitutes the model's log marginal density
+  #### Notes
 
-  $\log p(x, z) = \log \mathbb{E}_{q(\\beta)} [ p(x, z, \\beta) ]
-                \\approx \log p(x, z, \\beta^*)$
+  @{notes_mcmc_programs}
 
-  leveraging a single Monte Carlo sample, where $\\beta^* \sim
-  q(\\beta)$. This is unbiased (and therefore asymptotically exact as a
-  pseudo-marginal method) if $q(\\beta) = p(\\beta \mid x)$.
+  @{notes_conditional_inference}
 
   #### Examples
 
+  Consider the following setup.
   ```python
-  mu = Normal(loc=0.0, scale=1.0)
-  x = Normal(loc=mu, scale=1.0, sample_shape=10)
-
-  qmu = Empirical(tf.Variable(tf.zeros(500)))
-  inference = ed.SGHMC({mu: qmu}, {x: np.zeros(10, dtype=np.float32)})
+  def model():
+    mu = Normal(loc=0.0, scale=1.0, name="mu")
+    x = Normal(loc=mu, scale=1.0, sample_shape=10, name="x")
   ```
+  In graph mode, build `tf.Variable`s which are updated via the Markov
+  chain. The update op is fetched at runtime over many iterations.
+  ```python
+  qmu = tf.get_variable("qmu", initializer=1.)
+  qmu_mom = tf.get_variable("qmu_mom", initializer=0.)
+  qmu_mom_state = tf.get_variable("qmu_mom_state", initializer=0.)
+  new_state, new_momentum, new_momentum_state = ed.sghmc(
+      model,
+      ...,
+      current_state=qmu,
+      momentum=qmu_mom,
+      momentum_state=qmu_mom_state,
+      align_latent=lambda name: "qmu" if name == "mu" else None,
+      align_data=lambda name: "x_data" if name == "x" else None,
+      x_data=x_data)
+  qmu_update = qmu.assign(new_state)
+  qmu_mom_update = qmu_mom.assign(new_momentum)
+  qmu_mom_state_update = qmu_mom_state.assign(new_momentum_state)
+  ```
+  In eager mode, call the function at runtime, updating its inputs
+  such as `state`.
+  ```python
+  qmu = 1.
+  qmu_mom = None
+  qmu_mom_state = None
+  for _ in range(1000):
+    new_state, new_momentum, new_momentum_state = ed.sghmc(
+        model,
+        ...,
+        current_state=qmu,
+        momentum=qmu_mom,
+        momentum_state=qmu_mom_state,
+        align_latent=lambda name: "qmu" if name == "mu" else None,
+        align_data=lambda name: "x_data" if name == "x" else None,
+        x_data=x_data)
+    qmu = new_state
+    qmu_mom = new_momentum
+    qmu_mom_state = new_momentum_state
+  ```
+  """
+  def _target_log_prob_fn(*fargs):
+    """Target's unnormalized log-joint density as a function of states."""
+    posterior_trace = {state.name.split(':')[0]: Node(arg)
+                       for state, arg in zip(states, fargs)}
+    intercept = make_intercept(
+        posterior_trace, align_data, align_latent, args, kwargs)
+    with Trace(intercept=intercept) as model_trace:
+      call_function_up_to_args(model, *args, **kwargs)
+
+    p_log_prob = 0.0
+    for name, node in six.iteritems(model_trace):
+      if align_latent(name) is not None or align_data(name) is not None:
+        rv = node.value
+        p_log_prob += tf.reduce_sum(rv.log_prob(rv.value))
+    return p_log_prob
+
+  is_list_like = lambda x: isinstance(x, (tuple, list))
+  maybe_list = lambda x: list(x) if is_list_like(x) else [x]
+  states = maybe_list(current_state)
+
+  out = kernel(
+      target_log_prob_fn=_target_log_prob_fn,
+      current_state=current_state,
+      momentum=momentum,
+      momentum_state=momentum_state,
+      learning_rate=learning_rate,
+      frictions=friction,
+      preconditioner_decay_rate=preconditioner_decay_rate,
+      num_pseudo_batches=num_pseudo_batches,
+      diagonal_bias=diagonal_bias,
+      current_target_log_prob=target_log_prob,
+      current_grads_target_log_prob=grads_target_log_prob)
+  return out
+
+
+def kernel(target_log_prob_fn,
+           current_state,
+           momentum,
+           momentum_state,
+           learning_rate,
+           frictions=0.1,
+           preconditioner_decay_rate=0.95,
+           num_pseudo_batches=1,
+           diagonal_bias=1e-8,
+           current_target_log_prob=None,
+           current_grads_target_log_prob=None,
+           name=None):
+  """Pre-conditioned SGHMC.
+
+  Args:
+    ...
+    momentum:
+    momentum_state: Auxiliary momentums for states (the other is
+      momentum for the preconditioner RMSProp.)
+    learning_rate: From tf.contrib.bayesflow.SGLDOptimizer.
+    frictions:
+    preconditioner_decay_rate: From tf.contrib.bayesflow.SGLDOptimizer.
+    num_pseudo_batches: From tf.contrib.bayesflow.SGLDOptimizer.
+    diagonal_bias: From tf.contrib.bayesflow.SGLDOptimizer.
+    ...
   """
-  def __init__(self, *args, **kwargs):
-    super(SGHMC, self).__init__(*args, **kwargs)
-
-  def initialize(self, step_size=0.25, friction=0.1, *args, **kwargs):
-    """Initialize inference algorithm.
-
-    Args:
-      step_size: float.
-        Constant scale factor of learning rate.
-      friction: float.
-        Constant scale on the friction term in the Hamiltonian system.
-    """
-    self.step_size = step_size
-    self.friction = friction
-    self.v = {z: tf.Variable(tf.zeros(qz.params.shape[1:], dtype=qz.dtype))
-              for z, qz in six.iteritems(self.latent_vars)}
-    return super(SGHMC, self).initialize(*args, **kwargs)
-
-  def build_update(self):
-    """Simulate Hamiltonian dynamics with friction using a discretized
-    integrator. Its discretization error goes to zero as the learning
-    rate decreases.
-
-    Implements the update equations from (15) of @chen2014stochastic.
-    """
-    old_sample = {z: tf.gather(qz.params, tf.maximum(self.t - 1, 0))
-                  for z, qz in six.iteritems(self.latent_vars)}
-    old_v_sample = {z: v for z, v in six.iteritems(self.v)}
-
-    # Simulate Hamiltonian dynamics with friction.
-    learning_rate = self.step_size * 0.01
-    grad_log_joint = tf.gradients(self._log_joint(old_sample),
-                                  list(six.itervalues(old_sample)))
-
-    # v_sample is so named b/c it represents a velocity rather than momentum.
-    sample = {}
-    v_sample = {}
-    for z, grad_log_p in zip(six.iterkeys(old_sample), grad_log_joint):
-      qz = self.latent_vars[z]
-      event_shape = qz.event_shape
-      stddev = tf.sqrt(tf.cast(learning_rate * self.friction, qz.dtype))
-      normal = tf.random_normal(event_shape, dtype=qz.dtype)
-      sample[z] = old_sample[z] + old_v_sample[z]
-      v_sample[z] = ((1.0 - 0.5 * self.friction) * old_v_sample[z] +
-                     learning_rate * tf.convert_to_tensor(grad_log_p) +
-                     stddev * normal)
-
-    # Update Empirical random variables.
-    assign_ops = []
-    for z, qz in six.iteritems(self.latent_vars):
-      variable = qz.get_variables()[0]
-      assign_ops.append(tf.scatter_update(variable, self.t, sample[z]))
-      assign_ops.append(tf.assign(self.v[z], v_sample[z]).op)
-
-    # Increment n_accept.
-    assign_ops.append(self.n_accept.assign_add(1))
-    return tf.group(*assign_ops)
-
-  def _log_joint(self, z_sample):
-    """Utility function to calculate model's log joint density,
-    log p(x, z), for inputs z (and fixed data x).
-
-    Args:
-      z_sample: dict.
-        Latent variable keys to samples.
-    """
-    scope = tf.get_default_graph().unique_name("inference")
-    # Form dictionary in order to replace conditioning on prior or
-    # observed variable with conditioning on a specific value.
-    dict_swap = z_sample.copy()
-    for x, qx in six.iteritems(self.data):
-      if isinstance(x, RandomVariable):
-        if isinstance(qx, RandomVariable):
-          qx_copy = copy(qx, scope=scope)
-          dict_swap[x] = qx_copy.value
-        else:
-          dict_swap[x] = qx
-
-    log_joint = 0.0
-    for z in six.iterkeys(self.latent_vars):
-      z_copy = copy(z, dict_swap, scope=scope)
-      log_joint += tf.reduce_sum(
-          self.scale.get(z, 1.0) * z_copy.log_prob(dict_swap[z]))
-
-    for x in six.iterkeys(self.data):
-      if isinstance(x, RandomVariable):
-        x_copy = copy(x, dict_swap, scope=scope)
-        log_joint += tf.reduce_sum(
-            self.scale.get(x, 1.0) * x_copy.log_prob(dict_swap[x]))
-
-    return log_joint
+  is_list_like = lambda x: isinstance(x, (tuple, list))
+  maybe_list = lambda x: list(x) if is_list_like(x) else [x]
+  states = maybe_list(current_state)
+  momentums = maybe_list(momentum)
+  momentums_states = maybe_list(momentum_state)
+  with tf.name_scope(name, "sghmc_kernel", states):
+    with tf.name_scope("initialize"):
+      if current_target_log_prob is None:
+        current_target_log_prob = target_log_prob_fn(*states)
+      if current_grads_target_log_prob is None:
+        current_grads_target_log_prob = tf.gradients(current_target_log_prob, states)
+
+    next_states = []
+    next_momentums_states = []
+    for state, mom, grad in zip(states, momentums, current_grads_target_log_prob):
+      state_update, mom_state_update = _apply_noisy_update(
+          mom, grad, learning_rate,
+          friction, mom_state,
+          diagonal_bias, num_pseudo_batches)
+      next_state = state + learning_rate * state_update
+      # TODO doesn't this scale the noise incorrectly by additional
+      # learning_rate during the update? (same in sgld_optimizer)
+      next_mom_state = mom + learning_rate * mom_state_update
+      momentum = (mom + (1.0 - preconditioner_decay_rate) *
+                  (tf.square(grad) - mom))
+      next_states.append(next_state)
+      next_momentums_states.append(next_mom_state)
+      momentums.append(momentum)
+
+    maybe_flatten = lambda x: x if is_list_like(state) else x[0]
+    next_state = maybe_flatten(next_states)
+    next_momentum_state = maybe_flatten(next_momentums_states)
+    momentum = maybe_flatten(momentums)
+    return [
+        next_state,
+        next_momentum_state,
+        momentum,
+    ]
+
+
+def _apply_noisy_update(mom, grad, learning_rate,
+                        friction, mom_state,
+                        diagonal_bias, num_pseudo_batches):
+  """Adapted from tf.contrib.bayesflow.SGLDOptimizer._apply_noisy_update."""
+  from tensorflow.python.ops import array_ops
+  from tensorflow.python.ops import math_ops
+  from tensorflow.python.ops import random_ops
+  stddev = math_ops.cast(math_ops.rsqrt(2 * learning_rate * friction), grad.dtype)
+  preconditioner = math_ops.rsqrt(
+      mom + math_ops.cast(diagonal_bias, grad.dtype))
+  state_update = preconditioner * mom_state
+  mom_state_update = (
+      -grad * math_ops.cast(num_pseudo_batches,
+                            grad.dtype) +
+      friction * tf.matmul(preconditioner, mom_state) +
+      random_ops.random_normal(array_ops.shape(grad), 1.0, dtype=grad.dtype) *
+      stddev)
+  return state_update, mom_state_update
diff --git a/edward/inferences/sgld.py b/edward/inferences/sgld.py
index 930eae583..39bfef0bc 100644
--- a/edward/inferences/sgld.py
+++ b/edward/inferences/sgld.py
@@ -5,116 +5,279 @@
 import six
 import tensorflow as tf
 
-from edward.inferences.monte_carlo import MonteCarlo
-from edward.models import RandomVariable
+from edward.inferences import docstrings as doc
+from edward.inferences.inference import (
+    call_function_up_to_args, make_intercept)
+from edward.models.core import Node, Trace
 
+tfp = tf.contrib.bayesflow
 
-class SGLD(MonteCarlo):
+
+@doc.set_doc(
+    args_part_one=(doc.arg_model +
+                   doc.arg_align_latent_monte_carlo +
+                   doc.arg_align_data +
+                   doc.arg_current_state)[:-1],
+    args_part_two=(doc.arg_current_target_log_prob +
+                   doc.arg_current_grads_target_log_prob +
+                   doc.arg_auto_transform +
+                   doc.arg_collections +
+                   doc.arg_args_kwargs)[:-1],
+    returns=doc.return_samples,
+    notes_mcmc_programs=doc.notes_mcmc_programs,
+    notes_conditional_inference=doc.notes_conditional_inference)
+def sgld(model,
+         align_latent,
+         align_data,
+         # current_state=None,  # TODO kwarg before arg
+         current_state,
+         momentum,
+         learning_rate,
+         preconditioner_decay_rate=0.95,
+         num_pseudo_batches=1,
+         diagonal_bias=1e-8,
+         target_log_prob=None,
+         grads_target_log_prob=None,
+         auto_transform=True,
+         collections=None,
+         *args, **kwargs):
   """Stochastic gradient Langevin dynamics [@welling2011bayesian].
 
-  #### Notes
+  SGLD simulates Langevin dynamics using a discretized integrator. Its
+  discretization error goes to zero as the learning rate decreases.
+
+  This function implements an adaptive preconditioner using RMSProp
+  [@li2016preconditioned].
+
+  Works for any probabilistic program whose latent variables of
+  interest are differentiable. If `auto_transform=True`, the latent
+  variables may exist on any constrained differentiable support.
 
-  In conditional inference, we infer $z$ in $p(z, \\beta
-  \mid x)$ while fixing inference over $\\beta$ using another
-  distribution $q(\\beta)$.
-  `SGLD` substitutes the model's log marginal density
+  Args:
+  @{args_part_one}
+    momentum:
+    learning_rate:
+    preconditioner_decay_rate:
+    num_pseudo_batches:
+    diagonal_bias:
+  @{args_part_two}
+
+  Returns:
+  @{returns}
+
+  #### Notes
 
-  $\log p(x, z) = \log \mathbb{E}_{q(\\beta)} [ p(x, z, \\beta) ]
-                \\approx \log p(x, z, \\beta^*)$
+  @{notes_mcmc_programs}
 
-  leveraging a single Monte Carlo sample, where $\\beta^* \sim
-  q(\\beta)$. This is unbiased (and therefore asymptotically exact as a
-  pseudo-marginal method) if $q(\\beta) = p(\\beta \mid x)$.
+  @{notes_conditional_inference}
 
   #### Examples
 
+  Consider the following setup.
   ```python
-  mu = Normal(loc=0.0, scale=1.0)
-  x = Normal(loc=mu, scale=1.0, sample_shape=10)
-
-  qmu = Empirical(tf.Variable(tf.zeros(500)))
-  inference = ed.SGLD({mu: qmu}, {x: np.zeros(10, dtype=np.float32)})
+  def model():
+    mu = Normal(loc=0.0, scale=1.0, name="mu")
+    x = Normal(loc=mu, scale=1.0, sample_shape=10, name="x")
+  ```
+  In graph mode, build `tf.Variable`s which are updated via the Markov
+  chain. The update op is fetched at runtime over many iterations.
+  ```python
+  qmu = tf.get_variable("qmu", initializer=1.)
+  qmu_mom = tf.get_variable("qmu_mom", initializer=0.)
+  new_state, new_momentum = ed.sgld(
+      model,
+      ...,
+      current_state=qmu,
+      momentum=qmu_mom,
+      align_latent=lambda name: "qmu" if name == "mu" else None,
+      align_data=lambda name: "x_data" if name == "x" else None,
+      x_data=x_data)
+  qmu_update = qmu.assign(new_state)
+  qmu_mom_update = qmu_mom.assign(new_momentum)
   ```
+  In eager mode, call the function at runtime, updating its inputs
+  such as `state`.
+  ```python
+  qmu = 1.
+  qmu_mom = None
+  for _ in range(1000):
+    new_state, momentum = ed.sgld(
+        model,
+        ...,
+        current_state=qmu,
+        momentum=qmu_mom,
+        align_latent=lambda name: "qmu" if name == "mu" else None,
+        align_data=lambda name: "x_data" if name == "x" else None,
+        x_data=x_data)
+    qmu = new_state
+    qmu_mom = new_momentum
+  ```
+  """
+  def _target_log_prob_fn(*fargs):
+    """Target's unnormalized log-joint density as a function of states."""
+    posterior_trace = {state.name.split(':')[0]: Node(arg)
+                       for state, arg in zip(states, fargs)}
+    intercept = make_intercept(
+        posterior_trace, align_data, align_latent, args, kwargs)
+    with Trace(intercept=intercept) as model_trace:
+      call_function_up_to_args(model, *args, **kwargs)
+
+    p_log_prob = 0.0
+    for name, node in six.iteritems(model_trace):
+      if align_latent(name) is not None or align_data(name) is not None:
+        rv = node.value
+        p_log_prob += tf.reduce_sum(rv.log_prob(rv.value))
+    return p_log_prob
+
+  is_list_like = lambda x: isinstance(x, (tuple, list))
+  maybe_list = lambda x: list(x) if is_list_like(x) else [x]
+  states = maybe_list(state)
+
+  out = tfp.sgld.kernel(
+      target_log_prob_fn=_target_log_prob_fn,
+      current_state=state,
+      momentum=momentum,
+      learning_rate=learning_rate,
+      preconditioner_decay_rate=preconditioner_decay_rate,
+      num_pseudo_batches=num_pseudo_batches,
+      diagonal_bias=diagonal_bias,
+      current_target_log_prob=target_log_prob,
+      current_grads_target_log_prob=grads_target_log_prob)
+  return out
+
+
+def kernel(target_log_prob_fn,
+           current_state,
+           momentum,
+           learning_rate,
+           preconditioner_decay_rate=0.95,
+           num_pseudo_batches=1,
+           diagonal_bias=1e-8,
+           current_target_log_prob=None,
+           current_grads_target_log_prob=None,
+           name=None):
+  """Runs the stochastic gradient Langevin dynamics transition kernel.
+
+  This implements the preconditioned Stochastic Gradient Langevin Dynamics
+  optimizer [1]. The optimization variable is regarded as a sample from the
+  posterior under Stochastic Gradient Langevin Dynamics with noise rescaled in
+  each dimension according to RMSProp [2].
+
+  Note: If a prior is included in the loss, it should be scaled by
+  `1/num_pseudo_batches`, where num_pseudo_batches is the number of minibatches
+  in the data.  I.e., it should be divided by the `num_pseudo_batches` term
+  described below.
+
+  This function can update multiple chains in parallel. It assumes that all
+  leftmost dimensions of `current_state` index independent chain states (and are
+  therefore updated independently). The output of `target_log_prob_fn()` should
+  sum log-probabilities across all event dimensions. Slices along the rightmost
+  dimensions may have different target distributions; for example,
+  `current_state[0, :]` could have a different target distribution from
+  `current_state[1, :]`. This is up to `target_log_prob_fn()`. (The number of
+  independent chains is `tf.size(target_log_prob_fn(*current_state))`.)
+
+  [1]: "Preconditioned Stochastic Gradient Langevin Dynamics for Deep Neural
+       Networks." Chunyuan Li, Changyou Chen, David Carlson, Lawrence Carin.
+       ArXiv:1512.07666, 2015. https://arxiv.org/abs/1512.07666
+  [2]: http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf
+
+  Args:
+    target_log_prob_fn: Python callable which takes an argument like
+      `current_state` (or `*current_state` if it's a list) and returns its
+      (possibly unnormalized) log-density under the target distribution.
+    current_state: `Tensor` or Python `list` of `Tensor`s representing the
+      current state(s) of the Markov chain(s). The first `r` dimensions index
+      independent chains, `r = tf.rank(target_log_prob_fn(*current_state))`.
+    momentum: Tensor or List of Tensors, representing exponentially
+      weighted moving average of each squared gradient with respect to a
+      state. It is recommended to initialize it with tf.ones.
+    learning_rate: Scalar `float`-like `Tensor`. The base learning rate for the
+      optimizer. Must be tuned to the specific function being minimized.
+    preconditioner_decay_rate: Scalar `float`-like `Tensor`. The exponential
+      decay rate of the rescaling of the preconditioner (RMSprop). (This is
+      "alpha" in [1]). Should be smaller than but nearly `1` to approximate
+      sampling from the posterior. (Default: `0.95`)
+    num_pseudo_batches: Scalar `int`-like `Tensor`. The effective number of
+      minibatches in the data set.  Trades off noise and prior with the SGD
+      likelihood term. Note: Assumes the loss is taken as the mean over a
+      minibatch. Otherwise if the sum was taken, divide this number by the
+      batch size.  (Default: `1`)
+    burnin: Scalar `int`-like `Tensor`. The number of iterations to collect
+      gradient statistics to update the preconditioner before starting to draw
+      noisy samples. (Default: `25`)
+    diagonal_bias: Scalar `float`-like `Tensor`. Term added to the diagonal of
+      the preconditioner to prevent the preconditioner from degenerating.
+      (Default: `1e-8`)
+    seed: Python integer to seed the random number generator.
+    current_target_log_prob: (Optional) `Tensor` representing the value of
+      `target_log_prob_fn` at the `current_state`. The only reason to
+      specify this argument is to reduce TF graph size.
+      Default value: `None` (i.e., compute as needed).
+    current_grads_target_log_prob: (Optional) Python list of `Tensor`s
+      representing gradient of `current_target_log_prob` at the `current_state`
+      and wrt the `current_state`. Must have same shape as `current_state`. The
+      only reason to specify this argument is to reduce TF graph size.
+      Default value: `None` (i.e., compute as needed).
+    name: Python `str` name prefixed to Ops created by this function.
+      Default value: `None` (i.e., "sgld_kernel").
+
+  Returns:
+    accepted_states: Tensor or Python list of `Tensor`s representing the
+      state(s) of the Markov chain(s) at each result step. Has same shape as
+      input `current_state` but with a prepended `num_results`-size dimension.
+    kernel_results: `collections.namedtuple` of internal calculations used to
+      advance the chain.
   """
-  def __init__(self, *args, **kwargs):
-    super(SGLD, self).__init__(*args, **kwargs)
-
-  def initialize(self, step_size=0.25, *args, **kwargs):
-    """
-    Args:
-      step_size: float.
-        Constant scale factor of learning rate.
-    """
-    self.step_size = step_size
-    return super(SGLD, self).initialize(*args, **kwargs)
-
-  def build_update(self):
-    """Simulate Langevin dynamics using a discretized integrator. Its
-    discretization error goes to zero as the learning rate decreases.
-
-    #### Notes
-
-    The updates assume each Empirical random variable is directly
-    parameterized by `tf.Variable`s.
-    """
-    old_sample = {z: tf.gather(qz.params, tf.maximum(self.t - 1, 0))
-                  for z, qz in six.iteritems(self.latent_vars)}
-
-    # Simulate Langevin dynamics.
-    learning_rate = self.step_size / tf.pow(
-        tf.cast(self.t + 1, list(six.iterkeys(old_sample))[0].dtype), 0.55)
-    grad_log_joint = tf.gradients(self._log_joint(old_sample),
-                                  list(six.itervalues(old_sample)))
-    sample = {}
-    for z, grad_log_p in zip(six.iterkeys(old_sample), grad_log_joint):
-      qz = self.latent_vars[z]
-      event_shape = qz.event_shape
-      stddev = tf.sqrt(tf.cast(learning_rate, qz.dtype))
-      normal = tf.random_normal(event_shape, dtype=qz.dtype)
-      sample[z] = (old_sample[z] +
-                   0.5 * learning_rate * tf.convert_to_tensor(grad_log_p) +
-                   stddev * normal)
-
-    # Update Empirical random variables.
-    assign_ops = []
-    for z, qz in six.iteritems(self.latent_vars):
-      variable = qz.get_variables()[0]
-      assign_ops.append(tf.scatter_update(variable, self.t, sample[z]))
-
-    # Increment n_accept.
-    assign_ops.append(self.n_accept.assign_add(1))
-    return tf.group(*assign_ops)
-
-  def _log_joint(self, z_sample):
-    """Utility function to calculate model's log joint density,
-    log p(x, z), for inputs z (and fixed data x).
-
-    Args:
-      z_sample: dict.
-        Latent variable keys to samples.
-    """
-    scope = tf.get_default_graph().unique_name("inference")
-    # Form dictionary in order to replace conditioning on prior or
-    # observed variable with conditioning on a specific value.
-    dict_swap = z_sample.copy()
-    for x, qx in six.iteritems(self.data):
-      if isinstance(x, RandomVariable):
-        if isinstance(qx, RandomVariable):
-          qx_copy = copy(qx, scope=scope)
-          dict_swap[x] = qx_copy.value
-        else:
-          dict_swap[x] = qx
-
-    log_joint = 0.0
-    for z in six.iterkeys(self.latent_vars):
-      z_copy = copy(z, dict_swap, scope=scope)
-      log_joint += tf.reduce_sum(
-          self.scale.get(z, 1.0) * z_copy.log_prob(dict_swap[z]))
-
-    for x in six.iterkeys(self.data):
-      if isinstance(x, RandomVariable):
-        x_copy = copy(x, dict_swap, scope=scope)
-        log_joint += tf.reduce_sum(
-            self.scale.get(x, 1.0) * x_copy.log_prob(dict_swap[x]))
-
-    return log_joint
+  from tensorflow.python.ops import array_ops
+  from tensorflow.python.ops import math_ops
+  from tensorflow.python.ops import random_ops
+  is_list_like = lambda x: isinstance(x, (tuple, list))
+  maybe_list = lambda x: list(x) if is_list_like(x) else [x]
+  states = maybe_list(current_state)
+  momentums = maybe_list(momentum)
+  with tf.name_scope(name, "sgld_kernel", states):
+    with tf.name_scope("initialize"):
+      if current_target_log_prob is None:
+        current_target_log_prob = target_log_prob_fn(*states)
+      if current_grads_target_log_prob is None:
+        current_grads_target_log_prob = tf.gradients(current_target_log_prob, states)
+
+    # TODO doesn't this scale the noise incorrectly by additional
+    # learning_rate during the update? (same in sgld_optimizer)
+    next_states = []
+    momentums = []
+    for state, mom, grad in zip(states, momentums, current_grads_target_log_prob):
+      next_state = (
+          state + learning_rate *
+          _apply_noisy_update(mom, grad, learning_rate, diagonal_bias,
+                              num_pseudo_batches, seed))
+      momentum = (
+          mom + (1.0 - preconditioner_decay_rate) * (tf.square(grad) - mom))
+      next_states.append(next_state)
+      momentums.append(momentum)
+
+    maybe_flatten = lambda x: x if is_list_like(state) else x[0]
+    next_state = maybe_flatten(next_states)
+    momentum = maybe_flatten(momentums)
+    return [
+        next_state,
+        momentum,
+    ]
+
+
+def _apply_noisy_update(mom, grad, learning_rate, diagonal_bias,
+                        num_pseudo_batches, seed):
+  # Compute and apply the gradient update following
+  # preconditioned Langevin dynamics
+  stddev = math_ops.cast(math_ops.rsqrt(learning_rate), grad.dtype)
+  preconditioner = math_ops.rsqrt(
+      mom + math_ops.cast(diagonal_bias, grad.dtype))
+  return (
+      0.5 * preconditioner * grad * math_ops.cast(num_pseudo_batches,
+                                                  grad.dtype) +
+      random_ops.random_normal(array_ops.shape(grad),
+                               1.0,
+                               dtype=grad.dtype,
+                               seed=seed) *
+      stddev * math_ops.sqrt(preconditioner))

From 219c81cc16f7df554b35775f987f8c65c9116c71 Mon Sep 17 00:00:00 2001
From: Dustin Tran <dustinviettran@gmail.com>
Date: Wed, 14 Feb 2018 11:52:34 -0800
Subject: [PATCH 20/27] move MCMC kernels upstream to bayesflow

---
 edward/inferences/sgld.py | 137 --------------------------------------
 1 file changed, 137 deletions(-)

diff --git a/edward/inferences/sgld.py b/edward/inferences/sgld.py
index 39bfef0bc..93ac533e3 100644
--- a/edward/inferences/sgld.py
+++ b/edward/inferences/sgld.py
@@ -144,140 +144,3 @@ def _target_log_prob_fn(*fargs):
       current_target_log_prob=target_log_prob,
       current_grads_target_log_prob=grads_target_log_prob)
   return out
-
-
-def kernel(target_log_prob_fn,
-           current_state,
-           momentum,
-           learning_rate,
-           preconditioner_decay_rate=0.95,
-           num_pseudo_batches=1,
-           diagonal_bias=1e-8,
-           current_target_log_prob=None,
-           current_grads_target_log_prob=None,
-           name=None):
-  """Runs the stochastic gradient Langevin dynamics transition kernel.
-
-  This implements the preconditioned Stochastic Gradient Langevin Dynamics
-  optimizer [1]. The optimization variable is regarded as a sample from the
-  posterior under Stochastic Gradient Langevin Dynamics with noise rescaled in
-  each dimension according to RMSProp [2].
-
-  Note: If a prior is included in the loss, it should be scaled by
-  `1/num_pseudo_batches`, where num_pseudo_batches is the number of minibatches
-  in the data.  I.e., it should be divided by the `num_pseudo_batches` term
-  described below.
-
-  This function can update multiple chains in parallel. It assumes that all
-  leftmost dimensions of `current_state` index independent chain states (and are
-  therefore updated independently). The output of `target_log_prob_fn()` should
-  sum log-probabilities across all event dimensions. Slices along the rightmost
-  dimensions may have different target distributions; for example,
-  `current_state[0, :]` could have a different target distribution from
-  `current_state[1, :]`. This is up to `target_log_prob_fn()`. (The number of
-  independent chains is `tf.size(target_log_prob_fn(*current_state))`.)
-
-  [1]: "Preconditioned Stochastic Gradient Langevin Dynamics for Deep Neural
-       Networks." Chunyuan Li, Changyou Chen, David Carlson, Lawrence Carin.
-       ArXiv:1512.07666, 2015. https://arxiv.org/abs/1512.07666
-  [2]: http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf
-
-  Args:
-    target_log_prob_fn: Python callable which takes an argument like
-      `current_state` (or `*current_state` if it's a list) and returns its
-      (possibly unnormalized) log-density under the target distribution.
-    current_state: `Tensor` or Python `list` of `Tensor`s representing the
-      current state(s) of the Markov chain(s). The first `r` dimensions index
-      independent chains, `r = tf.rank(target_log_prob_fn(*current_state))`.
-    momentum: Tensor or List of Tensors, representing exponentially
-      weighted moving average of each squared gradient with respect to a
-      state. It is recommended to initialize it with tf.ones.
-    learning_rate: Scalar `float`-like `Tensor`. The base learning rate for the
-      optimizer. Must be tuned to the specific function being minimized.
-    preconditioner_decay_rate: Scalar `float`-like `Tensor`. The exponential
-      decay rate of the rescaling of the preconditioner (RMSprop). (This is
-      "alpha" in [1]). Should be smaller than but nearly `1` to approximate
-      sampling from the posterior. (Default: `0.95`)
-    num_pseudo_batches: Scalar `int`-like `Tensor`. The effective number of
-      minibatches in the data set.  Trades off noise and prior with the SGD
-      likelihood term. Note: Assumes the loss is taken as the mean over a
-      minibatch. Otherwise if the sum was taken, divide this number by the
-      batch size.  (Default: `1`)
-    burnin: Scalar `int`-like `Tensor`. The number of iterations to collect
-      gradient statistics to update the preconditioner before starting to draw
-      noisy samples. (Default: `25`)
-    diagonal_bias: Scalar `float`-like `Tensor`. Term added to the diagonal of
-      the preconditioner to prevent the preconditioner from degenerating.
-      (Default: `1e-8`)
-    seed: Python integer to seed the random number generator.
-    current_target_log_prob: (Optional) `Tensor` representing the value of
-      `target_log_prob_fn` at the `current_state`. The only reason to
-      specify this argument is to reduce TF graph size.
-      Default value: `None` (i.e., compute as needed).
-    current_grads_target_log_prob: (Optional) Python list of `Tensor`s
-      representing gradient of `current_target_log_prob` at the `current_state`
-      and wrt the `current_state`. Must have same shape as `current_state`. The
-      only reason to specify this argument is to reduce TF graph size.
-      Default value: `None` (i.e., compute as needed).
-    name: Python `str` name prefixed to Ops created by this function.
-      Default value: `None` (i.e., "sgld_kernel").
-
-  Returns:
-    accepted_states: Tensor or Python list of `Tensor`s representing the
-      state(s) of the Markov chain(s) at each result step. Has same shape as
-      input `current_state` but with a prepended `num_results`-size dimension.
-    kernel_results: `collections.namedtuple` of internal calculations used to
-      advance the chain.
-  """
-  from tensorflow.python.ops import array_ops
-  from tensorflow.python.ops import math_ops
-  from tensorflow.python.ops import random_ops
-  is_list_like = lambda x: isinstance(x, (tuple, list))
-  maybe_list = lambda x: list(x) if is_list_like(x) else [x]
-  states = maybe_list(current_state)
-  momentums = maybe_list(momentum)
-  with tf.name_scope(name, "sgld_kernel", states):
-    with tf.name_scope("initialize"):
-      if current_target_log_prob is None:
-        current_target_log_prob = target_log_prob_fn(*states)
-      if current_grads_target_log_prob is None:
-        current_grads_target_log_prob = tf.gradients(current_target_log_prob, states)
-
-    # TODO doesn't this scale the noise incorrectly by additional
-    # learning_rate during the update? (same in sgld_optimizer)
-    next_states = []
-    momentums = []
-    for state, mom, grad in zip(states, momentums, current_grads_target_log_prob):
-      next_state = (
-          state + learning_rate *
-          _apply_noisy_update(mom, grad, learning_rate, diagonal_bias,
-                              num_pseudo_batches, seed))
-      momentum = (
-          mom + (1.0 - preconditioner_decay_rate) * (tf.square(grad) - mom))
-      next_states.append(next_state)
-      momentums.append(momentum)
-
-    maybe_flatten = lambda x: x if is_list_like(state) else x[0]
-    next_state = maybe_flatten(next_states)
-    momentum = maybe_flatten(momentums)
-    return [
-        next_state,
-        momentum,
-    ]
-
-
-def _apply_noisy_update(mom, grad, learning_rate, diagonal_bias,
-                        num_pseudo_batches, seed):
-  # Compute and apply the gradient update following
-  # preconditioned Langevin dynamics
-  stddev = math_ops.cast(math_ops.rsqrt(learning_rate), grad.dtype)
-  preconditioner = math_ops.rsqrt(
-      mom + math_ops.cast(diagonal_bias, grad.dtype))
-  return (
-      0.5 * preconditioner * grad * math_ops.cast(num_pseudo_batches,
-                                                  grad.dtype) +
-      random_ops.random_normal(array_ops.shape(grad),
-                               1.0,
-                               dtype=grad.dtype,
-                               seed=seed) *
-      stddev * math_ops.sqrt(preconditioner))

From 825128663de6677a94b72a65e277ad04b106ebfe Mon Sep 17 00:00:00 2001
From: Dustin Tran <dustinviettran@gmail.com>
Date: Sun, 28 Jan 2018 03:21:13 -0800
Subject: [PATCH 21/27] appeal to Estimator API; move transform to
 inferences/util.py

---
 docker/Dockerfile                            |   2 +-
 docker/Dockerfile-gpu                        |   2 +-
 edward/__init__.py                           |   4 +-
 edward/inferences/__init__.py                |  33 +-
 edward/inferences/bigan_inference.py         |   2 +-
 edward/inferences/gan_inference.py           |   2 +-
 edward/inferences/hmc.py                     |   3 +-
 edward/inferences/inference.py               | 413 -------------------
 edward/inferences/klpq.py                    |   3 +-
 edward/inferences/klqp.py                    |   3 +-
 edward/inferences/klqp_implicit.py           |   3 +-
 edward/inferences/laplace.py                 |   3 +-
 edward/inferences/map.py                     |   3 +-
 edward/inferences/metropolis_hastings.py     |   3 +-
 edward/inferences/sghmc.py                   |   3 +-
 edward/inferences/sgld.py                    |   3 +-
 edward/inferences/util.py                    | 145 +++++++
 edward/inferences/wake_sleep.py              |   3 +-
 edward/inferences/wgan_inference.py          |   2 +-
 edward/util/__init__.py                      |   2 -
 edward/util/random_variables.py              |  73 ----
 setup.py                                     |   1 -
 tests/{util => inferences}/transform_test.py |  19 +-
 23 files changed, 182 insertions(+), 548 deletions(-)
 delete mode 100644 edward/inferences/inference.py
 create mode 100644 edward/inferences/util.py
 delete mode 100644 edward/util/random_variables.py
 rename tests/{util => inferences}/transform_test.py (89%)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index 53a11ba25..d442dc3fb 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -42,7 +42,7 @@ USER edward
 ARG python_version=3.5.3-0
 ARG python_qt_version=4
 RUN conda install -y python=${python_version} && \
-    pip install observations numpy six tensorflow keras prettytensor && \
+    pip install observations numpy six tensorflow keras && \
     pip install ipdb pytest pytest-cov python-coveralls coverage==3.7.1 pytest-xdist pep8 pytest-pep8 pydot_ng && \
     conda install Pillow scikit-learn matplotlib notebook pandas seaborn pyyaml h5py && \
     conda install -y pyqt=${python_qt_version} && \
diff --git a/docker/Dockerfile-gpu b/docker/Dockerfile-gpu
index a32bed4e2..90d74c9c5 100644
--- a/docker/Dockerfile-gpu
+++ b/docker/Dockerfile-gpu
@@ -42,7 +42,7 @@ USER edward
 # Python
 ARG python_version=3.5.3-0
 RUN conda install -y python=${python_version} && \
-    pip install observations numpy six tensorflow-gpu keras prettytensor && \
+    pip install observations numpy six tensorflow-gpu keras && \
     pip install ipdb pytest pytest-cov python-coveralls coverage==3.7.1 pytest-xdist pep8 pytest-pep8 pydot_ng && \
     conda install Pillow scikit-learn matplotlib notebook pandas seaborn pyyaml h5py && \
     pip install edward && \
diff --git a/edward/__init__.py b/edward/__init__.py
index cb512cd2c..5d7e7b8ba 100644
--- a/edward/__init__.py
+++ b/edward/__init__.py
@@ -37,8 +37,7 @@
     is_independent,
     random_variables)
 from edward.util import (
-    get_control_variate_coef,
-    transform)
+    get_control_variate_coef)
 from edward.version import __version__, VERSION
 
 from tensorflow.python.util.all_util import remove_undocumented
@@ -76,7 +75,6 @@
     'get_variables',
     'is_independent',
     'random_variables',
-    'transform',
     '__version__',
     'VERSION',
 ]
diff --git a/edward/inferences/__init__.py b/edward/inferences/__init__.py
index f6949b483..0fe41b1f7 100644
--- a/edward/inferences/__init__.py
+++ b/edward/inferences/__init__.py
@@ -5,38 +5,28 @@
   1. Build train_op (*).
   2. Build summary file writer.
   3. Build and run TensorFlow variable initializer ops.
-  4. Build progressbar (*).
-  5. Within a training loop:
+  4. Within a training loop:
     + sess.run with infeeding and summary writers.
-    + Update progressbar (*).
+    + Log progress by writing to files and/or printing.
     + Check convergence (*).
-  6. Build and run post-training ops (*).
+  5. Build and run post-training ops (*).
 2. Idiomatic TensorFlow Estimator
-  + Call train() (*). It is a higher-order function taking in the
-  model program, data, an optional inference function to build the
-  train_op, and various other things. As an inference engine, it
-  automates the process above.
+  + Build `model_fn` by writing a probabilistic program and calling an
+  inference algorithm to produce train ops. Use the Estimator API
+  workflow of `train`, `evaluate`, and `predict` alongside an
+  `input_fn` data pipeline.
 
-Inference provides functions for both approaches. In the first
+Inference provides utilities for both approaches. In the first
 approach, it provides (*), namely: (1) inference algorithms to help
 produce the train_op (and low-level functions to build your own
-algorithms); (2) a progressbar to build and update; (3) convergence
-diagnostics; and (4) post-training ops for certain algorithms. In the
-second approach, it provides the fully automated train().
+algorithms; sometimes post-training ops); and (2) convergence
+diagnostics. In the second approach, these functions build up a
+`model_fn` to form a TensorFlow Estimator.
 
 Inference uses (unbinded) pure functions with TensorFlow idiomatic
 exceptions (e.g., mutable state via TensorFlow variables; side effect
 of adding to global collections and TF graph). It forgoes OO.
 
-This file is a collection of functions shared across inference
-algorithms, used for the following:
-
-+ "call f up to args" (in `inferences/inference`)
-+ a "make intercept" factory (in `inferences/inference`)
-+ automated transforms (in `inferences/inference` and `util/random_variables`)
-+ programmatic docstrings (in `inferences/docstrings`)
-+ `train` (in `inferences/inference`)
-
 Specific inference files provide functions to help produce the train
 (and post-training) ops.
 """
@@ -48,7 +38,6 @@
 from edward.inferences.conjugacy import *
 from edward.inferences.gan_inference import *
 from edward.inferences.hmc import *
-from edward.inferences.inference import *
 from edward.inferences.klpq import *
 from edward.inferences.klqp import *
 from edward.inferences.klqp_implicit import *
diff --git a/edward/inferences/bigan_inference.py b/edward/inferences/bigan_inference.py
index b9ac408c4..2e415c353 100644
--- a/edward/inferences/bigan_inference.py
+++ b/edward/inferences/bigan_inference.py
@@ -6,7 +6,7 @@
 import tensorflow as tf
 
 from edward.inferences import docstrings as doc
-from edward.inferences.inference import call_function_up_to_args
+from edward.inferences.util import call_function_up_to_args
 from edward.models.core import Trace
 
 
diff --git a/edward/inferences/gan_inference.py b/edward/inferences/gan_inference.py
index ab4566f7c..9236cba8f 100644
--- a/edward/inferences/gan_inference.py
+++ b/edward/inferences/gan_inference.py
@@ -6,7 +6,7 @@
 import tensorflow as tf
 
 from edward.inferences import docstrings as doc
-from edward.inferences.inference import call_function_up_to_args
+from edward.inferences.util import call_function_up_to_args
 
 
 @doc.set_doc(
diff --git a/edward/inferences/hmc.py b/edward/inferences/hmc.py
index 843ba5cd6..94c490782 100644
--- a/edward/inferences/hmc.py
+++ b/edward/inferences/hmc.py
@@ -6,8 +6,7 @@
 import tensorflow as tf
 
 from edward.inferences import docstrings as doc
-from edward.inferences.inference import (
-    call_function_up_to_args, make_intercept)
+from edward.inferences.util import call_function_up_to_args, make_intercept
 from edward.models.core import Node, Trace
 
 tfp = tf.contrib.bayesflow
diff --git a/edward/inferences/inference.py b/edward/inferences/inference.py
deleted file mode 100644
index 094a60cd1..000000000
--- a/edward/inferences/inference.py
+++ /dev/null
@@ -1,413 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-import six
-import tensorflow as tf
-import os
-
-from datetime import datetime
-from edward.models import RandomVariable
-from edward.util import get_variables, Progbar
-from edward.util import transform as _transform
-
-tfb = tf.contrib.distributions.bijectors
-
-
-def call_function_up_to_args(f, *args, **kwargs):
-  """Call f, removing any args/kwargs it doesn't take as input."""
-  import inspect
-  if hasattr(f, "_func"):  # tf.make_template()
-    argspec = inspect.getargspec(f._func)
-  else:
-    argspec = inspect.getargspec(f)
-  fkwargs = {}
-  for k, v in six.iteritems(kwargs):
-    if k in argspec.args:
-      fkwargs[k] = v
-  num_args = len(argspec.args) - len(fkwargs)
-  if num_args > 0:
-    return f(*args[:num_args], **fkwargs)
-  elif len(fkwargs) > 0:
-    return f(**fkwargs)
-  return f()
-
-
-def make_intercept(trace, align_data, align_latent, args, kwargs):
-  def _intercept(f, *fargs, **fkwargs):
-    """Set model's sample values to variational distribution's and data."""
-    name = fkwargs.get('name', None)
-    key = align_data(name)
-    if isinstance(key, int):
-      fkwargs['value'] = args[key]
-    elif kwargs.get(key, None) is not None:
-      fkwargs['value'] = kwargs.get(key)
-    elif align_latent(name) is not None:
-      qz = trace[align_latent(name)].value
-      if isinstance(qz, RandomVariable):
-        value = qz.value
-      else:  # e.g. replacement is Tensor
-        value = tf.convert_to_tensor(qz)
-      fkwargs['value'] = value
-    # if auto_transform and 'qz' in locals():
-    #   # TODO for generation to work, must output original dist. to
-    #   keep around TD? must maintain another stack to write to as a
-    #   side-effect (or augment the original stack).
-    #   return transform(f, qz, *fargs, **fkwargs)
-    return f(*fargs, **fkwargs)
-  return _intercept
-
-
-def transform(f, qz, *args, **kwargs):
-  """Transform prior -> unconstrained -> q's constraint.
-
-  When using in VI, we keep variational distribution on its original
-  space (for sake of implementing only one intercepting function).
-  """
-  # TODO deal with f or qz being 'point' or 'points'
-  if (not hasattr(f, 'support') or not hasattr(qz, 'support') or
-          f.support == qz.support):
-    return f(*args, **kwargs)
-  value = kwargs.pop('value')
-  kwargs['value'] = 0.0  # to avoid sampling; TODO follow sample shape
-  rv = f(*args, **kwargs)
-  # Take shortcuts in logic if p or q are already unconstrained.
-  if qz.support in ('real', 'multivariate_real'):
-    return _transform(rv, value=value)
-  if rv.support in ('real', 'multivariate_real'):
-    rv_unconstrained = rv
-  else:
-    rv_unconstrained = _transform(rv, value=0.0)
-  unconstrained_to_constrained = tfb.Invert(_transform(qz).bijector)
-  return _transform(rv_unconstrained,
-                    unconstrained_to_constrained,
-                    value=value)
-
-
-def train(model, inference=None,
-          summary_key=None, n_iter=1000, n_print=None,
-          logdir=None, log_timestamp=True,
-          variables=None,
-          *args, **kwargs):
-  """An automated inference engine. It takes a model as input (and
-  optional args) and fully trains it until convergence given data to
-  return a posterior.
-
-  Given a defaulted inference algorithm (later, we might automate its
-  choice, or dynamically apply them), it performs the following steps:
-
-  1. (Optional) Build a TensorFlow summary writer for TensorBoard.
-  2. (Optional) Initialize TensorFlow variables.
-  3. while not converged: (for now, set by `n_iter` iterations)
-    3a. Run update ops.
-    3b. If within print window:
-      3bi. Print progress.
-      3bii. Run convergence diagnostics.
-  4. Run finalize (post-training) ops.
-
-  Args:
-    n_iter: int.
-      Number of iterations for algorithm when calling `run()`.
-      Alternatively if controlling inference manually, it is the
-      expected number of calls to `update()`; this number determines
-      tracking information during the print progress.
-    n_print: int.
-      Number of iterations for each print progress. To suppress print
-      progress, then specify 0. Default is `int(n_iter / 100)`.
-    logdir: str.
-      Directory where event file will be written. For details,
-      see `tf.summary.FileWriter`. Default is to log nothing.
-    log_timestamp: bool.
-      If True (and `logdir` is specified), create a subdirectory of
-      `logdir` to save the specific run results. The subdirectory's
-      name is the current UTC timestamp with format 'YYYYMMDD_HHMMSS'.
-    variables: list.
-      A list of TensorFlow variables to initialize during inference.
-      Default is to initialize all variables (this includes
-      reinitializing variables that were already initialized). To
-      avoid initializing any variables, pass in an empty list.
-  """
-  if n_print is None:
-    n_print = int(n_iter / 100)
-  if inference in (bigan_inference, gan_inference, implicit_klqp):
-    _update = _gan_update
-  elif inference == wgan_inference:
-    _update = _wgan_update
-  else:
-    _update = _default_update
-  progbar = Progbar(n_iter)
-  t = tf.Variable(0, trainable=False, name="iteration")
-  kwargs['t'] = t.assign_add(1)  # add to update()
-
-  if summary_key is not None:
-    # TODO _summary_variables()
-    summarize = tf.summary.merge_all(key=summary_key)
-    if log_timestamp:
-      logdir = os.path.expanduser(logdir)
-      logdir = os.path.join(
-          logdir, datetime.strftime(datetime.utcnow(), "%Y%m%d_%H%M%S"))
-    train_writer = tf.summary.FileWriter(logdir, tf.get_default_graph())
-  else:
-    summarize = None
-    train_writer = None
-
-  if variables is None:
-    init = tf.global_variables_initializer()
-  else:
-    init = tf.variables_initializer(variables)
-
-  # Feed placeholders in case initialization depends on them.
-  feed_dict = kwargs.get('feed_dict', {})
-  # TODO use feed dict outside since static
-  # feed_dict = {}
-  for key, value in six.iteritems(data):
-    if isinstance(key, tf.Tensor) and "Placeholder" in key.op.type:
-      feed_dict[key] = value
-  init.run(feed_dict)
-
-  for _ in range(n_iter):
-    info_dict = _update(progbar, n_print, summarize,
-                        train_writer, train_op, *args, **kwargs)
-
-  finalize = None
-  if finalize is not None:
-    finalize_ops = finalize()
-    sess = get_session()
-    sess.run(finalize_op, feed_dict)
-  else:
-    if summary_key is not None:
-      train_writer.close()
-
-
-def _summary_variables(latent_vars=None, data=None, variables=None,
-                       *args, **kwargs):
-  # Note: to use summary_key, set
-  # collections=[tf.get_default_graph().unique_name("summaries")]
-  # TODO include in TensorBoard tutorial
-  """Log variables to TensorBoard.
-
-  For each variable in `variables`, forms a `tf.summary.scalar` if
-  the variable has scalar shape; otherwise forms a `tf.summary.histogram`.
-
-  Args:
-    variables: list.
-      Specifies the list of variables to log after each `n_print`
-      steps. If None, will log all variables. If `[]`, no variables
-      will be logged.
-  """
-  if variables is None:
-    variables = []
-    for key in six.iterkeys(data):
-      variables += get_variables(key)
-
-    for key, value in six.iteritems(latent_vars):
-      variables += get_variables(key)
-      variables += get_variables(value)
-
-    variables = set(variables)
-
-  for var in variables:
-    # replace colons which are an invalid character
-    var_name = var.name.replace(':', '/')
-    # Log all scalars.
-    if len(var.shape) == 0:
-      tf.summary.scalar("parameter/{}".format(var_name),
-                        var, *args, **kwargs)
-    elif len(var.shape) == 1 and var.shape[0] == 1:
-      tf.summary.scalar("parameter/{}".format(var_name),
-                        var[0], *args, **kwargs)
-    else:
-      # If var is multi-dimensional, log a histogram of its values.
-      tf.summary.histogram("parameter/{}".format(var_name),
-                           var, *args, **kwargs)
-
-
-def _optimize(loss, grads_and_vars, collections=None, var_list=None,
-              optimizer=None, use_prettytensor=False, global_step=None):
-  """Build optimizer and its train op applied to loss or
-  grads_and_vars.
-
-  Args:
-    optimizer: str or tf.train.Optimizer.
-      A TensorFlow optimizer, to use for optimizing the variational
-      objective. Alternatively, one can pass in the name of a
-      TensorFlow optimizer, and default parameters for the optimizer
-      will be used.
-    use_prettytensor: bool.
-      `True` if aim to use PrettyTensor optimizer (when using
-      PrettyTensor) or `False` if aim to use TensorFlow optimizer.
-      Defaults to TensorFlow.
-    global_step: tf.Variable.
-      A TensorFlow variable to hold the global step.
-  """
-  if collections is not None:
-    # TODO when users call this, this duplicates for GANs
-    # train = optimize(loss, grads_and_vars, summary_key)
-    # train_d = optimize(loss_d, grads_and_vars_d, summary_key)
-    tf.summary.scalar("loss", loss, collections=collections)
-    for grad, var in grads_and_vars:
-      # replace colons which are an invalid character
-      tf.summary.histogram("gradient/" +
-                           var.name.replace(':', '/'),
-                           grad, collections=collections)
-      tf.summary.scalar("gradient_norm/" +
-                        var.name.replace(':', '/'),
-                        tf.norm(grad), collections=collections)
-
-  if optimizer is None and global_step is None:
-    # Default optimizer always uses a global step variable.
-    global_step = tf.Variable(0, trainable=False, name="global_step")
-
-  if isinstance(global_step, tf.Variable):
-    starter_learning_rate = 0.1
-    learning_rate = tf.train.exponential_decay(starter_learning_rate,
-                                               global_step,
-                                               100, 0.9, staircase=True)
-  else:
-    learning_rate = 0.01
-
-  # Build optimizer.
-  if optimizer is None:
-    optimizer = tf.train.AdamOptimizer(learning_rate)
-  elif isinstance(optimizer, str):
-    if optimizer == 'gradientdescent':
-      optimizer = tf.train.GradientDescentOptimizer(learning_rate)
-    elif optimizer == 'adadelta':
-      optimizer = tf.train.AdadeltaOptimizer(learning_rate)
-    elif optimizer == 'adagrad':
-      optimizer = tf.train.AdagradOptimizer(learning_rate)
-    elif optimizer == 'momentum':
-      optimizer = tf.train.MomentumOptimizer(learning_rate, 0.9)
-    elif optimizer == 'adam':
-      optimizer = tf.train.AdamOptimizer(learning_rate)
-    elif optimizer == 'ftrl':
-      optimizer = tf.train.FtrlOptimizer(learning_rate)
-    elif optimizer == 'rmsprop':
-      optimizer = tf.train.RMSPropOptimizer(learning_rate)
-    else:
-      raise ValueError('Optimizer class not found:', optimizer)
-  elif not isinstance(optimizer, tf.train.Optimizer):
-    raise TypeError("Optimizer must be str, tf.train.Optimizer, or None.")
-
-  with tf.variable_scope(None, default_name="optimizer") as scope:
-    if not use_prettytensor:
-      train_op = optimizer.apply_gradients(grads_and_vars,
-                                           global_step=global_step)
-    else:
-      import prettytensor as pt
-      # Note PrettyTensor optimizer does not accept manual updates;
-      # it autodiffs the loss directly.
-      train_op = pt.apply_optimizer(optimizer, losses=[loss],
-                                    global_step=global_step,
-                                    var_list=var_list)
-  return train_op
-
-
-def _default_update(progbar, n_print, summarize=None, train_writer=None,
-                    *args, **kwargs):
-  """Run one iteration of optimization.
-
-  Args:
-    args: things like `loss`
-    kwargs: things like 'feed_dict'
-    feed_dict: dict.
-      Feed dictionary for a TensorFlow session run. It is used to feed
-      placeholders that are not fed during initialization.
-
-  Returns:
-    dict.
-    Dictionary of algorithm-specific information. In this case, the
-    loss function value after one iteration.
-  """
-  sess = get_session()
-  feed_dict = kwargs.pop('feed_dict', {})
-  values = sess.run(list(args) + list(kwargs.values()), feed_dict)
-  info_dict = dict(zip(kwargs.keys(), values[len(args):]))
-
-  if n_print != 0:
-    t = info_dict['t']
-    if t == 1 or t % n_print == 0:
-      # TODO do we want specific key names? User can specify whatever
-      # in kwargs during run(...).
-      # progbar.update(t, {'Loss': info_dict['loss']})
-      # progbar.update(t, {'Gen Loss': info_dict['loss'],
-      #                    'Disc Loss': info_dict['loss_d']})
-      progbar.update(t, {k: v for k, v in six.iteritems(info_dict)
-                         if k != 't'})
-      if summarize is not None:
-        summary = sess.run(summarize, feed_dict)
-        train_writer.add_summary(summary, t)
-
-  return info_dict
-
-
-def _gan_update(train_op, train_op_d, n_print, summarize=None,
-                train_writer=None, variables=None, *args, **kwargs):
-  """Run one iteration of optimization.
-
-  Args:
-    variables: str.
-      Which set of variables to update. Either "Disc" or "Gen".
-      Default is both.
-
-  Returns:
-    dict.
-    Dictionary of algorithm-specific information. In this case, the
-    iteration number and generative and discriminative losses.
-
-  #### Notes
-
-  The outputted iteration number is the total number of calls to
-  `update`. Each update may include updating only a subset of
-  parameters.
-  """
-  # if feed_dict is None:
-  #   feed_dict = {}
-  # for key, value in six.iteritems(self.data):
-  #   if isinstance(key, tf.Tensor) and "Placeholder" in key.op.type:
-  #     feed_dict[key] = value
-  sess = get_session()
-  feed_dict = kwargs.pop('feed_dict', {})
-  if variables is None:
-    values = sess.run([train_op, train_op_d] + list(kwargs.values()), feed_dict)
-    values = values[2:]
-  elif variables == "Gen":
-    kwargs['loss_d'] = 0.0
-    values = sess.run([train_op] + list(kwargs_temp.values()), feed_dict)
-    values = values[1:]
-  elif variables == "Disc":
-    kwargs['loss'] = 0.0
-    values = sess.run([train_op_d] + list(kwargs_temp.values()), feed_dict)
-    values = values[1:]
-  else:
-    raise NotImplementedError("variables must be None, 'Gen', or 'Disc'.")
-
-  if summarize is not None and n_print != 0:
-    if t == 1 or t % self.n_print == 0:
-      summary = sess.run(summarize, feed_dict)
-      train_writer.add_summary(summary, t)
-
-  return dict(zip(kwargs_temp.keys(), values))
-
-
-def _wgan_update(clip_op, variables=None, *args, **kwargs):
-  # TODO make sure increment_t and clipping is called after the update
-  # (e.g., with control_dependencies, for monte carlo)
-  info_dict = gan_update(variables=variables, *args, **kwargs)
-
-  sess = get_session()
-  if clip_op is not None and variables in (None, "Disc"):
-    sess.run(clip_op)
-
-  return info_dict
-
-
-def _build_n_accept(collections):
-  # TODO for monte carlo methods
-  n_accept = tf.Variable(0, trainable=False, name="n_accept")
-  n_accept_over_t = n_accept / t
-  if collections is not None:
-    tf.summary.scalar("n_accept", n_accept,
-                      collections=collections)
-  return n_accept_over_t
diff --git a/edward/inferences/klpq.py b/edward/inferences/klpq.py
index 965c35d56..620afecf5 100644
--- a/edward/inferences/klpq.py
+++ b/edward/inferences/klpq.py
@@ -6,8 +6,7 @@
 import tensorflow as tf
 
 from edward.inferences import docstrings as doc
-from edward.inferences.inference import (
-    call_function_up_to_args, make_intercept)
+from edward.inferences.util import call_function_up_to_args, make_intercept
 from edward.models.core import Trace
 
 try:
diff --git a/edward/inferences/klqp.py b/edward/inferences/klqp.py
index 8267e824a..02e3b5f68 100644
--- a/edward/inferences/klqp.py
+++ b/edward/inferences/klqp.py
@@ -6,8 +6,7 @@
 import tensorflow as tf
 
 from edward.inferences import docstrings as doc
-from edward.inferences.inference import (
-    call_function_up_to_args, make_intercept)
+from edward.inferences.util import call_function_up_to_args, make_intercept
 from edward.models.core import Trace
 
 try:
diff --git a/edward/inferences/klqp_implicit.py b/edward/inferences/klqp_implicit.py
index aeeb79fd7..1fcf1eb0f 100644
--- a/edward/inferences/klqp_implicit.py
+++ b/edward/inferences/klqp_implicit.py
@@ -6,8 +6,7 @@
 import tensorflow as tf
 
 from edward.inferences import docstrings as doc
-from edward.inferences.inference import (
-    call_function_up_to_args, make_intercept)
+from edward.inferences.util import call_function_up_to_args, make_intercept
 from edward.models.core import Trace
 
 
diff --git a/edward/inferences/laplace.py b/edward/inferences/laplace.py
index c2a6c70ab..bc061fda1 100644
--- a/edward/inferences/laplace.py
+++ b/edward/inferences/laplace.py
@@ -5,10 +5,9 @@
 import six
 import tensorflow as tf
 
-from edward.inferences import docstrings as doc
-from edward.inferences.inference import call_function_up_to_args
 from edward.inferences import docstrings as doc
 from edward.inferences.map import map
+from edward.inferences.util import call_function_up_to_args
 from edward.models.core import Trace
 from edward.models.queries import get_variables
 
diff --git a/edward/inferences/map.py b/edward/inferences/map.py
index 76610f4fa..8e1cda2a9 100644
--- a/edward/inferences/map.py
+++ b/edward/inferences/map.py
@@ -6,8 +6,7 @@
 import tensorflow as tf
 
 from edward.inferences import docstrings as doc
-from edward.inferences.inference import (
-    call_function_up_to_args, make_intercept)
+from edward.inferences.util import call_function_up_to_args, make_intercept
 from edward.models.core import Trace
 
 try:
diff --git a/edward/inferences/metropolis_hastings.py b/edward/inferences/metropolis_hastings.py
index 7ad8633b4..5fa9ffd5e 100644
--- a/edward/inferences/metropolis_hastings.py
+++ b/edward/inferences/metropolis_hastings.py
@@ -6,8 +6,7 @@
 import tensorflow as tf
 
 from edward.inferences import docstrings as doc
-from edward.inferences.inference import (
-    call_function_up_to_args, make_intercept)
+from edward.inferences.util import call_function_up_to_args, make_intercept
 from edward.models.core import Node, Trace
 
 tfp = tf.contrib.bayesflow
diff --git a/edward/inferences/sghmc.py b/edward/inferences/sghmc.py
index 0672578ac..43f7cfb5d 100644
--- a/edward/inferences/sghmc.py
+++ b/edward/inferences/sghmc.py
@@ -6,8 +6,7 @@
 import tensorflow as tf
 
 from edward.inferences import docstrings as doc
-from edward.inferences.inference import (
-    call_function_up_to_args, make_intercept)
+from edward.inferences.util import call_function_up_to_args, make_intercept
 from edward.models.core import Node, Trace
 
 
diff --git a/edward/inferences/sgld.py b/edward/inferences/sgld.py
index 93ac533e3..17af4fdb4 100644
--- a/edward/inferences/sgld.py
+++ b/edward/inferences/sgld.py
@@ -6,8 +6,7 @@
 import tensorflow as tf
 
 from edward.inferences import docstrings as doc
-from edward.inferences.inference import (
-    call_function_up_to_args, make_intercept)
+from edward.inferences.util import call_function_up_to_args, make_intercept
 from edward.models.core import Node, Trace
 
 tfp = tf.contrib.bayesflow
diff --git a/edward/inferences/util.py b/edward/inferences/util.py
new file mode 100644
index 000000000..9481a8d43
--- /dev/null
+++ b/edward/inferences/util.py
@@ -0,0 +1,145 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import inspect
+import six
+import tensorflow as tf
+
+from edward.models.random_variable import RandomVariable
+from edward.models.core import TransformedDistribution
+
+tfb = tf.contrib.distributions.bijectors
+
+
+def call_function_up_to_args(f, *args, **kwargs):
+  """Call f, removing any args/kwargs it doesn't take as input."""
+  if hasattr(f, "_func"):  # tf.make_template()
+    argspec = inspect.getargspec(f._func)
+  else:
+    argspec = inspect.getargspec(f)
+  fkwargs = {}
+  for k, v in six.iteritems(kwargs):
+    if k in argspec.args:
+      fkwargs[k] = v
+  num_args = len(argspec.args) - len(fkwargs)
+  if num_args > 0:
+    return f(*args[:num_args], **fkwargs)
+  elif len(fkwargs) > 0:
+    return f(**fkwargs)
+  return f()
+
+
+def make_intercept(trace, align_data, align_latent, args, kwargs):
+  def _intercept(f, *fargs, **fkwargs):
+    """Set model's sample values to variational distribution's and data."""
+    name = fkwargs.get('name', None)
+    key = align_data(name)
+    if isinstance(key, int):
+      fkwargs['value'] = args[key]
+    elif kwargs.get(key, None) is not None:
+      fkwargs['value'] = kwargs.get(key)
+    elif align_latent(name) is not None:
+      qz = trace[align_latent(name)].value
+      if isinstance(qz, RandomVariable):
+        value = qz.value
+      else:  # e.g. replacement is Tensor
+        value = tf.convert_to_tensor(qz)
+      fkwargs['value'] = value
+    # if auto_transform and 'qz' in locals():
+    #   # TODO for generation to work, must output original dist. to
+    #   keep around TD? must maintain another stack to write to as a
+    #   side-effect (or augment the original stack).
+    #   return transform(f, qz, *fargs, **fkwargs)
+    return f(*fargs, **fkwargs)
+  return _intercept
+
+
+def transform(f, qz, *args, **kwargs):
+  """Transform prior -> unconstrained -> q's constraint.
+
+  When using in VI, we keep variational distribution on its original
+  space (for sake of implementing only one intercepting function).
+  """
+  # TODO deal with f or qz being 'point' or 'points'
+  if (not hasattr(f, 'support') or not hasattr(qz, 'support') or
+          f.support == qz.support):
+    return f(*args, **kwargs)
+  value = kwargs.pop('value')
+  kwargs['value'] = 0.0  # to avoid sampling; TODO follow sample shape
+  rv = f(*args, **kwargs)
+  # Take shortcuts in logic if p or q are already unconstrained.
+  if qz.support in ('real', 'multivariate_real'):
+    return _transform(rv, value=value)
+  if rv.support in ('real', 'multivariate_real'):
+    rv_unconstrained = rv
+  else:
+    rv_unconstrained = _transform(rv, value=0.0)
+  unconstrained_to_constrained = tfb.Invert(_transform(qz).bijector)
+  return _transform(rv_unconstrained,
+                    unconstrained_to_constrained,
+                    value=value)
+
+
+def transform(x, *args, **kwargs):
+  """Transform a continuous random variable to the unconstrained space.
+
+  `transform` selects among a number of default transformations which
+  depend on the support of the provided random variable:
+
+  + $[0, 1]$ (e.g., Beta): Inverse of sigmoid.
+  + $[0, \infty)$ (e.g., Gamma): Inverse of softplus.
+  + Simplex (e.g., Dirichlet): Inverse of softmax-centered.
+  + $(-\infty, \infty)$ (e.g., Normal, MultivariateNormalTriL): None.
+
+  Args:
+    x: RandomVariable.
+      Continuous random variable to transform.
+    *args, **kwargs:
+      Arguments to overwrite when forming the `TransformedDistribution`.
+      For example, manually specify the transformation by passing in
+      the `bijector` argument.
+
+  Returns:
+    RandomVariable.
+    A `TransformedDistribution` random variable, or the provided random
+    variable if no transformation was applied.
+
+  #### Examples
+
+  ```python
+  x = Gamma(1.0, 1.0)
+  y = ed.transform(x)
+  sess = tf.Session()
+  sess.run(y)
+  -2.2279539
+  ```
+  """
+  if len(args) != 0 or kwargs.get('bijector', None) is not None:
+    return TransformedDistribution(x, *args, **kwargs)
+
+  try:
+    support = x.support
+  except AttributeError as e:
+    msg = """'{}' object has no 'support'
+             so cannot be transformed.""".format(type(x).__name__)
+    raise AttributeError(msg)
+
+  if support == '01':
+    bij = tfb.Invert(tfb.Sigmoid())
+    new_support = 'real'
+  elif support == 'nonnegative':
+    bij = tfb.Invert(tfb.Softplus())
+    new_support = 'real'
+  elif support == 'simplex':
+    bij = tfb.Invert(tfb.SoftmaxCentered(event_ndims=1))
+    new_support = 'multivariate_real'
+  elif support in ('real', 'multivariate_real'):
+    return x
+  else:
+    msg = "'transform' does not handle supports of type '{}'".format(support)
+    raise ValueError(msg)
+
+  new_x = TransformedDistribution(x, bij, *args, **kwargs)
+  new_x.support = new_support
+  return new_x
diff --git a/edward/inferences/wake_sleep.py b/edward/inferences/wake_sleep.py
index 132978f6b..5c8437f59 100644
--- a/edward/inferences/wake_sleep.py
+++ b/edward/inferences/wake_sleep.py
@@ -6,8 +6,7 @@
 import tensorflow as tf
 
 from edward.inferences import docstrings as doc
-from edward.inferences.inference import (
-    call_function_up_to_args, make_intercept)
+from edward.inferences.util import call_function_up_to_args, make_intercept
 from edward.models.core import Trace
 
 
diff --git a/edward/inferences/wgan_inference.py b/edward/inferences/wgan_inference.py
index 8ad6ad446..8a86af146 100644
--- a/edward/inferences/wgan_inference.py
+++ b/edward/inferences/wgan_inference.py
@@ -6,7 +6,7 @@
 import tensorflow as tf
 
 from edward.inferences import docstrings as doc
-from edward.inferences.inference import call_function_up_to_args
+from edward.inferences.util import call_function_up_to_args
 
 
 @doc.set_doc(
diff --git a/edward/util/__init__.py b/edward/util/__init__.py
index fb8f8833c..911aeb043 100644
--- a/edward/util/__init__.py
+++ b/edward/util/__init__.py
@@ -4,14 +4,12 @@
 from __future__ import division
 from __future__ import print_function
 
-from edward.util.random_variables import *
 from edward.util.tensorflow import *
 
 from tensorflow.python.util.all_util import remove_undocumented
 
 _allowed_symbols = [
     'get_control_variate_coef',
-    'transform',
 ]
 
 remove_undocumented(__name__, allowed_exception_list=_allowed_symbols)
diff --git a/edward/util/random_variables.py b/edward/util/random_variables.py
deleted file mode 100644
index 080501a8a..000000000
--- a/edward/util/random_variables.py
+++ /dev/null
@@ -1,73 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf
-
-from edward.models.core import TransformedDistribution
-
-tfb = tf.contrib.distributions.bijectors
-
-
-def transform(x, *args, **kwargs):
-  """Transform a continuous random variable to the unconstrained space.
-
-  `transform` selects among a number of default transformations which
-  depend on the support of the provided random variable:
-
-  + $[0, 1]$ (e.g., Beta): Inverse of sigmoid.
-  + $[0, \infty)$ (e.g., Gamma): Inverse of softplus.
-  + Simplex (e.g., Dirichlet): Inverse of softmax-centered.
-  + $(-\infty, \infty)$ (e.g., Normal, MultivariateNormalTriL): None.
-
-  Args:
-    x: RandomVariable.
-      Continuous random variable to transform.
-    *args, **kwargs:
-      Arguments to overwrite when forming the `TransformedDistribution`.
-      For example, manually specify the transformation by passing in
-      the `bijector` argument.
-
-  Returns:
-    RandomVariable.
-    A `TransformedDistribution` random variable, or the provided random
-    variable if no transformation was applied.
-
-  #### Examples
-
-  ```python
-  x = Gamma(1.0, 1.0)
-  y = ed.transform(x)
-  sess = tf.Session()
-  sess.run(y)
-  -2.2279539
-  ```
-  """
-  if len(args) != 0 or kwargs.get('bijector', None) is not None:
-    return TransformedDistribution(x, *args, **kwargs)
-
-  try:
-    support = x.support
-  except AttributeError as e:
-    msg = """'{}' object has no 'support'
-             so cannot be transformed.""".format(type(x).__name__)
-    raise AttributeError(msg)
-
-  if support == '01':
-    bij = tfb.Invert(tfb.Sigmoid())
-    new_support = 'real'
-  elif support == 'nonnegative':
-    bij = tfb.Invert(tfb.Softplus())
-    new_support = 'real'
-  elif support == 'simplex':
-    bij = tfb.Invert(tfb.SoftmaxCentered(event_ndims=1))
-    new_support = 'multivariate_real'
-  elif support in ('real', 'multivariate_real'):
-    return x
-  else:
-    msg = "'transform' does not handle supports of type '{}'".format(support)
-    raise ValueError(msg)
-
-  new_x = TransformedDistribution(x, bij, *args, **kwargs)
-  new_x.support = new_support
-  return new_x
diff --git a/setup.py b/setup.py
index 45bd5742f..3878d1bcd 100644
--- a/setup.py
+++ b/setup.py
@@ -17,7 +17,6 @@
     extras_require={
         'tensorflow': ['tensorflow>=1.2.0rc0'],
         'tensorflow with gpu': ['tensorflow-gpu>=1.2.0rc0'],
-        'neural networks': ['keras>=2.0.0', 'prettytensor>=0.7.4'],
         'datasets': ['observations>=0.1.2'],
         'notebooks': ['jupyter>=1.0.0'],
         'visualization': ['matplotlib>=1.3',
diff --git a/tests/util/transform_test.py b/tests/inferences/transform_test.py
similarity index 89%
rename from tests/util/transform_test.py
rename to tests/inferences/transform_test.py
index f541f5008..903f0bd81 100644
--- a/tests/util/transform_test.py
+++ b/tests/inferences/transform_test.py
@@ -7,6 +7,7 @@
 import tensorflow as tf
 
 from collections import namedtuple
+from edward.inferences.util import transform
 from edward.models import (
     Beta, Dirichlet, DirichletProcess, Gamma, MultivariateNormalDiag,
     Normal, Poisson, TransformedDistribution)
@@ -24,21 +25,21 @@ def assertSamplePosNeg(self, sample):
   def test_args(self):
     with self.test_session():
       x = Normal(-100.0, 1.0)
-      y = ed.transform(x, bijectors.Softplus())
+      y = transform(x, bijectors.Softplus())
       sample = y.sample(10).eval()
       self.assertTrue((sample >= 0.0).all())
 
   def test_kwargs(self):
     with self.test_session():
       x = Normal(-100.0, 1.0)
-      y = ed.transform(x, bijector=bijectors.Softplus())
+      y = transform(x, bijector=bijectors.Softplus())
       sample = y.sample(10).eval()
       self.assertTrue((sample >= 0.0).all())
 
   def test_01(self):
     with self.test_session():
       x = Beta(1.0, 1.0)
-      y = ed.transform(x)
+      y = transform(x)
       self.assertIsInstance(y, TransformedDistribution)
       sample = y.sample(10, seed=1).eval()
       self.assertSamplePosNeg(sample)
@@ -46,7 +47,7 @@ def test_01(self):
   def test_nonnegative(self):
     with self.test_session():
       x = Gamma(1.0, 1.0)
-      y = ed.transform(x)
+      y = transform(x)
       self.assertIsInstance(y, TransformedDistribution)
       sample = y.sample(10, seed=1).eval()
       self.assertSamplePosNeg(sample)
@@ -54,7 +55,7 @@ def test_nonnegative(self):
   def test_simplex(self):
     with self.test_session():
       x = Dirichlet([1.1, 1.2, 1.3, 1.4])
-      y = ed.transform(x)
+      y = transform(x)
       self.assertIsInstance(y, TransformedDistribution)
       sample = y.sample(10, seed=1).eval()
       self.assertSamplePosNeg(sample)
@@ -62,7 +63,7 @@ def test_simplex(self):
   def test_real(self):
     with self.test_session():
       x = Normal(0.0, 1.0)
-      y = ed.transform(x)
+      y = transform(x)
       self.assertIsInstance(y, Normal)
       sample = y.sample(10, seed=1).eval()
       self.assertSamplePosNeg(sample)
@@ -70,7 +71,7 @@ def test_real(self):
   def test_multivariate_real(self):
     with self.test_session():
       x = MultivariateNormalDiag(tf.zeros(2), tf.ones(2))
-      y = ed.transform(x)
+      y = transform(x)
       sample = y.sample(10, seed=1).eval()
       self.assertSamplePosNeg(sample)
 
@@ -78,14 +79,14 @@ def test_no_support(self):
     with self.test_session():
       x = DirichletProcess(1.0, Normal(0.0, 1.0))
       with self.assertRaises(AttributeError):
-        y = ed.transform(x)
+        y = transform(x)
 
   def test_unhandled_support(self):
     with self.test_session():
       FakeRV = namedtuple('FakeRV', ['support'])
       x = FakeRV(support='rational')
       with self.assertRaises(ValueError):
-        y = ed.transform(x)
+        y = transform(x)
 
 if __name__ == '__main__':
   tf.test.main()

From e07789b24641b4438e3593afa80806aac41d9738 Mon Sep 17 00:00:00 2001
From: Dustin Tran <dustinviettran@gmail.com>
Date: Sun, 4 Feb 2018 15:04:38 -0800
Subject: [PATCH 22/27] move get_control_variate_coef, last of edward.util

---
 edward/__init__.py                            |  7 +--
 edward/inferences/util.py                     | 38 ++++++++++++++++
 edward/util/__init__.py                       | 15 -------
 edward/util/tensorflow.py                     | 43 -------------------
 .../get_control_variate_coef_test.py          |  2 +-
 5 files changed, 40 insertions(+), 65 deletions(-)
 delete mode 100644 edward/util/__init__.py
 delete mode 100644 edward/util/tensorflow.py
 rename tests/{util => inferences}/get_control_variate_coef_test.py (89%)

diff --git a/edward/__init__.py b/edward/__init__.py
index 5d7e7b8ba..2b0122937 100644
--- a/edward/__init__.py
+++ b/edward/__init__.py
@@ -4,7 +4,6 @@
 
 from edward import inferences
 from edward import models
-from edward import util
 
 # Direct imports for convenience
 from edward.inferences import (
@@ -36,8 +35,6 @@
     get_variables,
     is_independent,
     random_variables)
-from edward.util import (
-    get_control_variate_coef)
 from edward.version import __version__, VERSION
 
 from tensorflow.python.util.all_util import remove_undocumented
@@ -46,7 +43,6 @@
 _allowed_symbols = [
     'inferences',
     'models',
-    'util',
     'bigan_inference',
     'complete_conditional',
     'gan_inference',
@@ -68,7 +64,6 @@
     'get_ancestors',
     'get_blanket',
     'get_children',
-    'get_control_variate_coef',
     'get_descendants',
     'get_parents',
     'get_siblings',
@@ -82,5 +77,5 @@
 # Remove all extra symbols that don't have a docstring or are not explicitly
 # referenced in the whitelist.
 remove_undocumented(__name__, _allowed_symbols, [
-    inferences, models, util
+    inferences, models
 ])
diff --git a/edward/inferences/util.py b/edward/inferences/util.py
index 9481a8d43..02e426059 100644
--- a/edward/inferences/util.py
+++ b/edward/inferences/util.py
@@ -143,3 +143,41 @@ def transform(x, *args, **kwargs):
   new_x = TransformedDistribution(x, bij, *args, **kwargs)
   new_x.support = new_support
   return new_x
+
+
+def get_control_variate_coef(f, h):
+  """Returns scalar used by control variates method for variance reduction in
+  Monte Carlo methods.
+
+  If we have a statistic $m$ as an unbiased estimator of $\mu$ and
+  and another statistic $t$ which is an unbiased estimator of
+  $\\tau$ then $m^* = m + c(t - \\tau)$ is also an unbiased
+  estimator of $\mu$ for any coefficient $c$.
+
+  This function calculates the optimal coefficient
+
+  $c^* = \\frac{\\text{Cov}(m,t)}{\\text{Var}(t)}$
+
+  for minimizing the variance of $m^*$.
+
+  Args:
+    f: tf.Tensor.
+      A 1-D tensor.
+    h: tf.Tensor.
+      A 1-D tensor.
+
+  Returns:
+    tf.Tensor.
+    A 0 rank tensor
+  """
+  f_mu = tf.reduce_mean(f)
+  h_mu = tf.reduce_mean(h)
+
+  n = f.shape[0].value
+
+  cov_fh = tf.reduce_sum((f - f_mu) * (h - h_mu)) / (n - 1)
+  var_h = tf.reduce_sum(tf.square(h - h_mu)) / (n - 1)
+
+  a = cov_fh / var_h
+
+  return a
diff --git a/edward/util/__init__.py b/edward/util/__init__.py
deleted file mode 100644
index 911aeb043..000000000
--- a/edward/util/__init__.py
+++ /dev/null
@@ -1,15 +0,0 @@
-"""
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from edward.util.tensorflow import *
-
-from tensorflow.python.util.all_util import remove_undocumented
-
-_allowed_symbols = [
-    'get_control_variate_coef',
-]
-
-remove_undocumented(__name__, allowed_exception_list=_allowed_symbols)
diff --git a/edward/util/tensorflow.py b/edward/util/tensorflow.py
deleted file mode 100644
index 03a8c9d41..000000000
--- a/edward/util/tensorflow.py
+++ /dev/null
@@ -1,43 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf
-
-
-def get_control_variate_coef(f, h):
-  """Returns scalar used by control variates method for variance reduction in
-  Monte Carlo methods.
-
-  If we have a statistic $m$ as an unbiased estimator of $\mu$ and
-  and another statistic $t$ which is an unbiased estimator of
-  $\\tau$ then $m^* = m + c(t - \\tau)$ is also an unbiased
-  estimator of $\mu$ for any coefficient $c$.
-
-  This function calculates the optimal coefficient
-
-  $c^* = \\frac{\\text{Cov}(m,t)}{\\text{Var}(t)}$
-
-  for minimizing the variance of $m^*$.
-
-  Args:
-    f: tf.Tensor.
-      A 1-D tensor.
-    h: tf.Tensor.
-      A 1-D tensor.
-
-  Returns:
-    tf.Tensor.
-    A 0 rank tensor
-  """
-  f_mu = tf.reduce_mean(f)
-  h_mu = tf.reduce_mean(h)
-
-  n = f.shape[0].value
-
-  cov_fh = tf.reduce_sum((f - f_mu) * (h - h_mu)) / (n - 1)
-  var_h = tf.reduce_sum(tf.square(h - h_mu)) / (n - 1)
-
-  a = cov_fh / var_h
-
-  return a
diff --git a/tests/util/get_control_variate_coef_test.py b/tests/inferences/get_control_variate_coef_test.py
similarity index 89%
rename from tests/util/get_control_variate_coef_test.py
rename to tests/inferences/get_control_variate_coef_test.py
index 5138f20e8..a502b7b28 100644
--- a/tests/util/get_control_variate_coef_test.py
+++ b/tests/inferences/get_control_variate_coef_test.py
@@ -4,7 +4,7 @@
 
 import tensorflow as tf
 
-from edward.util.tensorflow import get_control_variate_coef
+from edward.inferences.util import get_control_variate_coef
 
 
 class test_get_control_variate_coef(tf.test.TestCase):

From 04f649b46a0544c50c00c3c727693ff9c0dd220d Mon Sep 17 00:00:00 2001
From: Dustin Tran <dustinviettran@gmail.com>
Date: Thu, 1 Feb 2018 18:02:25 -0800
Subject: [PATCH 23/27] add core with trace v2

---
 edward/__init__.py                       |   6 +-
 edward/inferences/bigan_inference.py     |   6 +-
 edward/inferences/hmc.py                 |   7 +-
 edward/inferences/klpq.py                |  10 +-
 edward/inferences/klqp.py                |  29 ++--
 edward/inferences/klqp_implicit.py       |  14 +-
 edward/inferences/laplace.py             |  16 +-
 edward/inferences/map.py                 |  10 +-
 edward/inferences/metropolis_hastings.py |  17 +--
 edward/inferences/sghmc.py               |   7 +-
 edward/inferences/sgld.py                |   7 +-
 edward/inferences/wake_sleep.py          |  16 +-
 edward/models/__init__.py                |   4 +-
 edward/models/core.py                    | 183 ++++++++---------------
 14 files changed, 123 insertions(+), 209 deletions(-)

diff --git a/edward/__init__.py b/edward/__init__.py
index 2b0122937..661f83722 100644
--- a/edward/__init__.py
+++ b/edward/__init__.py
@@ -25,7 +25,6 @@
     wake_sleep,
     wgan_inference)
 from edward.models import (
-    Trace,
     get_ancestors,
     get_blanket,
     get_children,
@@ -34,7 +33,8 @@
     get_siblings,
     get_variables,
     is_independent,
-    random_variables)
+    random_variables,
+    trace)
 from edward.version import __version__, VERSION
 
 from tensorflow.python.util.all_util import remove_undocumented
@@ -60,7 +60,6 @@
     'sgld',
     'wake_sleep',
     'wgan_inference',
-    'Trace',
     'get_ancestors',
     'get_blanket',
     'get_children',
@@ -70,6 +69,7 @@
     'get_variables',
     'is_independent',
     'random_variables',
+    'trace',
     '__version__',
     'VERSION',
 ]
diff --git a/edward/inferences/bigan_inference.py b/edward/inferences/bigan_inference.py
index 2e415c353..96fe636bb 100644
--- a/edward/inferences/bigan_inference.py
+++ b/edward/inferences/bigan_inference.py
@@ -7,7 +7,7 @@
 
 from edward.inferences import docstrings as doc
 from edward.inferences.util import call_function_up_to_args
-from edward.models.core import Trace
+from edward.models.core import trace
 
 
 @doc.set_doc(
@@ -77,8 +77,8 @@ def discriminator(x):
       x_data=x_data)
   ```
   """
-  with Trace() as posterior_trace:
-    call_function_up_to_args(variational, *args, **kwargs)
+  posterior_trace = trace(variational, *args, **kwargs)
+  # TODO
   with Trace() as model_trace:
     x_fake = call_function_up_to_args(model, *args, **kwargs)
 
diff --git a/edward/inferences/hmc.py b/edward/inferences/hmc.py
index 94c490782..4f0e760b7 100644
--- a/edward/inferences/hmc.py
+++ b/edward/inferences/hmc.py
@@ -6,8 +6,8 @@
 import tensorflow as tf
 
 from edward.inferences import docstrings as doc
-from edward.inferences.util import call_function_up_to_args, make_intercept
-from edward.models.core import Node, Trace
+from edward.inferences.util import make_intercept
+from edward.models.core import Node, trace
 
 tfp = tf.contrib.bayesflow
 
@@ -109,8 +109,7 @@ def _target_log_prob_fn(*fargs):
                        for state, arg in zip(states, fargs)}
     intercept = make_intercept(
         posterior_trace, align_data, align_latent, args, kwargs)
-    with Trace(intercept=intercept) as model_trace:
-      call_function_up_to_args(model, *args, **kwargs)
+    model_trace = trace(model, intercept=intercept, *args, **kwargs)
 
     p_log_prob = 0.0
     for name, node in six.iteritems(model_trace):
diff --git a/edward/inferences/klpq.py b/edward/inferences/klpq.py
index 620afecf5..0072b6dd1 100644
--- a/edward/inferences/klpq.py
+++ b/edward/inferences/klpq.py
@@ -6,8 +6,8 @@
 import tensorflow as tf
 
 from edward.inferences import docstrings as doc
-from edward.inferences.util import call_function_up_to_args, make_intercept
-from edward.models.core import Trace
+from edward.inferences.util import make_intercept
+from edward.models.core import trace
 
 try:
   from edward.models import Normal
@@ -98,12 +98,10 @@ def variational():
   p_log_prob = [0.0] * n_samples
   q_log_prob = [0.0] * n_samples
   for s in range(n_samples):
-    with Trace() as posterior_trace:
-      call_function_up_to_args(variational, *args, **kwargs)
+    posterior_trace = trace(variational, *args, **kwargs)
     intercept = make_intercept(
         posterior_trace, align_data, align_latent, args, kwargs)
-    with Trace(intercept=intercept) as model_trace:
-      call_function_up_to_args(model, *args, **kwargs)
+    model_trace = trace(model, intercept=intercept, *args, **kwargs)
 
     for name, node in six.iteritems(model_trace):
       rv = node.value
diff --git a/edward/inferences/klqp.py b/edward/inferences/klqp.py
index 02e3b5f68..5d9de07f4 100644
--- a/edward/inferences/klqp.py
+++ b/edward/inferences/klqp.py
@@ -6,8 +6,8 @@
 import tensorflow as tf
 
 from edward.inferences import docstrings as doc
-from edward.inferences.util import call_function_up_to_args, make_intercept
-from edward.models.core import Trace
+from edward.inferences.util import make_intercept
+from edward.models.core import trace
 
 try:
   from edward.models import Normal
@@ -114,12 +114,10 @@ def variational():
   surrogate_loss = [None] * n_samples
   kl_penalty = 0.0
   for s in range(n_samples):
-    with Trace() as posterior_trace:
-      call_function_up_to_args(variational, *args, **kwargs)
+    posterior_trace = trace(variational, *args, **kwargs)
     intercept = make_intercept(
         posterior_trace, align_data, align_latent, args, kwargs)
-    with Trace(intercept=intercept) as model_trace:
-      call_function_up_to_args(model, *args, **kwargs)
+    model_trace = trace(model, intercept=intercept, *args, **kwargs)
 
     # Collect key-value pairs of (rv, rv's (scaled) log prob).
     p_dict = {}
@@ -252,13 +250,10 @@ def variational():
   p_log_prob = [0.0] * n_samples
   q_log_prob = [0.0] * n_samples
   for s in range(n_samples):
-    with Trace() as posterior_trace:
-      call_function_up_to_args(variational, *args, **kwargs)
+    posterior_trace = trace(variational, *args, **kwargs)
     intercept = make_intercept(
         posterior_trace, align_data, align_latent, args, kwargs)
-    with Trace(intercept=intercept) as model_trace:
-      call_function_up_to_args(model, *args, **kwargs)
-
+    model_trace = trace(model, intercept=intercept, *args, **kwargs)
     for name, node in six.iteritems(model_trace):
       rv = node.value
       scale_factor = scale(name)
@@ -358,12 +353,10 @@ def variational():
   """
   p_log_lik = [0.0] * n_samples
   for s in range(n_samples):
-    with Trace() as posterior_trace:
-      call_function_up_to_args(variational, *args, **kwargs)
+    posterior_trace = trace(variational, *args, **kwargs)
     intercept = make_intercept(
         posterior_trace, align_data, align_latent, args, kwargs)
-    with Trace(intercept=intercept) as model_trace:
-      call_function_up_to_args(model, *args, **kwargs)
+    model_trace = trace(model, intercept=intercept, *args, **kwargs)
 
     for name, node in six.iteritems(model_trace):
       if align_data(name) is not None:
@@ -465,12 +458,10 @@ def variational():
   p_log_prob = [0.0] * n_samples
   q_log_prob = [0.0] * n_samples
   for s in range(n_samples):
-    with Trace() as posterior_trace:
-      call_function_up_to_args(variational, *args, **kwargs)
+    posterior_trace = trace(variational, *args, **kwargs)
     intercept = make_intercept(
         posterior_trace, align_data, align_latent, args, kwargs)
-    with Trace(intercept=intercept) as model_trace:
-      call_function_up_to_args(model, *args, **kwargs)
+    model_trace = trace(model, intercept=intercept, *args, **kwargs)
 
     for name, node in six.iteritems(model_trace):
       rv = node.value
diff --git a/edward/inferences/klqp_implicit.py b/edward/inferences/klqp_implicit.py
index 1fcf1eb0f..7dbe1cb47 100644
--- a/edward/inferences/klqp_implicit.py
+++ b/edward/inferences/klqp_implicit.py
@@ -6,8 +6,8 @@
 import tensorflow as tf
 
 from edward.inferences import docstrings as doc
-from edward.inferences.util import call_function_up_to_args, make_intercept
-from edward.models.core import Trace
+from edward.inferences.util import make_intercept
+from edward.models.core import trace
 
 
 @doc.set_doc(
@@ -163,14 +163,12 @@ def ratio_estimator(data, local_vars, global_vars):
   else:
     raise ValueError('Ratio loss not found:', ratio_loss)
 
-  with Trace() as posterior_trace:
-    call_function_up_to_args(variational, *args, **kwargs)
+  posterior_trace = trace(variational, *args, **kwargs)
+  # Intercept model's global latent variables and set to posterior
+  # samples (but not its locals).
   global_intercept = make_intercept(
       posterior_trace, align_data, align_latent_global, args, kwargs)
-  with Trace(intercept=global_intercept) as model_trace:
-    # Intercept model's global latent variables and set to posterior
-    # samples (but not its locals).
-    call_function_up_to_args(model, *args, **kwargs)
+  model_trace = trace(model, intercept=global_intercept, *args, **kwargs)
 
   # Collect tensors used in calculation of losses.
   pbeta_log_prob = 0.0
diff --git a/edward/inferences/laplace.py b/edward/inferences/laplace.py
index bc061fda1..6661ddd2a 100644
--- a/edward/inferences/laplace.py
+++ b/edward/inferences/laplace.py
@@ -7,8 +7,7 @@
 
 from edward.inferences import docstrings as doc
 from edward.inferences.map import map
-from edward.inferences.util import call_function_up_to_args
-from edward.models.core import Trace
+from edward.models.core import trace
 from edward.models.queries import get_variables
 
 try:
@@ -96,12 +95,11 @@ def _finalize(loss, variational):
 
   Computes the Hessian at the mode.
   """
-  with Trace() as trace:
-    call_function_up_to_args(variational, *args, **kwargs)
+  posterior_trace = trace(variational, *args, **kwargs)
   hessians = tf.hessians(
-      loss, [node.value.loc for node in six.itervalues(trace)])
+      loss, [node.value.loc for node in six.itervalues(posterior_trace)])
   finalize_ops = []
-  for qz, hessian in zip(six.itervalues(trace), hessians):
+  for qz, hessian in zip(six.itervalues(posterior_trace), hessians):
     if isinstance(qz, (MultivariateNormalDiag, Normal)):
       scale_var = get_variables(qz.variance())[0]
       scale = 1.0 / tf.diag_part(hessian)
@@ -119,11 +117,9 @@ def _make_variational_pointmass(variational, *args, **kwargs):
 
   We assume all latent variables are traceable in one execution.
   """
-  with Trace() as trace:
-    call_function_up_to_args(variational, *args, **kwargs)
-
+  posterior_trace = trace(variational, *args, **kwargs)
   def variational_pointmass(*args, **kwargs):
-    for name, node in six.iteritems(trace):
+    for name, node in six.iteritems(posterior_trace):
       qz = node.value
       qz_pointmass = PointMass(params=qz.loc,
                                name=qz.name + "_pointmass",
diff --git a/edward/inferences/map.py b/edward/inferences/map.py
index 8e1cda2a9..7b3b5f17b 100644
--- a/edward/inferences/map.py
+++ b/edward/inferences/map.py
@@ -6,8 +6,8 @@
 import tensorflow as tf
 
 from edward.inferences import docstrings as doc
-from edward.inferences.util import call_function_up_to_args, make_intercept
-from edward.models.core import Trace
+from edward.inferences.util import make_intercept
+from edward.models.core import trace
 
 try:
   from tensorflow.contrib.distributions import bijectors
@@ -98,12 +98,10 @@ def variational():
   performing MAP on the unconstrained space: in general, the MAP of
   the transform is not the transform of the MAP.
   """
-  with Trace() as posterior_trace:
-    call_function_up_to_args(variational, *args, **kwargs)
+  posterior_trace = trace(variational, *args, **kwargs)
   intercept = make_intercept(
       posterior_trace, align_data, align_latent, args, kwargs)
-  with Trace(intercept=intercept) as model_trace:
-    call_function_up_to_args(model, *args, **kwargs)
+  model_trace = trace(model, intercept=intercept, *args, **kwargs)
 
   p_log_prob = 0.0
   for name, node in six.iteritems(model_trace):
diff --git a/edward/inferences/metropolis_hastings.py b/edward/inferences/metropolis_hastings.py
index 5fa9ffd5e..d2821fb13 100644
--- a/edward/inferences/metropolis_hastings.py
+++ b/edward/inferences/metropolis_hastings.py
@@ -6,8 +6,8 @@
 import tensorflow as tf
 
 from edward.inferences import docstrings as doc
-from edward.inferences.util import call_function_up_to_args, make_intercept
-from edward.models.core import Node, Trace
+from edward.inferences.util import make_intercept
+from edward.models.core import Node, trace
 
 tfp = tf.contrib.bayesflow
 
@@ -109,8 +109,7 @@ def _target_log_prob_fn(*fargs):
                        for state, arg in zip(states, fargs)}
     intercept = make_intercept(
         posterior_trace, align_data, align_latent, args, kwargs)
-    with Trace(intercept=intercept) as model_trace:
-      call_function_up_to_args(model, *args, **kwargs)
+    model_trace = trace(model, intercept=intercept, *args, **kwargs)
 
     global inverse_align_latent
     inverse_align_latent = {}
@@ -131,9 +130,8 @@ def _proposal_fn(*fargs):
     returns same size and order as inputted states.
     """
     global inverse_align_latent
-    with Trace() as new_trace:
-      # Build g(new | old): new states are drawn given old states as input.
-      call_function_up_to_args(proposal, *fargs)
+    # Build g(new | old): new states are drawn given old states as input.
+    new_trace = trace(proposal, *fargs)
     new_states = []
     old_proposal_trace = {}
     for state, farg in zip(states, fargs):
@@ -145,9 +143,8 @@ def _proposal_fn(*fargs):
     align_latent = lambda name: name if name in old_proposal_trace else None
     intercept = make_intercept(
         old_proposal_trace, align_data, align_latent, args, kwargs)
-    with Trace(intercept=intercept) as old_trace:
-      # Build g(old | new): `value`s set to old states; new states are input.
-      call_function_up_to_args(proposal, *new_states)
+    # Build g(old | new): `value`s set to old states; new states are input.
+    old_trace = trace(proposal, intercept=intercept, *new_states)
     old_states = [old_trace[align_proposal(
                     inverse_align_latent[state.name.split(':')[0]])].value
                   for state in states]
diff --git a/edward/inferences/sghmc.py b/edward/inferences/sghmc.py
index 43f7cfb5d..bb1ca8df4 100644
--- a/edward/inferences/sghmc.py
+++ b/edward/inferences/sghmc.py
@@ -6,8 +6,8 @@
 import tensorflow as tf
 
 from edward.inferences import docstrings as doc
-from edward.inferences.util import call_function_up_to_args, make_intercept
-from edward.models.core import Node, Trace
+from edward.inferences.util import make_intercept
+from edward.models.core import Node, trace
 
 
 @doc.set_doc(
@@ -136,8 +136,7 @@ def _target_log_prob_fn(*fargs):
                        for state, arg in zip(states, fargs)}
     intercept = make_intercept(
         posterior_trace, align_data, align_latent, args, kwargs)
-    with Trace(intercept=intercept) as model_trace:
-      call_function_up_to_args(model, *args, **kwargs)
+    model_trace = trace(model, intercept=intercept, *args, **kwargs)
 
     p_log_prob = 0.0
     for name, node in six.iteritems(model_trace):
diff --git a/edward/inferences/sgld.py b/edward/inferences/sgld.py
index 17af4fdb4..c897aa443 100644
--- a/edward/inferences/sgld.py
+++ b/edward/inferences/sgld.py
@@ -6,8 +6,8 @@
 import tensorflow as tf
 
 from edward.inferences import docstrings as doc
-from edward.inferences.util import call_function_up_to_args, make_intercept
-from edward.models.core import Node, Trace
+from edward.inferences.util import make_intercept
+from edward.models.core import Node, trace
 
 tfp = tf.contrib.bayesflow
 
@@ -118,8 +118,7 @@ def _target_log_prob_fn(*fargs):
                        for state, arg in zip(states, fargs)}
     intercept = make_intercept(
         posterior_trace, align_data, align_latent, args, kwargs)
-    with Trace(intercept=intercept) as model_trace:
-      call_function_up_to_args(model, *args, **kwargs)
+    model_trace = trace(model, intercept=intercept, *args, **kwargs)
 
     p_log_prob = 0.0
     for name, node in six.iteritems(model_trace):
diff --git a/edward/inferences/wake_sleep.py b/edward/inferences/wake_sleep.py
index 5c8437f59..22bac13c7 100644
--- a/edward/inferences/wake_sleep.py
+++ b/edward/inferences/wake_sleep.py
@@ -6,8 +6,8 @@
 import tensorflow as tf
 
 from edward.inferences import docstrings as doc
-from edward.inferences.util import call_function_up_to_args, make_intercept
-from edward.models.core import Trace
+from edward.inferences.util import make_intercept
+from edward.models.core import trace
 
 
 @doc.set_doc(
@@ -98,12 +98,10 @@ def variational(x):
   p_log_prob = [0.0] * n_samples
   q_log_prob = [0.0] * n_samples
   for s in range(n_samples):
-    with Trace() as posterior_trace:
-      call_function_up_to_args(variational, *args, **kwargs)
+    posterior_trace = trace(variational, *args, **kwargs)
     intercept = make_intercept(
         posterior_trace, align_data, align_latent, args, kwargs)
-    with Trace(intercept=intercept) as model_trace:
-      call_function_up_to_args(model, *args, **kwargs)
+    model_trace = trace(model, intercept=intercept, *args, **kwargs)
 
     for name, node in six.iteritems(model_trace):
       rv = node.value
@@ -117,12 +115,10 @@ def variational(x):
             scale_factor * qz.log_prob(tf.stop_gradient(qz.value)))
 
     if phase_q == 'sleep':
-      with Trace() as model_trace:
-        call_function_up_to_args(model, *args, **kwargs)
+      model_trace = trace(model, *args, **kwargs)
       intercept = _make_sleep_intercept(
           model_trace, align_data, align_latent, args, kwargs)
-      with Trace(intercept=intercept) as posterior_trace:
-        call_function_up_to_args(variational, *args, **kwargs)
+      posterior_trace = trace(variational, intercept=intercept, *args, **kwargs)
 
       # Build dictionary to return scale factor for a posterior
       # variable via its corresponding prior. The implementation is
diff --git a/edward/models/__init__.py b/edward/models/__init__.py
index 0060281c9..6605275f3 100644
--- a/edward/models/__init__.py
+++ b/edward/models/__init__.py
@@ -5,15 +5,14 @@
 from __future__ import print_function
 
 from edward.models.core import *
-from edward.models.random_variable import *
 from edward.models.queries import *
+from edward.models.random_variable import *
 
 from tensorflow.python.util.all_util import remove_undocumented
 from edward.models import core as _module
 
 _allowed_symbols = [
     'RandomVariable',
-    'Trace',
     'get_ancestors',
     'get_blanket',
     'get_children',
@@ -23,6 +22,7 @@
     'get_variables',
     'is_independent',
     'random_variables',
+    'trace',
 ]
 for name in dir(_module):
   obj = getattr(_module, name)
diff --git a/edward/models/core.py b/edward/models/core.py
index e3c7e5886..9c7151ea3 100644
--- a/edward/models/core.py
+++ b/edward/models/core.py
@@ -9,87 +9,11 @@
 from tensorflow.contrib import distributions as _distributions
 
 
-class Trace(object):
-  """Context manager with two objects:
-
-  + The trace stack stores executions from each primitive fn.
-  + (Optional) The intercept callable intercepts the continuation of a function.
-
-  Optionally, the trace stack stores the function call, its inputs, and
-  its parent primitives. This lets us trace the continuation
-  structure. Storing inputs can be memory-intensive as it prevents
-  garbage collection; hence it's optional.
-  """
-  def __init__(self, intercept=None, trace_continuation=False):
-    self._intercept = intercept
-    self._trace_continuation = trace_continuation
-    # We use OrderedDict. It is essentially a stack where each element is a node
-    # (value) and its name (key); the name is a pointer to the node.
-    self._trace_stack = _collections.OrderedDict({})
-
-  def __enter__(self):
-    # Note if Trace's are nested, global vars are set
-    # to the innermost context's variables.
-    if self._intercept is not None:
-      global _INTERCEPT
-      _INTERCEPT = self._intercept
-    global _TRACE_CONTINUATION, _TRACE_STACK
-    _TRACE_CONTINUATION = self._trace_continuation
-    _TRACE_STACK = self._trace_stack
-    return self
-
-  def __exit__(self, t, v, tb):
-    global _INTERCEPT, _TRACE_CONTINUATION, _TRACE_STACK
-    try:
-      del _INTERCEPT
-    except:
-      pass
-    del _TRACE_CONTINUATION
-    del _TRACE_STACK
-
-  # operator-overloading for convenience
-  def __repr__(self):
-    return self._trace_stack.__repr__()
-
-  def __str__(self):
-    return self._trace_stack.__str__()
-
-  def __delitem__(self, key):
-    del self._trace_stack[key]
-
-  def __getitem__(self, key):
-    return self._trace_stack[key]
-
-  def __setitem__(self, key, value):
-    self._trace_stack[key] = value
-
-  def get(self, key, value=None):
-    return self._trace_stack.get(key, value)
-
-  def iteritems(self):
-    return self._trace_stack.items()
-
-  def iterkeys(self):
-    return self._trace_stack.keys()
-
-  def itervalues(self):
-    return self._trace_stack.values()
-
-  def items(self):
-    return self._trace_stack.items()
-
-  def keys(self):
-    return self._trace_stack.keys()
-
-  def values(self):
-    return self._trace_stack.values()
-
-
 class Node(object):
-  """Node in trace stack. Collection of nodes forms a directed acyclic graph."""
+  """Node in execution trace. A trace's nodes form a directed acyclic graph."""
   __slots__ = ['value', 'f', 'args', 'kwargs', 'parents']
 
-  def __init__(self, value, f=None, args=None, kwargs=None, parents=None):
+  def __init__(self, value, f, args, kwargs, parents):
     self.value = value
     self.f = f
     self.args = args
@@ -97,52 +21,71 @@ def __init__(self, value, f=None, args=None, kwargs=None, parents=None):
     self.parents = parents
 
 
-def primitive(fn):
-  """Wraps function so its continuation can be intercepted
-  and its execution can be written to a stack.
-
-  Apply this to decorate primitive functions.
-  """
-  def wrapped_fn(*args, **kwargs):
-    global _INTERCEPT, _TRACE_CONTINUATION, _TRACE_STACK
-    if '_INTERCEPT' in globals():
-      out = _INTERCEPT(fn, *args, **kwargs)
+def primitive(cls_init):
+  """Wraps class __init__ for recording and intercepting."""
+  def __init__(self, *args, **kwargs):
+    global _INTERCEPT, _STORE_ARGS, _TRACE_STACK
+    if '_INTERCEPT' in globals() and callable(_INTERCEPT):
+      _INTERCEPT(cls_init, self, *args, **kwargs)
     else:
-      out = fn(*args, **kwargs)
-    if '_TRACE_CONTINUATION' in globals() and '_TRACE_STACK' in globals():
-      if _TRACE_CONTINUATION:
+      cls_init(self, *args, **kwargs)
+    if '_STORE_ARGS' in globals() and '_TRACE_STACK' in globals():
+      if _STORE_ARGS:
         parents = [v for v in list(args) + kwargs.values()
                    if hasattr(v, "name") and v.name in _TRACE_STACK]
-        _TRACE_STACK[out.name] = Node(out, fn, args, kwargs, parents)
+        _TRACE_STACK[self.name] = Node(self, cls_init, args, kwargs, parents)
       else:
-        _TRACE_STACK[out.name] = Node(out)
-    return out
-  return wrapped_fn
-
-
-# TODO(trandustin): wrapping via init, not primitive() so wrapped
-# class still belongs in RandomVariable. Is this distinction
-# necessary?
-def _primitive_cls(__init__):
-  """Wraps class' __init__ so its continuation can be intercepted
-  and its execution can be written to a stack.
-
-  Apply this to decorate primitive classes.
+        _TRACE_STACK[self.name] = Node(self, None, None, None, None)
+  return __init__
+
+
+def trace(f, *args, **kwargs):
+  """Traces the function `f(*args, **kwargs)`.
+
+  Args:
+    f: Function to trace.
+    intercept: Function to intercept primitives. It takes a primitive
+      function `f`, inputs `args, kwargs`, and may return any value and/or
+      add side-effects. Default is `None`, equivalent to `f(*args, **kwargs)`.
+    store_args: Boolean for whether `Node`s store their inputs and parent
+      primitives. Default is `False`.
+    args, kwargs: (Possible) inputs to function.
+
+  Returns:
+    The execution trace of `f`, collecting any `primitive` operations that the
+    function executed. It is reified as a stack (`OrderedDict`), and each
+    executed primitive is a `Node` on the stack indexed by its string name.
+
+  #### Examples
+
+  ```python
+  def f(x):
+    y = Poisson(rate=x, name="y")
+
+  def intercept(f, *args, **kwargs):
+    if kwargs.get("name") == "y":
+      kwargs["value"] = 42
+    return f(*args, **kwargs)
+
+  trace_stack = ed.trace(f, 1.5, intercept=intercept)
+  print(trace_stack)
+  ## OrderedDict([('y', <edward.models.core.Node object at 0x118c1ce10>)])
+
+  rv = trace_stack["y"].value
+  with tf.Session() as sess:
+    assert sess.run(rv.value) == 42
+  ```
   """
-  def wrapped_fn(self, *args, **kwargs):
-    global _INTERCEPT, _TRACE_CONTINUATION, _TRACE_STACK
-    if '_INTERCEPT' in globals():
-      _INTERCEPT(__init__, self, *args, **kwargs)
-    else:
-      __init__(self, *args, **kwargs)
-    if '_TRACE_CONTINUATION' in globals() and '_TRACE_STACK' in globals():
-      if _TRACE_CONTINUATION:
-        parents = [v for v in list(args) + kwargs.values()
-                   if hasattr(v, "name") and v.name in _TRACE_STACK]
-        _TRACE_STACK[self.name] = Node(self, __init__, args, kwargs, parents)
-      else:
-        _TRACE_STACK[self.name] = Node(self)
-  return wrapped_fn
+  # TODO move call_function_up_to_args
+  from edward.inferences.util import call_function_up_to_args
+  global _INTERCEPT, _STORE_ARGS, _TRACE_STACK
+  _INTERCEPT = kwargs.pop("intercept", None)
+  _STORE_ARGS = kwargs.pop("store_args", False)
+  _TRACE_STACK = _collections.OrderedDict({})
+  call_function_up_to_args(f, *args, **kwargs)
+  output = _TRACE_STACK
+  del _INTERCEPT, _STORE_ARGS, _TRACE_STACK
+  return output
 
 
 # Automatically generate random variable classes from classes in
@@ -156,7 +99,7 @@ def wrapped_fn(self, *args, **kwargs):
 
     # write a new __init__ method in order to decorate class as primitive
     # and share _candidate's docstring
-    @_primitive_cls
+    @primitive
     def __init__(self, *args, **kwargs):
       _RandomVariable.__init__(self, *args, **kwargs)
     __init__.__doc__ = _candidate.__init__.__doc__

From a13b765993a7a1a5c27632a000760ce520dbd51e Mon Sep 17 00:00:00 2001
From: Dustin Tran <dustinviettran@gmail.com>
Date: Sun, 4 Feb 2018 11:22:39 -0800
Subject: [PATCH 24/27] add core with trace v3

---
 edward/__init__.py                            |   6 +-
 edward/inferences/bigan_inference.py          |  20 +--
 edward/inferences/gan_inference.py            |   5 +-
 edward/inferences/hmc.py                      |  43 ++----
 edward/inferences/klpq.py                     |  25 ++--
 edward/inferences/klqp.py                     |  84 ++++++------
 edward/inferences/klqp_implicit.py            |  44 +++---
 edward/inferences/laplace.py                  |  14 +-
 edward/inferences/map.py                      |  20 ++-
 edward/inferences/metropolis_hastings.py      |  82 ++++++------
 edward/inferences/sghmc.py                    |  43 ++----
 edward/inferences/sgld.py                     |  41 ++----
 edward/inferences/util.py                     | 126 ++++++++++++++----
 edward/inferences/wake_sleep.py               |  53 +++-----
 edward/inferences/wgan_inference.py           |   5 +-
 edward/models/__init__.py                     |   2 +-
 edward/models/core.py                         |  82 ++++--------
 edward/models/queries.py                      |  31 ++++-
 edward/models/random_variable.py              |  13 +-
 ...e_test.py => call_with_manipulate_test.py} |  13 +-
 20 files changed, 369 insertions(+), 383 deletions(-)
 rename tests/models/{trace_test.py => call_with_manipulate_test.py} (69%)

diff --git a/edward/__init__.py b/edward/__init__.py
index 661f83722..16e10475b 100644
--- a/edward/__init__.py
+++ b/edward/__init__.py
@@ -25,6 +25,7 @@
     wake_sleep,
     wgan_inference)
 from edward.models import (
+    call_with_manipulate,
     get_ancestors,
     get_blanket,
     get_children,
@@ -33,8 +34,7 @@
     get_siblings,
     get_variables,
     is_independent,
-    random_variables,
-    trace)
+    random_variables)
 from edward.version import __version__, VERSION
 
 from tensorflow.python.util.all_util import remove_undocumented
@@ -44,6 +44,7 @@
     'inferences',
     'models',
     'bigan_inference',
+    'call_with_manipulate',
     'complete_conditional',
     'gan_inference',
     'hmc',
@@ -69,7 +70,6 @@
     'get_variables',
     'is_independent',
     'random_variables',
-    'trace',
     '__version__',
     'VERSION',
 ]
diff --git a/edward/inferences/bigan_inference.py b/edward/inferences/bigan_inference.py
index 96fe636bb..1c6e6187c 100644
--- a/edward/inferences/bigan_inference.py
+++ b/edward/inferences/bigan_inference.py
@@ -6,8 +6,8 @@
 import tensorflow as tf
 
 from edward.inferences import docstrings as doc
-from edward.inferences.util import call_function_up_to_args
-from edward.models.core import trace
+from edward.inferences.util import (
+    call_with_trace, make_optional_inputs, toposort)
 
 
 @doc.set_doc(
@@ -65,6 +65,7 @@ def variational(x_data):
                 scale=tf.nn.softplus(net[:, 25:]),
                 sample_shape=[256,],
                 name="qz")
+    return qz
 
   def discriminator(x):
     net = tf.layers.dense(x, 256, activation=tf.nn.relu)
@@ -77,10 +78,9 @@ def discriminator(x):
       x_data=x_data)
   ```
   """
-  posterior_trace = trace(variational, *args, **kwargs)
-  # TODO
-  with Trace() as model_trace:
-    x_fake = call_function_up_to_args(model, *args, **kwargs)
+  q_trace = call_with_trace(variational, *args, **kwargs)
+  model = make_optional_inputs(model)
+  x_fake = model(*args, **kwargs)
 
   key = align_data(x_fake.name.split(':')[0])
   if isinstance(key, int):
@@ -88,11 +88,11 @@ def discriminator(x):
   elif kwargs.get(key, None) is not None:
     x_true = kwargs.get(key)
 
-  for name, node in six.iteritems(model_trace):
-    aligned = align_latent(name)
+  for rv in toposort(x_fake):
+    aligned = align_latent(rv.name)
     if aligned is not None:
-      z_true = node.value
-      z_fake = posterior_trace[aligned].value
+      z_true = rv
+      z_fake = q_trace[aligned]
       break
 
   with tf.variable_scope("Disc"):
diff --git a/edward/inferences/gan_inference.py b/edward/inferences/gan_inference.py
index 9236cba8f..50a6c410a 100644
--- a/edward/inferences/gan_inference.py
+++ b/edward/inferences/gan_inference.py
@@ -6,7 +6,7 @@
 import tensorflow as tf
 
 from edward.inferences import docstrings as doc
-from edward.inferences.util import call_function_up_to_args
+from edward.inferences.util import make_optional_inputs
 
 
 @doc.set_doc(
@@ -59,7 +59,8 @@ def discriminator(x):
       x_data=x_data)
   ```
   """
-  x_fake = call_function_up_to_args(model, *args, **kwargs)
+  model = make_optional_inputs(model)
+  x_fake = model(*args, **kwargs)
   key = align_data(x_fake.name.split(':')[0])
   if isinstance(key, int):
     x_true = args[key]
diff --git a/edward/inferences/hmc.py b/edward/inferences/hmc.py
index 4f0e760b7..9dba22669 100644
--- a/edward/inferences/hmc.py
+++ b/edward/inferences/hmc.py
@@ -6,8 +6,7 @@
 import tensorflow as tf
 
 from edward.inferences import docstrings as doc
-from edward.inferences.util import make_intercept
-from edward.models.core import Node, trace
+from edward.inferences.util import make_log_joint
 
 tfp = tf.contrib.bayesflow
 
@@ -70,60 +69,42 @@ def hmc(model,
   def model():
     mu = Normal(loc=0.0, scale=1.0, name="mu")
     x = Normal(loc=mu, scale=1.0, sample_shape=10, name="x")
+    return x
   ```
   In graph mode, build `tf.Variable`s which are updated via the Markov
   chain. The update op is fetched at runtime over many iterations.
   ```python
   qmu = tf.get_variable("qmu", initializer=1.)
-  new_state, _, _ = ed.hmc(
+  next_state, _, _ = ed.hmc(
       model,
       ...,
       current_state=qmu,
       align_latent=lambda name: "qmu" if name == "mu" else None,
       align_data=lambda name: "x_data" if name == "x" else None,
       x_data=x_data)
-  qmu_update = qmu.assign(new_state)
+  qmu_update = qmu.assign(next_state)
   ```
   In eager mode, call the function at runtime, updating its inputs
-  such as `state`.
+  such as `current_state`.
   ```python
   qmu = 1.
-  new_log_prob = None
-  new_gradients = None
+  next_log_prob = None
+  next_gradients = None
   for _ in range(1000):
-    new_state, new_log_prob, new_gradients = ed.hmc(
+    next_state, next_log_prob, next_gradients = ed.hmc(
         model,
         ...,
         current_state=qmu,
         align_latent=lambda name: "qmu" if name == "mu" else None,
         align_data=lambda name: "x_data" if name == "x" else None,
-        current_target_log_prob=new_log_prob,
-        current_grads_target_log_prob=new_gradients,
+        current_target_log_prob=next_log_prob,
+        current_grads_target_log_prob=next_gradients,
         x_data=x_data)
-    qmu = new_state
+    qmu = next_state
   ```
   """
-  def _target_log_prob_fn(*fargs):
-    """Target's unnormalized log-joint density as a function of states."""
-    posterior_trace = {state.name.split(':')[0]: Node(arg)
-                       for state, arg in zip(states, fargs)}
-    intercept = make_intercept(
-        posterior_trace, align_data, align_latent, args, kwargs)
-    model_trace = trace(model, intercept=intercept, *args, **kwargs)
-
-    p_log_prob = 0.0
-    for name, node in six.iteritems(model_trace):
-      if align_latent(name) is not None or align_data(name) is not None:
-        rv = node.value
-        p_log_prob += tf.reduce_sum(rv.log_prob(rv.value))
-    return p_log_prob
-
-  is_list_like = lambda x: isinstance(x, (tuple, list))
-  maybe_list = lambda x: list(x) if is_list_like(x) else [x]
-  states = maybe_list(current_state)
-
   out = tfp.hmc.kernel(
-      target_log_prob_fn=_target_log_prob_fn,
+      target_log_prob_fn=make_log_joint(model, current_state),
       current_state=current_state,
       step_size=step_size,
       num_leapfrog_steps=num_leapfrog_steps,
diff --git a/edward/inferences/klpq.py b/edward/inferences/klpq.py
index 0072b6dd1..b1924398a 100644
--- a/edward/inferences/klpq.py
+++ b/edward/inferences/klpq.py
@@ -6,8 +6,8 @@
 import tensorflow as tf
 
 from edward.inferences import docstrings as doc
-from edward.inferences.util import make_intercept
-from edward.models.core import trace
+from edward.inferences.util import (
+    call_with_intercept, call_with_trace, toposort)
 
 try:
   from edward.models import Normal
@@ -82,11 +82,13 @@ def klpq(model, variational, align_latent, align_data,
   def model():
     mu = Normal(loc=0.0, scale=1.0, name="mu")
     x = Normal(loc=mu, scale=1.0, sample_shape=10, name="x")
+    return x
 
   def variational():
     qmu = Normal(loc=tf.get_variable("loc", []),
                  scale=tf.nn.softplus(tf.get_variable("shape", [])),
                  name="qmu")
+    return qmu
 
   loss, surrogate_loss = ed.klpq(
       model, variational,
@@ -98,19 +100,16 @@ def variational():
   p_log_prob = [0.0] * n_samples
   q_log_prob = [0.0] * n_samples
   for s in range(n_samples):
-    posterior_trace = trace(variational, *args, **kwargs)
-    intercept = make_intercept(
-        posterior_trace, align_data, align_latent, args, kwargs)
-    model_trace = trace(model, intercept=intercept, *args, **kwargs)
-
-    for name, node in six.iteritems(model_trace):
-      rv = node.value
-      scale_factor = scale(name)
-      if align_latent(name) is not None or align_data(name) is not None:
+    q_trace = call_with_trace(variational, *args, **kwargs)
+    x = call_with_intercept(model, q_trace, align_data, align_latent,
+                            *args, **kwargs)
+    for rv in toposort(x):
+      scale_factor = scale(rv.name)
+      if align_latent(rv.name) is not None or align_data(rv.name) is not None:
         p_log_prob[s] += tf.reduce_sum(
             scale_factor * rv.log_prob(tf.stop_gradient(rv.value)))
-      if align_latent(name) is not None:
-        qz = posterior_trace[align_latent(name)].value
+      if align_latent(rv.name) is not None:
+        qz = posterior_trace[align_latent(rv.name)]
         q_log_prob[s] += tf.reduce_sum(
             scale_factor * qz.log_prob(tf.stop_gradient(qz.value)))
 
diff --git a/edward/inferences/klqp.py b/edward/inferences/klqp.py
index 5d9de07f4..4610eeac2 100644
--- a/edward/inferences/klqp.py
+++ b/edward/inferences/klqp.py
@@ -6,8 +6,8 @@
 import tensorflow as tf
 
 from edward.inferences import docstrings as doc
-from edward.inferences.util import make_intercept
-from edward.models.core import trace
+from edward.inferences.util import (
+    call_with_intercept, call_with_trace, toposort)
 
 try:
   from edward.models import Normal
@@ -89,11 +89,13 @@ def klqp(model, variational, align_latent, align_data,
   def model():
     mu = Normal(loc=0.0, scale=1.0, name="mu")
     x = Normal(loc=mu, scale=1.0, sample_shape=10, name="x")
+    return x
 
   def variational():
     qmu = Normal(loc=tf.get_variable("loc", []),
                  scale=tf.nn.softplus(tf.get_variable("shape", [])),
                  name="qmu")
+    return qmu
 
   loss, surrogate_loss = ed.klqp(
       model, variational,
@@ -114,28 +116,26 @@ def variational():
   surrogate_loss = [None] * n_samples
   kl_penalty = 0.0
   for s in range(n_samples):
-    posterior_trace = trace(variational, *args, **kwargs)
-    intercept = make_intercept(
-        posterior_trace, align_data, align_latent, args, kwargs)
-    model_trace = trace(model, intercept=intercept, *args, **kwargs)
+    q_trace = call_with_trace(variational, *args, **kwargs)
+    x = call_with_intercept(model, q_trace, align_data, align_latent,
+                            *args, **kwargs)
 
     # Collect key-value pairs of (rv, rv's (scaled) log prob).
     p_dict = {}
     q_dict = {}
     inverse_align_latent = {}
-    for name, node in six.iteritems(model_trace):
-      rv = node.value
-      scale_factor = scale(name)
-      if align_data(name) is not None:
+    for rv in toposort(x):
+      scale_factor = scale(rv.name)
+      if align_data(rv.name) is not None:
         p_dict[rv] = tf.reduce_sum(scale_factor * rv.log_prob(rv.value))
-      if align_latent(name) is not None:
-        qz = posterior_trace[align_latent(name)].value
+      if align_latent(rv.name) is not None:
+        qz = q_trace[align_latent(rv.name)]
         # For pairs with analytic KL, accumulate KL divergences for
         # first iteration in loop.
         if isinstance(rv, Normal) and isinstance(qz, Normal):
           if s == 0:
             kl_penalty += tf.reduce_sum(
-                kl_scaling(name) * kl_divergence(qz, rv))
+                kl_scaling(rv.name) * kl_divergence(qz, rv))
         else:
           p_dict[rv] = tf.reduce_sum(scale_factor * rv.log_prob(rv.value))
           q_dict[qz] = tf.reduce_sum(scale_factor * qz.log_prob(qz.value))
@@ -234,11 +234,13 @@ def klqp_reparameterization(model, variational, align_latent, align_data,
   def model():
     mu = Normal(loc=0.0, scale=1.0, name="mu")
     x = Normal(loc=mu, scale=1.0, sample_shape=10, name="x")
+    return x
 
   def variational():
     qmu = Normal(loc=tf.get_variable("loc", []),
                  scale=tf.nn.softplus(tf.get_variable("shape", [])),
                  name="qmu")
+    return qmu
 
   loss = ed.klqp_reparameterization(
       model, variational,
@@ -250,17 +252,15 @@ def variational():
   p_log_prob = [0.0] * n_samples
   q_log_prob = [0.0] * n_samples
   for s in range(n_samples):
-    posterior_trace = trace(variational, *args, **kwargs)
-    intercept = make_intercept(
-        posterior_trace, align_data, align_latent, args, kwargs)
-    model_trace = trace(model, intercept=intercept, *args, **kwargs)
-    for name, node in six.iteritems(model_trace):
-      rv = node.value
-      scale_factor = scale(name)
-      if align_latent(name) is not None or align_data(name) is not None:
+    q_trace = call_with_trace(variational, *args, **kwargs)
+    x = call_with_intercept(model, q_trace, align_data, align_latent,
+                            *args, **kwargs)
+    for rv in toposort(x):
+      scale_factor = scale(rv.name)
+      if align_latent(rv.name) is not None or align_data(rv.name) is not None:
         p_log_prob[s] += tf.reduce_sum(scale_factor * rv.log_prob(rv.value))
-      if align_latent(name) is not None:
-        qz = posterior_trace[align_latent(name)].value
+      if align_latent(rv.name) is not None:
+        qz = q_trace[align_latent(rv.name)]
         q_log_prob[s] += tf.reduce_sum(scale_factor * qz.log_prob(qz.value))
 
   p_log_prob = tf.reduce_mean(p_log_prob)
@@ -338,11 +338,13 @@ def klqp_reparameterization_kl(model, variational, align_latent, align_data,
   def model():
     mu = Normal(loc=0.0, scale=1.0, name="mu")
     x = Normal(loc=mu, scale=1.0, sample_shape=10, name="x")
+    return x
 
   def variational():
     qmu = Normal(loc=tf.get_variable("loc", []),
                  scale=tf.nn.softplus(tf.get_variable("shape", [])),
                  name="qmu")
+    return qmu
 
   loss = ed.klqp_reparameterization_kl(
       model, variational,
@@ -353,15 +355,12 @@ def variational():
   """
   p_log_lik = [0.0] * n_samples
   for s in range(n_samples):
-    posterior_trace = trace(variational, *args, **kwargs)
-    intercept = make_intercept(
-        posterior_trace, align_data, align_latent, args, kwargs)
-    model_trace = trace(model, intercept=intercept, *args, **kwargs)
-
-    for name, node in six.iteritems(model_trace):
-      if align_data(name) is not None:
-        rv = node.value
-        scale_factor = scale(name)
+    q_trace = call_with_trace(variational, *args, **kwargs)
+    x = call_with_intercept(model, q_trace, align_data, align_latent,
+                            *args, **kwargs)
+    for rv in toposort(x):
+      if align_data(rv.name) is not None:
+        scale_factor = scale(rv.name)
         p_log_lik[s] += tf.reduce_sum(scale_factor * rv.log_prob(rv.value))
 
   p_log_lik = tf.reduce_mean(p_log_lik)
@@ -442,11 +441,13 @@ def klqp_score(model, variational, align_latent, align_data,
   def model():
     mu = Normal(loc=0.0, scale=1.0, name="mu")
     x = Normal(loc=mu, scale=1.0, sample_shape=10, name="x")
+    return x
 
   def variational():
     qmu = Normal(loc=tf.get_variable("loc", []),
                  scale=tf.nn.softplus(tf.get_variable("shape", [])),
                  name="qmu")
+    return qmu
 
   loss, surrogate_loss = ed.klqp_score(
       model, variational,
@@ -458,18 +459,15 @@ def variational():
   p_log_prob = [0.0] * n_samples
   q_log_prob = [0.0] * n_samples
   for s in range(n_samples):
-    posterior_trace = trace(variational, *args, **kwargs)
-    intercept = make_intercept(
-        posterior_trace, align_data, align_latent, args, kwargs)
-    model_trace = trace(model, intercept=intercept, *args, **kwargs)
-
-    for name, node in six.iteritems(model_trace):
-      rv = node.value
-      scale_factor = scale(name)
-      if align_latent(name) is not None or align_data(name) is not None:
+    q_trace = call_with_trace(variational, *args, **kwargs)
+    x = call_with_intercept(model, q_trace, align_data, align_latent,
+                            *args, **kwargs)
+    for rv in toposort(x):
+      scale_factor = scale(rv.name)
+      if align_latent(rv.name) is not None or align_data(rv.name) is not None:
         p_log_prob[s] += tf.reduce_sum(scale_factor * rv.log_prob(rv.value))
-      if align_latent(name) is not None:
-        qz = posterior_trace[align_latent(name)].value
+      if align_latent(rv.name) is not None:
+        qz = q_trace[align_latent(rv.name)]
         q_log_prob[s] += tf.reduce_sum(
             scale_factor * qz.log_prob(tf.stop_gradient(qz.value)))
 
diff --git a/edward/inferences/klqp_implicit.py b/edward/inferences/klqp_implicit.py
index 7dbe1cb47..243b996ed 100644
--- a/edward/inferences/klqp_implicit.py
+++ b/edward/inferences/klqp_implicit.py
@@ -6,8 +6,8 @@
 import tensorflow as tf
 
 from edward.inferences import docstrings as doc
-from edward.inferences.util import make_intercept
-from edward.models.core import trace
+from edward.inferences.util import (
+    call_with_intercept, call_with_trace, toposort)
 
 
 @doc.set_doc(
@@ -140,6 +140,7 @@ def variational(x):
     qz = Normal(loc=net[:, :25],
                 scale=tf.nn.softplus(net[:, 25:]),
                 name="qz")
+    return qz
 
   def ratio_estimator(data, local_vars, global_vars):
     # concatenated input has shape (batch_size, 28*28 + 25)
@@ -163,12 +164,11 @@ def ratio_estimator(data, local_vars, global_vars):
   else:
     raise ValueError('Ratio loss not found:', ratio_loss)
 
-  posterior_trace = trace(variational, *args, **kwargs)
+  q_trace = call_with_trace(variational, *args, **kwargs)
   # Intercept model's global latent variables and set to posterior
   # samples (but not its locals).
-  global_intercept = make_intercept(
-      posterior_trace, align_data, align_latent_global, args, kwargs)
-  model_trace = trace(model, intercept=global_intercept, *args, **kwargs)
+  x = call_with_intercept(model, q_trace, align_data, align_latent_global,
+                          *args, **kwargs)
 
   # Collect tensors used in calculation of losses.
   pbeta_log_prob = 0.0
@@ -178,35 +178,35 @@ def ratio_estimator(data, local_vars, global_vars):
   qz_sample = {}
   x_psample = {}
   x_qsample = {}
-  for name, node in six.iteritems(model_trace):
+  for rv in toposort(x):
     # Calculate log p(beta') and log q(beta').
-    if align_latent_global(name) is not None:
-      pbeta = node.value
-      qbeta = posterior_trace[align_latent_global(name)].value
-      scale_factor = scale(name)
+    if align_latent_global(rv.name) is not None:
+      pbeta = rv
+      qbeta = q_trace[align_latent_global(rv.name)]
+      scale_factor = scale(rv.name)
       pbeta_log_prob += tf.reduce_sum(
           scale_factor * pbeta.log_prob(pbeta.value))
       qbeta_log_prob += tf.reduce_sum(
           scale_factor * qbeta.log_prob(qbeta.value))
-      qbeta_sample[name] = qbeta.value
+      qbeta_sample[rv.name] = qbeta.value
     else:
       # TODO This assumes implicit variables are tf.Tensors existing
       # on the Trace stack.
-      if align_latent(name) is not None:
-        pz = node.value
-        qz = posterior_trace[align_latent(Name)].value
-        pz_sample[name] = pz
-        qz_sample[name] = qz
+      if align_latent(rv.name) is not None:
+        pz = rv
+        qz = q_trace[align_latent(rv.name)]
+        pz_sample[rv.name] = pz
+        qz_sample[rv.name] = qz
       else:
-        key = align_data(name)
+        key = align_data(rv.name)
         if isinstance(key, int):
           data_node = args[key]
         elif kwargs.get(key, None) is not None:
           data_node = kwargs.get(key)
-        px = node.value
-        qx = data_node.value
-        x_psample[name] = px
-        x_qsample[name] = qx
+        px = rv
+        qx = data_node
+        x_psample[rv.name] = px
+        x_qsample[rv.name] = qx
 
   # Collect x' ~ p(x | z', beta') and x' ~ q(x).
   with tf.variable_scope("Disc"):
diff --git a/edward/inferences/laplace.py b/edward/inferences/laplace.py
index 6661ddd2a..d74241909 100644
--- a/edward/inferences/laplace.py
+++ b/edward/inferences/laplace.py
@@ -7,7 +7,7 @@
 
 from edward.inferences import docstrings as doc
 from edward.inferences.map import map
-from edward.models.core import trace
+from edward.inferences.util import call_with_trace
 from edward.models.queries import get_variables
 
 try:
@@ -67,12 +67,14 @@ def laplace(model, variational, align_latent, align_data,
   def model(X):
     w = Normal(loc=tf.zeros(D), scale=tf.ones(D), name="w")
     y = Normal(loc=tf.tensordot(X, w, [[1], [0]]), scale=tf.ones(N), name="y")
+    return y
 
   def variational():
     qw = MultivariateNormalTriL(
         loc=tf.Variable(tf.random_normal([D])),
         scale_tril=tf.Variable(tf.random_normal([D, D])),
         name="qw")
+    return qw
 
   loss = ed.laplace(
       model, variational,
@@ -95,11 +97,11 @@ def _finalize(loss, variational):
 
   Computes the Hessian at the mode.
   """
-  posterior_trace = trace(variational, *args, **kwargs)
+  q_trace = call_with_trace(variational, *args, **kwargs)
   hessians = tf.hessians(
-      loss, [node.value.loc for node in six.itervalues(posterior_trace)])
+      loss, [node.value.loc for node in six.itervalues(q_trace)])
   finalize_ops = []
-  for qz, hessian in zip(six.itervalues(posterior_trace), hessians):
+  for qz, hessian in zip(six.itervalues(q_trace), hessians):
     if isinstance(qz, (MultivariateNormalDiag, Normal)):
       scale_var = get_variables(qz.variance())[0]
       scale = 1.0 / tf.diag_part(hessian)
@@ -117,9 +119,9 @@ def _make_variational_pointmass(variational, *args, **kwargs):
 
   We assume all latent variables are traceable in one execution.
   """
-  posterior_trace = trace(variational, *args, **kwargs)
+  q_trace = call_with_trace(variational, *args, **kwargs)
   def variational_pointmass(*args, **kwargs):
-    for name, node in six.iteritems(posterior_trace):
+    for name, node in six.iteritems(q_trace):
       qz = node.value
       qz_pointmass = PointMass(params=qz.loc,
                                name=qz.name + "_pointmass",
diff --git a/edward/inferences/map.py b/edward/inferences/map.py
index 7b3b5f17b..3c9eb3995 100644
--- a/edward/inferences/map.py
+++ b/edward/inferences/map.py
@@ -6,8 +6,8 @@
 import tensorflow as tf
 
 from edward.inferences import docstrings as doc
-from edward.inferences.util import make_intercept
-from edward.models.core import trace
+from edward.inferences.util import (
+    call_with_intercept, call_with_trace, toposort)
 
 try:
   from tensorflow.contrib.distributions import bijectors
@@ -84,6 +84,7 @@ def variational():
                     name="qmu")
     qsigma = PointMass(params=tf.nn.softplus(tf.Variable(tf.zeros(K*D))),
                        name="qsigma")
+    return qpi, qmu, qsigma
 
   loss = ed.map(..., variational, ...)
   ```
@@ -98,16 +99,13 @@ def variational():
   performing MAP on the unconstrained space: in general, the MAP of
   the transform is not the transform of the MAP.
   """
-  posterior_trace = trace(variational, *args, **kwargs)
-  intercept = make_intercept(
-      posterior_trace, align_data, align_latent, args, kwargs)
-  model_trace = trace(model, intercept=intercept, *args, **kwargs)
-
+  q_trace = call_with_trace(variational, *args, **kwargs)
+  x = call_with_intercept(model, q_trace, align_data, align_latent,
+                          *args, **kwargs)
   p_log_prob = 0.0
-  for name, node in six.iteritems(model_trace):
-    if align_latent(name) is not None or align_data(name) is not None:
-      rv = node.value
-      scale_factor = scale(name)
+  for rv in toposort(x):
+    if align_latent(rv.name) is not None or align_data(rv.name) is not None:
+      scale_factor = scale(rv.name)
       p_log_prob += tf.reduce_sum(scale_factor * rv.log_prob(rv.value))
 
   reg_penalty = tf.reduce_sum(tf.losses.get_regularization_losses())
diff --git a/edward/inferences/metropolis_hastings.py b/edward/inferences/metropolis_hastings.py
index d2821fb13..939c1b60e 100644
--- a/edward/inferences/metropolis_hastings.py
+++ b/edward/inferences/metropolis_hastings.py
@@ -2,12 +2,15 @@
 from __future__ import division
 from __future__ import print_function
 
+import collections
 import six
 import tensorflow as tf
 
 from edward.inferences import docstrings as doc
-from edward.inferences.util import make_intercept
-from edward.models.core import Node, trace
+from edward.inferences.util import (
+    call_with_trace, make_optional_inputs, make_log_joint)
+from edward.models.core import call_with_manipulate
+from edward.models.random_variable import RandomVariable
 
 tfp = tf.contrib.bayesflow
 
@@ -70,58 +73,41 @@ def metropolis_hastings(model,
   def model():
     mu = Normal(loc=0.0, scale=1.0, name="mu")
     x = Normal(loc=mu, scale=1.0, sample_shape=10, name="x")
+    return x
 
   def proposal(mu):
     proposal_mu = Normal(loc=mu, scale=0.5, name="proposal/mu")
+    return proposal_mu
   ```
   In graph mode, build `tf.Variable`s which are updated via the Markov
   chain. The update op is fetched at runtime over many iterations.
   ```python
   qmu = tf.get_variable("qmu", initializer=1.)
-  new_state, _ = ed.metropolis_hastings(
+  next_state, _ = ed.metropolis_hastings(
       model, proposal,
       current_state=qmu,
       align_latent=lambda name: "qmu" if name == "mu" else None,
       align_data=lambda name: "x_data" if name == "x" else None,
       x_data=x_data)
-  qmu_update = qmu.assign(new_state)
+  qmu_update = qmu.assign(next_state)
   ```
   In eager mode, call the function at runtime, updating its inputs
   such as `state`.
   ```python
   qmu = 1.
-  new_log_prob = None
+  next_log_prob = None
   for _ in range(1000):
-    new_state, new_log_prob = ed.metropolis_hastings(
+    next_state, next_log_prob = ed.metropolis_hastings(
         model, proposal,
         current_state=qmu,
         align_latent=lambda name: "qmu" if name == "mu" else None,
         align_proposal=lambda name: "proposal/mu" if name == "mu" else None,
         align_data=lambda name: "x_data" if name == "x" else None,
-        current_target_log_prob=new_log_prob,
+        current_target_log_prob=next_log_prob,
         x_data=x_data)
-    qmu = new_state
+    qmu = next_state
   ```
   """
-  def _target_log_prob_fn(*fargs):
-    """Target's unnormalized log-joint density as a function of states."""
-    posterior_trace = {state.name.split(':')[0]: Node(arg)
-                       for state, arg in zip(states, fargs)}
-    intercept = make_intercept(
-        posterior_trace, align_data, align_latent, args, kwargs)
-    model_trace = trace(model, intercept=intercept, *args, **kwargs)
-
-    global inverse_align_latent
-    inverse_align_latent = {}
-    p_log_prob = 0.0
-    for name, node in six.iteritems(model_trace):
-      if align_latent(name) is not None or align_data(name) is not None:
-        if align_latent(name) is not None:
-          inverse_align_latent[align_latent(name)] = name
-        rv = node.value
-        p_log_prob += tf.reduce_sum(rv.log_prob(rv.value))
-    return p_log_prob
-
   def _proposal_fn(*fargs):
     """Takes inputted states and returns (proposed states, log Hastings ratio).
 
@@ -131,27 +117,24 @@ def _proposal_fn(*fargs):
     """
     global inverse_align_latent
     # Build g(new | old): new states are drawn given old states as input.
-    new_trace = trace(proposal, *fargs)
+    new_trace = call_with_trace(proposal, *fargs)
     new_states = []
     old_proposal_trace = {}
     for state, farg in zip(states, fargs):
       name = state.name.split(':')[0]
-      new_state = new_trace[align_proposal(inverse_align_latent[name])].value
-      new_state_name = new_state.name.split(':')[0]
-      old_proposal_trace[new_state_name] = Node(farg)
+      new_state = new_trace[align_proposal(inverse_align_latent[name])]
+      old_proposal_trace[new_state.name.split(':')[0]] = farg
       new_states.append(new_state)
-    align_latent = lambda name: name if name in old_proposal_trace else None
-    intercept = make_intercept(
-        old_proposal_trace, align_data, align_latent, args, kwargs)
     # Build g(old | new): `value`s set to old states; new states are input.
-    old_trace = trace(proposal, intercept=intercept, *new_states)
-    old_states = [old_trace[align_proposal(
-                    inverse_align_latent[state.name.split(':')[0]])].value
-                  for state in states]
+    old_trace = call_with_trace_and_intercept(
+        proposal,
+        old_proposal_trace,
+        lambda name: name if name in old_proposal_trace else None,
+        *new_states)
     old_states = []
     for state, farg in zip(states, fargs):
       name = state.name.split(':')[0]
-      old_state = old_trace[align_proposal(inverse_align_latent[name])].value
+      old_state = old_trace[align_proposal(inverse_align_latent[name])]
       old_states.append(old_state)
     # Compute log p(old | new) - log p(new | old).
     log_hastings_ratio = 0.0
@@ -160,13 +143,26 @@ def _proposal_fn(*fargs):
       log_hastings_ratio -= tf.reduce_sum(new_state.log_prob(new_state.value))
     return new_states, log_hastings_ratio
 
-  is_list_like = lambda x: isinstance(x, (tuple, list))
-  maybe_list = lambda x: list(x) if is_list_like(x) else [x]
+  maybe_list = lambda x: list(x) if isinstance(x, (tuple, list)) else [x]
   states = maybe_list(current_state)
-
   out = tfp.metropolis_hastings.kernel(
-      target_log_prob_fn=_target_log_prob_fn,
+      target_log_prob_fn=make_log_joint(model, current_state),
       proposal_fn=_proposal_fn,
       current_state=current_state,
       current_target_log_prob=current_target_log_prob)
   return out
+
+
+def call_with_trace_and_intercept(f, trace, align_latent, *args, **kwargs):
+  """Calls function and both writes to a stack and intercepts sample value."""
+  def manipulate(cls_init, self, *fargs, **fkwargs):
+    name = fkwargs.get('name', None)
+    key = align_latent(name)
+    if trace.get(key, None) is not None:
+      fkwargs['value'] = tf.convert_to_tensor(trace[key])
+    cls_init(self, *fargs, **fkwargs)
+    stack[name] = self
+  stack = collections.OrderedDict({})
+  f = make_optional_inputs(f)
+  call_with_manipulate(f, manipulate, *args, **kwargs)
+  return stack
diff --git a/edward/inferences/sghmc.py b/edward/inferences/sghmc.py
index bb1ca8df4..801ab054e 100644
--- a/edward/inferences/sghmc.py
+++ b/edward/inferences/sghmc.py
@@ -6,8 +6,7 @@
 import tensorflow as tf
 
 from edward.inferences import docstrings as doc
-from edward.inferences.util import make_intercept
-from edward.models.core import Node, trace
+from edward.inferences.util import make_log_joint
 
 
 @doc.set_doc(
@@ -89,6 +88,7 @@ def sghmc(model,
   def model():
     mu = Normal(loc=0.0, scale=1.0, name="mu")
     x = Normal(loc=mu, scale=1.0, sample_shape=10, name="x")
+    return x
   ```
   In graph mode, build `tf.Variable`s which are updated via the Markov
   chain. The update op is fetched at runtime over many iterations.
@@ -96,7 +96,7 @@ def model():
   qmu = tf.get_variable("qmu", initializer=1.)
   qmu_mom = tf.get_variable("qmu_mom", initializer=0.)
   qmu_mom_state = tf.get_variable("qmu_mom_state", initializer=0.)
-  new_state, new_momentum, new_momentum_state = ed.sghmc(
+  next_state, next_momentum, next_momentum_state = ed.sghmc(
       model,
       ...,
       current_state=qmu,
@@ -105,18 +105,18 @@ def model():
       align_latent=lambda name: "qmu" if name == "mu" else None,
       align_data=lambda name: "x_data" if name == "x" else None,
       x_data=x_data)
-  qmu_update = qmu.assign(new_state)
-  qmu_mom_update = qmu_mom.assign(new_momentum)
-  qmu_mom_state_update = qmu_mom_state.assign(new_momentum_state)
+  qmu_update = qmu.assign(next_state)
+  qmu_mom_update = qmu_mom.assign(next_momentum)
+  qmu_mom_state_update = qmu_mom_state.assign(next_momentum_state)
   ```
   In eager mode, call the function at runtime, updating its inputs
-  such as `state`.
+  such as `current_state`.
   ```python
   qmu = 1.
   qmu_mom = None
   qmu_mom_state = None
   for _ in range(1000):
-    new_state, new_momentum, new_momentum_state = ed.sghmc(
+    next_state, next_momentum, next_momentum_state = ed.sghmc(
         model,
         ...,
         current_state=qmu,
@@ -125,32 +125,13 @@ def model():
         align_latent=lambda name: "qmu" if name == "mu" else None,
         align_data=lambda name: "x_data" if name == "x" else None,
         x_data=x_data)
-    qmu = new_state
-    qmu_mom = new_momentum
-    qmu_mom_state = new_momentum_state
+    qmu = next_state
+    qmu_mom = next_momentum
+    qmu_mom_state = next_momentum_state
   ```
   """
-  def _target_log_prob_fn(*fargs):
-    """Target's unnormalized log-joint density as a function of states."""
-    posterior_trace = {state.name.split(':')[0]: Node(arg)
-                       for state, arg in zip(states, fargs)}
-    intercept = make_intercept(
-        posterior_trace, align_data, align_latent, args, kwargs)
-    model_trace = trace(model, intercept=intercept, *args, **kwargs)
-
-    p_log_prob = 0.0
-    for name, node in six.iteritems(model_trace):
-      if align_latent(name) is not None or align_data(name) is not None:
-        rv = node.value
-        p_log_prob += tf.reduce_sum(rv.log_prob(rv.value))
-    return p_log_prob
-
-  is_list_like = lambda x: isinstance(x, (tuple, list))
-  maybe_list = lambda x: list(x) if is_list_like(x) else [x]
-  states = maybe_list(current_state)
-
   out = kernel(
-      target_log_prob_fn=_target_log_prob_fn,
+      target_log_prob_fn=make_log_joint(model, current_state),
       current_state=current_state,
       momentum=momentum,
       momentum_state=momentum_state,
diff --git a/edward/inferences/sgld.py b/edward/inferences/sgld.py
index c897aa443..5e7140a5f 100644
--- a/edward/inferences/sgld.py
+++ b/edward/inferences/sgld.py
@@ -6,8 +6,7 @@
 import tensorflow as tf
 
 from edward.inferences import docstrings as doc
-from edward.inferences.util import make_intercept
-from edward.models.core import Node, trace
+from edward.inferences.util import make_log_joint
 
 tfp = tf.contrib.bayesflow
 
@@ -77,13 +76,14 @@ def sgld(model,
   def model():
     mu = Normal(loc=0.0, scale=1.0, name="mu")
     x = Normal(loc=mu, scale=1.0, sample_shape=10, name="x")
+    return x
   ```
   In graph mode, build `tf.Variable`s which are updated via the Markov
   chain. The update op is fetched at runtime over many iterations.
   ```python
   qmu = tf.get_variable("qmu", initializer=1.)
   qmu_mom = tf.get_variable("qmu_mom", initializer=0.)
-  new_state, new_momentum = ed.sgld(
+  next_state, next_momentum = ed.sgld(
       model,
       ...,
       current_state=qmu,
@@ -91,16 +91,16 @@ def model():
       align_latent=lambda name: "qmu" if name == "mu" else None,
       align_data=lambda name: "x_data" if name == "x" else None,
       x_data=x_data)
-  qmu_update = qmu.assign(new_state)
-  qmu_mom_update = qmu_mom.assign(new_momentum)
+  qmu_update = qmu.assign(next_state)
+  qmu_mom_update = qmu_mom.assign(next_momentum)
   ```
   In eager mode, call the function at runtime, updating its inputs
-  such as `state`.
+  such as `current_state`.
   ```python
   qmu = 1.
   qmu_mom = None
   for _ in range(1000):
-    new_state, momentum = ed.sgld(
+    next_state, momentum = ed.sgld(
         model,
         ...,
         current_state=qmu,
@@ -108,32 +108,13 @@ def model():
         align_latent=lambda name: "qmu" if name == "mu" else None,
         align_data=lambda name: "x_data" if name == "x" else None,
         x_data=x_data)
-    qmu = new_state
-    qmu_mom = new_momentum
+    qmu = next_state
+    qmu_mom = next_momentum
   ```
   """
-  def _target_log_prob_fn(*fargs):
-    """Target's unnormalized log-joint density as a function of states."""
-    posterior_trace = {state.name.split(':')[0]: Node(arg)
-                       for state, arg in zip(states, fargs)}
-    intercept = make_intercept(
-        posterior_trace, align_data, align_latent, args, kwargs)
-    model_trace = trace(model, intercept=intercept, *args, **kwargs)
-
-    p_log_prob = 0.0
-    for name, node in six.iteritems(model_trace):
-      if align_latent(name) is not None or align_data(name) is not None:
-        rv = node.value
-        p_log_prob += tf.reduce_sum(rv.log_prob(rv.value))
-    return p_log_prob
-
-  is_list_like = lambda x: isinstance(x, (tuple, list))
-  maybe_list = lambda x: list(x) if is_list_like(x) else [x]
-  states = maybe_list(state)
-
   out = tfp.sgld.kernel(
-      target_log_prob_fn=_target_log_prob_fn,
-      current_state=state,
+      target_log_prob_fn=make_log_joint(model, current_state),
+      current_state=current_state,
       momentum=momentum,
       learning_rate=learning_rate,
       preconditioner_decay_rate=preconditioner_decay_rate,
diff --git a/edward/inferences/util.py b/edward/inferences/util.py
index 02e426059..a92e7f7a3 100644
--- a/edward/inferences/util.py
+++ b/edward/inferences/util.py
@@ -2,36 +2,34 @@
 from __future__ import division
 from __future__ import print_function
 
+import collections
 import inspect
+import operator
 import six
 import tensorflow as tf
 
-from edward.models.random_variable import RandomVariable
+from edward.models.core import call_with_manipulate
 from edward.models.core import TransformedDistribution
+from edward.models.random_variable import RandomVariable
 
 tfb = tf.contrib.distributions.bijectors
 
 
-def call_function_up_to_args(f, *args, **kwargs):
-  """Call f, removing any args/kwargs it doesn't take as input."""
-  if hasattr(f, "_func"):  # tf.make_template()
-    argspec = inspect.getargspec(f._func)
-  else:
-    argspec = inspect.getargspec(f)
-  fkwargs = {}
-  for k, v in six.iteritems(kwargs):
-    if k in argspec.args:
-      fkwargs[k] = v
-  num_args = len(argspec.args) - len(fkwargs)
-  if num_args > 0:
-    return f(*args[:num_args], **fkwargs)
-  elif len(fkwargs) > 0:
-    return f(**fkwargs)
-  return f()
-
-
-def make_intercept(trace, align_data, align_latent, args, kwargs):
-  def _intercept(f, *fargs, **fkwargs):
+def call_with_trace(f, *args, **kwargs):
+  """Calls function and writes to a stack to expose its execution trace."""
+  def manipulate(cls_init, self, *fargs, **fkwargs):
+    cls_init(self, *fargs, **fkwargs)
+    stack[self.name] = self
+  stack = collections.OrderedDict({})
+  f = make_optional_inputs(f)
+  call_with_manipulate(f, manipulate, *args, **kwargs)
+  return stack
+
+
+def call_with_intercept(f, trace, align_data, align_latent,
+                        *args, **kwargs):
+  """Calls function and intercepts its primitive ops' sample values."""
+  def manipulate(f, *fargs, **fkwargs):
     """Set model's sample values to variational distribution's and data."""
     name = fkwargs.get('name', None)
     key = align_data(name)
@@ -40,19 +38,91 @@ def _intercept(f, *fargs, **fkwargs):
     elif kwargs.get(key, None) is not None:
       fkwargs['value'] = kwargs.get(key)
     elif align_latent(name) is not None:
-      qz = trace[align_latent(name)].value
-      if isinstance(qz, RandomVariable):
-        value = qz.value
-      else:  # e.g. replacement is Tensor
-        value = tf.convert_to_tensor(qz)
-      fkwargs['value'] = value
+      fkwargs['value'] = tf.convert_to_tensor(trace[align_latent(name)])
     # if auto_transform and 'qz' in locals():
     #   # TODO for generation to work, must output original dist. to
     #   keep around TD? must maintain another stack to write to as a
     #   side-effect (or augment the original stack).
     #   return transform(f, qz, *fargs, **fkwargs)
     return f(*fargs, **fkwargs)
-  return _intercept
+  f = make_optional_inputs(f)
+  return call_with_manipulate(f, manipulate, *args, **kwargs)
+
+
+def make_log_joint(model, states):
+  """Factory to make a log-joint probability function.
+
+  It takes a model and transition states as input. It returns its log-joint
+  probability as a function of the states. (This is applied in Markov chain
+  Carlo algorithms.)
+  """
+  maybe_list = lambda x: list(x) if isinstance(x, (tuple, list)) else [x]
+  states = maybe_list(states)
+  def log_joint(*fargs):
+    """Target's unnormalized log-joint density as a function of states."""
+    q_trace = {state.name.split(':')[0]: arg
+               for state, arg in zip(states, fargs)}
+    x = call_with_intercept(model, q_trace, align_data, align_latent,
+                            *args, **kwargs)
+    p_log_prob = 0.0
+    for rv in toposort(x):
+      if align_latent(rv.name) is not None or align_data(rv.name) is not None:
+        p_log_prob += tf.reduce_sum(rv.log_prob(rv.value))
+    return p_log_prob
+  return log_joint
+
+
+def make_optional_inputs(f):
+  """Wraps function to take in optional, unused args/kwargs."""
+  def f_wrapped(*args, **kwargs):
+    if hasattr(f, "_func"):  # tf.make_template()
+      argspec = inspect.getargspec(f._func)
+    else:
+      argspec = inspect.getargspec(f)
+    fkwargs = {}
+    for k, v in six.iteritems(kwargs):
+      if k in argspec.args:
+        fkwargs[k] = v
+    num_args = len(argspec.args) - len(fkwargs)
+    if num_args > 0:
+      return f(*args[:num_args], **fkwargs)
+    elif len(fkwargs) > 0:
+      return f(**fkwargs)
+    return f()
+  f_wrapped.__name__ = getattr(f, '__name__', '[unknown name]')
+  f_wrapped.__doc__ = getattr(f, '__doc__' , '')
+  return f_wrapped
+
+
+def toposort(end_node, parents=operator.methodcaller('get_parents')):
+  """Generate nodes in DAG's reverse topological order.
+
+  For any edge U -> V, the function visits V before visiting U. It traces
+  using a backward pass, i.e., the "pull" dataflow model.
+
+  Args:
+    end_node: Input or list of inputs.
+  """
+  child_counts = {}
+  maybe_list = lambda x: list(x) if isinstance(x, (list, tuple)) else [x]
+  stack = maybe_list(end_node)
+  while stack:
+    node = stack.pop()
+    if node in child_counts:
+      child_counts[node] += 1
+    else:
+      child_counts[node] = 1
+      stack.extend(parents(node))
+
+  childless_nodes = maybe_list(end_node)
+  while childless_nodes:
+    node = childless_nodes.pop()
+    yield node
+    for parent in parents(node):
+      if child_counts[parent] == 1:
+        childless_nodes.append(parent)
+      else:
+        child_counts[parent] -= 1
 
 
 def transform(f, qz, *args, **kwargs):
diff --git a/edward/inferences/wake_sleep.py b/edward/inferences/wake_sleep.py
index 22bac13c7..fad6e835a 100644
--- a/edward/inferences/wake_sleep.py
+++ b/edward/inferences/wake_sleep.py
@@ -6,8 +6,8 @@
 import tensorflow as tf
 
 from edward.inferences import docstrings as doc
-from edward.inferences.util import make_intercept
-from edward.models.core import trace
+from edward.inferences.util import (
+    call_with_intercept, call_with_trace, toposort)
 
 
 @doc.set_doc(
@@ -81,12 +81,14 @@ def model():
     net = tf.layers.dense(z, 512, activation=tf.nn.relu)
     net = tf.layers.dense(net, 28 * 28, activation=None)
     x = Normal(loc=net, scale=1.0, name="x")
+    return x
 
   def variational(x):
     net = tf.layers.dense(x, 25 * 2)
     qz = Normal(loc=net[:, :25],
                 scale=tf.nn.softplus(net[:, 25:]),
                 name="qz")
+    return qz
 
   loss_p, loss_q = ed.wake_sleep(
       model, variational,
@@ -98,40 +100,35 @@ def variational(x):
   p_log_prob = [0.0] * n_samples
   q_log_prob = [0.0] * n_samples
   for s in range(n_samples):
-    posterior_trace = trace(variational, *args, **kwargs)
-    intercept = make_intercept(
-        posterior_trace, align_data, align_latent, args, kwargs)
-    model_trace = trace(model, intercept=intercept, *args, **kwargs)
-
-    for name, node in six.iteritems(model_trace):
-      rv = node.value
-      scale_factor = scale(name)
-      if align_data(name) is not None or align_latent(name) is not None:
+    q_trace = call_with_trace(variational, *args, **kwargs)
+    x = call_with_intercept(model, q_trace, align_data, align_latent,
+                            *args, **kwargs)
+    for rv in toposort(x):
+      scale_factor = scale(rv.name)
+      if align_data(rv.name) is not None or align_latent(rv.name) is not None:
         p_log_prob[s] += tf.reduce_sum(scale_factor * rv.log_prob(rv.value))
-      if phase_q != 'sleep' and align_latent(name) is not None:
+      if phase_q != 'sleep' and align_latent(rv.name) is not None:
         # If not sleep phase, compute log q(z).
-        qz = posterior_trace[align_latent(name)].value
+        qz = q_trace[align_latent(rv.name)]
         q_log_prob[s] += tf.reduce_sum(
             scale_factor * qz.log_prob(tf.stop_gradient(qz.value)))
 
     if phase_q == 'sleep':
-      model_trace = trace(model, *args, **kwargs)
-      intercept = _make_sleep_intercept(
-          model_trace, align_data, align_latent, args, kwargs)
-      posterior_trace = trace(variational, intercept=intercept, *args, **kwargs)
-
+      p_trace = call_with_trace(model, *args, **kwargs)
+      qz = call_with_intercept(variational, p_trace,
+                               align_data=lambda name: None,
+                               align_latent=align_latent,
+                               *args, **kwargs)
       # Build dictionary to return scale factor for a posterior
       # variable via its corresponding prior. The implementation is
       # naive.
       scale_posterior = {}
-      for name, node in six.iteritems(model_trace):
-        rv = node.value
+      for name, rv in six.iteritems(p_trace):
         if align_latent(name) is not None:
-          qz = posterior_trace[align_latent(name)].value
+          qz = q_trace[align_latent(name)]
           scale_posterior[qz] = rv
 
-      for name, node in six.iteritems(posterior_trace):
-        rv = node.value
+      for rv in toposort(qz):
         scale_factor = scale_posterior[rv]
         q_log_prob[s] += tf.reduce_sum(
             scale_factor * rv.log_prob(tf.stop_gradient(rv.value)))
@@ -150,13 +147,3 @@ def variational(x):
   loss_p = -p_log_prob + reg_penalty
   loss_q = -q_log_prob + reg_penalty
   return loss_p, loss_q
-
-
-def _make_sleep_intercept(trace, align_data, align_latent, args, kwargs):
-  def _intercept(f, *fargs, **fkwargs):
-    """Set variational distribution's sample value to prior's."""
-    name = fkwargs.get('name', None)
-    z = trace[align_latent(name)].value
-    fkwargs['value'] = z.value
-    return f(*fargs, **fkwargs)
-  return _intercept
diff --git a/edward/inferences/wgan_inference.py b/edward/inferences/wgan_inference.py
index 8a86af146..dc5609c17 100644
--- a/edward/inferences/wgan_inference.py
+++ b/edward/inferences/wgan_inference.py
@@ -6,7 +6,7 @@
 import tensorflow as tf
 
 from edward.inferences import docstrings as doc
-from edward.inferences.util import call_function_up_to_args
+from edward.inferences.util import make_optional_inputs
 
 
 @doc.set_doc(
@@ -77,7 +77,8 @@ def discriminator(x):
       x_data=x_data)
   ```
   """
-  x_fake = call_function_up_to_args(model, *args, **kwargs)
+  model = make_optional_inputs(model)
+  x_fake = model(*args, **kwargs)
   key = align_data(x_fake.name.split(':')[0])
   if isinstance(key, int):
     x_true = args[key]
diff --git a/edward/models/__init__.py b/edward/models/__init__.py
index 6605275f3..fb39afb2c 100644
--- a/edward/models/__init__.py
+++ b/edward/models/__init__.py
@@ -13,6 +13,7 @@
 
 _allowed_symbols = [
     'RandomVariable',
+    'call_with_manipulate',
     'get_ancestors',
     'get_blanket',
     'get_children',
@@ -22,7 +23,6 @@
     'get_variables',
     'is_independent',
     'random_variables',
-    'trace',
 ]
 for name in dir(_module):
   obj = getattr(_module, name)
diff --git a/edward/models/core.py b/edward/models/core.py
index 9c7151ea3..91c893b70 100644
--- a/edward/models/core.py
+++ b/edward/models/core.py
@@ -2,92 +2,58 @@
 from __future__ import division
 from __future__ import print_function
 
-import collections as _collections
 import inspect as _inspect
 
 from edward.models.random_variable import RandomVariable as _RandomVariable
 from tensorflow.contrib import distributions as _distributions
 
+TRACE_STACK = [lambda f, *args, **kwargs: f(*args, **kwargs)]
 
-class Node(object):
-  """Node in execution trace. A trace's nodes form a directed acyclic graph."""
-  __slots__ = ['value', 'f', 'args', 'kwargs', 'parents']
 
-  def __init__(self, value, f, args, kwargs, parents):
-    self.value = value
-    self.f = f
-    self.args = args
-    self.kwargs = kwargs
-    self.parents = parents
-
-
-def primitive(cls_init):
-  """Wraps class __init__ for recording and intercepting."""
-  def __init__(self, *args, **kwargs):
-    global _INTERCEPT, _STORE_ARGS, _TRACE_STACK
-    if '_INTERCEPT' in globals() and callable(_INTERCEPT):
-      _INTERCEPT(cls_init, self, *args, **kwargs)
-    else:
-      cls_init(self, *args, **kwargs)
-    if '_STORE_ARGS' in globals() and '_TRACE_STACK' in globals():
-      if _STORE_ARGS:
-        parents = [v for v in list(args) + kwargs.values()
-                   if hasattr(v, "name") and v.name in _TRACE_STACK]
-        _TRACE_STACK[self.name] = Node(self, cls_init, args, kwargs, parents)
-      else:
-        _TRACE_STACK[self.name] = Node(self, None, None, None, None)
-  return __init__
-
-
-def trace(f, *args, **kwargs):
-  """Traces the function `f(*args, **kwargs)`.
+def call_with_manipulate(f, manipulate, *args, **kwargs):
+  """Calls function `f(*args, **kwargs)` with manipulation.
 
   Args:
-    f: Function to trace.
-    intercept: Function to intercept primitives. It takes a primitive
-      function `f`, inputs `args, kwargs`, and may return any value and/or
-      add side-effects. Default is `None`, equivalent to `f(*args, **kwargs)`.
-    store_args: Boolean for whether `Node`s store their inputs and parent
-      primitives. Default is `False`.
-    args, kwargs: (Possible) inputs to function.
+    f: Function to call.
+    manipulate: Function to intercept primitives. It takes each primitive
+      function `f`, inputs `args, kwargs`, and may return any value and/or add
+      side-effects.
+    args, kwargs: Inputs to function.
 
   Returns:
-    The execution trace of `f`, collecting any `primitive` operations that the
-    function executed. It is reified as a stack (`OrderedDict`), and each
-    executed primitive is a `Node` on the stack indexed by its string name.
+    The output of `f`. Any calls to `primitive` operations are replaced by
+    calls to `manipulate`.
 
   #### Examples
 
   ```python
   def f(x):
     y = Poisson(rate=x, name="y")
+    return y
 
-  def intercept(f, *args, **kwargs):
+  def manipulate(f, *args, **kwargs):
     if kwargs.get("name") == "y":
       kwargs["value"] = 42
     return f(*args, **kwargs)
 
-  trace_stack = ed.trace(f, 1.5, intercept=intercept)
-  print(trace_stack)
-  ## OrderedDict([('y', <edward.models.core.Node object at 0x118c1ce10>)])
-
-  rv = trace_stack["y"].value
+  y = ed.call_with_manipulate(f, manipulate, 1.5)
   with tf.Session() as sess:
-    assert sess.run(rv.value) == 42
+    assert sess.run(y.value) == 42
   ```
   """
-  # TODO move call_function_up_to_args
-  from edward.inferences.util import call_function_up_to_args
-  global _INTERCEPT, _STORE_ARGS, _TRACE_STACK
-  _INTERCEPT = kwargs.pop("intercept", None)
-  _STORE_ARGS = kwargs.pop("store_args", False)
-  _TRACE_STACK = _collections.OrderedDict({})
-  call_function_up_to_args(f, *args, **kwargs)
-  output = _TRACE_STACK
-  del _INTERCEPT, _STORE_ARGS, _TRACE_STACK
+  TRACE_STACK.append(manipulate)
+  output = f(*args, **kwargs)
+  TRACE_STACK.pop()
   return output
 
 
+def primitive(cls_init):
+  """Wraps class __init__ for manipulating its continuation."""
+  def __init__(self, *args, **kwargs):
+    TRACE_STACK[-1](cls_init, self, *args, **kwargs)
+  return __init__
+
+
 # Automatically generate random variable classes from classes in
 # tf.contrib.distributions.
 _globals = globals()
diff --git a/edward/models/queries.py b/edward/models/queries.py
index 663532462..307ef0e00 100644
--- a/edward/models/queries.py
+++ b/edward/models/queries.py
@@ -216,13 +216,22 @@ def get_parents(x, collection=None):
     list of RandomVariable.
     Parent random variables of x.
 
+  #### Notes
+
+  We implement this using `tf.gradients`. This is potentially inefficient vs
+  to traverse and stop after reaching all ancestors which are root nodes
+  and/or rvs. Note user can also use `tf.stop_gradient` to stop graph traversal
+  of a node.
+
+  TODO how to extend to eager with its gradients function?
+
   #### Examples
 
   ```python
-  a = Normal(0.0, 1.0)
-  b = Normal(a, 1.0)
-  c = Normal(0.0, 1.0)
-  d = Normal(b * c, 1.0)
+  a = Normal(0.0, 1.0, name="a")
+  b = Normal(a, 1.0, name="b")
+  c = Normal(0.0, 1.0, name="c")
+  d = Normal(b * c, 1.0, name="d")
   assert set(ed.get_parents(d)) == set([b, c])
   ```
   """
@@ -252,6 +261,17 @@ def get_parents(x, collection=None):
       nodes.update(node.op.inputs)
 
   return list(output)
+  # TODO this gets ancestors
+  parents = []
+  if collection is None:
+    collection = random_variables()
+  if isinstance(x,
+                (tf.Variable, tf.SparseTensor, tf.Tensor, RandomVariable)):
+    for g, v in zip(tf.gradients(node, collection), collection):
+      if g is not None:
+        parents.append(v)
+  parents.remove(node)
+  return set(parents)
 
 
 def get_siblings(x, collection=None):
@@ -416,6 +436,3 @@ def is_independent(a, b, condition=None):
           schedule.append((child, "parent"))
 
   return True
-
-
-del random_variables
diff --git a/edward/models/random_variable.py b/edward/models/random_variable.py
index 3c2986ad0..4bcd4373e 100644
--- a/edward/models/random_variable.py
+++ b/edward/models/random_variable.py
@@ -5,6 +5,7 @@
 import tensorflow as tf
 
 from collections import defaultdict
+import six
 
 try:
   from tensorflow.python.client.session import \
@@ -237,7 +238,17 @@ def get_descendants(self, collection=None):
   def get_parents(self, collection=None):
     """Get parent random variables."""
     from edward.models.queries import get_parents
-    return get_parents(self, collection)
+    # The backward pass requires TF graph traversal. In general, consider
+    # primitive -> black box function (TF ops) -> primitive. To go to parent
+    # primitive, we traverse black box function.
+    parents = []
+    for node in six.itervalues(self.parameters):
+      if isinstance(node,
+                    (tf.Variable, tf.SparseTensor, tf.Tensor, RandomVariable)):
+        parents.extend(get_parents(node))
+      if isinstance(node, RandomVariable):
+        parents.append(node)
+    return parents
 
   def get_siblings(self, collection=None):
     """Get sibling random variables."""
diff --git a/tests/models/trace_test.py b/tests/models/call_with_manipulate_test.py
similarity index 69%
rename from tests/models/trace_test.py
rename to tests/models/call_with_manipulate_test.py
index 13abd3826..2f53e381c 100644
--- a/tests/models/trace_test.py
+++ b/tests/models/call_with_manipulate_test.py
@@ -8,19 +8,16 @@
 from edward.models import Normal, Poisson
 
 
-class test_trace_class(tf.test.TestCase):
+class test_call_with_manipulate_class(tf.test.TestCase):
 
   def _test_intercept_value(self, RV, value, *args, **kwargs):
-    def _intercept(f, *args, **kwargs):
+    def manipulate(f, *fargs, **fkwargs):
       name = kwargs.get('name', None)
       if name == "rv2":
-        rv1 = rv1_trace["rv1"].value
         kwargs['value'] = rv1.value
-      return f(*args, **kwargs)
-    with ed.Trace() as rv1_trace:
-      rv1 = RV(*args, value=value, name="rv1", **kwargs)
-    with ed.Trace(intercept=_intercept) as rv2_trace:
-      rv2 = RV(*args, name="rv2", **kwargs)
+      return f(*fargs, **fkwargs)
+    rv1 = RV(*args, value=value, name="rv1", **kwargs)
+    rv2 = ed.call_with_manipulate(RV, manipulate, *args, name="rv2", **kwargs)
     value_shape1 = rv1.value.shape
     value_shape2 = rv2.value.shape
     self.assertEqual(value_shape1, value_shape2)

From 949eefdd6a1a5c5b1dc4dc04400b084d0f515469 Mon Sep 17 00:00:00 2001
From: Dustin Tran <dustinviettran@gmail.com>
Date: Wed, 24 Jan 2018 22:56:01 -0800
Subject: [PATCH 25/27] update examples/

---
 examples/bayesian_linear_regression.py   | 179 ++++++++++++++++-------
 examples/bayesian_logistic_regression.py |  77 +++++-----
 examples/beta_bernoulli.py               |  65 +++++---
 examples/normal_normal.py                |  51 ++++---
 examples/normal_sgld.py                  |  52 +++++--
 5 files changed, 283 insertions(+), 141 deletions(-)

diff --git a/examples/bayesian_linear_regression.py b/examples/bayesian_linear_regression.py
index c23390fc0..de1fb7909 100644
--- a/examples/bayesian_linear_regression.py
+++ b/examples/bayesian_linear_regression.py
@@ -30,78 +30,151 @@
 FLAGS = tf.flags.FLAGS
 
 
-def build_toy_dataset(N, noise_std=0.5):
-  X = np.concatenate([np.linspace(0, 2, num=N / 2),
-                      np.linspace(6, 8, num=N / 2)])
-  y = 2.0 * X + 10 * np.random.normal(0, noise_std, size=N)
-  X = X.reshape((N, 1))
-  return X, y
+def get_input_fn():
+  """Returns `input_fn` for train and eval."""
+  def build_toy_dataset(N, noise_std=0.5):
+    X = np.concatenate([np.linspace(0, 2, num=N / 2),
+                        np.linspace(6, 8, num=N / 2)])
+    y = 2.0 * X + 10 * np.random.normal(0, noise_std, size=N)
+    X = X.reshape((N, 1))
+    return X, y
+  features, labels = build_toy_dataset(N)
+  def input_fn(params):
+    """A simple input_fn using the experimental input pipeline."""
+    batch_size = params["batch_size"]
+    # TODO
+    dataset = tf.data.TFRecordDataset(filename, buffer_size=None)
+    dataset = dataset.cache().repeat()
+    features, labels = dataset.make_one_shot_iterator().get_next()
+    return features, labels
+  return input_fn
+
+
+def model(X):
+  w = Normal(loc=tf.zeros(FLAGS.D), scale=tf.ones(FLAGS.D))
+  b = Normal(loc=tf.zeros(1), scale=tf.ones(1))
+  y = Normal(loc=tf.tensordot(X, w, [[1], [0]]) + b,
+             scale=tf.ones(FLAGS.N))
+  return y
+
+
+def model_fn(features, labels, mode, params):
+  """Model fn which runs on TPU.
+
+  Args:
+    features: [None, 784]
+    labels: [None, 10]
+    mode: tf.estimator.ModeKeys.*
+    params: dict of hyperparams.
+  """
+  qw = tf.get_variable("qw", [FLAGS.D])
+  qb = tf.get_variable("qb", [])
+  counter = tf.get_variable("counter", initializer=0.)
+  qw_mom = tf.get_variable("qw_mom", [FLAGS.D],
+                           initializer=tf.zeros_initializer())
+  qb_mom = tf.get_variable("qb_mom", [], initializer=tf.zeros_initializer())
+
+  new_states, new_counter, _, new_momentums = ed.sghmc(
+      model,
+      current_state=[qw, qb],
+      counter=counter,
+      momentums=[qw_mom, qb_mom],
+      learning_rate=1e-3,
+      align_latent=lambda name: {"w": "qw", "b": "qb"}.get(name),
+      align_data=lambda name: {"y": "y"}.get(name),
+      X=features,
+      y=labels)
+
+  if mode == tf.estimator.ModeKeys.PREDICT:
+    predicted_classes = tf.argmax(logits, 1)
+    predictions = {
+        "class_ids": predicted_classes[:, tf.newaxis],
+        "probabilities": tf.nn.softmax(logits),
+    }
+    return tf.estimator.EstimatorSpec(mode, loss=None, predictions=predictions)
+
+  predictions = tf.argmax(logits, 1)
+  accuracy = tf.metrics.accuracy(labels=labels, predictions=predictions)
+
+  n_accept = tf.get_variable("n_accept", initializer=0, trainable=False)
+  n_accept_over_t = n_accept / t
+
+  tf.summary.scalar("accuracy", accuracy[1])
+  tf.summary.scalar("n_accept", n_accept)
+
+  if mode == tf.estimator.ModeKeys.EVAL:
+    return tpu_estimator.TPUEstimatorSpec(
+        mode=mode,
+        loss=None,
+        eval_metrics={"accuracy": accuracy,
+                      "n_accept": n_accept,})
+
+  train_op = []
+  train_op.append(qw.assign(new_states[0]))
+  train_op.append(qb.assign(new_states[1]))
+  train_op.append(counter.assign(new_counter))
+  train_op.append(qw_mom.assign(new_momentums[0]))
+  train_op.append(qb_mom.assign(new_momentums[1]))
+  train_op = tf.group(*train_op)
+  return tpu_estimator.TPUEstimatorSpec(mode=mode, loss=None, train_op=train_op)
 
 
 def main(_):
-  ed.set_seed(42)
-
-  # DATA
-  X_train, y_train = build_toy_dataset(FLAGS.N)
-  X_test, y_test = build_toy_dataset(FLAGS.N)
-
-  # MODEL
-  X = tf.placeholder(tf.float32, [FLAGS.N, FLAGS.D])
-  w = Normal(loc=tf.zeros(FLAGS.D), scale=tf.ones(FLAGS.D))
-  b = Normal(loc=tf.zeros(1), scale=tf.ones(1))
-  y = Normal(loc=tf.tensordot(X, w, [[1], [0]]) + b, scale=tf.ones(N))
+  tf.set_random_seed(42)
 
-  # INFERENCE
-  qw = Empirical(params=tf.get_variable("qw/params", [FLAGS.T, FLAGS.D]))
-  qb = Empirical(params=tf.get_variable("qb/params", [FLAGS.T, 1]))
+  train_input_fn = get_input_fn()
+  eval_input_fn = get_input_fn()
 
-  inference = ed.SGHMC({w: qw, b: qb}, data={X: X_train, y: y_train})
-  inference.run(step_size=1e-3)
+  estimator = tf.Estimator(model_fn=model_fn)
+  estimator.train(input_fn=train_input_fn,
+                  max_steps=FLAGS.train_steps)
 
-  # CRITICISM
+  eval_result = estimator.evaluate(input_fn=eval_input_fn)
+  print('\nTest set accuracy: {accuracy:0.3f}\n'.format(**eval_result))
 
-  # Plot posterior samples.
-  sns.jointplot(qb.params.eval()[FLAGS.nburn:FLAGS.T:FLAGS.stride],
-                qw.params.eval()[FLAGS.nburn:FLAGS.T:FLAGS.stride])
-  plt.show()
+  # # Plot posterior samples.
+  # sns.jointplot(qb.params.eval()[FLAGS.nburn:FLAGS.T:FLAGS.stride],
+  #               qw.params.eval()[FLAGS.nburn:FLAGS.T:FLAGS.stride])
+  # plt.show()
 
-  # Posterior predictive checks.
-  y_post = ed.copy(y, {w: qw, b: qb})
-  # This is equivalent to
-  # y_post = Normal(loc=tf.tensordot(X, qw, [[1], [0]]) + qb, scale=tf.ones(N))
+  # # Posterior predictive checks.
+  # y_post = ed.copy(y, {w: qw, b: qb})
+  # # This is equivalent to
+  # # y_post = Normal(loc=tf.tensordot(X, qw, [[1], [0]]) + qb,
+  #                   scale=tf.ones(FLAGS.N))
 
-  print("Mean squared error on test data:")
-  print(ed.evaluate('mean_squared_error', data={X: X_test, y_post: y_test}))
+  # print("Mean squared error on test data:")
+  # print(ed.evaluate('mean_squared_error', data={X: X_test, y_post: y_test}))
 
-  print("Displaying prior predictive samples.")
-  n_prior_samples = 10
+  # print("Displaying prior predictive samples.")
+  # n_prior_samples = 10
 
-  w_prior = w.sample(n_prior_samples).eval()
-  b_prior = b.sample(n_prior_samples).eval()
+  # w_prior = w.sample(n_prior_samples).eval()
+  # b_prior = b.sample(n_prior_samples).eval()
 
-  plt.scatter(X_train, y_train)
+  # plt.scatter(X_train, y_train)
 
-  inputs = np.linspace(-1, 10, num=400)
-  for ns in range(n_prior_samples):
-      output = inputs * w_prior[ns] + b_prior[ns]
-      plt.plot(inputs, output)
+  # inputs = np.linspace(-1, 10, num=400)
+  # for ns in range(n_prior_samples):
+  #     output = inputs * w_prior[ns] + b_prior[ns]
+  #     plt.plot(inputs, output)
 
-  plt.show()
+  # plt.show()
 
-  print("Displaying posterior predictive samples.")
-  n_posterior_samples = 10
+  # print("Displaying posterior predictive samples.")
+  # n_posterior_samples = 10
 
-  w_post = qw.sample(n_posterior_samples).eval()
-  b_post = qb.sample(n_posterior_samples).eval()
+  # w_post = qw.sample(n_posterior_samples).eval()
+  # b_post = qb.sample(n_posterior_samples).eval()
 
-  plt.scatter(X_train, y_train)
+  # plt.scatter(X_train, y_train)
 
-  inputs = np.linspace(-1, 10, num=400)
-  for ns in range(n_posterior_samples):
-      output = inputs * w_post[ns] + b_post[ns]
-      plt.plot(inputs, output)
+  # inputs = np.linspace(-1, 10, num=400)
+  # for ns in range(n_posterior_samples):
+  #     output = inputs * w_post[ns] + b_post[ns]
+  #     plt.plot(inputs, output)
 
-  plt.show()
+  # plt.show()
 
 if __name__ == "__main__":
   tf.app.run()
diff --git a/examples/bayesian_logistic_regression.py b/examples/bayesian_logistic_regression.py
index a08c83c3a..fb56ce23f 100644
--- a/examples/bayesian_logistic_regression.py
+++ b/examples/bayesian_logistic_regression.py
@@ -11,7 +11,7 @@
 import numpy as np
 import tensorflow as tf
 
-from edward.models import Bernoulli, Normal, Empirical
+from edward.models import Bernoulli, Normal
 
 tf.flags.DEFINE_integer("N", default=40, help="Number of data points.")
 tf.flags.DEFINE_integer("D", default=1, help="Number of features.")
@@ -27,42 +27,53 @@ def build_toy_dataset(N, noise_std=0.1):
   y[y < 0.5] = 0
   y[y >= 0.5] = 1
   X = (X - 4.0) / 4.0
-  X = X.reshape((N, D))
+  X = X.reshape((N, D)).astype(np.float32)
+  y = y.astype(np.float32)
   return X, y
 
 
+def model(X):
+  w = Normal(loc=tf.zeros(FLAGS.D), scale=3.0 * tf.ones(FLAGS.D), name="w")
+  b = Normal(loc=tf.zeros([]), scale=3.0 * tf.ones([]), name="b")
+  y = Bernoulli(logits=tf.tensordot(X, w, [[1], [0]]) + b, name="y")
+  return y
+
+
 def main(_):
-  ed.set_seed(42)
+  tf.set_random_seed(42)
 
-  # DATA
   X_train, y_train = build_toy_dataset(FLAGS.N)
 
-  # MODEL
-  X = tf.placeholder(tf.float32, [FLAGS.N, FLAGS.D])
-  w = Normal(loc=tf.zeros(FLAGS.D), scale=3.0 * tf.ones(FLAGS.D))
-  b = Normal(loc=tf.zeros([]), scale=3.0 * tf.ones([]))
-  y = Bernoulli(logits=tf.tensordot(X, w, [[1], [0]]) + b)
+  qw = tf.get_variable("qw", [FLAGS.D])
+  qb = tf.get_variable("qb", [])
 
-  # INFERENCE
-  qw = Empirical(params=tf.get_variable("qw/params", [FLAGS.T, FLAGS.D]))
-  qb = Empirical(params=tf.get_variable("qb/params", [FLAGS.T]))
+  new_state, _, _ = ed.hmc(
+      model,
+      step_size=0.6,
+      current_state=[qw, qb],
+      align_latent=lambda name: {"w": "qw", "b": "qb"}.get(name),
+      align_data=lambda name: {"y": "y"}.get(name),
+      X=X_train,
+      y=y_train)
 
-  inference = ed.HMC({w: qw, b: qb}, data={X: X_train, y: y_train})
-  inference.initialize(n_print=10, step_size=0.6)
+  qw_update = qw.assign(new_state[0])
+  qb_update = qb.assign(new_state[1])
 
   # Alternatively, use variational inference.
-  # qw_loc = tf.get_variable("qw_loc", [FLAGS.D])
-  # qw_scale = tf.nn.softplus(tf.get_variable("qw_scale", [FLAGS.D]))
-  # qb_loc = tf.get_variable("qb_loc", []) + 10.0
-  # qb_scale = tf.nn.softplus(tf.get_variable("qb_scale", []))
-
-  # qw = Normal(loc=qw_loc, scale=qw_scale)
-  # qb = Normal(loc=qb_loc, scale=qb_scale)
-
-  # inference = ed.KLqp({w: qw, b: qb}, data={X: X_train, y: y_train})
-  # inference.initialize(n_print=10, n_iter=600)
-
-  tf.global_variables_initializer().run()
+  # def variational():
+  #   qw_loc = tf.get_variable("qw_loc", [FLAGS.D])
+  #   qw_scale = tf.nn.softplus(tf.get_variable("qw_scale", [FLAGS.D]))
+  #   qb_loc = tf.get_variable("qb_loc", []) + 10.0
+  #   qb_scale = tf.nn.softplus(tf.get_variable("qb_scale", []))
+  #   qw = Normal(loc=qw_loc, scale=qw_scale, name="qw")
+  #   qb = Normal(loc=qb_loc, scale=qb_scale, name="qb")
+  #   return qw, wb
+  #
+  # loss, surrogate_loss = ed.klqp(...)
+  # train_op = tf.train.AdamOptimizer().minimize(surrogate_loss)
+
+  sess = tf.Session()
+  sess.run(tf.global_variables_initializer())
 
   # Set up figure.
   fig = plt.figure(figsize=(8, 8), facecolor='white')
@@ -73,16 +84,14 @@ def main(_):
   # Build samples from inferred posterior.
   n_samples = 50
   inputs = np.linspace(-5, 3, num=400, dtype=np.float32).reshape((400, 1))
-  probs = tf.stack([tf.sigmoid(tf.tensordot(inputs, qw.sample(), [[1], [0]]) +
-                               qb.sample())
+  # TODO n_samples; will need to store and use last X posterior samples
+  probs = tf.stack([tf.sigmoid(tf.tensordot(inputs, qw, [[1], [0]]) + qb)
                     for _ in range(n_samples)])
 
-  for t in range(inference.n_iter):
-    info_dict = inference.update()
-    inference.print_progress(info_dict)
-
-    if t % inference.n_print == 0:
-      outputs = probs.eval()
+  for t in range(5000):
+    sess.run([qw_update, qb_update])
+    if t % 10 == 0:
+      outputs = sess.run(probs)
 
       # Plot data and functions
       plt.cla()
diff --git a/examples/beta_bernoulli.py b/examples/beta_bernoulli.py
index c3a674091..77b8b2e85 100644
--- a/examples/beta_bernoulli.py
+++ b/examples/beta_bernoulli.py
@@ -9,44 +9,65 @@
 import numpy as np
 import tensorflow as tf
 
-from edward.models import Bernoulli, Beta, Empirical
+from edward.models import Bernoulli, Beta
+
+
+def model():
+  p = Beta(1.0, 1.0, name="p")
+  x = Bernoulli(probs=p, sample_shape=10, name="x")
+  return x
+
+
+def proposal(p):
+  proposal_p = Beta(3.0, 9.0, name="proposal/p")
+  return proposal_p
 
 
 def main(_):
-  ed.set_seed(42)
+  tf.set_random_seed(42)
 
-  # DATA
   x_data = np.array([0, 1, 0, 0, 0, 0, 0, 0, 0, 1])
 
-  # MODEL
-  p = Beta(1.0, 1.0)
-  x = Bernoulli(probs=p, sample_shape=10)
+  qp = tf.get_variable("qp", initializer=0.5)
+  new_state, is_accepted, _, _ = ed.metropolis_hastings(
+      model, proposal,
+      current_state=qp,
+      align_latent=lambda name: {"p": "qp"}.get(name),
+      align_proposal=lambda name: {"p": "proposal/p"}.get(name),
+      align_data=lambda name: {"x": "x_data"}.get(name),
+      x_data=x_data)
+  qp_update = qp.assign(new_state)
 
-  # INFERENCE
-  qp = Empirical(params=tf.get_variable(
-      "qp/params", [1000], initializer=tf.constant_initializer(0.5)))
+  sess = tf.Session()
+  sess.run(tf.global_variables_initializer())
 
-  proposal_p = Beta(3.0, 9.0)
+  samples = []
+  num_accept = 0
+  for t in range(2500):
+    sample, accept = sess.run([qp_update, is_accepted])
+    samples.append(sample)
+    num_accept += float(accept)
+    if t % 100 == 0:
+      print("Step {}, Acceptance Rate {:.3}".format(t, num_accept / max(t, 1)))
 
-  inference = ed.MetropolisHastings({p: qp}, {p: proposal_p}, data={x: x_data})
-  inference.run()
+  samples = samples[500:]
 
-  # CRITICISM
   # exact posterior has mean 0.25 and std 0.12
-  sess = ed.get_session()
-  mean, stddev = sess.run([qp.mean(), qp.stddev()])
+  mean = np.mean(samples)
+  stddev = np.std(samples)
   print("Inferred posterior mean:")
   print(mean)
   print("Inferred posterior stddev:")
   print(stddev)
 
-  x_post = ed.copy(x, {p: qp})
-  tx_rep, tx = ed.ppc(
-      lambda xs, zs: tf.reduce_mean(tf.cast(xs[x_post], tf.float32)),
-      data={x_post: x_data})
-  ed.ppc_stat_hist_plot(
-      tx[0], tx_rep, stat_name=r'$T \equiv$mean', bins=10)
-  plt.show()
+  # TODO
+  # x_post = ed.copy(x, {p: qp})
+  # tx_rep, tx = ed.ppc(
+  #     lambda xs, zs: tf.reduce_mean(tf.cast(xs[x_post], tf.float32)),
+  #     data={x_post: x_data})
+  # ed.ppc_stat_hist_plot(
+  #     tx[0], tx_rep, stat_name=r'$T \equiv$mean', bins=10)
+  # plt.show()
 
 if __name__ == "__main__":
   tf.app.run()
diff --git a/examples/normal_normal.py b/examples/normal_normal.py
index caad1ac31..4928a4138 100644
--- a/examples/normal_normal.py
+++ b/examples/normal_normal.py
@@ -8,38 +8,53 @@
 import numpy as np
 import tensorflow as tf
 
-from edward.models import Empirical, Normal
+from edward.models import Normal
+
+
+def model():
+  mu = Normal(loc=0.0, scale=1.0, name="mu")
+  x = Normal(loc=mu, scale=1.0, sample_shape=50, name="x")
+  return x
 
 
 def main(_):
-  ed.set_seed(42)
+  tf.set_random_seed(42)
 
-  # DATA
   x_data = np.array([0.0] * 50)
 
-  # MODEL: Normal-Normal with known variance
-  mu = Normal(loc=0.0, scale=1.0)
-  x = Normal(loc=mu, scale=1.0, sample_shape=50)
+  # analytic solution: N(loc=0.0, scale=\sqrt{1/51}=0.140)
+  qmu = tf.get_variable("qmu", [])
+  new_state, kernel_results = ed.hmc(
+      model,
+      step_size=0.2,
+      current_state=qmu,
+      align_latent=lambda name: {"mu" : "qmu"}.get(name),
+      align_data=lambda name: {"x": "x"}.get(name),
+      x=x_data)
 
-  # INFERENCE
-  qmu = Empirical(params=tf.get_variable("qmu/params", [1000],
-                                         initializer=tf.zeros_initializer()))
+  qmu_update = qmu.assign(new_state)
 
-  # analytic solution: N(loc=0.0, scale=\sqrt{1/51}=0.140)
-  inference = ed.HMC({mu: qmu}, data={x: x_data})
-  inference.run()
+  sess = tf.Session()
+  sess.run(tf.global_variables_initializer())
 
-  # CRITICISM
-  sess = ed.get_session()
-  mean, stddev = sess.run([qmu.mean(), qmu.stddev()])
+  samples = []
+  num_accept = 0
+  for t in range(2500):
+    sample, accept = sess.run([qmu_update, kernel_results.is_accepted])
+    samples.append(sample)
+    num_accept += float(accept)
+    if t % 100 == 0:
+      print("Step {}, Acceptance Rate {:.3}".format(t, num_accept / max(t, 1)))
+
+  samples = samples[500:]
+
+  mean = np.mean(samples)
+  stddev = np.std(samples)
   print("Inferred posterior mean:")
   print(mean)
   print("Inferred posterior stddev:")
   print(stddev)
 
-  # Check convergence with visual diagnostics.
-  samples = sess.run(qmu.params)
-
   # Plot histogram.
   plt.hist(samples, bins='auto')
   plt.show()
diff --git a/examples/normal_sgld.py b/examples/normal_sgld.py
index 7ea554391..fb226acfa 100644
--- a/examples/normal_sgld.py
+++ b/examples/normal_sgld.py
@@ -6,28 +6,52 @@
 from __future__ import print_function
 
 import edward as ed
+import numpy as np
 import tensorflow as tf
 
-from edward.models import Empirical, MultivariateNormalTriL
+from edward.models import MultivariateNormalTriL
 
 
-def main(_):
-  ed.set_seed(42)
-
-  # MODEL
+def model():
   z = MultivariateNormalTriL(
       loc=tf.ones(2),
-      scale_tril=tf.cholesky(tf.constant([[1.0, 0.8], [0.8, 1.0]])))
+      scale_tril=tf.cholesky(tf.constant([[1.0, 0.8], [0.8, 1.0]])),
+      name="z")
+  return z
 
-  # INFERENCE
-  qz = Empirical(params=tf.get_variable("qz/params", [2000, 2]))
 
-  inference = ed.SGLD({z: qz})
-  inference.run(step_size=5.0)
-
-  # CRITICISM
-  sess = ed.get_session()
-  mean, stddev = sess.run([qz.mean(), qz.stddev()])
+def main(_):
+  tf.set_random_seed(42)
+
+  qz = tf.get_variable("qz", [2])
+  counter = tf.get_variable("counter", initializer=0.)
+  qz_mom = tf.get_variable("qz_mom", [2], initializer=tf.zeros_initializer())
+  # TODO what's up with the samples?
+  new_state, new_counter, new_momentum = ed.sgld(
+      model,
+      state=qz,
+      counter=counter,
+      momentum=qz_mom,
+      learning_rate=1e-3,
+      align_latent=lambda name: "qz" if name == "z" else None,
+      align_data=lambda name: None)
+  qz_update = qz.assign(new_state)
+  counter_update = counter.assign(new_counter)
+  qz_mom_update = qz_mom.assign(new_momentum)
+
+  sess = tf.Session()
+  sess.run(tf.global_variables_initializer())
+  samples = []
+  for t in range(2500):
+    sample, _, _ = sess.run([qz_update, counter_update, qz_mom_update])
+    samples.append(sample)
+    if t % 100 == 0:
+      print("Step {}".format(t))
+
+  samples = samples[500:]
+
+  mean = np.mean(samples)
+  stddev = np.std(samples)
   print("Inferred posterior mean:")
   print(mean)
   print("Inferred posterior stddev:")

From 836eec279b95ad06aff1f07e885107f51d5f42fe Mon Sep 17 00:00:00 2001
From: Dustin Tran <dustinviettran@gmail.com>
Date: Wed, 7 Feb 2018 16:40:06 -0800
Subject: [PATCH 26/27] add for cgs

---
 examples/eager.py               | 59 +++++++++++++++++++++++++++++++++
 examples/normal_normal_eager.py | 54 ++++++++++++++++++++++++++++++
 2 files changed, 113 insertions(+)
 create mode 100644 examples/eager.py
 create mode 100644 examples/normal_normal_eager.py

diff --git a/examples/eager.py b/examples/eager.py
new file mode 100644
index 000000000..91c7a1ae5
--- /dev/null
+++ b/examples/eager.py
@@ -0,0 +1,59 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import edward as ed
+import numpy as np
+import tensorflow as tf
+
+from edward.models import Gamma, Normal
+
+import tensorflow.contrib.eager as tfe
+tfe.enable_eager_execution()
+
+def model():
+  z = Normal(loc=0., scale=1., name='z')
+  x = Gamma(tf.nn.softplus(z), 1., sample_shape=1000, name='x')
+  return x
+
+def variational():
+  qz = Normal(loc=tf.get_variable("loc", shape=[]),
+              scale=tf.nn.softplus(tf.get_variable("scale", shape=[])), name='qz')
+  return qz
+
+variational = tf.make_template("variational", variational)
+
+x_data = np.random.gamma(5.2, 1.2, size=1000).astype(np.float32)
+
+optimizer = tf.train.AdamOptimizer(1e-2)
+
+# loss, surrogate_loss = ed.klqp(
+#     model,
+#     variational,
+#     align_latent=lambda name: {'z': 'qz'}.get(name),
+#     align_data=lambda name: {'x': 'x'}.get(name),
+#     x=x_data)
+# grads_and_vars = optimizer.compute_gradients(surrogate_loss)
+# train_op = optimizer.apply_gradients(grads_and_vars)
+
+# sess = tf.Session()
+# sess.run(tf.global_variables_initializer())
+# for _ in range(2000):
+#   sess.run(train_op)
+
+loss_fn = lambda *args: ed.klqp(
+    model,
+    variational,
+    lambda name: {'z': 'qz'}.get(name),
+    lambda name: {'x': 'x'}.get(name),
+    *args)[1]
+
+value_and_gradients_fn = tfe.implicit_value_and_gradients(loss_fn)
+
+for _ in range(100):
+  loss, gradients_and_variables = value_and_gradients_fn(x_data)
+  optimizer.apply_gradients(gradients_and_variables)
+
+qz = variational()
+print("Posterior mean: {}".format(qz.loc))
+print("Posterior variance: {}".format(qz.scale))
diff --git a/examples/normal_normal_eager.py b/examples/normal_normal_eager.py
new file mode 100644
index 000000000..316d74ba6
--- /dev/null
+++ b/examples/normal_normal_eager.py
@@ -0,0 +1,54 @@
+"""Normal-normal model using Hamiltonian Monte Carlo."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import edward as ed
+import numpy as np
+import tensorflow as tf
+
+from edward.models import Normal
+
+
+def model():
+  """Normal-Normal with known variance."""
+  loc = Normal(loc=0.0, scale=1.0, name="loc")
+  x = Normal(loc=loc, scale=1.0, sample_shape=50, name="x")
+  return x
+
+
+def variational():
+  qloc = Normal(loc=tf.get_variable("loc", []),
+                scale=tf.nn.softplus(tf.get_variable("shape", [])),
+                name="qloc")
+  return qloc
+
+
+variational = tf.make_template("variational", variational)
+
+tf.set_random_seed(42)
+x_data = np.array([0.0] * 50)
+
+# analytic solution: N(loc=0.0, scale=\sqrt{1/51}=0.140)
+loss, surrogate_loss = ed.klqp(
+    model,
+    variational,
+    align_latent=lambda name: 'qloc' if name == 'loc' else None,
+    align_data=lambda name: 'x_data' if name == 'x' else None,
+    x_data=x_data)
+
+optimizer = tf.train.AdamOptimizer(1e-2)
+grads_and_vars = optimizer.compute_gradients(surrogate_loss)
+train_op = optimizer.apply_gradients(grads_and_vars)
+
+qloc = variational()
+sess = tf.Session()
+
+sess.run(tf.global_variables_initializer())
+for t in range(1, 5001):
+  loss_val, _ = sess.run([loss, train_op])
+  if t % 50 == 0:
+    mean, stddev = sess.run([qloc.mean(), qloc.stddev()])
+    print({"Loss": loss_val,
+           "Posterior mean": mean,
+           "Posterior stddev": stddev})

From d627b753984bb26e2a92dfae9df604c6015c0bd2 Mon Sep 17 00:00:00 2001
From: Dustin Tran <dustinviettran@gmail.com>
Date: Thu, 8 Feb 2018 11:32:45 -0800
Subject: [PATCH 27/27] version 2.0

---
 edward/version.py | 2 +-
 setup.py          | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/edward/version.py b/edward/version.py
index 8832a6a84..cab57cf58 100644
--- a/edward/version.py
+++ b/edward/version.py
@@ -1,2 +1,2 @@
-__version__ = '1.3.5'
+__version__ = '2.0.0'
 VERSION = __version__
diff --git a/setup.py b/setup.py
index 3878d1bcd..76e95542b 100644
--- a/setup.py
+++ b/setup.py
@@ -15,8 +15,8 @@
     install_requires=['numpy>=1.7',
                       'six>=1.10.0'],
     extras_require={
-        'tensorflow': ['tensorflow>=1.2.0rc0'],
-        'tensorflow with gpu': ['tensorflow-gpu>=1.2.0rc0'],
+        'tensorflow': ['tensorflow>=1.6.0'],
+        'tensorflow with gpu': ['tensorflow-gpu>=1.6.0'],
         'datasets': ['observations>=0.1.2'],
         'notebooks': ['jupyter>=1.0.0'],
         'visualization': ['matplotlib>=1.3',