diff --git a/hyperoptimization/__init__.py b/hyperoptimization/__init__.py
new file mode 100644
index 0000000..4eca69c
--- /dev/null
+++ b/hyperoptimization/__init__.py
@@ -0,0 +1,6 @@
+from .neural_models import FCDeConvNetSigOut, FCDeConvNet
+from .workers import TVAEWorker
+from .explore import print_best
+from .runs import local_sequential
+
+__all__ = ["FCDeConvNet", "FCDeConvNetSigOut", "TVAEWorker", "print_best", "local_sequential"]
diff --git a/hyperoptimization/explore.py b/hyperoptimization/explore.py
new file mode 100644
index 0000000..958aa16
--- /dev/null
+++ b/hyperoptimization/explore.py
@@ -0,0 +1,197 @@
+import hpbandster.core.result as hpres
+import hpbandster.visualization as hpvis
+import json
+import matplotlib.pyplot as plt
+from collections import Counter
+import numpy as np
+import os
+
+
+class ValidFreeEnergy:
+    """
+    Extracts the free energy from an hpbandster run if the run was successful.
+    If the free energy of the loss is requested (i.e. training), the negative loss is returned.
+    """
+
+    def __init__(self, key):
+        """
+        :param key: The key of the free energy to extract.
+        (one of loss, validation,
+        """
+        self.key = key
+
+    def __call__(self, run):
+        if not self.key == "loss":
+            if "info" not in vars(run):
+                return None  # broken run
+            if run["info"] is None:
+                return None  # broken run, but later
+            if run["info"][self.key] is None:
+                return None  # broken run, but silently
+            else:
+                return run["info"][self.key]
+
+        else:
+            if run[self.key] is None:
+                return None
+            else:
+                return -run[self.key]  # hpbandster minimizes, but we report -loglikelihood
+
+
+def result_and_runs(path):
+    """
+    :param path: directory of the results and config json files
+    :return: the results and all_runs objects
+    """
+    result = hpres.logged_results_to_HBS_result(path)
+    all_runs = result.get_all_runs()
+    return result, all_runs
+
+
+def sorted_by_value(runs, key="loss"):
+    """
+    :param runs: outpout of an hpbandster.core.result method
+    :param key: the key by which to sort the results
+    :return: a sorted list with only valid results
+    """
+
+    get_if_valid = ValidFreeEnergy(key)
+    return sorted([run for run in runs if get_if_valid(run) is not None], key=get_if_valid)
+
+
+def print_best(path="", printable="loss", criterion="loss", show_config=True, top_n=10):
+    """
+    :param path: directory of the results and config json files
+    :param printable: value to print
+    :param criterion: value to sort by for the top N selection
+    :param show_config: Bool prints model config
+    :param top_n: number of models to show
+    :return:
+    """
+
+    result, all_runs = result_and_runs(path)
+    id2conf = result.get_id2config_mapping()
+
+    by_criterion = sorted_by_value(all_runs, key=criterion)
+
+    print("Good confs as judged by {}: ".format(criterion))
+    for i in range(top_n):
+
+        # get value
+        if criterion == "loss":
+            value = -by_criterion[::-1][i][printable]  # fix minus from Hpbandster minimization
+        else:
+            value = by_criterion[::-1][i]["info"][printable]
+
+        # get config id
+        id = "".join([(str(id_).rjust(3, " ")) for id_ in by_criterion[::-1][i]["config_id"]])
+
+        # print result
+        print(
+            "{}. with {}/free energy= {} |id ({})".format(
+                str(i + 1).rjust(2), printable, str(round(value, 6)).ljust(12), id
+            )
+        )
+
+        if show_config:
+            config = id2conf[by_criterion[i]["config_id"]]
+            print(json.dumps(config, indent=4))
+
+
+def print_error_configs(path, top_n_broken=10):
+    """
+    This function picks out and prints the hyperparameters that are most frequent
+    in configs that had an interrupted run.
+    :param path: directory of the results and config json files
+    :param top_n_broken: number of hyperparameters to print
+    :return: None
+    """
+    result, all_runs = result_and_runs(path)
+    id2conf = result.get_id2config_mapping()
+
+    all_confs = [id2conf[run["config_id"]] for run in all_runs]
+    broken_confs = [id2conf[run["config_id"]] for run in all_runs if run["info"] is None]
+
+    all_hyperparamters_by_usage = Counter(
+        [key for conf in all_confs for key in conf["config"].keys()]
+    )
+    constantly_used = [
+        key
+        for key in all_hyperparamters_by_usage
+        if all_hyperparamters_by_usage[key] == len(all_runs)
+    ]
+    broken_hyperparams_used = [
+        key for conf in broken_confs for key in conf["config"].keys() if key not in constantly_used
+    ]
+
+    max_len = max([len(key) for key in broken_hyperparams_used])
+    broken_hyperparams_used = [key.ljust(max_len) for key in broken_hyperparams_used]
+
+    print("{} broken runs found".format(len(broken_confs)))
+    print("Top {} hyperparams by frequency:".format(top_n_broken))
+    temp = np.array(Counter(broken_hyperparams_used).most_common(top_n_broken)).T
+    temp = np.array([temp[1], temp[0]]).T
+    temp = ["".join(a + ": " + b) for a, b in temp]
+    print("\n".join(temp))
+
+
+def visualize(path):
+    """
+    This function visualize the behaviour of an hpbandster run
+    :param path: directory of the results and config json files
+    :return:
+    """
+    # get results
+    result, all_runs = result_and_runs(path)
+
+    # plot:
+
+    # losses by budget
+    hpvis.losses_over_time(all_runs)
+
+    # concurent runs over time
+    hpvis.concurrent_runs_over_time(all_runs)
+
+    # finished runs over time
+    hpvis.finished_runs_over_time(all_runs)
+
+    # spearman rank correlation over budgets
+    hpvis.correlation_across_budgets(result)
+
+    # model based configs vs random search
+    hpvis.performance_histogram_model_vs_random(all_runs, result.get_id2config_mapping(), show=True)
+
+    plt.show()
+
+
+if __name__ == "__main__":
+    # path = 'results'
+    path = os.path.abspath(
+        os.path.join(os.path.abspath(__file__), "../../dynamically_binarized_mnist/results_2")
+    )
+    print_error_configs(path)
+    print("\n")
+    print_best(
+        path,
+        printable="validation accuracy",
+        criterion="train accuracy",
+        show_config=False,
+        top_n=5,
+    )
+    print("\n")
+    print_best(
+        path,
+        printable="validation accuracy",
+        criterion="validation accuracy",
+        show_config=True,
+        top_n=5,
+    )
+    print("\n")
+    print_best(
+        path,
+        printable="test accuracy",
+        criterion="test accuracy",
+        show_config=False,
+        top_n=5,
+    )
+    visualize(path)
diff --git a/hyperoptimization/hyperoptimize_tvae_bars_test.py b/hyperoptimization/hyperoptimize_tvae_bars_test.py
new file mode 100644
index 0000000..7bc07ed
--- /dev/null
+++ b/hyperoptimization/hyperoptimize_tvae_bars_test.py
@@ -0,0 +1,52 @@
+from argparse import ArgumentParser as Parser
+
+from typing import Tuple
+
+from hyperoptimization.workers import TVAEWorker
+from hyperoptimization.utils import parse_hyperopt_args as hyperopt
+from hyperoptimization.runs import local_sequential as run
+import logging
+
+logging.basicConfig(level=logging.WARNING)
+
+
+def experiment(parser):
+
+    parser.add_argument("dataset", help="HD5 file as expected in input by tvo.Training")
+    parser.add_argument("--Ksize", type=int, default=3, help="size of each K^n set")
+    parser.add_argument("--epochs", type=int, default=40, help="number of training epochs")
+    parser.add_argument(
+        "--net-shape",
+        required=True,
+        type=parse_net_shape,
+        help="column-separated list of layer sizes",
+    )
+    parser.add_argument("--min_lr", type=float, help="MLP min learning rate", required=True)
+    parser.add_argument("--max_lr", type=float, help="MLP max learning rate", required=True)
+    parser.add_argument("--batch-size", type=int, required=True)
+    parser.add_argument("--output", help="output file for train log", required=True)
+    parser.add_argument(
+        "--seed",
+        type=int,
+        help="seed value for random number generators. default is a random seed",
+    )
+    return parser
+
+
+def parse_net_shape(net_shape: str) -> Tuple[int, ...]:
+    """
+    Parse string with TVAE shape into a tuple.
+
+    :param net_shape: column-separated list of integers, e.g. `"10:10:2"`
+    :returns: a tuple with the shape as integers, e.g. `(10,10,2)`
+    """
+    return tuple(map(int, net_shape.split(":")))
+
+
+parser = experiment(hyperopt(Parser()))
+parsed_args = parser.parse_args()
+
+worker = TVAEWorker
+pr = None
+# pr = result.logged_results_to_HBS_result("results")
+run(worker=worker, parsed_args=parsed_args, previous_run=pr)
diff --git a/hyperoptimization/neural_models.py b/hyperoptimization/neural_models.py
new file mode 100644
index 0000000..09b5961
--- /dev/null
+++ b/hyperoptimization/neural_models.py
@@ -0,0 +1,439 @@
+import warnings
+import numpy as np
+from warnings import warn
+from typing import List
+import torch as to
+import torch.nn as nn
+
+
+class FCDeConvNet(to.nn.Module):
+    def __init__(
+        self,
+        n_deconv_layers: int,
+        n_fc_layers: int,
+        W_shapes: List[int],
+        fc_activations: List,
+        dc_activations: List,
+        n_kernels: List[int],
+        dropouts: List[bool],
+        batch_norms: List[bool],
+        output_shape: int,
+        input_size,
+        dropout_rate=0.25,
+        filters_from_fc=1,
+        kernels=None,
+        paddings=None,
+        sanity_checks=False,
+        dtype=to.double,
+    ):
+        """
+        Adjustable deconvolutional network class. It builds an optionally deconvolutional
+         generative model with a fully connected base. Both options are set in blocks of
+         [(fully connected/deconv) layer,(dropout/batchnorm) regularizer, nonlinearity].
+         # Todo make fc base completely optional?
+
+        :param n_deconv_layers: number of transposed convolutions to be applied to the
+         embedding (after fc layers)
+        :param n_fc_layers: number of fully connected layers to be applied to S
+        :param W_shapes: weight shapes of the fully connected layers. Currenly one shape
+         is used for all layers.
+        :param fc_activations: set of activations for the fully connected layers.
+        :param dc_activations: set of activations for the deconv layers.
+        :param n_kernels:  number of filters per deconv layer
+        :param dropouts: List of dropout booleans. Only applied to fc layers.
+        :param batch_norms: List of batch norm booleans. Only applied to deconv blocks.
+        :param output_shape: X.shape[-1]
+        :param input_size: S.shape
+        :param dropout_rate: global dropout rate # todo enable local dropout?
+        :param filters_from_fc: the amount of filters to use for the hidden representation
+         of the linear stack
+        :param sanity_checks: BOOL, decides whether sanity checks are run at init.
+        """
+        super().__init__()
+
+        if sanity_checks:
+            fc_sanitize = (W_shapes, n_fc_layers, dropouts)
+            dc_sanitize = (n_deconv_layers, n_kernels, batch_norms, filters_from_fc)
+            self.test_sanity(input_size, fc_sanitize, dc_sanitize)
+
+        self.shape = [input_size]
+
+        self.n_deconv_layers = n_deconv_layers
+
+        if n_kernels and n_kernels[-1] != 1:
+            warnings.warn(
+                "Final number of kernels exceeds expected dimensionality. Setting manually to 1."
+            )
+            n_kernels[-1] = 1
+
+        if n_fc_layers:
+            if not n_deconv_layers:
+                warnings.warn(
+                    "Using fully connected network, output layer set to to input shape manually"
+                )
+                W_shapes[-1] = output_shape
+            self.fc_stack = FCnet(
+                input_size, W_shapes, fc_activations, dropouts, dropout_rate, dtype=to.double
+            )
+
+        else:
+            self.fc_stack = nn.Sequential()
+
+        if n_deconv_layers:
+            if n_fc_layers:
+                input_size = W_shapes[-1]  # plug on top of fc
+            self.deconv_stack = Deconvnet(
+                input_size,
+                filters_from_fc,
+                kernels,
+                output_shape,
+                n_deconv_layers,
+                n_kernels,
+                dc_activations,
+                paddings,
+                batch_norms,
+                dtype=to.double,
+            )
+            self.deconv_stack.output_shape = output_shape
+
+        else:
+            self.deconv_stack = nn.Sequential()
+
+    def forward(self, x):
+        x = x.double()
+        h = self.fc_stack(x)
+        out = self.deconv_stack(h)
+        return out
+
+    def number_of_parameters(self):
+        return sum(p.numel() for p in self.parameters() if p.requires_grad)
+
+    @staticmethod  # static to make mypy happy
+    def test_sanity(input_size, fc_sanitize, dc_sanitize):
+
+        # make sanity checks
+        # Todo: decide whether and which sanity checks can be removed
+        W_shapes, n_fc_layers, dropouts = fc_sanitize
+        n_deconv_layers, n_kernels, batch_norms, filters_from_fc = dc_sanitize
+
+        # fc sanity
+        assert (
+            len(W_shapes) == n_fc_layers == len(dropouts)
+        ), "add information for all fc layers (dropout can be 0)"
+
+        # dc sanity
+        assert (
+            len(n_kernels) == n_deconv_layers == len(batch_norms)
+        ), "add information for all deconv layers"
+
+        # fc+dc sanity
+        if n_kernels and n_fc_layers:
+            initial_deconv_dim = int(np.sqrt(W_shapes[-1] / n_kernels[0]))
+            assert n_kernels[0] == W_shapes[-1] / initial_deconv_dim**2, (
+                "the output of the final fully connected layer should be "
+                "a product of squares, where the product is the number "
+                "of filters and the square is the shape of the filters. "
+            )
+        # dc sanity
+        elif n_kernels:
+            initial_deconv_dim = int(np.sqrt(input_size / n_kernels[0]))
+            assert initial_deconv_dim == np.sqrt(
+                input_size / n_kernels[0]
+            ), "pure deconvnet can only be used if the input size is a product of squares"
+
+        assert (
+            filters_from_fc == int(filters_from_fc) and filters_from_fc > 0
+        ), "filters need to be positive"
+
+
+class FCnet(to.nn.Module):
+    def __init__(
+        self,
+        input_size,
+        W_shapes: List[int],
+        fc_activations: List,
+        dropouts: List[bool],
+        dropout_rate=0.25,
+        dtype=to.double,
+    ):
+
+        super().__init__()
+
+        if not hasattr(self, "shape"):
+            self.shape = [input_size]
+
+        if not hasattr(self, "fc_stack"):
+            self.fc_stack = nn.Sequential()
+
+        # setup fully connected blocks
+        in_features = input_size
+
+        # build fc blocks
+        for i, (n_hidden, activation, dropout) in enumerate(
+            zip(W_shapes, fc_activations, dropouts)
+        ):
+            self.shape.append(n_hidden)  # store shape for TVEM
+            self.fc_stack.add_module(
+                "linear_{}".format(i),
+                nn.Linear(in_features, out_features=n_hidden, dtype=dtype),
+            )
+            # add dropout to layer
+            if dropout:
+                self.fc_stack.add_module("dropout_layer{}".format(i), nn.Dropout(dropout_rate))
+            self.fc_stack.add_module("activation_{}".format(i), eval(activation)())
+            in_features = n_hidden
+
+        self.dropout = nn.Dropout(p=dropout_rate)  # set the dropout rate
+
+    def forward(self, x):
+        out = self.fc_stack(x)
+        return out
+
+
+class Deconvnet(to.nn.Module):
+    def __init__(
+        self,
+        in_features,
+        filters_from_fc,
+        kernels,
+        output_shape,
+        n_deconv_layers,
+        n_kernels,
+        dc_activations,
+        paddings=None,
+        batch_norms=None,
+        dtype=to.double,
+    ):
+        """
+        kernels: dimensionality of kernels, e.g. =[3] results in kernels of 3x3
+        n_kernels: number of kernels. e.g.  =2 results in 2 3x3 kernels
+        """
+
+        super().__init__()
+
+        if not hasattr(self, "shape"):
+            self.shape = [in_features]
+
+        if not hasattr(self, "fc_stack"):
+            self.deconv_stack = nn.Sequential()
+
+        # transposed convolution blocks
+        input_len = int(np.sqrt(in_features))
+        input_shape = (input_len, input_len, filters_from_fc)
+
+        if not kernels:
+            # calculate total increase in dimensionality
+            total_upsampling = int(np.sqrt(output_shape) - np.sqrt(input_shape[0] * input_shape[1]))
+            assert total_upsampling == np.sqrt(output_shape) - np.sqrt(
+                input_shape[0] * input_shape[1]
+            )
+
+            if total_upsampling < 0:
+                warn("Transposed convolution used for downsampling")
+
+            # calculate kernel sizes and paddings, such as the outputs match the
+            # dimensionality of the output
+            kernels, paddings = self.deconvolution_hypers_from_upsampling(
+                upsampling=total_upsampling, min_kernel=3, n_layers=n_deconv_layers
+            )
+
+        if not paddings:
+            paddings = [0] * len(kernels)
+
+        if not batch_norms:
+            batch_norms = [0] * len(kernels)
+
+        # print(total_upsampling, kernels, paddings)
+
+        # add the transposed convolution blocks
+        # for i in range(n_deconv_layers):
+        hypers = zip(batch_norms, n_kernels, kernels, paddings, dc_activations)
+        for i, (batch_norm, n_kernels_, kernel_size, padding, activation) in enumerate(hypers):
+
+            self.shape.append(
+                (n_kernels_ * kernel_size**2)
+            )  # tuple denotes that n params is from filters
+            self.deconv_stack.add_module(
+                "conv_transpose_{}".format(i),
+                nn.ConvTranspose2d(
+                    in_channels=input_shape[-1],
+                    out_channels=n_kernels_,
+                    kernel_size=kernel_size,
+                    padding=padding,
+                    dtype=dtype,
+                ),
+            )
+
+            if batch_norm:
+                self.deconv_stack.add_module("batch_norm_{}".format(i), nn.BatchNorm2d(n_kernels_))
+
+            self.deconv_stack.add_module("deconv_activation_{}".format(i), eval(activation)())
+
+            input_shape = self.deconv_output_shape(
+                input_len=input_shape[0],
+                filters=n_kernels_,
+                kernel=kernel_size,
+                padding=padding,
+            )
+
+        assert input_shape[0] == input_shape[1]
+        assert output_shape == np.prod(
+            input_shape
+        ), "output ({}) not equal to product of input ({})".format(output_shape, input_shape)
+
+        # todo: change self.shape functionality appropriately after the TVAE changes
+        self.shape.append(output_shape)
+
+    def forward(self, x):
+        n, S_kn, D = x.shape[0], x.shape[1], self.output_shape
+        h = x.reshape(n, S_kn, int(np.sqrt(x.shape[-1])), int(np.sqrt(x.shape[-1])))
+        out = to.empty(size=(n, S_kn, D), device=h.device, dtype=h.dtype)
+        for s in range(S_kn):
+            h_s = self.deconv_stack(h[:, s, :, :].unsqueeze(axis=1))
+            # h_s = to.sum(h_s, dim=1)
+            # todo force last filter to match the dimensionality?
+
+            out[:, s, :] = to.reshape(h_s, (n, D))
+
+        return out
+
+    @staticmethod
+    def deconvolution_hypers_from_upsampling(upsampling: int, min_kernel=3, n_layers=1):
+        """
+        :param upsampling: dimentionality needed to upsample image
+        :param min_kernel: minimum kernel size
+        :param n_layers: number of layers on which to spread the upsampling on
+        :return: the kernel size and padding for each layer such that the upsampling is obeyed.
+        """
+
+        # todo: decide how to treat bad n_layers input
+        if n_layers <= 0:
+            return [], []
+        assert n_layers, "no layers provided"
+
+        if upsampling == 0:
+            return [3] * n_layers, [1] * n_layers
+
+        # each layer gets the same amount of upsampling
+        layer_upsampling = upsampling // n_layers
+
+        # any remaining upsampling goes to the final layer to reduce compute
+        last_layer = layer_upsampling + upsampling % n_layers
+
+        assert layer_upsampling * n_layers + upsampling % n_layers == upsampling
+
+        # assign upsampling to layers
+        layers_upsampling = [layer_upsampling for _ in range(n_layers)]
+        layers_upsampling[-1] = last_layer
+        kernels, paddings = [], []
+
+        # compute kernel/padding combination for each upsampling value
+        for upsampling_ in layers_upsampling:
+            # padding adds 2
+            # kernel adds -1, with kernel=1 -> upsampling=0
+            kernel = min_kernel
+            padding = (min_kernel - upsampling_ - 1) / 2
+
+            # padding uneven
+            if padding != int(padding):
+                kernel -= 1 * np.sign(padding)
+                padding = int(padding)
+
+            # # padding uneven
+            # if padding != int(padding):
+            #     kernel += 1 * np.sign(padding)
+            #     padding = int(padding)
+
+            if padding < 0:
+                kernel += 2 * abs(padding)
+                padding = 0
+
+            # print('upsampling for layer: kernels={}, padding={}'.format(kernel, padding))
+            assert (
+                -2 * padding + (kernel - 1) == upsampling_
+            ), "Logical error in upsampling: padding = {}, kernel = {}, upsamplings = {}".format(
+                padding, kernel, layers_upsampling
+            )
+            # todo decide whether to allow negative padding
+            assert padding >= 0, "upsampling {} with negative padding={}".format(
+                upsampling_, padding
+            )
+            kernels.append(int(kernel))
+            paddings.append(int(padding))
+
+        actual = np.sum(np.array(kernels) - 1) - 2 * np.sum(np.array(paddings))
+        diff = actual - upsampling
+
+        assert (
+            diff == 0
+        ), "Unexpected diff={} between expected result ({})" "and actual ({})".format(
+            diff, upsampling, actual
+        )
+        return kernels, paddings
+
+    def deconv_output_shape(
+        self,
+        input_len,
+        filters,
+        kernel,
+        stride=1,
+        padding=0,
+        dilation=1,
+        output_padding=0,
+    ):
+        """
+        returns the output shape of a transposed convolutional layer
+        :param input_len: length of input: for 3x28x28 coloured MNIST it's 28
+        :param filters: number of filters
+        :param kernel: 1D length of kernel: for a 3x3 kernel it's a 3
+        :param stride: stride of the filter
+        :param padding: 1D size of padding around the input.
+        :return: the output shape of a transposed convolutional layer
+        """
+        assert type(input_len + filters + stride + kernel + dilation + output_padding) is int
+        out = int(
+            stride * (input_len - 1) - 2 * padding + dilation * (kernel - 1) + output_padding + 1
+        )
+        return (out, out, filters)
+
+    def conv_output_shape(self, input_len, filters, kernel, stride=1, padding=0, dilation=1):
+        """
+        returns the output shape of a convolutional layer
+        :param input_len: length of input: for 3x28x28 coloured MNIST it's 28
+        :param filters: number of filters
+        :param kernel: 1D length of kernel: for a 3x3 kernel it's a 3
+        :param stride: stride of the filter
+        :param padding: 1D size of padding around the input.
+        :return: the output shape of a convolutional layer
+        """
+        assert type(input_len + filters + stride + kernel) is int
+        out = (input_len + 2 * padding - dilation * (kernel - 1) - 1) / stride + 1
+        out = int(out)
+        return (out, out, filters)
+
+
+# added in neural_models.py
+
+
+class FCDeConvNetSigOut(FCDeConvNet):
+    def forward(self, x):
+        return to.sigmoid(super(FCDeConvNetSigOut, self).forward(x))
+
+
+# this function computes the feature map of a convolutional layer.
+def feature_map(w, h, d, n_kernels, kernel_size):
+    w2 = w - kernel_size + 1
+    h2 = h - kernel_size + 1
+    d2 = n_kernels
+    volume = w2, h2, d2
+    n_weights = kernel_size**2 * d * n_kernels
+    return volume, n_weights
+
+
+def deconv_2_l(n_kernels):
+    volume1, n_weights1 = feature_map(32, 32, 3, 3, 15)
+    w, h, d = volume1
+    volume2, n_weights2 = feature_map(w, h, d, n_kernels, 15)
+    v1 = volume1[0] * volume1[1] * volume1[2]
+    v2 = volume2[0] * volume2[1] * volume2[2]
+    return v1 + v2, n_weights2 + n_weights1
diff --git a/hyperoptimization/run_best_config.py b/hyperoptimization/run_best_config.py
new file mode 100644
index 0000000..af8431f
--- /dev/null
+++ b/hyperoptimization/run_best_config.py
@@ -0,0 +1,72 @@
+import hpbandster.core.result as hpres
+from hyperoptimization.runs import from_config as run
+from hyperoptimization.workers import TVAEWorker
+from hyperoptimization.utils import parse_hyperopt_args as hyperopt
+from hyperoptimization.explore import sorted_by_value
+from argparse import ArgumentParser as Parser
+from typing import Tuple
+
+
+def experiment(parser):
+
+    parser.add_argument("dataset", help="HD5 file as expected in input by tvo.Training")
+    parser.add_argument("--Ksize", type=int, default=3, help="size of each K^n set")
+    parser.add_argument("--epochs", type=int, default=40, help="number of training epochs")
+    parser.add_argument(
+        "--net-shape",
+        required=True,
+        type=parse_net_shape,
+        help="column-separated list of layer sizes",
+    )
+    parser.add_argument("--min_lr", type=float, help="MLP min learning rate", required=True)
+    parser.add_argument("--max_lr", type=float, help="MLP max learning rate", required=True)
+    parser.add_argument("--batch-size", type=int, required=True)
+    parser.add_argument("--output", help="output file for train log", required=True)
+    parser.add_argument(
+        "--seed",
+        type=int,
+        help="seed value for random number generators. default is a random seed",
+    )
+    return parser
+
+
+def parse_net_shape(net_shape: str) -> Tuple[int, ...]:
+    """
+    Parse string with TVAE shape into a tuple.
+
+    :param net_shape: column-separated list of integers, e.g. `"10:10:2"`
+    :returns: a tuple with the shape as integers, e.g. `(10,10,2)`
+    """
+    return tuple(map(int, net_shape.split(":")))
+
+
+parser = experiment(hyperopt(Parser()))
+parsed_args = parser.parse_args()
+
+worker = TVAEWorker
+
+path = ""
+result = hpres.logged_results_to_HBS_result(path)
+all_runs = result.get_all_runs()
+id2conf = result.get_id2config_mapping()
+
+ordered_by_loss = sorted_by_value(all_runs, key="loss")
+
+best_n_configs = 5
+if best_n_configs:
+    raise Exception("modify to take arbitrary sort key")
+for i in range(best_n_configs):
+    id = ordered_by_loss[i]["config_id"]
+    config = id2conf[id]["config"]
+    # config['lr']*=10
+    print("Running long experiment with config:")
+    print("{:<20} {:<20} ".format("hyperparameter", "value"))
+    for key, value in config.items():
+        print("{:<20} {:<20} ".format(key, value))
+
+    run(
+        config,
+        budget=parsed_args.epochs,
+        worker=worker,
+        parsed_args=parsed_args,
+    )
diff --git a/hyperoptimization/runs.py b/hyperoptimization/runs.py
new file mode 100644
index 0000000..a4b72bd
--- /dev/null
+++ b/hyperoptimization/runs.py
@@ -0,0 +1,130 @@
+import os
+import pickle
+
+import hpbandster.core.nameserver as hpns
+import hpbandster.core.result as hpres
+from hpbandster.optimizers import BOHB
+
+
+def from_config(config, budget, worker, parsed_args, *args, **kwargs):
+
+    host = hpns.nic_name_to_host(parsed_args.nic_name)
+
+    # Start a nameserver:
+    NS = hpns.NameServer(
+        run_id=parsed_args.run_id,
+        host=host,
+        port=None,
+        working_directory=parsed_args.shared_directory,
+    )
+    ns_host, ns_port = NS.start()
+
+    # Start local worker
+    w = worker(
+        run_id=parsed_args.run_id,
+        host=host,
+        nameserver=ns_host,
+        nameserver_port=ns_port,
+        timeout=120,
+        parsed_args=parsed_args,
+        *args,
+        **kwargs
+    )
+    w.run(background=True)
+    res = w.compute(config=config, budget=budget, working_directory=os.getcwd(), *args, **kwargs)
+    with open(os.path.join(parsed_args.shared_directory, "results.pkl"), "wb") as fh:
+        pickle.dump(res, fh)
+
+    NS.shutdown()
+
+
+def local_sequential(worker, parsed_args, previous_run=None, *args, **kwargs):
+
+    # get hostname
+    host = hpns.nic_name_to_host(parsed_args.nic_name)
+
+    # log results
+    result_logger = hpres.json_result_logger(directory=parsed_args.shared_directory, overwrite=True)
+
+    # Start a nameserver:
+    NS = hpns.NameServer(
+        run_id=parsed_args.run_id,
+        host=host,
+        port=None,
+        working_directory=parsed_args.shared_directory,
+    )
+    ns_host, ns_port = NS.start()
+
+    # Start local worker
+    w = worker(
+        run_id=parsed_args.run_id,
+        host=host,
+        nameserver=ns_host,
+        nameserver_port=ns_port,
+        timeout=120,
+        parsed_args=parsed_args,
+        *args,
+        **kwargs
+    )
+    w.run(background=True)
+
+    # Run an optimizer
+    # previous_run = hpres.logged_results_to_HBS_result('')
+
+    bohb = BOHB(
+        configspace=w.get_configspace(),
+        run_id=parsed_args.run_id,
+        host=host,
+        nameserver=ns_host,
+        nameserver_port=ns_port,
+        result_logger=result_logger,
+        min_budget=parsed_args.min_budget,
+        max_budget=parsed_args.max_budget,
+        previous_result=previous_run,
+    )
+    res = bohb.run(n_iterations=parsed_args.n_iterations)
+
+    # store results
+    with open(os.path.join(parsed_args.shared_directory, "results.pkl"), "wb") as fh:
+        pickle.dump(res, fh)
+
+    # shutdown
+    bohb.shutdown(shutdown_workers=True)
+    NS.shutdown()
+
+
+def on_the_cluster(worker, parsed_args, previous_run=None, *args, **kwargs):
+
+    host = hpns.nic_name_to_host(parsed_args.nic_name)
+
+    NS = hpns.NameServer(
+        run_id=parsed_args.run_id, host=host, port=0, working_directory=parsed_args.shared_directory
+    )
+    ns_host, ns_port = NS.start()
+
+    w = worker(
+        sleep_interval=0.5,
+        run_id=parsed_args.run_id,
+        host=host,
+        nameserver=ns_host,
+        nameserver_port=ns_port,
+        parsed_args=parsed_args,
+    )
+    w.run(background=True)
+
+    bohb = BOHB(
+        configspace=w.get_configspace(),
+        run_id=parsed_args.run_id,
+        host=host,
+        nameserver=ns_host,
+        nameserver_port=ns_port,
+        min_budget=parsed_args.min_budget,
+        max_budget=parsed_args.max_budget,
+    )
+    res = bohb.run(n_iterations=parsed_args.n_iterations, min_n_workers=parsed_args.n_workers)
+
+    with open(os.path.join(parsed_args.shared_directory, "results.pkl"), "wb") as fh:
+        pickle.dump(res, fh)
+
+    bohb.shutdown(shutdown_workers=True)
+    NS.shutdown()
diff --git a/hyperoptimization/test_neural_models.py b/hyperoptimization/test_neural_models.py
new file mode 100644
index 0000000..94765bc
--- /dev/null
+++ b/hyperoptimization/test_neural_models.py
@@ -0,0 +1,207 @@
+import matplotlib.pyplot as plt
+import torch as to
+import torchvision.datasets as datasets
+import torchvision.transforms as transforms
+from neural_models import FCnet, Deconvnet
+from itertools import chain
+import argparse
+import logging
+
+logging.disable()
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument("--n_epochs", default=5, type=int)
+parser.add_argument(
+    "--model",
+    default="ConvDeConv",
+    choices=["ConvDeConv", "AE"],
+    help="Model can be ConvDeConv or AE",
+)
+parser.add_argument("--lr", default=0.0001, type=float)
+
+args = parser.parse_args()
+
+# Script made with torch==1.7.0
+
+# declare processor
+transform = transforms.Compose(
+    [
+        transforms.ToTensor(),
+        transforms.Normalize((0.5,), (0.5,)),
+    ]
+)
+
+# prepare data
+mnist_trainset = datasets.MNIST(root="./data", train=True, download=True, transform=transform)
+train_loader = to.utils.data.DataLoader(mnist_trainset, batch_size=1000, shuffle=True)
+mnist_testset = datasets.MNIST(root="./data", train=False, download=True, transform=transform)
+test_loader = to.utils.data.DataLoader(mnist_testset, batch_size=1000, shuffle=True)
+# define loss function
+loss_fn = to.nn.MSELoss()
+
+# define optimizer
+optimizer = to.optim.Adam
+
+
+class AutoEncoder(object):
+    def __init__(self, shape=None, lr=0.003):
+        if shape is None:
+            shape = (28**2, 256, 64, 256, 28**2)
+
+        # define weight initialization
+        w_init = to.nn.init.xavier_normal_
+
+        # initialize linear layers
+        self.W0 = w_init(to.empty(shape[0], shape[1], requires_grad=True))
+        self.W1 = w_init(to.empty(shape[1], shape[2], requires_grad=True))
+        self.W2 = w_init(to.empty(shape[2], shape[3], requires_grad=True))
+        self.W3 = w_init(to.empty(shape[3], shape[4], requires_grad=True))
+
+        # load layers to optimizer
+        self.optimizer = optimizer([self.W0, self.W1, self.W2, self.W3], lr=lr)
+
+    def forward(self, x):
+        # define activation function
+        f = to.nn.functional.leaky_relu
+
+        # forward pass
+        h0 = f(x @ self.W0)
+        h1 = f(h0 @ self.W1)
+        h2 = f(h1 @ self.W2)
+        rec = to.tanh(h2 @ self.W3)
+
+        return rec
+
+    def train(self, epochs):
+
+        losses = []
+
+        for n_epoch in range(epochs):
+
+            avg_train_loss = 0.0
+            no_datapoints = 0
+
+            for batch, target in train_loader:
+
+                # get 0s and 1s
+                batch_ = (
+                    batch[to.logical_or(target == 0, target == 1)].flatten(start_dim=1).double()
+                )
+                if batch_.shape[0] == 0:
+                    continue
+
+                # train
+                self.optimizer.zero_grad()
+                reconstruction = self.forward(batch_).resize_as(batch_)
+                loss = loss_fn(reconstruction, batch_)
+                loss.backward()
+                self.optimizer.step()
+
+                # log
+                avg_train_loss += loss.data.item()
+                no_datapoints += len(batch_)
+
+            print(
+                "Epoch {} finished with average loss of {}".format(
+                    n_epoch, round(avg_train_loss, 6)
+                )
+            )
+            losses.append(avg_train_loss)
+
+        print("Training is complete.")
+        return losses
+
+
+class FCAE(AutoEncoder):
+    def __init__(self, shape=None):
+        super().__init__(shape)
+        n_stacks = len(shape) - 1
+        self.model = FCnet(
+            input_size=shape[0],
+            W_shapes=shape[1:],
+            fc_activations=["to.nn.LeakyReLU"] * n_stacks,
+            dropouts=[0] * n_stacks,
+            dropout_rate=0.25,
+        )
+        self.optimizer = optimizer(self.model.parameters(), lr=0.003)
+
+    def forward(self, x):
+        return self.model.forward(x)
+
+
+class ConvDeconv(AutoEncoder):
+    def __init__(self, shape=None, dtype=to.double, lr=0.003):
+        super().__init__(shape, lr=lr)
+        n_stacks = len(shape) - 1
+        half_stacks = n_stacks // 2
+        self.encoder = to.nn.Sequential(
+            to.nn.Conv2d(in_channels=1, out_channels=6, kernel_size=5, dtype=dtype),
+            to.nn.Flatten(),
+            to.nn.LeakyReLU(),
+            to.nn.LazyLinear(out_features=shape[half_stacks], dtype=dtype),
+            to.nn.LeakyReLU(),
+        )
+
+        self.decoder = Deconvnet(
+            in_features=shape[half_stacks],
+            filters_from_fc=1,
+            kernels=None,
+            output_shape=28**2,
+            n_deconv_layers=2,
+            n_kernels=[5, 1],
+            batch_norms=None,
+            dc_activations=["to.nn.Tanh"] * half_stacks,
+        )
+        self.decoder.D = 28**2
+        self.decoder.double()
+
+        parameters = chain(self.encoder.parameters(), self.decoder.parameters())
+        self.optimizer = optimizer(parameters, lr=0.003)
+
+    def forward(self, x):
+        z = self.encoder(x.double().reshape(x.shape[0], 1, 28, 28))
+        # set S_K(n) to 1 because we are reusing Deconvnet
+        z = z.reshape(z.shape[0], 1, z.shape[-1])
+        x_hat = self.decoder(z)
+
+        return x_hat
+
+
+plt.ioff()
+
+# define model
+if args.model == "ConvDeConv":
+    model = ConvDeconv(shape=(28**2, 256, 64, 256, 28**2), lr=args.lr)
+elif args.model == "AE":
+    model = AutoEncoder(shape=(28**2, 256, 64, 256, 28**2), lr=args.lr)  # type: ignore
+# train model
+avg_train_loss = model.train(epochs=args.n_epochs)
+
+# get 0s and 1s for testing
+for batch, target in test_loader:
+    batch_ = batch[to.logical_or(target == 0, target == 1)].flatten(start_dim=1)
+    if batch_.shape[0] != 0:
+        break
+
+# reconstruct
+reconstruction = model.forward(batch_)
+
+# plot originals against reconstruction
+n_show = 7
+fig, axs = plt.subplots(2, n_show)
+for i in range(n_show):
+    axs[0, i].imshow(batch_[i].reshape(28, 28))
+    axs[0, i].axis("off")
+    axs[1, i].imshow(reconstruction[i].reshape(28, 28).detach().numpy())
+    axs[1, i].axis("off")
+    if round((n_show - 1) / 2) == i:
+        axs[0, i].title.set_text("Original")
+        axs[1, i].title.set_text("Reconstruction")
+plt.savefig("reconstruction")
+
+# plot training loss
+plt.figure()
+plt.plot(avg_train_loss)
+plt.title("loss")
+plt.savefig("loss")
diff --git a/hyperoptimization/utils.py b/hyperoptimization/utils.py
new file mode 100644
index 0000000..345dd58
--- /dev/null
+++ b/hyperoptimization/utils.py
@@ -0,0 +1,51 @@
+def parse_hyperopt_args(parser):
+    """
+    :param parser: an Argument Parser object from argparse
+    :return: non-initialized parser with the necessary hpbandster arguments
+    """
+
+    parser.add_argument(
+        "--min_budget",
+        type=float,
+        help="Minimum number of epochs for training.",
+        default=1,
+    )
+    parser.add_argument(
+        "--max_budget",
+        type=float,
+        help="Maximum number of epochs for training.",
+        default=5,
+    )
+    parser.add_argument(
+        "--n_iterations",
+        type=int,
+        help="Number of iterations performed by the optimizer",
+        default=16,
+    )
+    parser.add_argument(
+        "--worker",
+        help="Flag to turn this into a worker process",
+        action="store_true",
+        default=False,
+    )
+    parser.add_argument(
+        "--run_id",
+        type=str,
+        help="A unique run id for this optimization run. An easy option is "
+        "to use the job id of the clusters scheduler.",
+        default="derp",
+    )
+    parser.add_argument(
+        "--nic_name",
+        type=str,
+        help="Which network interface to use for communication.",
+        default="lo",
+    )
+    parser.add_argument(
+        "--shared_directory",
+        type=str,
+        help="A directory that is accessible for all processes, e.g. a NFS share.",
+        default=".",
+    )
+
+    return parser
diff --git a/hyperoptimization/workers.py b/hyperoptimization/workers.py
new file mode 100644
index 0000000..6c4307a
--- /dev/null
+++ b/hyperoptimization/workers.py
@@ -0,0 +1,627 @@
+import h5py
+import numpy as np
+import torch as to
+
+import tvo
+from tvo.utils import get
+from tvo.models import BernoulliTVAE as TVAE
+from tvo.exp import EVOConfig, ExpConfig, Training, Testing
+
+from hyperoptimization.neural_models import FCDeConvNetSigOut as FCDeConvNet
+
+import ConfigSpace as CS
+import ConfigSpace.hyperparameters as CSH
+from hpbandster.core.worker import Worker
+
+import logging
+
+logging.basicConfig(level=logging.INFO)
+
+
+class BaseWorker(Worker):
+    def __init__(self, **kwargs):
+        super().__init__(**self.extract_worker_args(**kwargs))
+
+    def extract_worker_args(self, **kwargs):
+        """
+        This function enables the keyword arguments dictionary that is passed to the Worker class
+        to contain keywords that are not named explicitly by the base Worker class, but are
+        otherwise useful to a downstream class, e.g. to the TVAE.
+        :param kwargs: any **kwargs
+        :return: inputs accepted by Worker class
+        """
+        assert "run_id" in kwargs.keys(), "run_id is necessary"
+        kw = {
+            "run_id": None,
+            "nameserver": None,
+            "nameserver_port": None,
+            "logger": None,
+            "host": None,
+            "id": None,
+            "timeout": None,
+        }
+        for key in kw:
+            if key in kwargs:
+                kw[key] = kwargs[key]
+        return kw
+
+
+# TODO: See if it is useful to make a cleaner separation between model and worker
+class TVAEWorker(BaseWorker):
+    def __init__(self, parsed_args, **kwargs):
+        """
+        :param parsed_args: list of arguments passed to the script. It is expected to
+         contain the following:
+        - Ksize: number of states to be kept for truncated inference
+        - dataset: name of the dataset to be used
+        - epochs per half cycle: number of epochs until a half cycle of cyclic learning
+          rate is completed
+        - batch size: number of samples per batch
+        - output: name of the output file
+        - min_lr: minimum learning rate for the cyclic learning rate scheduler
+        - max_lr: maximum learning rate for the cyclic learning rate scheduler
+        - net_shape: shape of the network. If no H argument is passed, the final layer
+          is used to infer the H size.
+        - H: size of the first generative layer. This option should be used, as net_shape
+          will be phased out.
+        - cyclic_lr: whether to use cyclic learning rate or not. If False, the learning
+          rate will be constant.
+        :param kwargs: Additional arguments to be passed to the underlying Worker class.
+        """
+
+        # call base class constructor
+        super().__init__(**kwargs)
+
+        # extract args
+        self.S = S = parsed_args.Ksize
+        self.data_fname = parsed_args.dataset
+        self.epochs_per_half_cycle = 1
+        self.batch_size = parsed_args.batch_size
+        self.output = parsed_args.output
+        self.min_lr = parsed_args.min_lr
+        self.max_lr = parsed_args.max_lr
+
+        # infer size of H from net_shape. TODO: phase net_shape out
+        try:
+            net_shape = parsed_args.net_shape
+            self.H = H = net_shape[-1]
+        except AttributeError:
+            self.H = H = parsed_args.H
+
+        try:
+            self.cyclic_lr = parsed_args.cyclic_lr
+        except AttributeError:
+            self.cyclic_lr = False
+
+        # infer hyperparameter status
+        self.is_hyperparameter_S = not (S)
+        self.is_hyperparameter_H = not (H)
+        self.is_hyperparameter_EEM = False
+
+        # loads data, sets N and D
+        self.handle_data()
+
+        # set the config space
+        self.set_configspace()
+
+        # print out dataset information
+        print(f"\ninput file: {parsed_args.dataset}")
+        try:
+            print(f"true logL: {self.data_file['ground_truth']['logL'][...]}")
+        except KeyError:
+            pass
+
+    def compute(self, config, budget, working_directory, *args, **kwargs):
+        """
+        :param config: a config in hpbandster style that contains the model hyperparameters.
+        :param budget: amount of epochs to run the model
+        :param working_directory: arg used by hpbandster
+        :param args: other args
+        :param kwargs: other keyworded args
+        :return: loss and additional run information
+        """
+
+        # extract S and H if they are hyperparameters
+        self.extract_hypers_from_config(config)
+
+        # define the model
+        model = self.get_external_model(config)
+
+        # setup optimizer
+        if config["optimizer"] == "SGD":
+            optimizer = to.optim.SGD(
+                model.parameters(),
+                lr=config["lr"],
+                momentum=config["sgd_momentum"],
+            )
+        elif config["optimizer"] == "Adam":
+            optimizer = to.optim.Adam(model.parameters(), lr=config["lr"])
+        else:
+            raise NotImplementedError("Currently we support only SGD with momentum and Adam")
+
+        model.to(tvo.get_device())
+        model.device = tvo.get_device()
+
+        # setup TVAE
+        cycliclr_half_step_size = np.ceil(self.N / self.batch_size) * self.epochs_per_half_cycle
+
+        if not self.cyclic_lr:
+            self.min_lr = self.max_lr = config["lr"]
+
+        model = TVAE(
+            external_model=model,
+            shape=None,
+            min_lr=self.min_lr,
+            max_lr=self.max_lr,
+            cycliclr_step_size_up=cycliclr_half_step_size,
+            optimizer=optimizer,
+            precision=to.double,
+        )
+
+        exp_conf = ExpConfig(
+            batch_size=self.batch_size,
+            output=self.output,
+            data_transform=self.data_transform,
+        )
+        estep_conf = self.get_EEM_conf(config)
+
+        # setup training
+        data_fname = self.data_fname
+
+        training = Training(exp_conf, estep_conf, model, data_fname, self.valid_fname)
+        testing = Testing(exp_conf, estep_conf, model, data_fname)
+        print("\nlearning...")
+        training_results = []
+        for train_log in training.run(int(budget)):
+            train_log.print()
+            training_results.append(train_log._results)
+
+        testing_results = []
+        for test_log in testing.run(1):
+            # test_log.print()
+            testing_results.append(test_log._results)
+
+        train_F, subs = get(training_results[-1], "train_F", "train_subs")
+        valid_F, subs = get(training_results[-1], "test_F", "test_subs")
+
+        test_F, subs = get(testing_results[-1], "test_F", "test_subs")
+
+        # optimizable = -train_F  # HpBandSter always minimizes
+        #
+        # if self.valid_fname:
+        #     optimizable = -valid_F  # HpBandSter always minimizes
+
+        return {
+            "loss": -train_F if -train_F else np.nan,
+            "info": {
+                "test accuracy": test_F,
+                "train accuracy": train_F,
+                "validation accuracy": valid_F,
+                "number of parameters": model._external_model.number_of_parameters(),
+            },
+        }
+
+    def get_external_model(self, config):
+        # unpack external model args from config
+        (
+            n_deconv_layers,
+            n_fc_layers,
+            W_shapes,
+            fc_activations,
+            dropouts,
+            dc_activations,
+            n_filters,
+            batch_norms,
+            dropout_rate,
+            kernels,
+        ) = self.model_args_from_(config, sanity_checks=False)
+
+        # setup external model
+        model = FCDeConvNet(
+            n_deconv_layers=n_deconv_layers,
+            n_fc_layers=n_fc_layers,
+            W_shapes=W_shapes,
+            fc_activations=fc_activations,
+            dc_activations=dc_activations,
+            n_kernels=n_filters,
+            batch_norms=batch_norms,
+            dropouts=dropouts,
+            dropout_rate=dropout_rate,
+            input_size=self.H,
+            output_shape=self.D,
+            filters_from_fc=1,
+            kernels=kernels,
+        )
+
+        model.H0 = model.shape[0]
+        model.D = self.D
+        model.double()
+        return model
+
+    def get_EEM_conf(self, config):
+        if self.is_hyperparameter_EEM:
+            estep_conf = EVOConfig(
+                n_states=config["S"],
+                n_parents=config["n_parents"],
+                n_children=config["n_children"],
+                n_generations=1,
+                crossover=False,
+            )
+        else:
+            estep_conf = EVOConfig(
+                n_states=self.S,
+                n_parents=min(3, self.S),
+                n_children=min(2, self.S),
+                n_generations=1,
+                crossover=False,
+            )
+        return estep_conf
+
+    def set_configspace(self):
+        add_EEM = self.add_EEM
+        add_fc_deconv = self.add_FCDeconv
+
+        if self.is_hyperparameter_H and self.is_hyperparameter_EEM:
+
+            def custom_configspace():
+                cs = CS.ConfigurationSpace()
+                cs = add_EEM(add_fc_deconv(cs))
+                H = CSH.UniformIntegerHyperparameter(name="H", lower=1, upper=10)
+                cs.add_hyperparameters([H])
+                return cs
+
+        elif self.is_hyperparameter_H:
+
+            def custom_configspace():
+                cs = CS.ConfigurationSpace()
+                cs = add_fc_deconv(cs)
+                H = CSH.UniformIntegerHyperparameter(name="H", lower=1, upper=10)
+                cs.add_hyperparameters([H])
+                return cs
+
+        elif self.is_hyperparameter_EEM:
+
+            def custom_configspace():
+                cs = CS.ConfigurationSpace()
+                cs = add_EEM(add_fc_deconv(cs))
+                return cs
+
+        else:
+
+            add_fc_deconv = self.add_FCDeconv
+
+            def custom_configspace():
+                cs = CS.ConfigurationSpace()
+                return add_fc_deconv(cs)
+
+        self.get_configspace = custom_configspace  # .__get__(custom_configspace)
+
+    @staticmethod
+    def add_EEM(cs):
+        n_states = CSH.UniformIntegerHyperparameter(name="S", lower=1, upper=6)
+        n_parents = CSH.UniformIntegerHyperparameter(name="n_parents", lower=1, upper=n_states)
+        n_children = CSH.UniformIntegerHyperparameter(
+            name="n_children", lower=1, upper=max(n_parents, 2)
+        )
+        cs.add_hyperparameters([n_states, n_parents, n_children])
+        return cs
+
+    @staticmethod
+    def add_FCDeconv(cs):
+        """
+        It builds the configuration space with the needed hyperparameters.
+        It is easily possible to implement different types of hyperparameters.
+        Beside float-hyperparameters on a log scale, it is also able to handle
+        categorical input parameter.
+        :return: ConfigurationsSpace-Object
+        """
+
+        lr = CSH.UniformFloatHyperparameter(
+            "lr", lower=1e-6, upper=1e-1, default_value="1e-2", log=True
+        )
+
+        # setup optimizers
+        optimizer = CSH.CategoricalHyperparameter("optimizer", ["Adam", "SGD"])
+        sgd_momentum = CSH.UniformFloatHyperparameter(
+            "sgd_momentum", lower=0.0, upper=0.99, default_value=0.9, log=False
+        )
+        cs.add_hyperparameters([lr, optimizer, sgd_momentum])
+
+        # The hyperparameter sgd_momentum will be used,if the configuration
+        # contains 'SGD' as optimizer.
+        cond = CS.EqualsCondition(sgd_momentum, optimizer, "SGD")
+
+        cs.add_condition(cond)
+
+        # set general block length
+        max_block_length = 5  # maximum block size for deconv or linear stack
+
+        # define the  linear blocks
+        num_linear_layers = CSH.UniformIntegerHyperparameter(
+            "num_linear_layers",
+            lower=1,
+            upper=max_block_length,
+            default_value=2,
+            log=False,
+        )
+        root_W_shapes = CSH.UniformIntegerHyperparameter(
+            "root_W_shapes", lower=7, upper=14, default_value=8, log=False
+        )  # squared in the compute function as of August 2021
+
+        # define positions of dropout layers
+        has_dropout_1 = CSH.UniformIntegerHyperparameter(
+            "dropout_1", lower=0, upper=1, default_value=0, log=False
+        )
+        has_dropout_2 = CSH.UniformIntegerHyperparameter(
+            "dropout_2", lower=0, upper=1, default_value=0, log=False
+        )
+        has_dropout_3 = CSH.UniformIntegerHyperparameter(
+            "dropout_3", lower=0, upper=1, default_value=0, log=False
+        )
+        has_dropout_4 = CSH.UniformIntegerHyperparameter(
+            "dropout_4", lower=0, upper=1, default_value=0, log=False
+        )
+        has_dropout_5 = CSH.UniformIntegerHyperparameter(
+            "dropout_5", lower=0, upper=1, default_value=0, log=False
+        )
+
+        # define activations for the fully connected stack
+        activation_list = ["nn.Tanh", "nn.Sigmoid", "nn.LeakyReLU"]
+        activation_list = [activation_list[-1]]
+        fc_activation_1 = CSH.CategoricalHyperparameter("fc_activation_1", ["nn.Tanh"])
+        fc_activation_2 = CSH.CategoricalHyperparameter("fc_activation_2", activation_list)
+        fc_activation_3 = CSH.CategoricalHyperparameter("fc_activation_3", activation_list)
+        fc_activation_4 = CSH.CategoricalHyperparameter("fc_activation_4", activation_list)
+        fc_activation_5 = CSH.CategoricalHyperparameter("fc_activation_5", activation_list)
+
+        # define the deconv blocks
+        num_deconv_layers = CSH.UniformIntegerHyperparameter(
+            "num_deconv_layers",
+            lower=0,
+            upper=max_block_length,
+            default_value=2,
+        )
+
+        # define filter ranges
+        # Todo: take filter dimensionality from x for the final filter
+        # TODO: remove last filter from hyperparameters
+
+        num_filters_1 = CSH.CategoricalHyperparameter("num_filters_1", [1])
+        num_filters_2 = CSH.UniformIntegerHyperparameter(
+            "num_filters_2", lower=1, upper=4, default_value=4, log=True
+        )
+        num_filters_3 = CSH.UniformIntegerHyperparameter(
+            "num_filters_3", lower=1, upper=4, default_value=4, log=True
+        )
+        num_filters_4 = CSH.UniformIntegerHyperparameter(
+            "num_filters_4", lower=1, upper=4, default_value=4, log=True
+        )
+        num_filters_5 = CSH.UniformIntegerHyperparameter(
+            "num_filters_5", lower=1, upper=4, default_value=4, log=True
+        )
+
+        # define existence of per-layer batch normalization
+        has_batch_norm_1 = CSH.UniformIntegerHyperparameter(
+            "batch_norm_1", lower=0, upper=1, default_value=0, log=False
+        )
+        has_batch_norm_2 = CSH.UniformIntegerHyperparameter(
+            "batch_norm_2", lower=0, upper=1, default_value=0, log=False
+        )
+        has_batch_norm_3 = CSH.UniformIntegerHyperparameter(
+            "batch_norm_3", lower=0, upper=1, default_value=0, log=False
+        )
+        has_batch_norm_4 = CSH.UniformIntegerHyperparameter(
+            "batch_norm_4", lower=0, upper=1, default_value=0, log=False
+        )
+        has_batch_norm_5 = CSH.UniformIntegerHyperparameter(
+            "batch_norm_5", lower=0, upper=1, default_value=0, log=False
+        )
+
+        # define activations for the deconv stack
+        activation_list = ["nn.Tanh", "nn.Sigmoid", "nn.LeakyReLU"]
+        activation_list = [activation_list[-1]]
+        # raise ArithmeticError
+        dc_activation_1 = CSH.CategoricalHyperparameter("dc_activation_1", ["nn.Tanh"])
+        dc_activation_2 = CSH.CategoricalHyperparameter("dc_activation_2", activation_list)
+        dc_activation_3 = CSH.CategoricalHyperparameter("dc_activation_3", activation_list)
+        dc_activation_4 = CSH.CategoricalHyperparameter("dc_activation_4", activation_list)
+        dc_activation_5 = CSH.CategoricalHyperparameter("dc_activation_5", activation_list)
+
+        # add fc hyperparams
+        cs.add_hyperparameters(
+            [
+                num_linear_layers,
+                root_W_shapes,
+                has_dropout_1,
+                has_dropout_2,
+                has_dropout_3,
+                has_dropout_4,
+                has_dropout_5,
+                fc_activation_1,
+                fc_activation_2,
+                fc_activation_3,
+                fc_activation_4,
+                fc_activation_5,
+            ]
+        )
+        # add dc hyperparameters
+        cs.add_hyperparameters(
+            [
+                num_deconv_layers,
+                num_filters_1,
+                num_filters_2,
+                num_filters_3,
+                num_filters_4,
+                num_filters_5,
+                has_batch_norm_1,
+                has_batch_norm_2,
+                has_batch_norm_3,
+                has_batch_norm_4,
+                has_batch_norm_5,
+                dc_activation_1,
+                dc_activation_2,
+                dc_activation_3,
+                dc_activation_4,
+                dc_activation_5,
+            ]
+        )
+
+        # Add conditions to hyperparameters.
+        # Activate deeper hyperparameters only if their corresponding layer is present
+
+        # fully connected stack
+        for i in range(2, max_block_length + 1):
+            dropout_cond = CS.GreaterThanCondition(
+                eval("has_dropout_{}".format(i)), num_linear_layers, i - 1
+            )
+            cs.add_condition(dropout_cond)
+            activation_cond = CS.GreaterThanCondition(
+                eval("fc_activation_{}".format(i)), num_linear_layers, i - 1
+            )
+            cs.add_condition(activation_cond)
+
+        # deconv stack
+        for i in range(1, max_block_length + 1):
+            batch_norm_cond = CS.GreaterThanCondition(
+                eval("has_batch_norm_{}".format(i)), num_deconv_layers, i - 1
+            )
+            cs.add_condition(batch_norm_cond)
+            activation_cond = CS.GreaterThanCondition(
+                eval("dc_activation_{}".format(i)), num_deconv_layers, i - 1
+            )
+            cs.add_condition(activation_cond)
+            filter_cond = CS.GreaterThanCondition(
+                eval("num_filters_{}".format(i)), num_deconv_layers, i - 1
+            )
+            cs.add_condition(filter_cond)
+
+        # set global dropout rate
+        dropout_rate = CSH.UniformFloatHyperparameter(
+            "dropout_rate", lower=0.0, upper=0.9, default_value=0.5, log=False
+        )
+        cs.add_hyperparameters([dropout_rate])
+
+        return cs
+
+    def model_args_from_(self, config, sanity_checks=False):
+
+        # unpack values from hpbandster config
+        if self.is_hyperparameter_S:
+            self.S = config["S"]
+        if self.is_hyperparameter_H:
+            self.H = config["H"]
+
+        n_deconv_layers = config["num_deconv_layers"]
+        n_fc_layers = config["num_linear_layers"]
+
+        W_shapes = config["root_W_shapes"] ** 2
+
+        if type(W_shapes) is not list:
+            assert type(W_shapes) is int
+            W_shapes = [W_shapes for _ in range(n_fc_layers)]
+
+        fc_activations = [config["fc_activation_{}".format(i + 1)] for i in range(n_fc_layers)]
+        dropouts = [config["dropout_{}".format(i + 1)] for i in range(n_fc_layers)]
+        dc_activations = [config["dc_activation_{}".format(i + 1)] for i in range(n_deconv_layers)]
+        n_filters = [config["num_filters_{}".format(i + 1)] for i in range(n_deconv_layers)]
+        batch_norms = [
+            config["batch_norm_{}".format(i + 1)] if "batch_norm_1" in config.keys() else 0
+            for i in range(n_deconv_layers)
+        ]
+        dropout_rate = config["dropout_rate"]
+        kernels = [
+            config["num_kernels_{}".format(i + 1)]
+            for i in range(n_deconv_layers)
+            if "num_kernels_1" in config.keys()
+        ]
+        if sanity_checks:
+            # todo: use in testing
+
+            # check expected argument length
+            assert len(fc_activations) == len(
+                [config[key] for key in config if "fc_activation_" in key]
+            )
+            assert len(dropouts) == len(
+                [config[key] for key in config if "dropout_" in key and "rate" not in key]
+            )
+            assert len(dc_activations) == len(
+                [config[key] for key in config if "dc_activation_" in key]
+            )
+            assert len(n_filters) == len([config[key] for key in config if "num_filters_" in key])
+            assert len(batch_norms) == len([config[key] for key in config if "batch_norm_" in key])
+
+            # if len(n_filters):
+            #     assert n_filters[0] == 1
+
+            # check expected argument type
+            assert type(n_fc_layers) is int
+            assert type(n_deconv_layers) is int
+
+            for a in fc_activations:
+                assert type(a) is str
+            for dr in dropouts:
+                assert dr in [0, 1]
+            for a in dc_activations:
+                assert type(a) is str
+
+            for f in n_filters:
+                assert type(f) is int
+
+            for bn in batch_norms:
+                assert bn in [0, 1]
+
+            assert 1 >= dropout_rate >= 0
+
+        return (
+            n_deconv_layers,
+            n_fc_layers,
+            W_shapes,
+            fc_activations,
+            dropouts,
+            dc_activations,
+            n_filters,
+            batch_norms,
+            dropout_rate,
+            kernels,
+        )
+
+    def handle_data(self, **kwargs):
+        # extract data from file
+        self.data_file = h5py.File(self.data_fname, "r")
+        try:
+            data = self.data_file["train_data"]
+        except KeyError:
+            data = self.data_file["data"]
+
+        # extract validation data
+        if "val_data" in self.data_file.keys():
+            self.valid_fname = self.data_fname
+        else:
+            self.valid_fname = None
+
+        # infer data dimensionalities
+        self.N, self.D = data.shape
+
+        # set data transform
+        if "data_transform" in kwargs.keys():
+            self.data_transform = kwargs["data_transform"]
+        else:
+            self.data_transform = None
+
+    def extract_hypers_from_config(self, config):
+        if self.is_hyperparameter_S:
+            try:
+                self.S = config["S"]
+            except KeyError:
+                raise KeyError(
+                    "Number of states is not a hyperparameter, and none was passed " "to the init"
+                )
+
+        if self.is_hyperparameter_H:
+            try:
+                self.H = config["H"]
+            except KeyError:
+                raise KeyError(
+                    "Number of initial hidden units is not a hyperparameter, and none "
+                    "was passed to the init"
+                )
diff --git a/tvo/models/__init__.py b/tvo/models/__init__.py
index 610f0df..1076a6b 100644
--- a/tvo/models/__init__.py
+++ b/tvo/models/__init__.py
@@ -1,8 +1,8 @@
 from .noisyor import NoisyOR
 from .bsc import BSC
-from .sssc import SSSC
 from .tvae import GaussianTVAE, BernoulliTVAE
 from .gmm import GMM
 from .pmm import PMM
+from .sssc import SSSC
 
 __all__ = ["NoisyOR", "BSC", "SSSC", "GaussianTVAE", "BernoulliTVAE", "GMM", "PMM"]