Source code for fast.sampling.core

# Author: Maxwell I. Zimmerman <mizimmer@wustl.edu>
# Contributors:
# Copywright (C) 2017, Washington University in St. Louis
# All rights reserved.
# Unauthorized copying of this file, via any medium, is strictly prohibited
# Proprietary and confidential


#######################################################################
# imports
#######################################################################


import glob
import itertools
import logging
import mdtraj as md
import numpy as np
import os
import pickle
import scipy.io
import subprocess as sp
import time
from . import rankings
from .. import tools
from ..base import base
from ..exception import DataInvalid, MissingData
from ..msm_gen import SaveWrap
from ..submissions import slurm_subs
from ..submissions import lsf_subs
from enspara.msm import builders, MSM
from enspara.util import array as ra
from functools import partial


logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)


#######################################################################
# code
#######################################################################


def _setup_directories(output_dir):
    """Setup adaptive sampling directory structure"""
    # if output dir already exists raise an error
    if os.path.exists(output_dir):
        raise DataInvalid('output directory already exists!')
    msm_dir = output_dir + "/msm"
    cmd1 = 'mkdir ' + output_dir
    cmd2 = 'mkdir ' + msm_dir
    cmd3 = 'mkdir ' + msm_dir + '/data'
    cmd4 = 'mkdir ' + msm_dir + '/rankings'
    cmd5 = 'mkdir ' + msm_dir + '/trajectories'
    cmd6 = 'mkdir ' + msm_dir + '/trajectories_full'
    cmd7 = 'mkdir ' + msm_dir + '/centers_masses'
    cmd8 = 'mkdir ' + msm_dir + '/centers_restarts'
    cmd9 = 'mkdir ' + msm_dir + '/submissions'
    cmds = [cmd1, cmd2, cmd3, cmd4, cmd5, cmd6, cmd7, cmd8, cmd9]
    out = tools.run_commands(cmds)
    return msm_dir


def _gen_initial_sims(
        base_dir, initial_struct, trj_obj, n_kids, q_check_obj):
    """Runs the first round of adaptive sampling. Currently runs
    simulations from a single structure.

    Inputs
    ----------
    base_dir : str,
        The base adaptive sampling directory that will contain gen
        directories and the msm directory.
    initial_struct : str or md.Trajectory,
        The initial structure to start a swarn of sims from.
    trj_obj : object,
        Simulation object used for simulations. See Gromax.
    n_kids : int,
        Number of children gen will have.
    q_check_obj : object,
        Queueing system wrapper to determine if simulations are
        still running.
    """
    t0 = time.time()
    # generate initial gen directory
    gen0_dir = base_dir + '/gen0'
    cmd = 'mkdir ' + gen0_dir
    _ = tools.run_commands(cmd)
    # Spawn simulations
    pids = []
    for kid in range(n_kids):
        # generate kid directory
        kid_dir = gen0_dir + '/kid'+str(kid)
        cmd = 'mkdir ' + kid_dir
        _ = tools.run_commands(cmd)
        # submit job based on trj_obj and retain pid
        pid = trj_obj.run(initial_struct, kid_dir)
        pids.append(pid)
        # wait for a job to finish if maximum number of simulations are
        #still running
        q_check_obj.wait_for_pids(np.array(pids))
    # gather all pids and wait for every simulation to finish
    pids = np.array(pids)
    q_check_obj.wait_for_pids(pids, wait_for_all=True)
    t1 = time.time()
    logging.info("simulations took %0.4f seconds" % (t1-t0))
    return pids


def _prop_sims(base_dir, trj_obj, gen_num, q_check_obj, new_states):
    """Runs the subsequent rounds of adaptive sampling. Looks for
    states in specified directory structure and uses them for
    respawning simulations.

    Inputs
    ----------
    base_dir : str,
        The base adaptive sampling directory that will contain gen
        directories and the msm directory.
    trj_obj : object,
        Simulation object used for simulations. See Gromax.
    gen_num : int,
        The generation of sampling to propagate new simulations.
    q_check_obj : object,
        Queueing system wrapper to determine if simulations are
        still running.
    new_states : list,
        List of state numbers to respawn simulations from
    """
    t0 = time.time()
    # generate gen directory
    gen_dir = base_dir + '/gen' + str(gen_num)
    cmd = 'mkdir ' + gen_dir
    _ = tools.run_commands(cmd)
    pids = []
    # propagate simulations
    for kid in range(len(new_states)):
        # generate kid directory
        kid_dir = gen_dir + '/kid'+str(kid)
        cmd = 'mkdir ' + kid_dir
        _ = tools.run_commands(cmd)
        # run simulation and gather pid
        filename = base_dir + '/msm/centers_restarts/state' + \
            ('%06d' % new_states[kid]) + '-00.gro'
        pid = trj_obj.run(filename, kid_dir)
        pids.append(pid)
        # wait for a job to finish if maximum number of simulations are
        #still running
        q_check_obj.wait_for_pids(np.array(pids))
    # gather all pids and wait for every simulation to finish
    pids = np.array(pids)
    q_check_obj.wait_for_pids(pids, wait_for_all=True)
    t1 = time.time()
    logging.info("simulations took %0.4f seconds" % (t1 - t0))
    return pids
    

def _move_trjs(gen_dir, msm_dir, gen_num, n_kids):
    """Move finished trajectories to MSM directory.

    Inputs
    ----------
    gen_dir : str,
        Generation directory containing simulations to move.
    msm_dir : str,
        MSM directory where analysis is performed.
    gen_num : int,
        Generation number.
    n_kids : int,
        Number of kids expected in gen directory.
    """
    # iterate over potential kids
    for kid in range(n_kids):
        # trj to move trajectories
        try:
            # specify directroy and file names
            kid_dir = gen_dir + "/kid" + str(kid)
            output_full = msm_dir + '/trajectories_full' + '/trj_gen' + \
                ('%03d' % gen_num) + '_kid' + ('%03d' % kid) + '.xtc'
            output_masses = msm_dir + '/trajectories' + '/trj_gen' + \
                ('%03d' % gen_num) + '_kid' + ('%03d' % kid) + '.xtc'
            cmds = []
            if os.path.exists(output_full):
                logging.info("file '%s' exists! skipping move." % output_full)
            else:
                cmds.append(
                    'mv ' + kid_dir + '/frame0_aligned.xtc ' + output_full)
            if os.path.exists(output_masses):
                logging.info("file '%s' exists! skipping move." % output_masses)
            else:
                cmds.append(
                    'mv ' + kid_dir + '/frame0_masses.xtc ' +output_masses)
            if len(cmds) > 0:
                out = tools.run_commands(cmds)
        # give sad-face expression
        except:
            raise MissingData(
                'trajectory from gen %03d and kid %03d was not found.' %
                (gen_num, kid),
                'Simulation may have crahsed!')
    return


def _pickle_submit(
        msm_dir, base_obj, sub_obj, q_check_obj, gen_num, base_name):
    """Helper function for pickling an object and submitting it to run.

    Inputs
    ----------
    msm_dir : str,
        MSM directory where analysis is performed.
    base_obj : object,
        The object to pickle and submit. Must have `run` as a class
        function.
    sub_obj : object,
        Submission wrapper object.
    q_check_obj : object,
        Queueing system wrapper to determine if submission is still
        running.
    gen_num : int,
        Generation number.
    base_name : str,
        The base name of the job being submitted. Used to name the
        output files.
    """
    # pickle object
    base_pickle = msm_dir + "/" + base_name + ".pkl"
    pickle.dump(base_obj, open(base_pickle, "wb"))
    # determine home dir and switch to msm dir
    home_dir = os.path.abspath("./")
    os.chdir(msm_dir)
    # write python file to open and run pickle object
    f = open(base_name +".py", "w")
    f.write(
        'import pickle\n\n' + \
        'c = pickle.load(open("' + base_pickle + '", "rb"))\n' + \
        'c.run()')
    f.close()
    # generate python run commands for submission
    cmd0 = 'sync\n'
    cmd1 = 'python ' + base_name + '.py'
    cmds = [cmd0, cmd1]
    base_submission = base_name + '_submission'
    # submit and wait for job to finish
    pid = sub_obj.run(cmds, output_name=base_submission)
    q_check_obj.wait_for_pids([pid], wait_for_all=True)
    # clean up submission
    sub_script_name = q_check_obj.get_submission_names(pid)[0]
    sub_output = 'submissions/' + base_name + '_gen' + \
        ('%03d' % gen_num) + '.out'
    base_pickle_output = 'submissions/' + base_name + '_gen' + \
        ('%03d' % gen_num) + '.pkl'
    base_python_output = 'submissions/' + base_name + '_gen' + \
        ('%03d' % gen_num) + '.py'
    base_sub_output = 'submissions/' + base_submission + '_gen' + \
        ('%03d' % gen_num)
    cmd1 = 'mv ' + sub_script_name + ' ' + sub_output + ' --backup=numbered'
    cmd2 = 'mv ' + base_pickle + ' ' + base_pickle_output + \
        ' --backup=numbered'
    cmd3 = 'mv ' + base_name + ".py" + ' ' + base_python_output + \
        ' --backup=numbered'
    cmd4 = 'mv ' + base_submission + ' ' + base_sub_output + \
        ' --backup=numbered'
    cmds = [cmd1, cmd2, cmd3, cmd4]
    _ = tools.run_commands(cmds)
    # change directory back to original
    os.chdir(home_dir)
    return


def _determine_gen(output_dir, ignore_error=False):
    """Determines the current generation number."""
    # determine number of gen folders
    n_gen_folders = len(glob.glob(output_dir+'/gen*'))
    # gets completed sims in msm dir
    trj_names = glob.glob(output_dir + '/msm/trajectories/*.xtc')
    # sorts by unique gen number
    trj_gen_nums = np.array(
        [int(n.split("gen")[-1].split("_")[0]) for n in trj_names])
    # error check that completed sims match number of gen folders
    n_trj_gens = len(np.unique(trj_gen_nums))
    if (n_gen_folders != n_trj_gens) and not ignore_error:
        raise DataInvalid(
            'The number of generations are not consistent with the number ' + \
            'of trajectories. Maybe a simulation crashed?')
    # subtract 1 for 0 based counting
    gen_num = int(np.max([n_trj_gens, n_gen_folders]) - 1)
    return gen_num


def _prop_msm(msm_dir, msm_obj):
    """Propagate MSM files."""
    t0 = time.time()
    # load assignments and build MSM
    assignments = ra.load(msm_dir + '/data/assignments.h5')
    msm_obj.fit(assignments)
    # write counts, probs, and popoulations (if applicable)
    scipy.io.mmwrite(msm_dir + '/data/tcounts.mtx', msm_obj.tcounts_)
    scipy.io.mmwrite(msm_dir + '/data/tprobs.mtx', msm_obj.tprobs_)
    if msm_obj.eq_probs_ is not None:
        np.save(msm_dir + '/data/populations.npy', msm_obj.eq_probs_)
    t1 = time.time()
    logging.info("building MSM took %0.4f seconds" % (t1-t0))
    return msm_obj


def _move_cluster_data(msm_dir, rebuild_num, analysis_obj=None):
    """Move current cluster data to old directory. For rebuilding whole
    MSM and analysis

    Inputs
    ----------
    msm_dir : str,
        MSM directory where analysis is performed.
    rebuild_num : int,
        The rebuild number, i.e. If this is gen 6 and MSMs are being
        rebuild every 2 gens, this would be rebuild number 2 (0 based).
    analysis_obj : object,
        The object used for analysis. This object may contain
        information of a folder that also needs to be moved.
    """
    # define old directory to move into. mkdir if first rebuild.
    old_dir = msm_dir + '/old'
    if rebuild_num == 0:
        try:
            cmd = 'mkdir ' + old_dir
            _ = tools.run_commands(cmd)
        except:
            pass
    # move data and centers
    backup = ' --backup=numbered'
    cmd1 = 'mv ' + msm_dir + '/data ' + old_dir + '/data' + \
        str(rebuild_num) + backup
    cmd2 = 'mv ' + msm_dir + '/centers_masses ' + old_dir + \
        '/centers_masses' + str(rebuild_num) + backup
    cmd3 = 'mv ' + msm_dir + '/centers_restarts ' + old_dir + \
        '/centers_restarts' + str(rebuild_num) + backup
    # rebuild directories
    cmd4 = 'mkdir ' + msm_dir + '/data'
    cmd5 = 'mkdir ' + msm_dir + '/centers_masses'
    cmd6 = 'mkdir ' + msm_dir + '/centers_restarts'
    cmds = [cmd1, cmd2, cmd3, cmd4, cmd5, cmd6]
    # if applicable, move analysis folder
    if hasattr(analysis_obj, "output_folder"):
        base_folder = analysis_obj.output_folder.split("/")[-1]
        cmd = 'mv ' + analysis_obj.output_folder + ' ' + old_dir + \
            "/" + base_folder + str(rebuild_num)
        cmds.append(cmd)
    # run move commands
    _ = tools.run_commands(cmds)
    return


def _perform_analysis(
        analysis_obj, msm_dir, gen_num, sub_obj, q_check_obj, update_data):
    """Performs analysis of cluster centers.

    Inputs
    ----------
    analysis_obj : object,
        The object used for analysis.
    msm_dir : str,
        MSM directory where analysis is performed.
    gen_num : int,
        Generation number.
    sub_obj : object,
        Submission wrapper object.
    q_check_obj : object,
        Queueing system wrapper to determine if submission is still
        running.
    update_data : bool,
        Flag for rebuilding whole analysis or analyzing a subset of
        structures.
    """
    t0 = time.time()
    # determine if there is an analysis object
    if analysis_obj is None:
        state_rankings = None
    else:
        # set the objects output
        analysis_obj.set_output(msm_dir, gen_num)
        # optionally set rebuild or continue analysis
        if hasattr(analysis_obj, 'build_full'):
            analysis_obj.build_full = update_data
        # if the output doesn't exists, pickle submit analysis
        if not os.path.exists(analysis_obj.output_name):
            _pickle_submit(
                msm_dir, analysis_obj, sub_obj,
                q_check_obj, gen_num, 'analysis')
        # get rankings
        state_rankings = analysis_obj.state_rankings
        # check that everything went well
        # number of state rankings should be equal to number of state
        # in the assignments
        n_states_ranked = len(state_rankings)
        n_states = len(np.unique(ra.load(msm_dir + '/data/assignments.h5')))
        if n_states_ranked != n_states:
            raise DataInvalid(
                'The number of state rankings does not match the number ' + \
                'of states in the assignments! Analysis may have failed!')
    t1 = time.time()
    logging.info("analysis took %0.4f seconds" %(t1-t0))
    return state_rankings


[docs]def push_forward(s, num=0):
    s_out = s.split("\n")
    s_pushed = "\n".join(
        ["".join(itertools.repeat(" ", num)) + l for l in s_out])
    return s_pushed


[docs]class AdaptiveSampling(base):
    """Performs adaptive sampling

    Parameters
    ----------
    initial_state : str or MDTraj object,
        The starting structure for adaptive sampling.
    n_gens : int, default=1,
        The number of generations of sampling to perform.
    n_kids : int, default=1,
        The number of simulations per generation of adaptive sampling.
    sim_obj : object, default=None,
        An object that can run simulations. Currently supported within
        this package are Gromacs and Upside wrappers.
    cluster_obj : object, default=None,
        A cluster wrapper that dictates how simulations are clustered.
    save_state_obj : object, default=None,
        Can optionally provide an object that dictates how states are saved.
    msm_obj : enspara.msm.MSM object
        An enspara MSM object. This is used to fit assignments at each
        generation of sampling.
    analysis_obj : object, default=None,
        Type of analysis to perform on each cluster center. Can be used in
        state rankings.
    ranking_obj : rankings object
        This is an object with at least two functions: __init__(**args)
        and select_states(msm, n_clones). The output of this object is
        a list of states to simulate.
    spreading_func : func, default=None,
        Optionally spread state selection by minimizing similarity penalty,
        calculated using the provided metric for calculating state-distances.
        i.e. md.rmsd.
    update_freq : int, default=np.inf,
        The number of generations between a full reclustering of states and
        analysis of cluster centers. Defaults to never reclustering
        (continually adds new cluster centers without changing previously
        discovered centers).
    continue_prev : bool, default=False,
        Flag to indicate if sampling is continuing from a previous run.
        Avoids accidentally overwritting a previous run of sampling.
    sub_obj : object, default=None,
        A submission object that handles submitting clustering, MSM,
        analysis, and save_state routines. Wrappers are available for
        Slurm queueing systems as well as local machines (subprocess calls).
    q_check_obj : object, default=None,
        An object that handles checking queueing system for jobs that are
        still running.
    q_check_obj_sim : object, default=None,
        An object that handles checking queueing system for jobs that are
        still running.
    output_dir : str, default='adaptive_sampling',
        The output directory name for adaptive sampling run.
    """

    def __init__(
            self, initial_state, n_gens=1, n_kids=1, sim_obj=None,
            cluster_obj=None, save_state_obj=None, msm_obj=None,
            analysis_obj=None, ranking_obj=None, spreading_func=None,
            update_freq=np.inf, continue_prev=False, sub_obj=None,
            q_check_obj=None, q_check_obj_sim=None,
            output_dir='adaptive_sampling', verbose=True):
        # Initialize class variables
        self.sim_obj = sim_obj
        self.initial_state = initial_state
        if type(self.initial_state) is str:
            self.initial_state_md = md.load(self.initial_state)
        else:
            self.initial_state_md = self.initial_state
        self.n_gens = n_gens
        self.n_kids = n_kids
        self.cluster_obj = cluster_obj
        if save_state_obj is None:
            self.save_state_obj = SaveWrap()
        else:
            self.save_state_obj = save_state_obj
        self.save_restart_obj = SaveWrap(
            centers='restarts', save_routine='restarts')
        self.analysis_obj = analysis_obj
        # msm obj default is normalize without eq_probs
        if msm_obj is None:
            b = partial(builders.normalize, calculate_eq_probs=False)
            self.msm_obj = MSM(lag_time=1, method=b)
        else:
            self.msm_obj = msm_obj
        # ranking obj default is evens
        if ranking_obj is None:
            self.ranking_obj = rankings.evens()
        else:
            self.ranking_obj = ranking_obj
        self.spreading_func = spreading_func
        self.update_freq = update_freq
        self.continue_prev = continue_prev
        if sub_obj is None:
            self.sub_obj = lsf_subs.LSFSub(
                'bowman', n_tasks=128, R='"model=AMDEPYC_7742"')
        else:
            self.sub_obj = sub_obj
        if q_check_obj is None:
            self.q_check_obj = lsf_subs.LSFWrap()
        else:
            self.q_check_obj = q_check_obj
        if q_check_obj_sim is None:
            self.q_check_obj_sim = lsf_subs.LSFWrap()
        else:
            self.q_check_obj_sim = q_check_obj_sim
        self.output_dir = os.path.abspath(output_dir)
        self.msm_dir = self.output_dir + '/msm'
        self.verbose = verbose

    @property
    def class_name(self):
        return "AdaptiveSampling"

    @property
    def config(self):
        return {
            'initial_state': self.initial_state,
            'n_gens': self.n_gens,
            'n_kids': self.n_kids,
            'sim_obj': self.sim_obj,
            'cluster_obj': self.cluster_obj,
            'msm_obj': self.msm_obj,
            'analysis_obj': self.analysis_obj,
            'ranking_obj': self.ranking_obj,
            'update_freq': self.update_freq,
            'continue_prev': self.continue_prev,
            'sub_obj': self.sub_obj,
            'q_check_obj': self.q_check_obj,
            'q_check_obj_sim': self.q_check_obj_sim,
            'output_dir': self.output_dir,
            'verbose': self.verbose,
        }

[docs]    def print_parameters(self):
        print(
            "\n\n#########################################################" + \
            "####################")
        print(
            "                               adaptive sampling!             ")
        print(
            "###########################################################" + \
            "##################")
        if self.continue_prev:
            print("\ncontinuing sampling from a previous run!")
        print("\noutput directory:\n    " + str(self.output_dir))
        print("\nstarting state:\n    " + str(self.initial_state))
        print("\nnumber of gens:\n    " + str(self.n_gens))
        print("\nnumber of kids:\n    " + str(self.n_kids))
        print(
            "\nupdating clustering and analysis every:\n    " + \
            str(self.update_freq) + " gens")
        print("\nsimulation object:\n" + push_forward(str(self.sim_obj), 4))
        print(
            "\nclustering object:\n" + push_forward(str(self.cluster_obj), 4))
        print(
            "\nsave states object:\n" + \
            push_forward(str(self.save_state_obj), 4))
        print("\nanalysis object:\n" + push_forward(str(self.analysis_obj), 4))
#        print("\nMSM object:\n" + push_forward(str(self.msm_obj), 4))
        print("\nranking object:\n" + push_forward(str(self.ranking_obj), 4))
        print("\nsubmission object:\n" + push_forward(str(self.sub_obj), 4))
        print(
            "\nqueue checking object:\n" + \
            push_forward(str(self.q_check_obj), 4))
        print(
            "\nsim queue checking object:\n" + \
            push_forward(str(self.q_check_obj_sim), 4))
        print(
            "\n###########################################################" + \
            "##################\n")
        return

[docs]    def run(self):
        # after setup, adaptive sampling proceeds with the following steps:
        # 1) simulate, process trajectories, and move them to the
        #    msm directory
        # 2) cluster the conformations and save assignments and distances
        # 3) optionally save cluster centers
        # 4) analyze the cluster centers and save the results
        #    of the analysis
        # 5) build the MSM and save transition matrix
        # 6) rank states for reselection based on structural analysis
        #    (optional) and MSM statistics
        # 
        # If restarting an adaptive sampling run, attempts to move simulations
        # from the last gen and recluster
        self.print_parameters()
        # set msmdir
        msm_dir = self.output_dir + '/msm'
        # timeit
        t0 = time.time()
        # initilize adaptive sampling if not continuing a previous run
        # builds directories, generates first run of sampling, and
        # clusters data


        ###########################################################
        #               First generation of sampling              #
        ###########################################################

        if not self.continue_prev:
            # build initial directory structure
            logging.info('building directories')
            gen_num = 0
            gen_dir = self.output_dir + '/gen' + str(gen_num)
            _setup_directories(self.output_dir)
            # save starting state in msm directory. Will be used to load
            # and save trajectories for restarting simulations
            try:
                self.initial_state_md.save_gro(msm_dir + '/restart.gro')
            except:
                logging.warning(
                    "Could not save initial state. Initial state is not pdb or gro?")
                self.cluster_obj.base_struct_md.save_gro(msm_dir + '/restart.gro')

            ###########################################################
            #                  STEP 1 (simulations)                   #
            ###########################################################

            # initialize first run of sampling
            logging.info('starting initial simulations')
            _gen_initial_sims(
                self.output_dir, self.initial_state_md, self.sim_obj,
                self.n_kids, self.q_check_obj_sim)
            # move trajectories after sampling
            logging.info('moving trajectories')
            _move_trjs(gen_dir, self.msm_dir, gen_num, self.n_kids)
            # wait for nfs to catch up
            time.sleep(65)

            ###########################################################
            #                  STEP 2 (clustering)                    #
            ###########################################################

            # submit clustering job
            logging.info('clustering simulation data')
            t_pre = time.time()
            # since its the first round of sampling, build full msm
            self.cluster_obj.build_full = True
            self.cluster_obj.set_filenames(self.msm_dir)
            _pickle_submit(
                self.msm_dir, self.cluster_obj, self.sub_obj,
                self.q_check_obj, gen_num, 'clusterer')
            # check that clustering went well
            correct_clust =  self.cluster_obj.check_clustering(
                self.msm_dir, gen_num, self.n_kids)
            if not correct_clust:
                raise MissingData('clustering job failed!')
            # log clustering time
            t_post = time.time()
            logging.info("clustering took %0.4f seconds" % (t_post - t_pre))

            ###########################################################
            #                 STEP 3 (saving states)                  #
            ###########################################################

            if self.save_state_obj is not None:
                logging.info('saving states')
                t_pre = time.time()
                _pickle_submit(
                    self.msm_dir, self.save_state_obj, self.sub_obj,
                    self.q_check_obj, gen_num, 'save_states')
                correct_save = self.save_state_obj.check_save_states(
                    self.msm_dir)
                if not correct_save:
                    raise MissingData('Saving states failed!')
                t_post = time.time()
                logging.info(
                    'saving states took %0.4f seconds' % (t_post - t_pre))


        ###########################################################
        #               restarting adaptive sampling              #
        ###########################################################

        # if continuing from a previous run, determines gen, and
        # attempts to move trajectories, cluster data, build MSM,
        # rank states, and restart simulations
        else:
            # check for valid path to restart from
            if not os.path.exists(self.output_dir):
                raise DataInvalid(
                    "Can't continue run from output directory that doesn't" + \
                    " exist!")
            # determine where adaptive sampling left off (looks at
            # trajectories and folder numbers). Allows for a discrepancy.
            gen_num = _determine_gen(self.output_dir, ignore_error=True)
            gen_dir = self.output_dir + '/gen' + str(gen_num)
            logging.info('continuing adaptive sampling from run %d' % gen_num)
            # try to move trajectories from current gen and initiate clustering
            try:
                # move trajectories
                logging.info('moving trajectories')
                _move_trjs(gen_dir, self.msm_dir, gen_num, self.n_kids)
                # wait for nfs to catch up
                time.sleep(65)
            except:
                pass
            # error check for consistent number of trajectories and
            # gen folders
            gen_num_test = _determine_gen(self.output_dir)
            assert gen_num == gen_num_test

            ###########################################################
            #                  STEP 2 (clustering)                    #
            ###########################################################

            # determine if clustering was completed
            if os.path.exists(self.msm_dir+"/data/assignments.h5"):
                # check if clustering was successful
                correct_clust =  self.cluster_obj.check_clustering(
                    self.msm_dir, gen_num, self.n_kids)
            else:
                correct_clust = False
            # if not, recluster
            if not correct_clust:
                # submit clustering job
                logging.info('clustering simulation data')
                logging.info('updating all cluster centers')
                rebuild_num = int(gen_num / self.update_freq) - 1
                # if restarting from first gen, might not need to move
                # cluster data
                if gen_num == 0:
                    try:
                        _move_cluster_data(
                            self.msm_dir, rebuild_num, self.analysis_obj)
                    except:
                        pass
                else:
                    _move_cluster_data(
                        self.msm_dir, rebuild_num, self.analysis_obj)
                t_pre = time.time()
                # built in rebuild everything if restarting sims
                self.cluster_obj.build_full = True
                self.cluster_obj.set_filenames(self.msm_dir)
                _pickle_submit(
                    self.msm_dir, self.cluster_obj, self.sub_obj,
                    self.q_check_obj, gen_num, 'clusterer')
                correct_clust =  self.cluster_obj.check_clustering(
                    self.msm_dir, gen_num, self.n_kids)
                # if still wrong, raise error
                if not correct_clust:
                    raise MissingData('clustering job failed!')
                t_post = time.time()
                logging.info("clustering took %0.4f seconds" % (t_post - t_pre))

            ###########################################################
            #                 STEP 3 (saving states)                  #
            ###########################################################

            if self.save_state_obj is not None:
                correct_save = self.save_state_obj.check_save_states(
                    self.msm_dir)
                if not correct_save: 
                    logging.info('saving states')
                    t_pre = time.time()
                    _pickle_submit(
                        self.msm_dir, self.save_state_obj, self.sub_obj,
                        self.q_check_obj, gen_num, 'save_states')
                    correct_save = self.save_state_obj.check_save_states(
                        self.msm_dir)
                    if not correct_save:
                        raise MissingData('Saving states failed!')
                    t_post = time.time()
                    logging.info(
                        'saving states took %0.4f seconds' % (t_post - t_pre))

        # determine if updating data
        if int(gen_num % self.update_freq) == 0:
            update_data = True
        else:
            update_data = False

        ###########################################################
        #               STEP 4 (analysis of centers)              #
        ###########################################################

        # run analysis object routine
        logging.info('analyzing cluster data')
        state_rankings = _perform_analysis(
            self.analysis_obj, self.msm_dir, gen_num, self.sub_obj,
            self.q_check_obj, update_data)

        ###########################################################
        #                  STEP 5 (MSM generation)                #
        ###########################################################

        # build msm
        logging.info('building MSM')
        self.msm_obj = _prop_msm(
            self.msm_dir, self.msm_obj)

        ###########################################################
        #                   STEP 6 (rank states)                  #
        ###########################################################

        # if ranking  uses analysis from state centers, update
        # the ranking object
        if hasattr(self.ranking_obj, 'state_rankings'):
            self.ranking_obj.state_rankings = state_rankings
        # if the ranking object uses rmsd information, load cluster
        # centers and update the ranking object
        if hasattr(self.ranking_obj, 'distance_metric'):
            if self.ranking_obj.distance_metric is not None:
                logging.info('loading centers for spreading')
                self.ranking_obj.state_centers = md.load(
                    self.msm_dir+'/data/full_centers.xtc',
                    top=self.msm_dir+'/prot_masses.pdb')
        logging.info('ranking states\n')
        # rank states
        new_states = self.ranking_obj.select_states(self.msm_obj, self.n_kids)
        np.save(
            self.msm_dir + '/rankings/states_to_simulate_gen' + \
                str(gen_num) + '.npy', new_states)

        if (self.save_state_obj.save_routine == 'masses') or \
                (self.save_state_obj.centers == 'none'):
            self.save_restart_obj.gen_num = gen_num
            self.save_restart_obj.run(self.msm_dir)

        ################################################################
        #                 main adaptive sampling loop                  #
        ################################################################

        # iterate adaptive sampling until gen reaches n_gens
        for gen_num in np.arange(gen_num + 1, self.n_gens):

            ###########################################################
            #                  STEP 1 (simulations)                   #
            ###########################################################

            logging.info('STARTING GEN NUM: %d' % gen_num)
            gen_dir = self.output_dir + '/gen' + str(gen_num)
            # propagate trajectories
            str_states = ", ".join([str(state) for state in new_states])
            logging.info('starting simulations for states: ' + str_states)
            _prop_sims(
                self.output_dir, self.sim_obj, gen_num, self.q_check_obj_sim,
                new_states)
            # move trajectories
            logging.info('moving trajectories')
            _move_trjs(gen_dir, self.msm_dir, gen_num, self.n_kids)
            # ensure proper trajectories
            gen_num_test = _determine_gen(self.output_dir)
            assert gen_num == gen_num_test
            # wait for nfs to catch up
            time.sleep(65)

            ###########################################################
            #                  STEP 2 (clustering)                    #
            ###########################################################

            # determine if updating data
            if int(gen_num % self.update_freq) == 0:
                logging.info('updating all cluster centers')
                rebuild_num = int(gen_num / self.update_freq) - 1 
                _move_cluster_data(
                    self.msm_dir, rebuild_num, self.analysis_obj)
                update_data = True
            else:
                update_data = False
            # submit clustering job
            logging.info('clustering simulation data')
            t_pre = time.time()
            self.cluster_obj.build_full = update_data

            # touch trajectories...
            cmd = 'touch %s/trajectories/*.xtc' % self.msm_dir
            tools.run_commands(cmd)

            self.cluster_obj.set_filenames(self.msm_dir)
            _pickle_submit(
                self.msm_dir, self.cluster_obj, self.sub_obj,
                self.q_check_obj, gen_num, 'clusterer')
            correct_clust =  self.cluster_obj.check_clustering(
                self.msm_dir, gen_num, self.n_kids)
            if not correct_clust:
                raise MissingData('clustering job failed!')
            t_post = time.time()
            logging.info("clustering took %0.4f seconds" % (t_post - t_pre))


            ###########################################################
            #                 STEP 3 (saving states)                  #
            ###########################################################

            if self.save_state_obj is not None:
                logging.info('saving states')
                t_pre = time.time()
                _pickle_submit(
                    self.msm_dir, self.save_state_obj, self.sub_obj,
                    self.q_check_obj, gen_num, 'save_states')
                correct_save = self.save_state_obj.check_save_states(
                    self.msm_dir)
                if not correct_save:
                    raise MissingData('Saving states failed!')
                t_post = time.time()
                logging.info(
                    'saving states took %0.4f seconds' % (t_post - t_pre))

            
            ###########################################################
            #               STEP 4 (analysis of centers)              #
            ###########################################################

            # analysis
            logging.info('analyzing cluster data')
            state_rankings = _perform_analysis(
                self.analysis_obj, self.msm_dir, gen_num, self.sub_obj,
                self.q_check_obj, update_data)

            ###########################################################
            #                  STEP 5 (MSM generation)                #
            ###########################################################

            # build msm
            logging.info('building MSM')
            self.msm_obj = _prop_msm(
                self.msm_dir, self.msm_obj)

            ###########################################################
            #                   STEP 6 (rank states)                  #
            ###########################################################

            # rank states and get new 
            if hasattr(self.ranking_obj, 'state_rankings'):
                self.ranking_obj.state_rankings = state_rankings
            if hasattr(self.ranking_obj, 'distance_metric'):
                if self.ranking_obj.distance_metric is not None:
                    logging.info('loading centers for spreading')
                    self.ranking_obj.state_centers = md.load(
                        self.msm_dir+'/data/full_centers.xtc',
                        top=self.msm_dir+'/prot_masses.pdb')
            logging.info('ranking states\n')
            new_states = self.ranking_obj.select_states(self.msm_obj, self.n_kids)
            np.save(
                self.msm_dir + '/rankings/states_to_simulate_gen' + \
                    str(gen_num) + '.npy', new_states)

            # save restarts if not saved previously
            if (self.save_state_obj.save_routine == 'masses') or \
                    (self.save_state_obj.centers == 'none'):
                self.save_restart_obj.gen_num = gen_num
                self.save_restart_obj.run(self.msm_dir)

        t1 = time.time()
        logging.info("Total time took %0.4f seconds" % (t1 - t0))