Source code for fast.sampling.core

# Author: Maxwell I. Zimmerman <mizimmer@wustl.edu>
# Contributors:
# Copywright (C) 2017, Washington University in St. Louis
# All rights reserved.
# Unauthorized copying of this file, via any medium, is strictly prohibited
# Proprietary and confidential


#######################################################################
# imports
#######################################################################


import glob
import itertools
import logging
import mdtraj as md
import numpy as np
import os
import pickle
import scipy.io
import subprocess as sp
import time
from . import rankings
from .. import tools
from ..base import base
from ..exception import DataInvalid, MissingData
from ..msm_gen import SaveWrap
from ..submissions import slurm_subs
from ..submissions import lsf_subs
from enspara.msm import builders, MSM
from enspara.util import array as ra
from functools import partial


logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)


#######################################################################
# code
#######################################################################


def _setup_directories(output_dir):
    """Setup adaptive sampling directory structure"""
    # if output dir already exists raise an error
    if os.path.exists(output_dir):
        raise DataInvalid('output directory already exists!')
    msm_dir = output_dir + "/msm"
    cmd1 = 'mkdir ' + output_dir
    cmd2 = 'mkdir ' + msm_dir
    cmd3 = 'mkdir ' + msm_dir + '/data'
    cmd4 = 'mkdir ' + msm_dir + '/rankings'
    cmd5 = 'mkdir ' + msm_dir + '/trajectories'
    cmd6 = 'mkdir ' + msm_dir + '/trajectories_full'
    cmd7 = 'mkdir ' + msm_dir + '/centers_masses'
    cmd8 = 'mkdir ' + msm_dir + '/centers_restarts'
    cmd9 = 'mkdir ' + msm_dir + '/submissions'
    cmds = [cmd1, cmd2, cmd3, cmd4, cmd5, cmd6, cmd7, cmd8, cmd9]
    out = tools.run_commands(cmds)
    return msm_dir


def _gen_initial_sims(
        base_dir, initial_struct, trj_obj, n_kids, q_check_obj):
    """Runs the first round of adaptive sampling. Currently runs
    simulations from a single structure.

    Inputs
    ----------
    base_dir : str,
        The base adaptive sampling directory that will contain gen
        directories and the msm directory.
    initial_struct : str or md.Trajectory,
        The initial structure to start a swarn of sims from.
    trj_obj : object,
        Simulation object used for simulations. See Gromax.
    n_kids : int,
        Number of children gen will have.
    q_check_obj : object,
        Queueing system wrapper to determine if simulations are
        still running.
    """
    t0 = time.time()
    # generate initial gen directory
    gen0_dir = base_dir + '/gen0'
    cmd = 'mkdir ' + gen0_dir
    _ = tools.run_commands(cmd)
    # Spawn simulations
    pids = []
    for kid in range(n_kids):
        # generate kid directory
        kid_dir = gen0_dir + '/kid'+str(kid)
        cmd = 'mkdir ' + kid_dir
        _ = tools.run_commands(cmd)
        # submit job based on trj_obj and retain pid
        pid = trj_obj.run(initial_struct, kid_dir)
        pids.append(pid)
        # wait for a job to finish if maximum number of simulations are
        #still running
        q_check_obj.wait_for_pids(np.array(pids))
    # gather all pids and wait for every simulation to finish
    pids = np.array(pids)
    q_check_obj.wait_for_pids(pids, wait_for_all=True)
    t1 = time.time()
    logging.info("simulations took %0.4f seconds" % (t1-t0))
    return pids


def _prop_sims(base_dir, trj_obj, gen_num, q_check_obj, new_states):
    """Runs the subsequent rounds of adaptive sampling. Looks for
    states in specified directory structure and uses them for
    respawning simulations.

    Inputs
    ----------
    base_dir : str,
        The base adaptive sampling directory that will contain gen
        directories and the msm directory.
    trj_obj : object,
        Simulation object used for simulations. See Gromax.
    gen_num : int,
        The generation of sampling to propagate new simulations.
    q_check_obj : object,
        Queueing system wrapper to determine if simulations are
        still running.
    new_states : list,
        List of state numbers to respawn simulations from
    """
    t0 = time.time()
    # generate gen directory
    gen_dir = base_dir + '/gen' + str(gen_num)
    cmd = 'mkdir ' + gen_dir
    _ = tools.run_commands(cmd)
    pids = []
    # propagate simulations
    for kid in range(len(new_states)):
        # generate kid directory
        kid_dir = gen_dir + '/kid'+str(kid)
        cmd = 'mkdir ' + kid_dir
        _ = tools.run_commands(cmd)
        # run simulation and gather pid
        filename = base_dir + '/msm/centers_restarts/state' + \
            ('%06d' % new_states[kid]) + '-00.gro'
        pid = trj_obj.run(filename, kid_dir)
        pids.append(pid)
        # wait for a job to finish if maximum number of simulations are
        #still running
        q_check_obj.wait_for_pids(np.array(pids))
    # gather all pids and wait for every simulation to finish
    pids = np.array(pids)
    q_check_obj.wait_for_pids(pids, wait_for_all=True)
    t1 = time.time()
    logging.info("simulations took %0.4f seconds" % (t1 - t0))
    return pids
    

def _move_trjs(gen_dir, msm_dir, gen_num, n_kids):
    """Move finished trajectories to MSM directory.

    Inputs
    ----------
    gen_dir : str,
        Generation directory containing simulations to move.
    msm_dir : str,
        MSM directory where analysis is performed.
    gen_num : int,
        Generation number.
    n_kids : int,
        Number of kids expected in gen directory.
    """
    # iterate over potential kids
    for kid in range(n_kids):
        # trj to move trajectories
        try:
            # specify directroy and file names
            kid_dir = gen_dir + "/kid" + str(kid)
            output_full = msm_dir + '/trajectories_full' + '/trj_gen' + \
                ('%03d' % gen_num) + '_kid' + ('%03d' % kid) + '.xtc'
            output_masses = msm_dir + '/trajectories' + '/trj_gen' + \
                ('%03d' % gen_num) + '_kid' + ('%03d' % kid) + '.xtc'
            cmds = []
            if os.path.exists(output_full):
                logging.info("file '%s' exists! skipping move." % output_full)
            else:
                cmds.append(
                    'mv ' + kid_dir + '/frame0_aligned.xtc ' + output_full)
            if os.path.exists(output_masses):
                logging.info("file '%s' exists! skipping move." % output_masses)
            else:
                cmds.append(
                    'mv ' + kid_dir + '/frame0_masses.xtc ' +output_masses)
            if len(cmds) > 0:
                out = tools.run_commands(cmds)
        # give sad-face expression
        except:
            raise MissingData(
                'trajectory from gen %03d and kid %03d was not found.' %
                (gen_num, kid),
                'Simulation may have crahsed!')
    return


def _pickle_submit(
        msm_dir, base_obj, sub_obj, q_check_obj, gen_num, base_name):
    """Helper function for pickling an object and submitting it to run.

    Inputs
    ----------
    msm_dir : str,
        MSM directory where analysis is performed.
    base_obj : object,
        The object to pickle and submit. Must have `run` as a class
        function.
    sub_obj : object,
        Submission wrapper object.
    q_check_obj : object,
        Queueing system wrapper to determine if submission is still
        running.
    gen_num : int,
        Generation number.
    base_name : str,
        The base name of the job being submitted. Used to name the
        output files.
    """
    # pickle object
    base_pickle = msm_dir + "/" + base_name + ".pkl"
    pickle.dump(base_obj, open(base_pickle, "wb"))
    # determine home dir and switch to msm dir
    home_dir = os.path.abspath("./")
    os.chdir(msm_dir)
    # write python file to open and run pickle object
    f = open(base_name +".py", "w")
    f.write(
        'import pickle\n\n' + \
        'c = pickle.load(open("' + base_pickle + '", "rb"))\n' + \
        'c.run()')
    f.close()
    # generate python run commands for submission
    cmd0 = 'sync\n'
    cmd1 = 'python ' + base_name + '.py'
    cmds = [cmd0, cmd1]
    base_submission = base_name + '_submission'
    # submit and wait for job to finish
    pid = sub_obj.run(cmds, output_name=base_submission)
    q_check_obj.wait_for_pids([pid], wait_for_all=True)
    # clean up submission
    sub_script_name = q_check_obj.get_submission_names(pid)[0]
    sub_output = 'submissions/' + base_name + '_gen' + \
        ('%03d' % gen_num) + '.out'
    base_pickle_output = 'submissions/' + base_name + '_gen' + \
        ('%03d' % gen_num) + '.pkl'
    base_python_output = 'submissions/' + base_name + '_gen' + \
        ('%03d' % gen_num) + '.py'
    base_sub_output = 'submissions/' + base_submission + '_gen' + \
        ('%03d' % gen_num)
    cmd1 = 'mv ' + sub_script_name + ' ' + sub_output + ' --backup=numbered'
    cmd2 = 'mv ' + base_pickle + ' ' + base_pickle_output + \
        ' --backup=numbered'
    cmd3 = 'mv ' + base_name + ".py" + ' ' + base_python_output + \
        ' --backup=numbered'
    cmd4 = 'mv ' + base_submission + ' ' + base_sub_output + \
        ' --backup=numbered'
    cmds = [cmd1, cmd2, cmd3, cmd4]
    _ = tools.run_commands(cmds)
    # change directory back to original
    os.chdir(home_dir)
    return


def _determine_gen(output_dir, ignore_error=False):
    """Determines the current generation number."""
    # determine number of gen folders
    n_gen_folders = len(glob.glob(output_dir+'/gen*'))
    # gets completed sims in msm dir
    trj_names = glob.glob(output_dir + '/msm/trajectories/*.xtc')
    # sorts by unique gen number
    trj_gen_nums = np.array(
        [int(n.split("gen")[-1].split("_")[0]) for n in trj_names])
    # error check that completed sims match number of gen folders
    n_trj_gens = len(np.unique(trj_gen_nums))
    if (n_gen_folders != n_trj_gens) and not ignore_error:
        raise DataInvalid(
            'The number of generations are not consistent with the number ' + \
            'of trajectories. Maybe a simulation crashed?')
    # subtract 1 for 0 based counting
    gen_num = int(np.max([n_trj_gens, n_gen_folders]) - 1)
    return gen_num


def _prop_msm(msm_dir, msm_obj):
    """Propagate MSM files."""
    t0 = time.time()
    # load assignments and build MSM
    assignments = ra.load(msm_dir + '/data/assignments.h5')
    msm_obj.fit(assignments)
    # write counts, probs, and popoulations (if applicable)
    scipy.io.mmwrite(msm_dir + '/data/tcounts.mtx', msm_obj.tcounts_)
    scipy.io.mmwrite(msm_dir + '/data/tprobs.mtx', msm_obj.tprobs_)
    if msm_obj.eq_probs_ is not None:
        np.save(msm_dir + '/data/populations.npy', msm_obj.eq_probs_)
    t1 = time.time()
    logging.info("building MSM took %0.4f seconds" % (t1-t0))
    return msm_obj


def _move_cluster_data(msm_dir, rebuild_num, analysis_obj=None):
    """Move current cluster data to old directory. For rebuilding whole
    MSM and analysis

    Inputs
    ----------
    msm_dir : str,
        MSM directory where analysis is performed.
    rebuild_num : int,
        The rebuild number, i.e. If this is gen 6 and MSMs are being
        rebuild every 2 gens, this would be rebuild number 2 (0 based).
    analysis_obj : object,
        The object used for analysis. This object may contain
        information of a folder that also needs to be moved.
    """
    # define old directory to move into. mkdir if first rebuild.
    old_dir = msm_dir + '/old'
    if rebuild_num == 0:
        try:
            cmd = 'mkdir ' + old_dir
            _ = tools.run_commands(cmd)
        except:
            pass
    # move data and centers
    backup = ' --backup=numbered'
    cmd1 = 'mv ' + msm_dir + '/data ' + old_dir + '/data' + \
        str(rebuild_num) + backup
    cmd2 = 'mv ' + msm_dir + '/centers_masses ' + old_dir + \
        '/centers_masses' + str(rebuild_num) + backup
    cmd3 = 'mv ' + msm_dir + '/centers_restarts ' + old_dir + \
        '/centers_restarts' + str(rebuild_num) + backup
    # rebuild directories
    cmd4 = 'mkdir ' + msm_dir + '/data'
    cmd5 = 'mkdir ' + msm_dir + '/centers_masses'
    cmd6 = 'mkdir ' + msm_dir + '/centers_restarts'
    cmds = [cmd1, cmd2, cmd3, cmd4, cmd5, cmd6]
    # if applicable, move analysis folder
    if hasattr(analysis_obj, "output_folder"):
        base_folder = analysis_obj.output_folder.split("/")[-1]
        cmd = 'mv ' + analysis_obj.output_folder + ' ' + old_dir + \
            "/" + base_folder + str(rebuild_num)
        cmds.append(cmd)
    # run move commands
    _ = tools.run_commands(cmds)
    return


def _perform_analysis(
        analysis_obj, msm_dir, gen_num, sub_obj, q_check_obj, update_data):
    """Performs analysis of cluster centers.

    Inputs
    ----------
    analysis_obj : object,
        The object used for analysis.
    msm_dir : str,
        MSM directory where analysis is performed.
    gen_num : int,
        Generation number.
    sub_obj : object,
        Submission wrapper object.
    q_check_obj : object,
        Queueing system wrapper to determine if submission is still
        running.
    update_data : bool,
        Flag for rebuilding whole analysis or analyzing a subset of
        structures.
    """
    t0 = time.time()
    # determine if there is an analysis object
    if analysis_obj is None:
        state_rankings = None
    else:
        # set the objects output
        analysis_obj.set_output(msm_dir, gen_num)
        # optionally set rebuild or continue analysis
        if hasattr(analysis_obj, 'build_full'):
            analysis_obj.build_full = update_data
        # if the output doesn't exists, pickle submit analysis
        if not os.path.exists(analysis_obj.output_name):
            _pickle_submit(
                msm_dir, analysis_obj, sub_obj,
                q_check_obj, gen_num, 'analysis')
        # get rankings
        state_rankings = analysis_obj.state_rankings
        # check that everything went well
        # number of state rankings should be equal to number of state
        # in the assignments
        n_states_ranked = len(state_rankings)
        n_states = len(np.unique(ra.load(msm_dir + '/data/assignments.h5')))
        if n_states_ranked != n_states:
            raise DataInvalid(
                'The number of state rankings does not match the number ' + \
                'of states in the assignments! Analysis may have failed!')
    t1 = time.time()
    logging.info("analysis took %0.4f seconds" %(t1-t0))
    return state_rankings


[docs]def push_forward(s, num=0): s_out = s.split("\n") s_pushed = "\n".join( ["".join(itertools.repeat(" ", num)) + l for l in s_out]) return s_pushed
[docs]class AdaptiveSampling(base): """Performs adaptive sampling Parameters ---------- initial_state : str or MDTraj object, The starting structure for adaptive sampling. n_gens : int, default=1, The number of generations of sampling to perform. n_kids : int, default=1, The number of simulations per generation of adaptive sampling. sim_obj : object, default=None, An object that can run simulations. Currently supported within this package are Gromacs and Upside wrappers. cluster_obj : object, default=None, A cluster wrapper that dictates how simulations are clustered. save_state_obj : object, default=None, Can optionally provide an object that dictates how states are saved. msm_obj : enspara.msm.MSM object An enspara MSM object. This is used to fit assignments at each generation of sampling. analysis_obj : object, default=None, Type of analysis to perform on each cluster center. Can be used in state rankings. ranking_obj : rankings object This is an object with at least two functions: __init__(**args) and select_states(msm, n_clones). The output of this object is a list of states to simulate. spreading_func : func, default=None, Optionally spread state selection by minimizing similarity penalty, calculated using the provided metric for calculating state-distances. i.e. md.rmsd. update_freq : int, default=np.inf, The number of generations between a full reclustering of states and analysis of cluster centers. Defaults to never reclustering (continually adds new cluster centers without changing previously discovered centers). continue_prev : bool, default=False, Flag to indicate if sampling is continuing from a previous run. Avoids accidentally overwritting a previous run of sampling. sub_obj : object, default=None, A submission object that handles submitting clustering, MSM, analysis, and save_state routines. Wrappers are available for Slurm queueing systems as well as local machines (subprocess calls). q_check_obj : object, default=None, An object that handles checking queueing system for jobs that are still running. q_check_obj_sim : object, default=None, An object that handles checking queueing system for jobs that are still running. output_dir : str, default='adaptive_sampling', The output directory name for adaptive sampling run. """ def __init__( self, initial_state, n_gens=1, n_kids=1, sim_obj=None, cluster_obj=None, save_state_obj=None, msm_obj=None, analysis_obj=None, ranking_obj=None, spreading_func=None, update_freq=np.inf, continue_prev=False, sub_obj=None, q_check_obj=None, q_check_obj_sim=None, output_dir='adaptive_sampling', verbose=True): # Initialize class variables self.sim_obj = sim_obj self.initial_state = initial_state if type(self.initial_state) is str: self.initial_state_md = md.load(self.initial_state) else: self.initial_state_md = self.initial_state self.n_gens = n_gens self.n_kids = n_kids self.cluster_obj = cluster_obj if save_state_obj is None: self.save_state_obj = SaveWrap() else: self.save_state_obj = save_state_obj self.save_restart_obj = SaveWrap( centers='restarts', save_routine='restarts') self.analysis_obj = analysis_obj # msm obj default is normalize without eq_probs if msm_obj is None: b = partial(builders.normalize, calculate_eq_probs=False) self.msm_obj = MSM(lag_time=1, method=b) else: self.msm_obj = msm_obj # ranking obj default is evens if ranking_obj is None: self.ranking_obj = rankings.evens() else: self.ranking_obj = ranking_obj self.spreading_func = spreading_func self.update_freq = update_freq self.continue_prev = continue_prev if sub_obj is None: self.sub_obj = lsf_subs.LSFSub( 'bowman', n_tasks=128, R='"model=AMDEPYC_7742"') else: self.sub_obj = sub_obj if q_check_obj is None: self.q_check_obj = lsf_subs.LSFWrap() else: self.q_check_obj = q_check_obj if q_check_obj_sim is None: self.q_check_obj_sim = lsf_subs.LSFWrap() else: self.q_check_obj_sim = q_check_obj_sim self.output_dir = os.path.abspath(output_dir) self.msm_dir = self.output_dir + '/msm' self.verbose = verbose @property def class_name(self): return "AdaptiveSampling" @property def config(self): return { 'initial_state': self.initial_state, 'n_gens': self.n_gens, 'n_kids': self.n_kids, 'sim_obj': self.sim_obj, 'cluster_obj': self.cluster_obj, 'msm_obj': self.msm_obj, 'analysis_obj': self.analysis_obj, 'ranking_obj': self.ranking_obj, 'update_freq': self.update_freq, 'continue_prev': self.continue_prev, 'sub_obj': self.sub_obj, 'q_check_obj': self.q_check_obj, 'q_check_obj_sim': self.q_check_obj_sim, 'output_dir': self.output_dir, 'verbose': self.verbose, }
[docs] def print_parameters(self): print( "\n\n#########################################################" + \ "####################") print( " adaptive sampling! ") print( "###########################################################" + \ "##################") if self.continue_prev: print("\ncontinuing sampling from a previous run!") print("\noutput directory:\n " + str(self.output_dir)) print("\nstarting state:\n " + str(self.initial_state)) print("\nnumber of gens:\n " + str(self.n_gens)) print("\nnumber of kids:\n " + str(self.n_kids)) print( "\nupdating clustering and analysis every:\n " + \ str(self.update_freq) + " gens") print("\nsimulation object:\n" + push_forward(str(self.sim_obj), 4)) print( "\nclustering object:\n" + push_forward(str(self.cluster_obj), 4)) print( "\nsave states object:\n" + \ push_forward(str(self.save_state_obj), 4)) print("\nanalysis object:\n" + push_forward(str(self.analysis_obj), 4)) # print("\nMSM object:\n" + push_forward(str(self.msm_obj), 4)) print("\nranking object:\n" + push_forward(str(self.ranking_obj), 4)) print("\nsubmission object:\n" + push_forward(str(self.sub_obj), 4)) print( "\nqueue checking object:\n" + \ push_forward(str(self.q_check_obj), 4)) print( "\nsim queue checking object:\n" + \ push_forward(str(self.q_check_obj_sim), 4)) print( "\n###########################################################" + \ "##################\n") return
[docs] def run(self): # after setup, adaptive sampling proceeds with the following steps: # 1) simulate, process trajectories, and move them to the # msm directory # 2) cluster the conformations and save assignments and distances # 3) optionally save cluster centers # 4) analyze the cluster centers and save the results # of the analysis # 5) build the MSM and save transition matrix # 6) rank states for reselection based on structural analysis # (optional) and MSM statistics # # If restarting an adaptive sampling run, attempts to move simulations # from the last gen and recluster self.print_parameters() # set msmdir msm_dir = self.output_dir + '/msm' # timeit t0 = time.time() # initilize adaptive sampling if not continuing a previous run # builds directories, generates first run of sampling, and # clusters data ########################################################### # First generation of sampling # ########################################################### if not self.continue_prev: # build initial directory structure logging.info('building directories') gen_num = 0 gen_dir = self.output_dir + '/gen' + str(gen_num) _setup_directories(self.output_dir) # save starting state in msm directory. Will be used to load # and save trajectories for restarting simulations try: self.initial_state_md.save_gro(msm_dir + '/restart.gro') except: logging.warning( "Could not save initial state. Initial state is not pdb or gro?") self.cluster_obj.base_struct_md.save_gro(msm_dir + '/restart.gro') ########################################################### # STEP 1 (simulations) # ########################################################### # initialize first run of sampling logging.info('starting initial simulations') _gen_initial_sims( self.output_dir, self.initial_state_md, self.sim_obj, self.n_kids, self.q_check_obj_sim) # move trajectories after sampling logging.info('moving trajectories') _move_trjs(gen_dir, self.msm_dir, gen_num, self.n_kids) # wait for nfs to catch up time.sleep(65) ########################################################### # STEP 2 (clustering) # ########################################################### # submit clustering job logging.info('clustering simulation data') t_pre = time.time() # since its the first round of sampling, build full msm self.cluster_obj.build_full = True self.cluster_obj.set_filenames(self.msm_dir) _pickle_submit( self.msm_dir, self.cluster_obj, self.sub_obj, self.q_check_obj, gen_num, 'clusterer') # check that clustering went well correct_clust = self.cluster_obj.check_clustering( self.msm_dir, gen_num, self.n_kids) if not correct_clust: raise MissingData('clustering job failed!') # log clustering time t_post = time.time() logging.info("clustering took %0.4f seconds" % (t_post - t_pre)) ########################################################### # STEP 3 (saving states) # ########################################################### if self.save_state_obj is not None: logging.info('saving states') t_pre = time.time() _pickle_submit( self.msm_dir, self.save_state_obj, self.sub_obj, self.q_check_obj, gen_num, 'save_states') correct_save = self.save_state_obj.check_save_states( self.msm_dir) if not correct_save: raise MissingData('Saving states failed!') t_post = time.time() logging.info( 'saving states took %0.4f seconds' % (t_post - t_pre)) ########################################################### # restarting adaptive sampling # ########################################################### # if continuing from a previous run, determines gen, and # attempts to move trajectories, cluster data, build MSM, # rank states, and restart simulations else: # check for valid path to restart from if not os.path.exists(self.output_dir): raise DataInvalid( "Can't continue run from output directory that doesn't" + \ " exist!") # determine where adaptive sampling left off (looks at # trajectories and folder numbers). Allows for a discrepancy. gen_num = _determine_gen(self.output_dir, ignore_error=True) gen_dir = self.output_dir + '/gen' + str(gen_num) logging.info('continuing adaptive sampling from run %d' % gen_num) # try to move trajectories from current gen and initiate clustering try: # move trajectories logging.info('moving trajectories') _move_trjs(gen_dir, self.msm_dir, gen_num, self.n_kids) # wait for nfs to catch up time.sleep(65) except: pass # error check for consistent number of trajectories and # gen folders gen_num_test = _determine_gen(self.output_dir) assert gen_num == gen_num_test ########################################################### # STEP 2 (clustering) # ########################################################### # determine if clustering was completed if os.path.exists(self.msm_dir+"/data/assignments.h5"): # check if clustering was successful correct_clust = self.cluster_obj.check_clustering( self.msm_dir, gen_num, self.n_kids) else: correct_clust = False # if not, recluster if not correct_clust: # submit clustering job logging.info('clustering simulation data') logging.info('updating all cluster centers') rebuild_num = int(gen_num / self.update_freq) - 1 # if restarting from first gen, might not need to move # cluster data if gen_num == 0: try: _move_cluster_data( self.msm_dir, rebuild_num, self.analysis_obj) except: pass else: _move_cluster_data( self.msm_dir, rebuild_num, self.analysis_obj) t_pre = time.time() # built in rebuild everything if restarting sims self.cluster_obj.build_full = True self.cluster_obj.set_filenames(self.msm_dir) _pickle_submit( self.msm_dir, self.cluster_obj, self.sub_obj, self.q_check_obj, gen_num, 'clusterer') correct_clust = self.cluster_obj.check_clustering( self.msm_dir, gen_num, self.n_kids) # if still wrong, raise error if not correct_clust: raise MissingData('clustering job failed!') t_post = time.time() logging.info("clustering took %0.4f seconds" % (t_post - t_pre)) ########################################################### # STEP 3 (saving states) # ########################################################### if self.save_state_obj is not None: correct_save = self.save_state_obj.check_save_states( self.msm_dir) if not correct_save: logging.info('saving states') t_pre = time.time() _pickle_submit( self.msm_dir, self.save_state_obj, self.sub_obj, self.q_check_obj, gen_num, 'save_states') correct_save = self.save_state_obj.check_save_states( self.msm_dir) if not correct_save: raise MissingData('Saving states failed!') t_post = time.time() logging.info( 'saving states took %0.4f seconds' % (t_post - t_pre)) # determine if updating data if int(gen_num % self.update_freq) == 0: update_data = True else: update_data = False ########################################################### # STEP 4 (analysis of centers) # ########################################################### # run analysis object routine logging.info('analyzing cluster data') state_rankings = _perform_analysis( self.analysis_obj, self.msm_dir, gen_num, self.sub_obj, self.q_check_obj, update_data) ########################################################### # STEP 5 (MSM generation) # ########################################################### # build msm logging.info('building MSM') self.msm_obj = _prop_msm( self.msm_dir, self.msm_obj) ########################################################### # STEP 6 (rank states) # ########################################################### # if ranking uses analysis from state centers, update # the ranking object if hasattr(self.ranking_obj, 'state_rankings'): self.ranking_obj.state_rankings = state_rankings # if the ranking object uses rmsd information, load cluster # centers and update the ranking object if hasattr(self.ranking_obj, 'distance_metric'): if self.ranking_obj.distance_metric is not None: logging.info('loading centers for spreading') self.ranking_obj.state_centers = md.load( self.msm_dir+'/data/full_centers.xtc', top=self.msm_dir+'/prot_masses.pdb') logging.info('ranking states\n') # rank states new_states = self.ranking_obj.select_states(self.msm_obj, self.n_kids) np.save( self.msm_dir + '/rankings/states_to_simulate_gen' + \ str(gen_num) + '.npy', new_states) if (self.save_state_obj.save_routine == 'masses') or \ (self.save_state_obj.centers == 'none'): self.save_restart_obj.gen_num = gen_num self.save_restart_obj.run(self.msm_dir) ################################################################ # main adaptive sampling loop # ################################################################ # iterate adaptive sampling until gen reaches n_gens for gen_num in np.arange(gen_num + 1, self.n_gens): ########################################################### # STEP 1 (simulations) # ########################################################### logging.info('STARTING GEN NUM: %d' % gen_num) gen_dir = self.output_dir + '/gen' + str(gen_num) # propagate trajectories str_states = ", ".join([str(state) for state in new_states]) logging.info('starting simulations for states: ' + str_states) _prop_sims( self.output_dir, self.sim_obj, gen_num, self.q_check_obj_sim, new_states) # move trajectories logging.info('moving trajectories') _move_trjs(gen_dir, self.msm_dir, gen_num, self.n_kids) # ensure proper trajectories gen_num_test = _determine_gen(self.output_dir) assert gen_num == gen_num_test # wait for nfs to catch up time.sleep(65) ########################################################### # STEP 2 (clustering) # ########################################################### # determine if updating data if int(gen_num % self.update_freq) == 0: logging.info('updating all cluster centers') rebuild_num = int(gen_num / self.update_freq) - 1 _move_cluster_data( self.msm_dir, rebuild_num, self.analysis_obj) update_data = True else: update_data = False # submit clustering job logging.info('clustering simulation data') t_pre = time.time() self.cluster_obj.build_full = update_data # touch trajectories... cmd = 'touch %s/trajectories/*.xtc' % self.msm_dir tools.run_commands(cmd) self.cluster_obj.set_filenames(self.msm_dir) _pickle_submit( self.msm_dir, self.cluster_obj, self.sub_obj, self.q_check_obj, gen_num, 'clusterer') correct_clust = self.cluster_obj.check_clustering( self.msm_dir, gen_num, self.n_kids) if not correct_clust: raise MissingData('clustering job failed!') t_post = time.time() logging.info("clustering took %0.4f seconds" % (t_post - t_pre)) ########################################################### # STEP 3 (saving states) # ########################################################### if self.save_state_obj is not None: logging.info('saving states') t_pre = time.time() _pickle_submit( self.msm_dir, self.save_state_obj, self.sub_obj, self.q_check_obj, gen_num, 'save_states') correct_save = self.save_state_obj.check_save_states( self.msm_dir) if not correct_save: raise MissingData('Saving states failed!') t_post = time.time() logging.info( 'saving states took %0.4f seconds' % (t_post - t_pre)) ########################################################### # STEP 4 (analysis of centers) # ########################################################### # analysis logging.info('analyzing cluster data') state_rankings = _perform_analysis( self.analysis_obj, self.msm_dir, gen_num, self.sub_obj, self.q_check_obj, update_data) ########################################################### # STEP 5 (MSM generation) # ########################################################### # build msm logging.info('building MSM') self.msm_obj = _prop_msm( self.msm_dir, self.msm_obj) ########################################################### # STEP 6 (rank states) # ########################################################### # rank states and get new if hasattr(self.ranking_obj, 'state_rankings'): self.ranking_obj.state_rankings = state_rankings if hasattr(self.ranking_obj, 'distance_metric'): if self.ranking_obj.distance_metric is not None: logging.info('loading centers for spreading') self.ranking_obj.state_centers = md.load( self.msm_dir+'/data/full_centers.xtc', top=self.msm_dir+'/prot_masses.pdb') logging.info('ranking states\n') new_states = self.ranking_obj.select_states(self.msm_obj, self.n_kids) np.save( self.msm_dir + '/rankings/states_to_simulate_gen' + \ str(gen_num) + '.npy', new_states) # save restarts if not saved previously if (self.save_state_obj.save_routine == 'masses') or \ (self.save_state_obj.centers == 'none'): self.save_restart_obj.gen_num = gen_num self.save_restart_obj.run(self.msm_dir) t1 = time.time() logging.info("Total time took %0.4f seconds" % (t1 - t0))