Source code for fast.msm_gen.save_states

# Author: Maxwell I. Zimmerman <mizimmer@wustl.edu>
# Contributors:
# Copywright (C) 2017, Washington University in St. Louis
# All rights reserved.
# Unauthorized copying of this file, via any medium, is strictly prohibited
# Proprietary and confidential


#######################################################################
# imports
#######################################################################


import glob
import logging
import mdtraj as md
import numpy as np
from enspara.util import array as ra
from enspara.util.load import load_as_concatenated
from multiprocessing import Pool
from ..base import base


#######################################################################
# code
#######################################################################


[docs]def chunks(lst, n): """Yield successive n-sized chunks from lst.""" for i in range(0, len(lst), n): yield lst[i:i + n]
def _save_states(centers_info): """Save centers found within a single trajectory""" # get state, conf, and frame info. Also the filename and topology. states = centers_info['state'] confs = centers_info['conf'] frames = centers_info['frame'] trj_filename = centers_info['trj_filename'][0] save_routine = centers_info['save_routine'][0] msm_dir = centers_info['msm_dir'][0] if save_routine == 'full': save_masses = True save_restarts = True elif save_routine == 'masses': save_masses = True save_restarts = False elif save_routine == 'restarts': save_masses = False save_restarts = True else: raise # load structs trajectories if save_masses: top = md.load(msm_dir+"/prot_masses.pdb") trj = md.load( msm_dir + '/trajectories/' + trj_filename, top=top) if save_restarts: trj_full = md.load( msm_dir + '/trajectories_full/' + trj_filename, top=msm_dir + "/restart.gro") for num in range(len(states)): if save_masses: # save center after processing pdb_filename = msm_dir + \ "/centers_masses/state%06d-%02d.pdb" % \ (states[num], confs[num]) center = trj[frames[num]] center.superpose(top) center.save_pdb(pdb_filename) if save_restarts: # save center for restarting simulations pdb_filename = msm_dir + \ "/centers_restarts/state%06d-%02d.gro" % \ (states[num], confs[num]) center = trj_full[frames[num]] center.save_gro(pdb_filename) if save_masses: del trj if save_restarts: del trj_full return
[docs]def save_states( assignments, distances, state_nums=None, save_routine='full', largest_center=np.inf, n_confs=1, n_procs=1, msm_dir='.'): """Saves specified state-numbers by searching through the assignments and distances and pulling single frames from trajectories. This is a special tailored helper function that has a directory structure hard-coded in. Can specify a largest distance to a cluster center to save computational time searching for min distances. If multiple conformations are saved, the center is saved as conf-0 and new conformations are sampled from the cluster. Inputs ---------- assignments : array, shape=(n_trajectories, n_frames), Assigned cluster for each frame in each trajectory. distances : array, shape=(n_trajectories, n_frames), The distance to the cluster center for each frame in each trajectory. state_nums : array, shape=(n_states, ), default=None, The specific state numbers for saving. If None are supplied, will save every state. save_routine : str, default='full', The routine for saving states, either 'full', 'masses', or 'restarts'. 'masses' will only save the processed cluster centers, 'restarts' will only save the full system centers, and 'full' will save both. largest_center : float, default=np.inf, The largest expected distance from any frame to a cluster center. Specifying a small number can save in computational time. Defaults to np.inf, which will consider every frame when finding cluster centers. n_confs : int, default=1, The number of representative conformations to save of each cluster center. The first conformation is always the cluster center, and subsequent conformations are sampled randomly from frames clustered. n_procs : int, default=1, The number of processes to use when saving states. msm_dir : str, default='.', Location of the msm directory containing trajectories and folders for saving states. """ if state_nums is None: try: state_nums = np.unique(assignments) except: state_nums = np.unique(np.concatenate(assignments)) trj_filenames = np.sort( np.array( [ s.split("/")[-1] for s in glob.glob(msm_dir + "/trajectories/*.xtc")])) topology = "prot_masses.pdb" # reduce the number of conformations to search through reduced_iis = ra.where((distances > -0.1)*(distances < largest_center)) reduced_assignments = assignments[reduced_iis] reduced_distances = distances[reduced_iis] centers_location = [] for state in state_nums: state_iis = np.where(reduced_assignments == state) nconfs_in_state = len(state_iis[0]) if nconfs_in_state >= n_confs: center_picks = np.array([0]) if n_confs > 1: center_picks = np.append( center_picks, np.random.choice( range(1, nconfs_in_state), n_confs-1, replace=False)) else: center_picks = np.array([0]) center_picks = np.append( center_picks, np.random.choice(nconfs_in_state, n_confs - 1)) state_centers = np.argsort(reduced_distances[state_iis])[center_picks] # Obtain information on conformation locations within trajectories trj_locations = reduced_iis[0][state_iis[0][state_centers]] frame_nums = reduced_iis[1][state_iis[0][state_centers]] for conf_num in range(n_confs): trj_num = trj_locations[conf_num] centers_location.append( ( state, conf_num, trj_num, frame_nums[conf_num], trj_filenames[trj_num], save_routine, msm_dir)) if type(topology) == str: centers_location = np.array( centers_location, dtype=[ ('state', 'int'), ('conf', 'int'), ('trj_num', 'int'), ('frame', 'int'), ('trj_filename', np.str_, 800), ('save_routine', np.str_, 10), ('msm_dir', np.str_, 800)]) unique_trjs = np.unique(centers_location['trj_num']) partitioned_centers_info = [] for trj in unique_trjs: partitioned_centers_info.append( centers_location[np.where(centers_location['trj_num'] == trj)]) if n_procs == 1: for pci in partitioned_centers_info: _save_states(pci) else: with Pool(processes=n_procs) as pool: pool.map(_save_states, partitioned_centers_info) # pool = Pool(processes=n_procs) # pool.map(_save_states, partitioned_centers_info) # pool.terminate() return
[docs]class SaveWrap(base): """Save states wrapping object Parameters ---------- save_routine : str, default='full', The type of states to save. Three options: 1) 'masses' saves only in the centers_masses, 2) 'restarts' saves only the restarts, and 3) 'full' saves both. centers : str, default='auto', The indicator for the set of centers to save. Four options: 1) 'all' will save every center, 2) 'none' will not save any centers, 3) 'restarts' will only save the centers to use for restarting simulations, and 4) 'auto' will only save new states that were discovered in previous round of sampling. gen_num : int, default=0, The generation number of adaptive sampling. Only used if centers is set to 'restarts'. largest_center : float, default=np.inf, The largest distance to a cluster center expected. Can be used to speed up searching for cluster centers. A reasonable value if the distance cutoff used for clustering. save_xtc_centers : bool, default=False, Optionally save centers as an xtc in data. n_procs : int, default=1, The number of processes to use when saving states. """ def __init__( self, save_routine='full', centers='auto', gen_num=0, largest_center=np.inf, save_xtc_centers=False, n_procs=1): self.save_routine = save_routine self.centers = centers self.gen_num = gen_num self.largest_center = largest_center self.save_xtc_centers = save_xtc_centers self.n_procs = n_procs @property def class_name(self): return "SaveWrap" @property def config(self): return { 'save_routine': self.save_routine, 'centers': self.centers, 'gen_num': self.gen_num, 'largest_center': self.largest_center, 'n_procs': self.n_procs, 'save_xtc_centers': self.save_xtc_centers, }
[docs] def check_save_states(self, msm_dir): assigns = ra.load(msm_dir + '/data/assignments.h5') unique_states = np.unique(assigns) n_states = unique_states.shape[0] correct_save = True save_masses = False save_restarts = False if (self.save_routine == 'masses') or (self.save_routine == 'full'): save_masses = True if (self.save_routine == 'restarts') or (self.save_routine == 'full'): save_restarts = True if (self.centers == 'none') or (self.centers == 'restarts'): pass else: if save_masses: n_masses = len(glob.glob(msm_dir + '/centers_masses/*.pdb')) if n_masses != n_states: correct_save = False if save_restarts: n_restarts = len(glob.glob(msm_dir + '/centers_restarts/*.gro')) if n_restarts != n_states: correct_save = False return correct_save
[docs] def run(self, msm_dir='.'): if self.centers != 'none': assignments = ra.load(msm_dir + "/data/assignments.h5") distances = ra.load(msm_dir + "/data/distances.h5") if self.centers == 'auto': state_nums = np.load(msm_dir + "/data/unique_states.npy") elif self.centers == 'all': state_nums = None elif self.centers == 'restarts': states_to_simulate_file = \ msm_dir + "/rankings/states_to_simulate_gen" + \ str(self.gen_num) + ".npy" state_nums = np.load(states_to_simulate_file) save_states( assignments, distances, state_nums=state_nums, n_procs=self.n_procs, largest_center=self.largest_center, save_routine=self.save_routine, msm_dir=msm_dir) if self.save_xtc_centers: center_filenames = np.sort(glob.glob("%s/centers_masses/*.pdb" % msm_dir)) trj_lengths, xyzs = load_as_concatenated(center_filenames, processes=self.n_procs) centers = md.Trajectory(xyzs, topology=md.load("%s/prot_masses.pdb" % msm_dir).top) centers.save_xtc("%s/data/full_centers.xtc" % msm_dir)