Source code for analyze_foldamers.ensembles.cluster_torsion

import os

import matplotlib.cm as cm
import matplotlib.pyplot as plt
import mdtraj as md
import numpy as np
from analyze_foldamers.ensembles.cluster import *
from analyze_foldamers.parameters.angle_distributions import *
from cg_openmm.cg_model.cgmodel import CGModel
from cg_openmm.utilities.iotools import write_pdbfile_without_topology
from openmm import unit
from scipy.optimize import minimize
from scipy.spatial.distance import pdist, squareform
from sklearn.cluster import DBSCAN, OPTICS, KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn_extra.cluster import KMedoids


[docs]def cluster_torsions_KMedoids( file_list, cgmodel, n_clusters=2, frame_start=0, frame_stride=1, frame_end=-1, output_format="pdb", output_dir="cluster_output", backbone_torsion_type="bb_bb_bb_bb", filter=False, filter_ratio=0.25, plot_silhouette=True, plot_distance_hist=True): """ Given PDB or DCD trajectory files and coarse grained model as input, this function performs K-medoids clustering on the poses in trajectory, and returns a list of the coordinates for the medoid pose of each cluster. :param file_list: A list of PDB or DCD files to read and concatenate :type file_list: List( str ) :param cgmodel: A CGModel() class object :type cgmodel: class :param n_clusters: The number of clusters for K-medoids algorithm. :type n_clusters: int :param frame_start: First frame in pdb trajectory file to use for clustering. :type frame_start: int :param frame_stride: Advance by this many frames when reading pdb trajectories. :type frame_stride: int :param frame_end: Last frame in trajectory file to use for clustering. :type frame_end: int :param output_format: file format extension to write medoid coordinates to (default="pdb"), dcd also supported :type output_format: str :param output_dir: path to which cluster medoid structures and silhouette plots will be saved :type output_dir: str :param backbone_torsion_type: particle sequence of the backbone torsions (default="bb_bb_bb_bb") - for now only single sequence permitted :type backbone_torsion_type: str :param filter: option to apply neighborhood radius filtering to remove low-density data (default=False) :type filter: boolean :param filter_ratio: fraction of data points which pass through the neighborhood radius filter (default=0.05) :type filter_ratio: float :param plot_silhouette: option to create silhouette plot of clustering results (default=True) :type plot_silhouette: boolean :param plot_torsion_hist: option to plot a histogram of torsion euclidean distances (post-filtering) :type plot_torsion_hist: boolean :returns: - medoid_positions ( np.array( float * unit.angstrom ( n_clusters x num_particles x 3 ) ) ) - A 3D numpy array of poses corresponding to the medoids of all trajectory clusters. - medoid torsions ( np.array ( float * unit.degrees ( n_clusters x n_torsion ) - A 2D numpy array of the backbone torsion angles for each cluster medoid - cluster_sizes ( List ( int ) ) - A list of number of members in each cluster - cluster_rmsd( np.array ( float ) ) - A 1D numpy array of rmsd (in cluster distance space) of samples to cluster centers - silhouette_avg - ( float ) - average silhouette score across all clusters """ if not os.path.exists(output_dir): os.mkdir(output_dir) torsion_val_array, traj_all = get_torsion_matrix(file_list, cgmodel, frame_start, frame_stride, frame_end, backbone_torsion_type) # We need to precompute the euclidean distance matrix, accounting for periodic boundaries total = 0 angle_range = np.full(torsion_val_array.shape[1],360) powers = np.full(torsion_val_array.shape[1],2) torsion_distances = np.zeros((torsion_val_array.shape[0], torsion_val_array.shape[0])) for i in range(torsion_val_array.shape[0]): for j in range(torsion_val_array.shape[0]): delta = np.abs(torsion_val_array[i,:]-torsion_val_array[j,:]) delta = np.where(delta > 0.5*angle_range, delta-angle_range, delta) torsion_distances[i,j] = np.sqrt(np.power(delta,powers).sum()) if filter: # Filter distances: torsion_distances, dense_indices, filter_ratio_actual = \ filter_distances(torsion_distances, filter_ratio=filter_ratio) traj_all = traj_all[dense_indices] if plot_distance_hist: distances_row = np.reshape(torsion_distances, (torsion_distances.shape[0]*torsion_distances.shape[1],1)) # Remove the diagonal 0 elements: distances_row = distances_row[distances_row != 0] figure = plt.figure() n_out, bin_edges_out, patch = plt.hist( distances_row, bins=1000,density=True) plt.xlabel('rmsd') plt.ylabel('probability density') plt.savefig(f'{output_dir}/torsion_distances_hist.pdf') plt.close() # Cluster with sklearn-extra KMedoids kmedoids = KMedoids(n_clusters=n_clusters,metric='precomputed').fit(torsion_distances) # Get labels labels = kmedoids.labels_ # Get medoid indices medoid_indices = kmedoids.medoid_indices_ # Get medoid coordinates: medoid_xyz = np.zeros([n_clusters,traj_all.n_atoms,3]) for k in range(n_clusters): medoid_xyz[k,:,:] = traj_all[medoid_indices[k]].xyz[0] # Get medoid torsions: medoid_torsions = np.zeros([n_clusters, torsion_val_array.shape[1]]) for k in range(n_clusters): medoid_torsions[k,:] = torsion_val_array[medoid_indices[k],:] # Write medoids to file write_medoids_to_file(cgmodel, medoid_xyz, output_dir, output_format) medoid_positions = medoid_xyz * unit.nanometer # Get indices of frames in each cluster: cluster_indices = {} cluster_sizes = [] for k in range(n_clusters): cluster_indices[k] = np.argwhere(labels==k)[:,0] cluster_sizes.append(len(cluster_indices[k])) # Assign intra-cluster distances to medoids dist_to_medoids = {} for k in range(n_clusters): dist_to_medoids[k] = torsion_distances[cluster_indices[k],medoid_indices[k]] # Compute cluster rmsd of samples to medoid within each cluster cluster_rmsd = np.zeros(n_clusters) for k in range(n_clusters): for i in range(len(cluster_indices[k])): cluster_rmsd[k] += np.power(dist_to_medoids[k][i],2) cluster_rmsd[k] /= len(cluster_indices[k]) cluster_rmsd[k] = np.sqrt(cluster_rmsd[k]) # Get silhouette scores silhouette_avg = silhouette_score(torsion_distances, kmedoids.labels_) silhouette_sample_values = silhouette_samples(torsion_distances, kmedoids.labels_) if plot_silhouette: # Plot silhouette analysis plotfile=f"{output_dir}/silhouette_kmedoids_ncluster_{n_clusters}.pdf" make_silhouette_plot( kmedoids, silhouette_sample_values, silhouette_avg, n_clusters, cluster_rmsd, cluster_sizes, plotfile ) return (medoid_positions, medoid_torsions, cluster_sizes, cluster_rmsd, silhouette_avg)
[docs]def cluster_torsions_DBSCAN( file_list, cgmodel, min_samples=5, eps=0.5, frame_start=0, frame_stride=1, frame_end=-1, output_format="pdb", output_dir="cluster_output", backbone_torsion_type="bb_bb_bb_bb", core_points_only=True, filter=True, filter_ratio=0.25, plot_silhouette=True, plot_distance_hist=True): """ Given PDB or DCD trajectory files and coarse grained model as input, this function performs DBSCAN clustering on the poses in the trajectory, and returns a list of the coordinates for the medoid pose of each cluster. :param file_list: A list of PDB or DCD files to read and concatenate :type file_list: List( str ) :param cgmodel: A CGModel() class object :type cgmodel: class :param min_samples: minimum of number of samples in neighborhood of a point to be considered a core point (includes point itself) :type min_samples: int :param eps: DBSCAN parameter neighborhood distance cutoff :type eps: float :param frame_start: First frame in trajectory file to use for clustering. :type frame_start: int :param frame_stride: Advance by this many frames when reading trajectories. :type frame_stride: int :param frame_end: Last frame in trajectory file to use for clustering. :type frame_end: int :param output_format: file format extension to write medoid coordinates to (default="pdb"), dcd also supported :type output_format: str :param output_dir: directory to write clustering medoid and plot files :type output_dir: str :param backbone_torsion_type: particle sequence of the backbone torsions (default="bb_bb_bb_bb") - for now only single sequence permitted :type backbone_torsion_type: str :param core_points_only: use only core points to calculate medoid structures (default=True) :type core_points_only: boolean :param filter: option to apply neighborhood radius filtering to remove low-density data (default=True) :type filter: boolean :param filter_ratio: fraction of data points which pass through the neighborhood radius filter (default=0.05) :type filter_ratio: float :param plot_silhouette: option to create silhouette plot of clustering results (default=True) :type plot_silhouette: boolean :param plot_torsion_hist: option to plot a histogram of torsion euclidean distances (post-filtering) :type plot_torsion_hist: boolean :returns: - medoid_positions ( np.array( float * unit.angstrom ( n_clusters x num_particles x 3 ) ) ) - A 3D numpy array of poses corresponding to the medoids of all trajectory clusters. - medoid torsions ( np.array ( float * unit.degrees ( n_clusters x n_torsion ) - A 2D numpy array of the backbone torsion angles for each cluster medoid - cluster_sizes ( List ( int ) ) - A list of number of members in each cluster - cluster_rmsd( np.array ( float ) ) - A 1D numpy array of rmsd (in cluster distance space) of samples to cluster centers - n_noise ( int ) - number of points classified as noise - silhouette_avg - ( float ) - average silhouette score across all clusters """ if not os.path.exists(output_dir): os.mkdir(output_dir) torsion_val_array, traj_all = get_torsion_matrix(file_list, cgmodel, frame_start, frame_stride, frame_end, backbone_torsion_type) # We need to precompute the euclidean distance matrix, accounting for periodic boundaries total = 0 angle_range = np.full(torsion_val_array.shape[1],360) powers = np.full(torsion_val_array.shape[1],2) torsion_distances = np.zeros((torsion_val_array.shape[0], torsion_val_array.shape[0])) for i in range(torsion_val_array.shape[0]): for j in range(torsion_val_array.shape[0]): delta = np.abs(torsion_val_array[i,:]-torsion_val_array[j,:]) delta = np.where(delta > 0.5*angle_range, delta-angle_range, delta) torsion_distances[i,j] = np.sqrt(np.power(delta,powers).sum()) if filter: # Filter distances: torsion_distances, dense_indices, filter_ratio_actual = \ filter_distances(torsion_distances, filter_ratio=filter_ratio) traj_all = traj_all[dense_indices] if plot_distance_hist: distances_row = np.reshape(torsion_distances, (torsion_distances.shape[0]*torsion_distances.shape[1],1)) # Remove the diagonal 0 elements: distances_row = distances_row[distances_row != 0] figure = plt.figure() n_out, bin_edges_out, patch = plt.hist( distances_row, bins=1000,density=True) plt.xlabel('rmsd') plt.ylabel('probability density') plt.savefig(f'{output_dir}/torsion_distances_hist.pdf') plt.close() # Cluster with sklearn DBSCAN dbscan = DBSCAN(min_samples=min_samples,eps=eps,metric='precomputed').fit(torsion_distances) # The produces a cluster labels from 0 to n_clusters-1, and assigns -1 to noise points # Get labels labels = dbscan.labels_ # Get core sample indices: core_sample_indices = dbscan.core_sample_indices_ # Number of clusters: n_clusters = len(set(labels)) - (1 if -1 in labels else 0) # Number of noise points: n_noise = list(labels).count(-1) # Get indices of frames in each cluster: cluster_indices = {} cluster_indices_core = {} cluster_sizes = [] cluster_sizes_core = [] for k in range(n_clusters): cluster_indices[k] = np.argwhere(labels==k)[:,0] cluster_indices_core[k] = [] for elem in cluster_indices[k]: if elem in core_sample_indices: cluster_indices_core[k].append(elem) cluster_sizes.append(len(cluster_indices[k])) cluster_sizes_core.append(len(cluster_indices_core[k])) # Get indices of frames classified as noise: noise_indices = np.argwhere(labels==-1)[:,0] # Find the structure closest to each center (medoid): # OPTICS/DBSCAN does not have a built-in function to transform to cluster-distance space, # as the centroids of the clusters are not physically meaningful in general. However, as # RMSD between structures is our only clustering feature, the cluster centers (regions of # high density) will likely be representative structures of each cluster. # Following the protocol outlined in MDTraj example: # http://mdtraj.org/1.9.3/examples/centroids.html # Create distance matrices within each cluster: torsion_distances_k = {} if core_points_only: for k in range(n_clusters): torsion_distances_k[k] = np.zeros((cluster_sizes_core[k],cluster_sizes_core[k])) for i in range(cluster_sizes_core[k]): for j in range(cluster_sizes_core[k]): torsion_distances_k[k][i,j] = torsion_distances[cluster_indices_core[k][i],cluster_indices_core[k][j]] # Compute medoid based on similarity scores: medoid_index = [] # Global index intra_cluster_medoid_index = [] # Index within cluster for k in range(n_clusters): intra_cluster_medoid_index.append( np.exp(-torsion_distances_k[k]/torsion_distances_k[k].std()).sum(axis=1).argmax() ) # Here we need to use the global sample index to find the medoid structure: medoid_index.append(cluster_indices_core[k][intra_cluster_medoid_index[k]]) else: for k in range(n_clusters): torsion_distances_k[k] = np.zeros((cluster_sizes[k],cluster_sizes[k])) for i in range(cluster_sizes[k]): for j in range(cluster_sizes[k]): torsion_distances_k[k][i,j] = torsion_distances[cluster_indices[k][i],cluster_indices[k][j]] # Compute medoid based on similarity scores: medoid_index = [] # Global index intra_cluster_medoid_index = [] # Index within cluster for k in range(n_clusters): intra_cluster_medoid_index.append( np.exp(-torsion_distances_k[k]/torsion_distances_k[k].std()).sum(axis=1).argmax() ) # Here we need to use the global sample index to find the medoid structure: medoid_index.append(cluster_indices[k][intra_cluster_medoid_index[k]]) medoid_xyz = np.zeros([n_clusters,traj_all.n_atoms,3]) for k in range(n_clusters): medoid_xyz[k,:,:] = traj_all[medoid_index[k]].xyz[0] # Write medoids to file write_medoids_to_file(cgmodel, medoid_xyz, output_dir, output_format) medoid_positions = medoid_xyz * unit.nanometer # Get medoid torsions: medoid_torsions = np.zeros([n_clusters, torsion_val_array.shape[1]]) for k in range(n_clusters): medoid_torsions[k,:] = torsion_val_array[medoid_index[k],:] # Compute intra-cluster rmsd of samples to medoid based on structure rmsd cluster_rmsd = np.zeros(n_clusters) for k in range(n_clusters): cluster_rmsd[k] = np.sqrt(((torsion_distances_k[k][intra_cluster_medoid_index[k]]**2).sum())/len(cluster_indices[k])) # Get silhouette scores try: silhouette_sample_values = silhouette_samples(torsion_distances, labels) silhouette_avg = np.mean(silhouette_sample_values[labels!=-1]) if plot_silhouette: # Plot silhouette analysis plotfile = f"{output_dir}/silhouette_dbscan_min_sample_{min_samples}_eps_{eps}.pdf" make_silhouette_plot( dbscan, silhouette_sample_values, silhouette_avg, n_clusters, cluster_rmsd, cluster_sizes, plotfile ) except ValueError: print("There are either no clusters, or no noise points identified. Try adjusting DBSCAN min_samples, eps parameters.") silhouette_avg = None return medoid_positions, medoid_torsions, cluster_sizes, cluster_rmsd, n_noise, silhouette_avg
[docs]def get_torsion_matrix(file_list, cgmodel, frame_start, frame_stride, frame_end, backbone_torsion_type): """Internal function for reading trajectory files and computing torsions""" # Load files as {replica number: replica trajectory} rep_traj = {} for i in range(len(file_list)): if file_list[0][-3:] == 'dcd': rep_traj[i] = md.load(file_list[i],top=md.Topology.from_openmm(cgmodel.topology)) else: rep_traj[i] = md.load(file_list[i]) # Combine all trajectories, selecting specified frames if frame_end == -1: frame_end = rep_traj[0].n_frames if frame_start == -1: frame_start == frame_end traj_all = rep_traj[0][frame_start:frame_end:frame_stride] for i in range(len(file_list)-1): traj_all = traj_all.join(rep_traj[i+1][frame_start:frame_end:frame_stride]) # Get torsion list: torsion_list = CGModel.get_torsion_list(cgmodel) # Assign torsion types: torsion_types, torsion_array, torsion_sub_arrays, n_i, i_torsion_type, torsion_dict, inv_torsion_dict = \ assign_torsion_types(cgmodel, torsion_list) # Compute specified torsion angles over all frames: for i in range(i_torsion_type): if inv_torsion_dict[str(i+1)] == backbone_torsion_type: # Compute all torsion values in trajectory # This returns an [nframes x n_torsions] array torsion_val_array = md.compute_dihedrals( traj_all,torsion_sub_arrays[str(i+1)]) # Convert to degrees: torsion_val_array = (180/np.pi)*torsion_val_array return torsion_val_array, traj_all