diff --git a/dataloader/cath_dataset.py b/dataloader/cath_dataset.py index 741df0f..ff9e296 100644 --- a/dataloader/cath_dataset.py +++ b/dataloader/cath_dataset.py @@ -9,6 +9,7 @@ import sys import math import numpy as np +from numpy import pi from torch_geometric.data import ( Data, Dataset, @@ -397,35 +398,77 @@ def rec_residue_featurizer(self, rec, one_hot=True, add_feature=None): def get_node_features(self, n_coords, c_coords, c_alpha_coords, coord_mask, with_coord_mask=True, use_angle=False, use_omega=False): num_res = n_coords.shape[0] + assert len(c_coords) == len(n_coords) + + # Infer peptide-bond connectivity using the C_i--N_{i+1} distance. + # Biopython's PPBuilder uses a 1.8 Å C--N distance cutoff for polypeptide construction. + bond = np.linalg.norm(c_coords[:-1] - n_coords[1:], axis=1) < 1.8 + if use_omega: num_angle_type = 3 angles = np.zeros((num_res, num_angle_type)) - for i in range(num_res - 1): - # These angles are called φ (phi) which involves the backbone atoms C-N-Cα-C - angles[i, 0] = dihedral( - c_coords[i], n_coords[i], c_alpha_coords[i], n_coords[i + 1]) - # psi involves the backbone atoms N-Cα-C-N. - angles[i, 1] = dihedral( - n_coords[i], c_alpha_coords[i], c_coords[i], n_coords[i + 1]) - angles[i, 2] = dihedral( - c_alpha_coords[i], c_coords[i], n_coords[i + 1], c_alpha_coords[i + 1]) + for i in range(num_res): + # Terminal residues or chain breaks have undefined backbone torsions. + # They are kept as NaN here and later encoded as (sin, cos) = (0, 0). + if i != 0: + if bond[i-1]: + # Phi for residue i: C_{i-1}-N_i-CA_i-C_i. + angles[i, 0] = dihedral( + c_coords[i-1], n_coords[i], c_alpha_coords[i], c_coords[i]) + else: + angles[i, 0] = np.nan + else: + angles[i, 0] = np.nan + + if i < num_res-1: + if bond[i]: + # Psi for residue i: N_i-CA_i-C_i-N_{i+1}. + angles[i, 1] = dihedral( + n_coords[i], c_alpha_coords[i], c_coords[i], n_coords[i + 1]) + # Omega for residue i: CA_i-C_i-N_{i+1}-CA_{i+1}. + angles[i, 2] = dihedral( + c_alpha_coords[i], c_coords[i], n_coords[i + 1], c_alpha_coords[i + 1]) + else: + angles[i, 1] = np.nan + angles[i, 2] = np.nan + else: + angles[i, 1] = np.nan + angles[i, 2] = np.nan + else: num_angle_type = 2 angles = np.zeros((num_res, num_angle_type)) - for i in range(num_res - 1): - # These angles are called φ (phi) which involves the backbone atoms C-N-Cα-C - angles[i, 0] = dihedral( - c_coords[i], n_coords[i], c_alpha_coords[i], n_coords[i + 1]) - # psi involves the backbone atoms N-Cα-C-N. - angles[i, 1] = dihedral( - n_coords[i], c_alpha_coords[i], c_coords[i], n_coords[i + 1]) + for i in range(num_res): + if i != 0: + if bond[i-1]: + angles[i, 0] = dihedral( + c_coords[i-1], n_coords[i], c_alpha_coords[i], c_coords[i]) + else: + angles[i, 0] = np.nan + else: + angles[i, 0] = np.nan + + if i