Skip to content

Commit f5a22e7

Browse files
authored
Merge pull request #3 from UCLCheminformatics/simple_functions
Added Simple functions
2 parents 4f4f2ef + 85cddfc commit f5a22e7

6 files changed

Lines changed: 178 additions & 5 deletions

File tree

scaffoldgraph/core/graph.py

Lines changed: 102 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,15 @@
88
from collections import Counter
99

1010
import tqdm
11+
import networkx as nx
12+
1113
from loguru import logger
12-
from networkx import DiGraph
1314
from rdkit import RDLogger
1415
from rdkit.Chem import rdMolHash, MolToSmiles, rdmolops
1516
from rdkit.Chem.rdMolDescriptors import CalcNumRings
1617

1718
from scaffoldgraph.io import *
19+
from scaffoldgraph.utils import canonize_smiles
1820
from .fragment import get_murcko_scaffold, get_annotated_murcko_scaffold
1921
from .scaffold import Scaffold
2022

@@ -28,7 +30,7 @@ def init_molecule_name(mol):
2830
mol.SetProp('_Name', n)
2931

3032

31-
class ScaffoldGraph(DiGraph, ABC):
33+
class ScaffoldGraph(nx.DiGraph, ABC):
3234
"""Abstract base class for ScaffoldGraphs"""
3335

3436
def __init__(self, graph=None, fragmenter=None):
@@ -92,29 +94,125 @@ def _recursive_constructor(self, child):
9294
@property
9395
def num_scaffold_nodes(self):
9496
"""Return the number of scaffolds in the graph"""
95-
return len(list(self.get_scaffold_nodes()))
97+
count = 0
98+
for _ in self.get_scaffold_nodes():
99+
count += 1
100+
return count
96101

97102
@property
98103
def num_molecule_nodes(self):
99104
"""Return the number of molecules in the graph"""
100-
return len(list(self.get_molecule_nodes()))
105+
count = 0
106+
for _ in self.get_molecule_nodes():
107+
count += 1
108+
return count
101109

102110
def get_scaffold_nodes(self, data=False):
111+
"""Return a generator of all scaffold nodes in the graph"""
103112
if data is True:
104113
return ((n, self.nodes[n]) for n, d in self.nodes(data='type') if d == 'scaffold')
105114
else:
106115
return (n for n, d in self.nodes(data='type') if d == 'scaffold')
107116

108117
def get_molecule_nodes(self, data=False):
118+
"""Return a generator of all molecule nodes in the graph"""
109119
if data is True:
110120
return ((n, self.nodes[n]) for n, d in self.nodes(data='type') if d == 'molecule')
111121
else:
112122
return (n for n, d in self.nodes(data='type') if d == 'molecule')
113123

114124
def get_hierarchy_sizes(self):
125+
"""Return a collections.Counter object indicating the number of scaffolds
126+
within each hierarchy level"""
115127
hierarchy = (d['hierarchy'] for _, d in self.get_scaffold_nodes(data=True))
116128
return Counter(hierarchy)
117129

130+
def max_hierarchy(self):
131+
"""Return the largest hierarchy level"""
132+
return max(self.get_hierarchy_sizes())
133+
134+
def min_hierarchy(self):
135+
"""Return the smallest hierarchy level"""
136+
return min(self.get_hierarchy_sizes())
137+
138+
def get_scaffolds_in_hierarchy(self, hierarchy):
139+
"""Return a generator of all scaffolds within a specified hierarchy"""
140+
for s, d in self.get_scaffold_nodes(data=True):
141+
if d['hierarchy'] == int(hierarchy):
142+
yield s
143+
144+
def scaffold_in_graph(self, scaffold_smiles):
145+
"""Returns True if specified scaffold SMILES is in the scaffold graph
146+
147+
Parameters
148+
----------
149+
scaffold_smiles : (str) SMILES of query scaffold.
150+
"""
151+
result = scaffold_smiles in self
152+
if result is not True:
153+
scaffold_smiles = canonize_smiles(scaffold_smiles, failsafe=True)
154+
result = scaffold_smiles in self
155+
return result
156+
157+
def molecule_in_graph(self, molecule_id):
158+
"""Returns True if specified molecule ID is in the scaffold graph
159+
160+
Parameters
161+
----------
162+
molecule_id: (str) ID of query molecule.
163+
"""
164+
return str(molecule_id) in self
165+
166+
def get_molecules_for_scaffold(self, scaffold_smiles):
167+
"""Return a list of molecule IDs which are represented by a scaffold in the graph.
168+
169+
Note: This is determined by traversing the graph. In the case of a scaffold tree
170+
the results represent the rules used to prioritize the scaffolds.
171+
172+
Parameters
173+
----------
174+
scaffold_smiles : (str) SMILES of query scaffold.
175+
"""
176+
molecules = []
177+
if scaffold_smiles not in self:
178+
scaffold_smiles = canonize_smiles(scaffold_smiles, failsafe=True)
179+
if scaffold_smiles not in self:
180+
return molecules
181+
for succ in nx.bfs_tree(self, scaffold_smiles, reverse=False):
182+
if self.nodes[succ]['type'] == 'molecule':
183+
molecules.append(succ)
184+
return molecules
185+
186+
def get_scaffolds_for_molecule(self, molecule_id):
187+
"""Return a list of scaffold SMILES connected to a query molecule ID
188+
189+
Parameters
190+
----------
191+
molecule_id: (str) ID of query molecule.
192+
"""
193+
scaffolds = []
194+
if molecule_id not in self:
195+
return scaffolds
196+
for succ in nx.bfs_tree(self, molecule_id, reverse=True):
197+
if self.nodes[succ]['type'] == 'scaffold':
198+
scaffolds.append(succ)
199+
return scaffolds
200+
201+
def separate_disconnected_components(self, sort=False):
202+
"""Separate disconnected components into distinct ScaffoldGraph objects.
203+
204+
Parameters
205+
----------
206+
sort: if True sort components in descending order according
207+
to the number of nodes in the subgraph.
208+
"""
209+
components = []
210+
for c in nx.weakly_connected_components(self):
211+
components.append(self.subgraph(c).copy())
212+
if sort:
213+
return sorted(components, key=len, reverse=True)
214+
return components
215+
118216
def add_molecule_node(self, molecule, **attr):
119217
name = molecule.GetProp('_Name')
120218
default_attr = dict(type='molecule', smiles=MolToSmiles(molecule))

scaffoldgraph/utils/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,10 @@
22
scaffoldgraph.utils
33
"""
44

5+
from .misc import canonize_smiles
56
from .aggregate import aggregate
67

78
__all__ = [
9+
'canonize_smiles',
810
'aggregate'
911
]

scaffoldgraph/utils/misc.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
"""
2+
scaffoldgraph.utils.misc
3+
4+
Defines miscellaneous functions used within scaffoldgraph
5+
"""
6+
7+
from rdkit import Chem
8+
9+
10+
def canonize_smiles(smiles, failsafe=True):
11+
"""Canonize a SMILES string (with failsafe)"""
12+
mol = Chem.MolFromSmiles(smiles)
13+
if mol is None and failsafe:
14+
return smiles
15+
return Chem.MolToSmiles(mol)

tests/core/test_graph.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,5 +17,5 @@ def test_init_molecule_name():
1717

1818

1919
def test_graph_subclass():
20-
assert issubclass(ScaffoldGraph, DiGraph)
20+
assert issubclass(ScaffoldGraph, nx.DiGraph)
2121
assert issubclass(ScaffoldGraph, ABC)

tests/test_network.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,12 @@ def test_network(sdf_file):
1515
return network
1616

1717

18+
@pytest.fixture(name='network')
19+
def long_test_network():
20+
network = sg.ScaffoldNetwork.from_smiles_file('tests/test_smiles.smi')
21+
return network
22+
23+
1824
def test_network_from_sdf(sdf_file):
1925
network = sg.ScaffoldNetwork.from_sdf(sdf_file)
2026
assert network.num_scaffold_nodes == 8
@@ -33,5 +39,47 @@ def test_hiers(sdf_file):
3339
assert network.num_molecule_nodes == 2
3440

3541

42+
def test_hierarchy_functions(network):
43+
hierarchy_sizes = network.get_hierarchy_sizes()
44+
assert hierarchy_sizes[1] == 7
45+
assert hierarchy_sizes[2] == 10
46+
assert hierarchy_sizes[3] == 7
47+
assert hierarchy_sizes[4] == 1
48+
assert network.max_hierarchy() == 4
49+
assert network.min_hierarchy() == 1
50+
s_in_h2 = {
51+
'C1=Cn2cnnc2CN=C1', 'O=C1CN=C(c2ccccn2)C=CN1', 'O=C1CN=Cc2ccccc2N1',
52+
'C1=CC(c2ccccc2)=[NH+]CC=N1', 'C1=Nc2ccccc2C=[NH+]C1',
53+
'O=C1CC(=O)N(c2ccccc2)C=CN1', 'O=C1CC(=O)Nc2ccccc2N1',
54+
'O=C1CN=C(c2ccccc2)C=CN1', 'O=C1C[NH+]=Cc2ccccc2N1',
55+
'O=C1C[NH+]=C(c2ccccc2)C=CN1'
56+
}
57+
assert s_in_h2 == set(network.get_scaffolds_in_hierarchy(2))
58+
59+
60+
def test_simple_functions(network):
61+
assert network.scaffold_in_graph('C1=Cn2cnnc2CN=C1') is True
62+
# Below is the non-canonical SMILES of the above
63+
assert network.scaffold_in_graph('C1=C-n2:c:n:n:c:2-C-N=C-1') is True
64+
assert network.scaffold_in_graph('c1ccccc1CCNc2ccccc2') is False
65+
assert network.molecule_in_graph('Adinazolam') is True
66+
assert network.molecule_in_graph('Citalopram') is False
67+
68+
69+
def test_traversal(network):
70+
s_for_adinazolam = {
71+
'c1ccc(C2=NCc3nncn3-c3ccccc32)cc1', 'C1=NCc2nncn2-c2ccccc21',
72+
'C1=Cn2cnnc2CN=C1c1ccccc1', 'C1=Cn2cnnc2CN=C1', 'c1nnc[nH]1'
73+
}
74+
assert set(network.get_scaffolds_for_molecule('Adinazolam')) == s_for_adinazolam
75+
m_for_scaffold = {'Adinazolam', 'Alprazolam'}
76+
assert set(network.get_molecules_for_scaffold('c1nnc[nH]1')) == m_for_scaffold
77+
78+
79+
def test_separate_disconnected(network):
80+
assert len(network.separate_disconnected_components()) == 2
81+
assert type(network.separate_disconnected_components()[0]) == type(network)
82+
83+
3684
def test_repr(test_net):
3785
assert repr(test_net) == '<ScaffoldNetwork at {}>'.format(hex(id(test_net)))

tests/test_smiles.smi

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
CN(C)Cc1n-2c(nn1)CN=C(c1ccccc1)c1cc(Cl)ccc12 Adinazolam
2+
Cc1n-2c(nn1)CN=C(c1ccccc1)c1cc(Cl)ccc12 Alprazolam
3+
Brc1cc2c(cc1)NC(=O)CN=C2c1ncccc1 Bromazepam
4+
CNC1=Nc2c(cc(Cl)cc2)C(c2ccccc2)=[N+]([O-])C1 Chlordiazepoxide
5+
CN1c2ccc(Cl)cc2N(c2ccccc2)C(=O)CC1=O Clobazam
6+
[O-][N+](=O)c1cc2c(cc1)NC(=O)CN=C2c1c(Cl)cccc1 Clonazepam
7+
OC(=O)C1N=C(c2ccccc2)c2cc(Cl)ccc2NC1=O Clorazepate
8+
Clc1cc2c(cc1)NC(=O)CN=C2c1c(Cl)cccc1 Delorazepam
9+
[O-][N+]1=C(c2ccccc2)c2cc(Cl)ccc2NC(=O)C1 Demoxepam
10+
Clc1cc2c(cc1)NC(=O)CC(=O)N2c1ccccc1 Desmethylclobazam

0 commit comments

Comments
 (0)