Skip to content

Commit 38f9432

Browse files
authored
Added diversity picker for scaffold classes. (#15)
1 parent aa9fd95 commit 38f9432

2 files changed

Lines changed: 110 additions & 0 deletions

File tree

scaffoldgraph/analysis/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77
from .representation import calc_average_pairwise_similarity, get_over_represented_scaffold_classes
88
from .enrichment import calc_scaffold_enrichment, compound_set_enrichment
99
from .general import get_virtual_scaffolds, get_singleton_scaffolds
10+
from .diversity import diversity_pick_for_scaffold_class
11+
1012

1113
__all__ = [
1214
'calc_average_pairwise_similarity',
@@ -15,4 +17,5 @@
1517
'compound_set_enrichment',
1618
'get_virtual_scaffolds',
1719
'get_singleton_scaffolds',
20+
'diversity_pick_for_scaffold_class',
1821
]
Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
"""
2+
scaffoldgraph.analysis.diversity
3+
"""
4+
5+
from rdkit.SimDivFilters.rdSimDivPickers import LeaderPicker
6+
from rdkit.Chem.AllChem import GetMorganFingerprintAsBitVect
7+
from rdkit.Chem import MolFromSmiles
8+
9+
from functools import partial
10+
11+
12+
def _form_dist_func(dist_func, fps):
13+
"""function: create a partial dist_func."""
14+
if dist_func.__code__.co_argcount != 3:
15+
raise ValueError('dist_func must have three arguments: i, j, fps')
16+
if dist_func.__code__.co_varnames[2] != 'fps':
17+
raise ValueError('dist_func third argument name must be: fps')
18+
formed_dist_func = partial(dist_func, fps=fps)
19+
return formed_dist_func
20+
21+
22+
def _make_diversity_pick(pool, threshold, pick_size, dist_func=None):
23+
"""iterable: make a diversity pick from a pool of fingerprints."""
24+
picker = LeaderPicker()
25+
pool_size = len(pool)
26+
if pick_size > pool_size:
27+
pick_size = pool_size
28+
if dist_func is None:
29+
pick = picker.LazyBitVectorPick(pool, pool_size, threshold, pick_size)
30+
else:
31+
dist_func = _form_dist_func(dist_func, pool)
32+
pick = picker.LazyPick(dist_func, pool_size, threshold, pick_size)
33+
return pick
34+
35+
36+
def _create_pool(scaffold, graph, radius, bits):
37+
"""tuple : create molecule pool (ids, mols, fps)."""
38+
mol_ids, smiles = zip(*graph.get_molecules_for_scaffold(scaffold, 'smiles'))
39+
mols = list(map(MolFromSmiles, smiles))
40+
fps = list(map(lambda x: GetMorganFingerprintAsBitVect(x, radius, nBits=bits), mols))
41+
if len(fps) == 0:
42+
raise ValueError(f'No molecules for scaffold class: {scaffold}')
43+
return mol_ids, mols, fps
44+
45+
46+
def diversity_pick_for_scaffold_class(
47+
scaffold,
48+
graph,
49+
threshold=0.65,
50+
pick_size=0,
51+
fp_radius=2,
52+
fp_bits=1024,
53+
dist_func=None
54+
):
55+
"""
56+
Pick a diverse set of molecules from a scaffold class using
57+
the RDKit diversity picker (LeaderPicker) and Morgan
58+
fingerprints.
59+
60+
Parameters
61+
----------
62+
scaffold : str
63+
Scaffold class name i.e. scaffold SMILES.
64+
graph : ScaffoldGraph
65+
ScaffoldGraph for picking.
66+
threshold : float, optional
67+
Stop picking when the distance goes below this value.
68+
The default is 0.65 i.e. similarity = 0.35.
69+
pick_size : int, optional
70+
Number of items to pick from the molecule pool. If
71+
the pick size is greater than the pool size, the
72+
pick size will be equal to the size of the pool.
73+
fp_radius : int, optional
74+
Radius of Morgan fingerprint. The default is 2.
75+
fp_bits : int, optional
76+
Number of bits in the Morgan fingerprint. The
77+
default is 1024.
78+
dist_func : function, optional
79+
A function for calculating distance between a pair
80+
of fingerprints. The function should take two indicies
81+
(i, j) and a list of fingerprints (fps) and return
82+
the distance between these points.
83+
84+
Examples
85+
--------
86+
Diversity pick for benzene scaffold.
87+
88+
>>> ids, mols, fps = diversity_pick_for_scaffold_class('c1ccccc1', graph, pick_size=10)
89+
90+
Returns
91+
-------
92+
tuple ((ids), (mols), (fps))
93+
A tuple of tuples with the first containg the picked molecules ids,
94+
the seconds containing the picked mols RDMols and the third containg
95+
the molecules fingerprints.
96+
97+
Notes
98+
-----
99+
If performing diversity picks on a large scale, a custom implementation
100+
should probably be used where fingerprints can be cached.
101+
102+
"""
103+
mol_ids, mols, fps = _create_pool(scaffold, graph, fp_radius, fp_bits)
104+
pick = _make_diversity_pick(fps, threshold, pick_size, dist_func)
105+
picked = [(mol_ids[x], mols[x], fps[x]) for x in pick]
106+
ids, mols, fps = zip(*picked)
107+
return ids, mols, fps

0 commit comments

Comments
 (0)