|
| 1 | +""" |
| 2 | +scaffoldgraph.analysis.diversity |
| 3 | +""" |
| 4 | + |
| 5 | +from rdkit.SimDivFilters.rdSimDivPickers import LeaderPicker |
| 6 | +from rdkit.Chem.AllChem import GetMorganFingerprintAsBitVect |
| 7 | +from rdkit.Chem import MolFromSmiles |
| 8 | + |
| 9 | +from functools import partial |
| 10 | + |
| 11 | + |
| 12 | +def _form_dist_func(dist_func, fps): |
| 13 | + """function: create a partial dist_func.""" |
| 14 | + if dist_func.__code__.co_argcount != 3: |
| 15 | + raise ValueError('dist_func must have three arguments: i, j, fps') |
| 16 | + if dist_func.__code__.co_varnames[2] != 'fps': |
| 17 | + raise ValueError('dist_func third argument name must be: fps') |
| 18 | + formed_dist_func = partial(dist_func, fps=fps) |
| 19 | + return formed_dist_func |
| 20 | + |
| 21 | + |
| 22 | +def _make_diversity_pick(pool, threshold, pick_size, dist_func=None): |
| 23 | + """iterable: make a diversity pick from a pool of fingerprints.""" |
| 24 | + picker = LeaderPicker() |
| 25 | + pool_size = len(pool) |
| 26 | + if pick_size > pool_size: |
| 27 | + pick_size = pool_size |
| 28 | + if dist_func is None: |
| 29 | + pick = picker.LazyBitVectorPick(pool, pool_size, threshold, pick_size) |
| 30 | + else: |
| 31 | + dist_func = _form_dist_func(dist_func, pool) |
| 32 | + pick = picker.LazyPick(dist_func, pool_size, threshold, pick_size) |
| 33 | + return pick |
| 34 | + |
| 35 | + |
| 36 | +def _create_pool(scaffold, graph, radius, bits): |
| 37 | + """tuple : create molecule pool (ids, mols, fps).""" |
| 38 | + mol_ids, smiles = zip(*graph.get_molecules_for_scaffold(scaffold, 'smiles')) |
| 39 | + mols = list(map(MolFromSmiles, smiles)) |
| 40 | + fps = list(map(lambda x: GetMorganFingerprintAsBitVect(x, radius, nBits=bits), mols)) |
| 41 | + if len(fps) == 0: |
| 42 | + raise ValueError(f'No molecules for scaffold class: {scaffold}') |
| 43 | + return mol_ids, mols, fps |
| 44 | + |
| 45 | + |
| 46 | +def diversity_pick_for_scaffold_class( |
| 47 | + scaffold, |
| 48 | + graph, |
| 49 | + threshold=0.65, |
| 50 | + pick_size=0, |
| 51 | + fp_radius=2, |
| 52 | + fp_bits=1024, |
| 53 | + dist_func=None |
| 54 | +): |
| 55 | + """ |
| 56 | + Pick a diverse set of molecules from a scaffold class using |
| 57 | + the RDKit diversity picker (LeaderPicker) and Morgan |
| 58 | + fingerprints. |
| 59 | +
|
| 60 | + Parameters |
| 61 | + ---------- |
| 62 | + scaffold : str |
| 63 | + Scaffold class name i.e. scaffold SMILES. |
| 64 | + graph : ScaffoldGraph |
| 65 | + ScaffoldGraph for picking. |
| 66 | + threshold : float, optional |
| 67 | + Stop picking when the distance goes below this value. |
| 68 | + The default is 0.65 i.e. similarity = 0.35. |
| 69 | + pick_size : int, optional |
| 70 | + Number of items to pick from the molecule pool. If |
| 71 | + the pick size is greater than the pool size, the |
| 72 | + pick size will be equal to the size of the pool. |
| 73 | + fp_radius : int, optional |
| 74 | + Radius of Morgan fingerprint. The default is 2. |
| 75 | + fp_bits : int, optional |
| 76 | + Number of bits in the Morgan fingerprint. The |
| 77 | + default is 1024. |
| 78 | + dist_func : function, optional |
| 79 | + A function for calculating distance between a pair |
| 80 | + of fingerprints. The function should take two indicies |
| 81 | + (i, j) and a list of fingerprints (fps) and return |
| 82 | + the distance between these points. |
| 83 | +
|
| 84 | + Examples |
| 85 | + -------- |
| 86 | + Diversity pick for benzene scaffold. |
| 87 | +
|
| 88 | + >>> ids, mols, fps = diversity_pick_for_scaffold_class('c1ccccc1', graph, pick_size=10) |
| 89 | +
|
| 90 | + Returns |
| 91 | + ------- |
| 92 | + tuple ((ids), (mols), (fps)) |
| 93 | + A tuple of tuples with the first containg the picked molecules ids, |
| 94 | + the seconds containing the picked mols RDMols and the third containg |
| 95 | + the molecules fingerprints. |
| 96 | +
|
| 97 | + Notes |
| 98 | + ----- |
| 99 | + If performing diversity picks on a large scale, a custom implementation |
| 100 | + should probably be used where fingerprints can be cached. |
| 101 | +
|
| 102 | + """ |
| 103 | + mol_ids, mols, fps = _create_pool(scaffold, graph, fp_radius, fp_bits) |
| 104 | + pick = _make_diversity_pick(fps, threshold, pick_size, dist_func) |
| 105 | + picked = [(mol_ids[x], mols[x], fps[x]) for x in pick] |
| 106 | + ids, mols, fps = zip(*picked) |
| 107 | + return ids, mols, fps |
0 commit comments