Rdkit hlogp batch.py
Jump to navigation
Jump to search
########################### # Written by Jennifer Young # Last revised April 2020 # Version 1.0 ########################### from __future__ import print_function import pdb import itertools import sys import shlex import subprocess from rdkit.Chem import ( MolFromSmiles, MolToSmiles, AddHs, ) from rdkit.Chem.SaltRemover import ( SaltRemover ) from rdkit.Chem.Descriptors import MolLogP, MolWt def scale_logp_value(logp): if logp < -9.0: logp = -9.0 elif logp > 9.0: logp = 9.0 elif logp < 0 or logp >= 5.0: logp = 100*int(logp) elif logp >= 0.0 or logp < 1: logp = 10*int(10*logp) elif logp >= 1.0 or logp < 4: logp = int(100*logp) elif logp >=4 or logp < 5: logp = 10*int(10*logp) return logp if __name__ == '__main__': if len(sys.argv) != 3: print('Usage: python rdkit_hlogp_batch.py <smiles> <batch_size>') exit() BATCH_SIZE = int(sys.argv[2]) hlogp_list = list() with open(sys.argv[1]) as smiles_file: file_lines = smiles_file.readlines() for line in file_lines: if line.strip(): smiles, cid = str(line).strip().split()[:2] mol = MolFromSmiles(smiles) remover = SaltRemover() res, deleted = remover.StripMolWithDeleted(mol) if res is not None: res.SetProp('_Name', cid) logp = MolLogP(res) num_heavy_atoms = res.GetNumHeavyAtoms() if num_heavy_atoms > 99: num_heavy_atoms = 99 scaled_logp = scale_logp_value(logp) if logp < 0.0: sign = 'M' #remove the minus sign so it's not printed scaled_logp = scaled_logp * -1 else: sign = 'P' key_string = 'H{:02}{}{:03}'.format(num_heavy_atoms, sign, scaled_logp) #store in list up to batch size, then write out to new file final_string = '{0} {1} {2}\n'.format(smiles, cid, key_string) hlogp_list.append(final_string) #write the key string to the file if len(hlogp_list) >= BATCH_SIZE: with open('{0}_hlogp'.format(sys.argv[1]), 'a+') as category_file: for entry in hlogp_list: category_file.write('{0}'.format(entry)) #clear list because we already wrote these hlogp_list = list() #write remaining molecules to file for entry in hlogp_list: with open('{0}_hlogp'.format(sys.argv[1]), 'a+') as category_file: for entry in hlogp_list: category_file.write('{0}'.format(entry))