#!/usr/bin/env python
import itertools
from operator import itemgetter
import pandas as pd
import pystow
from rdkit import Chem
from rdkit.Chem.MolStandardize import rdMolStandardize
from rdkit.Chem.rdMolDescriptors import CalcNumRings
from tqdm.auto import tqdm
[docs]
class RingSystemFinder:
"""A class to identify ring systems """
def __init__(self):
"""Initialize susbstructure search objects to identify key functionality
"""
self.ring_db_pat = Chem.MolFromSmarts("[#6R,#16R]=[OR0,SR0,CR0,NR0]")
self.ring_atom_pat = Chem.MolFromSmarts("[R]")
[docs]
def tag_bonds_to_preserve(self, mol):
"""Assign the property "protected" to all ring carbonyls, etc.
:param mol: input molecule
:return: None
"""
for bnd in mol.GetBonds():
bnd.SetBoolProp("protected", False)
for match in mol.GetSubstructMatches(self.ring_db_pat):
bgn, end = match
bnd = mol.GetBondBetweenAtoms(bgn, end)
bnd.SetBoolProp("protected", True)
[docs]
@staticmethod
def cleave_linker_bonds(mol):
"""Cleave bonds that are not in rings and not protected
:param mol: input molecule
:return: None
"""
frag_bond_list = []
for bnd in mol.GetBonds():
if not bnd.IsInRing() and not bnd.GetBoolProp("protected") and bnd.GetBondType() == Chem.BondType.SINGLE:
frag_bond_list.append(bnd.GetIdx())
if len(frag_bond_list):
frag_mol = Chem.FragmentOnBonds(mol, frag_bond_list)
Chem.SanitizeMol(frag_mol)
# Chem.AssignStereochemistry(frag_mol, cleanIt=True, force=True)
return frag_mol
else:
return mol
[docs]
def cleanup_fragments(self, mol, keep_dummy=False):
"""Split a molecule containing multiple ring systems into individual ring systems
:param keep_dummy: retain dummy atoms
:param mol: input molecule
:return: a list of SMILES corresponding to individual ring systems
"""
frag_list = Chem.GetMolFrags(mol, asMols=True)
ring_system_list = []
for frag in frag_list:
if frag.HasSubstructMatch(self.ring_atom_pat):
for atm in frag.GetAtoms():
if atm.GetAtomicNum() == 0:
if keep_dummy:
atm.SetProp("atomLabel", "R")
else:
atm.SetAtomicNum(1)
atm.SetIsotope(0)
# Convert explict Hs to implicit
frag = Chem.RemoveAllHs(frag)
frag = self.fix_bond_stereo(frag)
ring_system_list.append(frag)
return ring_system_list
[docs]
@staticmethod
def fix_bond_stereo(mol):
"""Loop over double bonds and change stereo specification for double bonds that don't have stereo
:param mol: input RDKit molecule
:return: output RDKit molecule
"""
for bnd in mol.GetBonds():
if bnd.GetBondType() == Chem.BondType.DOUBLE:
begin_atm = bnd.GetBeginAtom()
end_atm = bnd.GetEndAtom()
# Look for double bond atoms with two attached hydrogens
if begin_atm.GetDegree() == 1 or end_atm.GetDegree() == 1:
bnd.SetStereo(Chem.BondStereo.STEREONONE)
return mol
[docs]
def find_ring_systems(self, mol, keep_dummy=False, as_mols=False):
"""Find the ring systems for an RDKit molecule
:param as_mols: return results a molecules (otherwise return SMILES)
:param keep_dummy: retain dummy atoms
:param mol: input molecule
:return: a list of SMILES corresponding to individual ring systems
"""
self.tag_bonds_to_preserve(mol)
frag_mol = self.cleave_linker_bonds(mol)
output_list = self.cleanup_fragments(frag_mol, keep_dummy=keep_dummy)
if not as_mols:
output_list = [Chem.MolToSmiles(x) for x in output_list]
return output_list
[docs]
def test_ring_system_finder():
"""A quick test for the RingSystemFinder class
:return: None
"""
mol = Chem.MolFromSmiles(r"CC(=O)[O-].CCn1c(=O)/c(=C2\Sc3ccccc3N2C)s/c1=C\C1CCC[n+]2c1sc1ccccc12")
ring_system_finder = RingSystemFinder()
ring_system_finder.find_ring_systems(mol)
[docs]
def create_ring_dictionary(input_chemreps, output_csv):
"""Read the ChEMBL chemreps.txt file, extract ring systems, write out ring systems and frequency
:param input_chemreps: ChEMBL chemreps file
:param output_csv: output csv file
:return: None
"""
ring_system_finder = RingSystemFinder()
df = pd.read_csv(input_chemreps, sep="\t")
ring_system_output_list = []
inchi_smi_dict = {}
for smi in tqdm(df.canonical_smiles):
mol = Chem.MolFromSmiles(smi)
if mol is None:
continue
ring_system_mols = ring_system_finder.find_ring_systems(mol, as_mols=True)
ring_system_inchi_list = [Chem.MolToInchiKey(x) for x in ring_system_mols]
ring_system_smiles_list = [Chem.MolToSmiles(x) for x in ring_system_mols]
for inchi_val, smi_val in zip(ring_system_inchi_list, ring_system_smiles_list):
inchi_smi_dict[inchi_val] = smi_val
ring_system_output_list += ring_system_inchi_list
df_out = pd.DataFrame(pd.Series(ring_system_output_list).value_counts())
df_out.index.name = "InChI"
df_out.columns = ['Count']
df_out = df_out.reset_index()
df_out['SMILES'] = [inchi_smi_dict.get(x) for x in df_out.InChI]
df_out[['SMILES', 'InChI', 'Count']].to_csv(output_csv, index=False)
df_no_stereo_out = generate_no_stereo_ring_systems(df_out)
df_no_stereo_out.to_csv(output_csv.replace(".csv", "_no_stereo.csv"), index=False)
[docs]
def build_ring_systems_dataframe(res: list) -> pd.DataFrame:
"""
Build a DataFrame with ring system information and minimum frequency.
:param res: List of ring system results for each molecule.
:return: DataFrame with columns for ring systems, min ring, and min frequency.
"""
res_df = pd.DataFrame()
res_df["ring_systems"] = res
freq_data = res_df.ring_systems.apply(get_min_ring_frequency)
res_df["min_ring"] = [x[0] for x in freq_data]
res_df["min_freq"] = [x[1] for x in freq_data]
return res_df
[docs]
class RingSystemLookup:
"""Lookup ring systems from a dictionary of rings and frequencies"""
def __init__(self, ring_file=None, ignore_stereo=False):
ring_csv_name = "chembl_ring_systems.csv"
if ignore_stereo:
ring_csv_name = ring_csv_name.replace(".csv", "_no_stereo.csv")
if ring_file is None:
url = f'https://raw.githubusercontent.com/PatWalters/useful_rdkit_utils/refs/heads/master/data/{ring_csv_name}'
self.rule_path = pystow.ensure('useful_rdkit_utils', 'data', url=url)
else:
self.rule_path = ring_file
self.ignore_stereo = ignore_stereo
self.ring_df = pd.read_csv(self.rule_path)
self.ring_dict = dict(self.ring_df[["InChI", "Count"]].values)
self.ring_system_finder = RingSystemFinder()
self.enumerator = rdMolStandardize.TautomerEnumerator()
[docs]
def process_mol(self, mol_in):
"""
find ring systems in an RDKit molecule
:param mol_in: input molecule
:return: list of SMILES for ring systems
"""
#mol = self.enumerator.Canonicalize(mol_in)
mol = mol_in
# mol = mol_in
output_ring_list = []
if mol:
if self.ignore_stereo:
Chem.RemoveStereochemistry(mol)
ring_system_list = self.ring_system_finder.find_ring_systems(mol, as_mols=True)
for ring in ring_system_list:
smiles = Chem.MolToSmiles(ring)
inchi = Chem.MolToInchiKey(ring)
count = self.ring_dict.get(inchi) or 0
output_ring_list.append((smiles, count))
return output_ring_list
[docs]
def process_smiles(self, smi):
"""
find ring systems from a SMILES
:param smi: input SMILES
:return: list of SMILES for ring systems
"""
res = []
mol = Chem.MolFromSmiles(smi)
if mol:
res = self.process_mol(mol)
return res
[docs]
def pandas_smiles(self, smiles_list):
"""
find ring systems from a list of SMILES
:param smiles_list: list of SMILES
:return: dataframe with ring information
"""
res = []
for smi in tqdm(smiles_list):
res.append(self.process_smiles(smi))
return build_ring_systems_dataframe(res)
[docs]
def pandas_mols(self, mol_list):
"""
find ring systems from a list of SMILES
:param smiles_list: list of SMILES
:return: dataframe with ring information
"""
res = []
for mol in tqdm(mol_list):
res.append(self.process_mol(mol))
return build_ring_systems_dataframe(res)
[docs]
def test_ring_system_lookup(input_filename, output_filename):
"""Test for RingSystemLookup
:param input_filename: input smiles file
:param output_filename: output csv file
:return: None
"""
df = pd.read_csv(input_filename, sep=" ", names=["SMILES", "Name"])
ring_system_lookup = RingSystemLookup.default()
min_freq_list = []
for smi in tqdm(df.SMILES):
freq_list = ring_system_lookup.process_smiles(smi)
min_freq_list.append(get_min_ring_frequency(freq_list))
df['min_ring'] = [x[0] for x in min_freq_list]
df['min_freq'] = [x[1] for x in min_freq_list]
df.to_csv(output_filename, index=False)
[docs]
def get_min_ring_frequency(ring_list):
"""Get minimum frequency from RingSystemLookup.process_smiles
:param ring_list: output from RingSystemLookup.process_smiles
:return: [ring_with minimum frequency, minimum frequency], acyclic molecules return ["",-1]
"""
ring_list.sort(key=itemgetter(1))
if len(ring_list):
return ring_list[0]
else:
return ["", -1]
def remove_stereo_from_smiles(smi_in):
mol = Chem.MolFromSmiles(smi_in)
smi_out = None
inchi_key = None
if mol:
Chem.RemoveStereochemistry(mol)
smi_out = Chem.MolToSmiles(mol)
inchi_key = Chem.MolToInchiKey(mol)
return smi_out, inchi_key
def generate_no_stereo_ring_systems(df):
tqdm.pandas()
df[["no_stereo_smiles", "no_stereo_inchi"]] = df.SMILES.progress_apply(remove_stereo_from_smiles).to_list()
res = []
for k, v in tqdm(df.groupby("no_stereo_inchi")):
res.append([v.no_stereo_smiles.values[0], k, v.Count.sum()])
no_stereo_ring_df = pd.DataFrame(res, columns=["SMILES", "InChI", "Count"])
no_stereo_ring_df.sort_values("Count", ascending=False, inplace=True)
df.drop(["no_stereo_smiles", "no_stereo_inchi"], axis=1, inplace=True)
return no_stereo_ring_df
# ----------- Ring stats
[docs]
def get_spiro_atoms(mol):
"""Get atoms that are part of a spiro fusion
:param mol: input RDKit molecule
:return: a list of atom numbers for atoms that are the centers of spiro fusions
"""
info = mol.GetRingInfo()
ring_sets = [set(x) for x in info.AtomRings()]
spiro_atoms = []
for i, j in itertools.combinations(ring_sets, 2):
i_and_j = (i.intersection(j))
if len(i_and_j) == 1:
spiro_atoms += list(i_and_j)
return spiro_atoms
[docs]
def max_ring_size(mol):
"""Get the size of the largest ring in a molecule
:param mol: input_molecule
:return: size of the largest ring or 0 for an acyclic molecule
"""
ri = mol.GetRingInfo()
atom_rings = ri.AtomRings()
if len(atom_rings) == 0:
return 0
else:
return max([len(x) for x in ri.AtomRings()])
[docs]
def ring_stats(mol):
"""Get some simple statistics for rings
:param mol: RDKit molecule
:return: number of rings, maximum ring size
"""
max_size = max_ring_size(mol)
num_rings = CalcNumRings(mol)
return num_rings, max_size