import itertools
import sys
from glob import glob
from typing import Tuple, List
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem.rdMMPA import FragmentMol
from rdkit.Chem.rdRGroupDecomposition import RGroupDecompose
from rdkit.Chem.rdchem import Mol
from tqdm.auto import tqdm
from .misc_utils import get_largest_fragment
[docs]
def cleanup_fragment(mol: Mol) -> Tuple[Mol, int]:
"""
Replace atom map numbers with Hydrogens
:param mol: input molecule
:return: modified molecule, number of R-groups
"""
rgroup_count = 0
for atm in mol.GetAtoms():
atm.SetAtomMapNum(0)
if atm.GetAtomicNum() == 0:
rgroup_count += 1
atm.SetAtomicNum(1)
mol = Chem.RemoveAllHs(mol)
return mol, rgroup_count
[docs]
def generate_fragments(mol: Mol) -> pd.DataFrame:
"""
Generate fragments using the RDKit
:param mol: RDKit molecule
:return: a Pandas dataframe with Scaffold SMILES, Number of Atoms, Number of R-Groups
"""
# Generate molecule fragments
frag_list = FragmentMol(mol)
# Flatten the output into a single list
flat_frag_list = [x for x in itertools.chain(*frag_list) if x]
# The output of Fragment mol is contained in single molecules. Extract the largest fragment from each molecule
flat_frag_list = [get_largest_fragment(x) for x in flat_frag_list]
# Keep fragments where the number of atoms in the fragment is at least 2/3 of the number fragments in
# input molecule
num_mol_atoms = mol.GetNumAtoms()
flat_frag_list = [x for x in flat_frag_list if x.GetNumAtoms() / num_mol_atoms > 0.67]
# remove atom map numbers from the fragments
flat_frag_list = [cleanup_fragment(x) for x in flat_frag_list]
# Convert fragments to SMILES
frag_smiles_list = [[Chem.MolToSmiles(x), x.GetNumAtoms(), y] for (x, y) in flat_frag_list]
# Add the input molecule to the fragment list
frag_smiles_list.append([Chem.MolToSmiles(mol), mol.GetNumAtoms(), 1])
# Put the results into a Pandas dataframe
frag_df = pd.DataFrame(frag_smiles_list, columns=["Scaffold", "NumAtoms", "NumRgroupgs"])
# Remove duplicate fragments
frag_df = frag_df.drop_duplicates("Scaffold")
return frag_df
[docs]
def find_scaffolds(df_in: pd.DataFrame, smiles_col="SMILES", name_col="Name",disable_progress=False) -> Tuple[pd.DataFrame, pd.DataFrame]:
"""
Generate scaffolds for a set of molecules
:param df_in: Pandas dataframe with [SMILES, Name, RDKit molecule] columns
:return: dataframe with molecules and scaffolds, dataframe with unique scaffolds
"""
# Loop over molecules and generate fragments, fragments for each molecule are returned as a Pandas dataframe
df_list = []
for smiles, name, mol in tqdm(df_in[[smiles_col, name_col, "mol"]].values, disable=disable_progress):
tmp_df = generate_fragments(mol).copy()
tmp_df['Name'] = name
tmp_df['SMILES'] = smiles
df_list.append(tmp_df)
# Combine the list of dataframes into a single dataframe
mol_df = pd.concat(df_list)
# Collect scaffolds
scaffold_list = []
for k, v in mol_df.groupby("Scaffold"):
scaffold_list.append([k, len(v.Name.unique()), v.NumAtoms.values[0]])
scaffold_df = pd.DataFrame(scaffold_list, columns=["Scaffold", "Count", "NumAtoms"])
# Any fragment that occurs more times than the number of fragments can't be a scaffold
num_df_rows = len(df_in)
scaffold_df = scaffold_df.query("Count <= @num_df_rows")
# Sort scaffolds by frequency
scaffold_df = scaffold_df.sort_values(["Count", "NumAtoms"], ascending=[False, False])
return mol_df, scaffold_df
[docs]
def get_molecules_with_scaffold(
scaffold: str,
mol_df: pd.DataFrame,
activity_df: pd.DataFrame,
smiles_col: str = "SMILES",
name_col: str = "Name",
activity_col: str = "pIC50",
extra_cols: List[str] = []
) -> Tuple[List[str], pd.DataFrame]:
"""
Find molecules in a DataFrame that match a given scaffold and return their core structures and activity data.
:param scaffold: SMILES string of the scaffold to match.
:param mol_df: DataFrame containing molecule information and fragments.
:param activity_df: DataFrame containing activity data for molecules.
:param smiles_col: Name of the column containing SMILES strings.
:param name_col: Name of the column containing molecule names.
:param activity_col: Name of the column containing activity values.
:param extra_cols: Additional columns to include in the output DataFrame.
:return: A tuple with a list of unique core SMILES and a DataFrame of matching molecules with activity data.
"""
match_df = mol_df.query("Scaffold == @scaffold").copy()
# Generate RDKit molecules if not already present in activity_df
if "mol" not in activity_df.columns:
match_df['mol'] = match_df.SMILES.apply(Chem.MolFromSmiles)
merge_df = match_df.merge(activity_df, left_on=["SMILES","Name"],
right_on=[smiles_col, name_col], how="left")
scaffold_mol = Chem.MolFromSmiles(scaffold)
Chem.RemoveStereochemistry(scaffold_mol)
rgroup_match, rgroup_miss = RGroupDecompose(
scaffold_mol, merge_df.mol, asSmiles=True
)
output_cols = [smiles_col, name_col, activity_col] + extra_cols
if len(rgroup_match):
rgroup_df = pd.DataFrame(rgroup_match)
return rgroup_df.Core.unique(), merge_df[output_cols]
else:
return [], merge_df[output_cols]
[docs]
def main():
"""
Read all SMILES files in the current directory, generate scaffolds, report stats for each scaffold
:return: None
"""
filename_list = glob("CHEMBL237.smi")
out_list = []
for filename in filename_list:
print(filename)
input_df = pd.read_csv(filename, names=["SMILES", "Name", "pIC50"])
input_df['mol'] = input_df.SMILES.apply(Chem.MolFromSmiles)
input_df['SMILES'] = input_df.mol.apply(Chem.MolToSmiles)
mol_df, scaffold_df = find_scaffolds(input_df)
scaffold_1 = scaffold_df.Scaffold.values[0]
scaffold_list, scaffold_mol_df = get_molecules_with_scaffold(scaffold_1, mol_df, input_df)
ic50_list = scaffold_mol_df.pIC50
if len(scaffold_list):
print(scaffold_list[0], len(scaffold_mol_df), max(ic50_list) - min(ic50_list), np.std(ic50_list))
out_list.append([filename, scaffold_list[0], len(scaffold_mol_df), max(ic50_list) - min(ic50_list),
np.std(ic50_list)])
else:
print(f"Could not find a scaffold for {filename}", file=sys.stderr)
out_df = pd.DataFrame(out_list, columns=["Filename", "Scaffold", "Count", "Range", "Std"])
out_df.to_csv("scaffold_stats.csv", index=False)
if __name__ == "__main__":
main()