Source code for useful_rdkit_utils.reactions

from typing import List, Union

from rdkit.Chem.rdChemReactions import ChemicalReaction
from rdkit.Chem import AllChem
from rdkit.Chem.rdchem import Mol
from rdkit import Chem
from itertools import product
import numpy as np
import pandas as pd
from tqdm.auto import tqdm



[docs] def enumerate_library(rxn_mol: ChemicalReaction, reagent_lol: List[List[Mol]]) -> List[ List[Union[str, str]]]: """ Enumerate a library of products from a given reaction and list of reagents. :param rxn_mol: A chemical reaction represented as an RDKit ChemicalReaction object. :param reagent_lol: A list of lists, where each inner list represents a set of reagents. Each reagent is an RDKit Mol object. The molecule object must have a "_Name" property that contains a string identifier. :return: A list of lists, where each inner list represents a product. Each product is represented as a list containing a SMILES string of the product and a string identifier formed by joining the identifiers of the reagents used to form the product. """ prod_list = [] # itertools.product generates all combinations of reactants for reagents in product(*reagent_lol): mol_list = reagents name_list = [x.GetProp("_Name") for x in mol_list] name = "_".join(name_list) prod = rxn_mol.RunReactants(mol_list) if prod is not None and len(prod): product_mol = prod[0][0] Chem.SanitizeMol(product_mol) prod_list.append([Chem.MolToSmiles(product_mol), name]) return prod_list
[docs] def enumerate_library_sample(rxn: ChemicalReaction, reagent_lol: List[List[Mol]], num_to_generate: int) -> pd.DataFrame: """ Enumerate a sample library of products from a given reaction and list of reagents. :param rxn: A chemical reaction represented as an RDKit ChemicalReaction object. :param reagent_lol: A list of lists, where each inner list represents a set of reagents. Each reagent is an RDKit Mol object. The molecule object must have a "_Name" property that contains a string identifier. :param num_to_generate: The number of products to generate. :return: A pandas DataFrame with the generated products. Each row contains a SMILES string of the product and a string identifier. """ used = set() prod_list = [] count = 0 with tqdm(total=num_to_generate) as pbar: while True: mol_list = [np.random.choice(x) for x in reagent_lol] name_list = [x.GetProp("_Name") for x in mol_list] mol_name = "_".join(name_list) if mol_name in used: continue used.add(mol_name) prod = rxn.RunReactants(mol_list) if len(prod): prod_mol = prod[0][0] res = Chem.SanitizeMol(prod_mol) if res == Chem.rdmolops.SanitizeFlags.SANITIZE_NONE: prod_list.append([Chem.MolToSmiles(prod_mol), mol_name]) count += 1 if count % 100 == 0: pbar.update(100) if count >= num_to_generate: break sample_df = pd.DataFrame(prod_list, columns=["SMILES", "Name"]) return sample_df
def add_molecule_name(mol_series, name_series): for mol, name in zip(mol_series.values, name_series.values): mol.SetProp("_Name", str(name)) def reaction_demo(): rxn_smarts = "N[c:4][c:3]C(O)=O.[#6:1][NH2].[#6:2]C(=O)[OH]>>[C:2]c1n[c:4][c:3]c(=O)n1[C:1]" rxn = AllChem.ReactionFromSmarts(rxn_smarts) df_list = [] for filename in ["aminobenzoic", "primary_amines", "carboxylic_acids"]: df = pd.read_csv(f"../data/{filename}_100.smi", names=["SMILES", "Name"], sep=" ", header=None) df["mol"] = df.SMILES.apply(Chem.MolFromSmiles) add_molecule_name(df.mol, df.Name) df_list.append(df) sample_df = enumerate_library_sample(rxn, [df.mol.values for df in df_list], 1000) all_df = enumerate_library(rxn, [df.mol.values[:10] for df in df_list]) print(len(sample_df), len(all_df)) if __name__ == "__main__": reaction_demo()