import pandas as pd
from itertools import combinations
from rdkit import Chem
from rdkit.Chem import DataStructs, rdFingerprintGenerator
from rdkit.Chem.Draw import MolsToGridImage
import useful_rdkit_utils as uru
import numpy as np
from tqdm.auto import tqdm
# Constants for fingerprint generation and calculation
MORGAN_RADIUS = 2
MORGAN_FP_SIZE = 2048
SIMILARITY_EPSILON = 0.001 # Small value to avoid division by zero
[docs]
def calculate_sali(data_frame: pd.DataFrame, smiles_col: str = 'SMILES',
activity_col: str = 'Activity') -> pd.DataFrame:
"""
Calculate the Structure-Activity Landscape Index (SALI) for a dataframe of molecules and activities.
:param data_frame: DataFrame with columns for SMILES and Activity.
:param smiles_col: The name of the column containing SMILES strings.
:param activity_col: The name of the column containing activity values.
:return: DataFrame containing pairwise SALI values and metadata.
"""
# Prepare data lists
smiles_strings = data_frame[smiles_col].tolist()
activities = data_frame[activity_col].tolist()
molecules = [Chem.MolFromSmiles(smi) for smi in smiles_strings]
# Generate fingerprints
fingerprint_generator = rdFingerprintGenerator.GetMorganGenerator(radius=MORGAN_RADIUS, fpSize=MORGAN_FP_SIZE)
fingerprints = [fingerprint_generator.GetFingerprint(mol) for mol in molecules]
sali_records = []
num_molecules = len(molecules)
# Calculate total number of pairs for the progress bar: n * (n - 1) / 2
total_pairs = (num_molecules * (num_molecules - 1)) // 2
# Calculate pairwise SALI using combinations to avoid redundant comparisons
progress_bar = tqdm(combinations(range(num_molecules), 2), total=total_pairs, desc="Calculating SALI")
for i, j in progress_bar:
similarity = DataStructs.TanimotoSimilarity(fingerprints[i], fingerprints[j])
delta_activity = abs(activities[i] - activities[j])
sali_value = delta_activity / (1 - similarity + SIMILARITY_EPSILON)
sali_records.append({
'SMILES_1': smiles_strings[i],
f'{activity_col}_1': activities[i],
'SMILES_2': smiles_strings[j],
f'{activity_col}_2': activities[j],
'Delta_Activity': delta_activity,
'Tanimoto_Similarity': similarity,
'SALI': sali_value
})
return pd.DataFrame(sali_records)
[docs]
def plot_sali_pairs(data_frame: pd.DataFrame,
smiles_col: str = 'SMILES',
activity_col: str = 'Activity',
similarity_col: str = 'Tanimoto_Similarity',
delta_activity_col: str = 'Delta_Activity',
sali_col: str = 'SALI',
similarity_cutoff: float = 0.5,
delta_activity_cutoff: float = 1.0,
mols_per_row: int = 4,
pairs_to_show: int = 10) -> object:
"""
Filters and visualizes molecule pairs from a SALI results DataFrame in a grid.
Ensures the more active compound of each pair is always displayed on the left.
:param data_frame: DataFrame containing pairwise SALI results.
:param smiles_col: Base name of the SMILES column.
:param activity_col: Base name of the activity column.
:param similarity_col: Column name for Tanimoto similarity.
:param delta_activity_col: Column name for the activity difference.
:param sali_col: Column name for the SALI index.
:param similarity_cutoff: Minimum similarity threshold for inclusion.
:param delta_activity_cutoff: Minimum activity difference threshold for inclusion.
:param mols_per_row: Number of molecules to display per row in the grid image.
:param pairs_to_show: Maximum number of top SALI pairs to visualize.
:return: An RDKit grid image showing aligned molecule pairs.
"""
# Define paired column names
smi_1, smi_2 = f"{smiles_col}_1", f"{smiles_col}_2"
act_1, act_2 = f"{activity_col}_1", f"{activity_col}_2"
# Filter by thresholds, sort by SALI score, and take the top N pairs
query_filter = f"{delta_activity_col} > {delta_activity_cutoff} and {similarity_col} > {similarity_cutoff}"
filtered_df = data_frame.query(query_filter).sort_values(sali_col, ascending=False).head(pairs_to_show).copy()
# Ensure the more active compound is always in the first position (left side)
needs_swap = filtered_df[act_2] > filtered_df[act_1]
if needs_swap.any():
# Swap SMILES
filtered_df.loc[needs_swap, [smi_1, smi_2]] = filtered_df.loc[needs_swap, [smi_2, smi_1]].values
# Swap Activities
filtered_df.loc[needs_swap, [act_1, act_2]] = filtered_df.loc[needs_swap, [act_2, act_1]].values
# Align molecules to their Maximum Common Substructure (MCS) in pairs
aligned_mols = []
for s1, s2 in zip(filtered_df[smi_1], filtered_df[smi_2]):
aligned_mols.extend(uru.mcs_align([s1, s2]))
# Flatten activity columns and format as strings for legends
flattened_activities = filtered_df[[act_1, act_2]].values.flatten()
activity_legends = [f"{val:.2f}" for val in flattened_activities]
# Apply global RDKit drawing settings
uru.rd_make_structures_pretty()
return MolsToGridImage(aligned_mols, molsPerRow=mols_per_row, legends=activity_legends)