import pandas as pd
import useful_rdkit_utils
# from https://stackoverflow.com/questions/47136436/python-pandas-convert-value-counts-output-to-dataframe
[docs]
def value_counts_df(df_in, col_in):
"""Returns pd.value_counts() as a DataFrame
:param df_in: Dataframe on which to run value_counts(), must have column `col`.
:param col_in: Name of column in `df` for which to generate counts
:return: Returned dataframe will have two columns, one named "count" which contains the count_values()
for each unique value of df[col]. The other column will be named `col`.
"""
df_out = pd.DataFrame(df_in[col_in].value_counts())
df_out.index.name = col_in
df_out.columns = ['count']
return df_out.reset_index()
[docs]
def add_molecule_and_errors(df_in, smiles_col='SMILES', mol_col_name='ROMol', error_col_name="Error"):
"""Add a molecule column and another column with associated errors to a Pandas dataframe
:param df_in: input dataframe
:param smiles_col: name for the input SMILES column
:param mol_col_name: name for the output molecule column
:param error_col_name: name for the output errors column
:return: None
"""
df_in[[mol_col_name, error_col_name]] = df_in[smiles_col].apply(useful_rdkit_utils.smi2mol_with_errors).to_list()
def split_dataframe(df, chunk_size = 10000):
chunks = list()
num_chunks = len(df) // chunk_size + 1
for i in range(num_chunks):
chunks.append(df[i*chunk_size:(i+1)*chunk_size])
return chunks
def get_dataframe_nans(df):
return df[df.isnull().any(axis=1)]