Test UNIQUE with Random Forest LogD model

Test UNIQUE with Random Forest LogD model#

In this Notebook, we showcase how to use UNIQUE to assess the uncertainty quantification methods for a random forest (RF) regressor trained to predict the LogD from public data automatically downlaoded from ChEMBL database (Query link: document_chemblid = CHEMBL3301361, standard_type = LogD7.4).

As uncertainty quantification methods based on models, we derive the variance of the predictions from 100 trees for the RF.

Regression Task
UQ metrics:
- Ensemble Variance (from the property model output)
- TanimotoDistance (from Morgan fingerprints)
Error model:
- UniqueRandomForestRegressor

import os
from pathlib import Path
from typing import List

import numpy as np
import pandas as pd
import yaml
from rdkit.Chem import DataStructs, MolFromSmiles, Mol, rdFingerprintGenerator
from sklearn.ensemble import RandomForestRegressor

from unique.pipeline import Pipeline


# Set the project's directory
PROJECT_PATH = os.environ.get("PROJECT_PATH", os.path.abspath("")) # ALTERNATIVELY, REPLACE `os.path.abspath("")` WITH YOUR PATH TO THE SYNTHETIC EXAMPLE FOLDER
%cd $PROJECT_PATH

/home/runner/work/UNIQUE/UNIQUE/notebooks/LogD_public

Download data from CHEMBL and prepare UNIQUE input#

try:
    from chembl_webresource_client.new_client import new_client
except ImportError as e:
    %pip install chembl-webresource-client
    from chembl_webresource_client.new_client import new_client

Collecting chembl-webresource-client
Downloading chembl_webresource_client-0.10.9-py3-none-any.whl.metadata (1.4 kB)
Requirement already satisfied: urllib3 in /home/runner/work/UNIQUE/UNIQUE/.conda/unique/lib/python3.12/site-packages (from chembl-webresource-client) (2.2.3)
Requirement already satisfied: requests>=2.18.4 in /home/runner/work/UNIQUE/UNIQUE/.conda/unique/lib/python3.12/site-packages (from chembl-webresource-client) (2.32.3)
Collecting requests-cache~=1.2 (from chembl-webresource-client)
  Downloading requests_cache-1.2.1-py3-none-any.whl.metadata (9.9 kB)
Collecting easydict (from chembl-webresource-client)
  Downloading easydict-1.13-py3-none-any.whl.metadata (4.2 kB)
Requirement already satisfied: charset-normalizer<4,>=2 in /home/runner/work/UNIQUE/UNIQUE/.conda/unique/lib/python3.12/site-packages (from requests>=2.18.4->chembl-webresource-client) (3.3.2)
Requirement already satisfied: idna<4,>=2.5 in /home/runner/work/UNIQUE/UNIQUE/.conda/unique/lib/python3.12/site-packages (from requests>=2.18.4->chembl-webresource-client) (3.10)
Requirement already satisfied: certifi>=2017.4.17 in /home/runner/work/UNIQUE/UNIQUE/.conda/unique/lib/python3.12/site-packages (from requests>=2.18.4->chembl-webresource-client) (2024.8.30)
Requirement already satisfied: attrs>=21.2 in /home/runner/work/UNIQUE/UNIQUE/.conda/unique/lib/python3.12/site-packages (from requests-cache~=1.2->chembl-webresource-client) (24.2.0)
Collecting cattrs>=22.2 (from requests-cache~=1.2->chembl-webresource-client)
Downloading cattrs-24.1.2-py3-none-any.whl.metadata (8.4 kB)
Requirement already satisfied: platformdirs>=2.5 in /home/runner/work/UNIQUE/UNIQUE/.conda/unique/lib/python3.12/site-packages (from requests-cache~=1.2->chembl-webresource-client) (4.3.6)
Collecting url-normalize>=1.4 (from requests-cache~=1.2->chembl-webresource-client)
  Downloading url_normalize-1.4.3-py2.py3-none-any.whl.metadata (3.1 kB)
Requirement already satisfied: six in /home/runner/work/UNIQUE/UNIQUE/.conda/unique/lib/python3.12/site-packages (from url-normalize>=1.4->requests-cache~=1.2->chembl-webresource-client) (1.16.0)
Downloading chembl_webresource_client-0.10.9-py3-none-any.whl (55 kB)
Downloading requests_cache-1.2.1-py3-none-any.whl (61 kB)
Downloading easydict-1.13-py3-none-any.whl (6.8 kB)
Downloading cattrs-24.1.2-py3-none-any.whl (66 kB)
Downloading url_normalize-1.4.3-py2.py3-none-any.whl (6.8 kB)
Installing collected packages: easydict, url-normalize, cattrs, requests-cache, chembl-webresource-client
Successfully installed cattrs-24.1.2 chembl-webresource-client-0.10.9 easydict-1.13 requests-cache-1.2.1 url-normalize-1.4.3
Note: you may need to restart the kernel to use updated packages.

# Download data from ChEMBL
data_path = Path('./CHEMBL3301361.csv')

if data_path.is_file():
    df = pd.read_csv(data_path)
else:
    activity = new_client.activity
    bioactivities = activity.filter(document_chembl_id="CHEMBL3301361",
                                    standard_type="LogD7.4").only('molecule_chembl_id',
                                                                  'canonical_smiles',
                                                                  'standard_value')
    df = pd.DataFrame.from_records(bioactivities)
    # Save locally
    df.to_csv(data_path, index=False)

df.head()

	canonical_smiles	molecule_chembl_id	standard_value	value
0	COc1cc(F)ccc1-c1cncc(CNCC2CC2)n1	CHEMBL1778865	1.8	1.8
1	COc1cc(F)ccc1-c1cncc(CNCC2CC2)n1	CHEMBL1778865	1.8	1.8
2	CN(C(=O)Cc1ccc(-n2cnnn2)cc1)[C@@H]1CCN(Cc2ccc(...	CHEMBL2010849	1.8	1.8
3	COc1cc2c(c(OC)c1OC)-c1ccc(O)cc1[C@@H](NC(C)=O)CC2	CHEMBL486504	1.81	1.81
4	N#Cc1cccc(S(=O)(=O)NC(=O)N2CCC(N3CCC(Oc4ccc(Cl...	CHEMBL2171044	1.81	1.81

Generate fingerprints#

def generate_fingerprints(molecules_list: List[Mol]) -> List:
    """Compute the corresponding Morgan fingerprints representation for input molecules.

    Args:
        molecules_list (List[Mol]):
            A list of `rdkit.Chem.Mol` objects (i.e., rdkit-compatible SMILES codes).
    
    Returns:
        A list of corresponding Morgan fingerprints.
    """
    fps = []

    # Check if we have any molecule objects
    if not molecules_list:
        return fps

    # generate count fingerprints output for sklearn
    for fp in rdFingerprintGenerator.GetCountFPs(molecules_list, fpType=rdFingerprintGenerator.MorganFP):
        cv = np.zeros((0,), np.int8)
        DataStructs.ConvertToNumpyArray(fp, cv)
        fps.append(cv)

    fps = [x.astype(bool).astype(int) for x in fps]
    return fps

# generate fingerprints
mols = [MolFromSmiles(i) for i in df.canonical_smiles]
fp = generate_fingerprints(mols)
df['fingerprints'] = [i.tolist() for i in fp]
df = df.drop(columns='value')
df.head()

	canonical_smiles	molecule_chembl_id	standard_value	fingerprints
0	COc1cc(F)ccc1-c1cncc(CNCC2CC2)n1	CHEMBL1778865	1.8	[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
1	COc1cc(F)ccc1-c1cncc(CNCC2CC2)n1	CHEMBL1778865	1.8	[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2	CN(C(=O)Cc1ccc(-n2cnnn2)cc1)[C@@H]1CCN(Cc2ccc(...	CHEMBL2010849	1.8	[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
3	COc1cc2c(c(OC)c1OC)-c1ccc(O)cc1[C@@H](NC(C)=O)CC2	CHEMBL486504	1.81	[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
4	N#Cc1cccc(S(=O)(=O)NC(=O)N2CCC(N3CCC(Oc4ccc(Cl...	CHEMBL2171044	1.81	[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...

Create training, calibration, and test sets#

import random
seed = 1
x = list(range(df.shape[0]))
random.Random(seed).shuffle(x)
perc_train = 0.5
perc_calib = 0.3
limit_train = int(len(x) * perc_train)
limit_calib = int(len(x) * perc_calib)
index_train = x[:limit_train]
index_calib = x[limit_train:(limit_train + limit_calib)]
index_test = x[(limit_train + limit_calib):]

df['which_set'] = None
df.loc[df.index[index_train], 'which_set'] = 'TRAIN'
df.loc[df.index[index_calib], 'which_set'] = 'CALIBRATION'
df.loc[df.index[index_test], 'which_set'] = 'TEST'

df.which_set.value_counts() / df.which_set.shape[0] * 100

which_set
TRAIN          49.988098
CALIBRATION    29.992859
TEST           20.019043
Name: count, dtype: float64

X_train = np.stack(df[df['which_set'] == 'TRAIN']['fingerprints'].tolist())
Y_train = df[df['which_set'] == 'TRAIN']['standard_value'].values

X_calib = np.stack(df[df['which_set'] == 'CALIBRATION']['fingerprints'].tolist())
Y_calib = df[df['which_set'] == 'CALIBRATION']['standard_value'].values

X_test = np.stack(df[df['which_set'] == 'TEST']['fingerprints'].tolist())
Y_test = df[df['which_set'] == 'TEST']['standard_value'].values

Define RF Regressor and fit with training set#

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    rmse = mean_squared_error(test_labels, predictions, squared=False)
    print(f'MAE: {mean_absolute_error(test_labels, predictions):.4f}')
    print(f'RMSE: {rmse:.4f}')
    print(f'R2: {r2_score(test_labels, predictions):.4f}')

# fitting in ~ 1 minutes
best_parameter_set = {'n_estimators': 200,
                      'min_samples_split': 2,
                      'min_samples_leaf': 2,
                      'max_depth': None,
                     'random_state': 42}


regr = RandomForestRegressor(**best_parameter_set)
regr.fit(X_train, Y_train)
print('Evaluation on calibration set:')
evaluate(regr, X_calib, Y_calib)
print('Evaluation on test set:')
evaluate(regr, X_test, Y_test)

Evaluation on calibration set:
MAE: 0.6667
RMSE: 0.8729
R2: 0.4693
Evaluation on test set:
MAE: 0.6586
RMSE: 0.8740
R2: 0.4527

Predict on the entire data se#

Xall = np.stack(df['fingerprints'].tolist())
predictions = regr.predict(Xall)
df['predictions'] = predictions
df.head()

	canonical_smiles	molecule_chembl_id	standard_value	fingerprints	which_set	predictions
0	COc1cc(F)ccc1-c1cncc(CNCC2CC2)n1	CHEMBL1778865	1.8	[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...	TRAIN	1.889113
1	COc1cc(F)ccc1-c1cncc(CNCC2CC2)n1	CHEMBL1778865	1.8	[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...	TRAIN	1.889113
2	CN(C(=O)Cc1ccc(-n2cnnn2)cc1)[C@@H]1CCN(Cc2ccc(...	CHEMBL2010849	1.8	[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...	CALIBRATION	2.633078
3	COc1cc2c(c(OC)c1OC)-c1ccc(O)cc1[C@@H](NC(C)=O)CC2	CHEMBL486504	1.81	[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...	TEST	0.788626
4	N#Cc1cccc(S(=O)(=O)NC(=O)N2CCC(N3CCC(Oc4ccc(Cl...	CHEMBL2171044	1.81	[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...	TRAIN	2.029591

Collect predictions for variance estimation#

# collect predictions
numberTrees = 100
collect = []
for tree in range(numberTrees):
    collect.append(regr.estimators_[tree].predict(Xall))

# reshape predictions in order to obtain a vector of predictions for each sample
tree_num = 0
collect_all = []
for sample in range(df.shape[0]):
    predictions_trees = []
    for tree in collect:
        predictions_trees.append(tree[sample])
    collect_all.append(predictions_trees)

Prepare UNIQUE input#

# store it in the original dataframe
df['ensemble_input_type'] = collect_all
df = df.rename(columns={'ensemble_input_type': 'rf_predictions'})
df['variance'] = df['rf_predictions'].map(lambda x: np.var(x))
df.to_csv('./LogD_unique_input_data.csv')

df = pd.read_csv("./LogD_unique_input_data.csv")

UNIQUE Pipeline#

To evaluate the UQ methods of interest, including an additional set of UQ methods generated by the UNIQUE pipeline, you can run the fit() method of the pipeline. This will allow you to assess their performance using three main UQ evaluation types: Ranking, Proper scoring rules, and Calibration curves.

The summary tables provide scores for each UQ method based on a set of UQ evaluation metrics that are indicative of each evaluation type. The UQ method with the highest score is highlighted in green, indicating it as the best performing method.

Following the summary tables, you will find individual plots showcasing the performance of the best UQ methods.

Additionally, you can explore the summary plots generated for all the evaluated UQ methods, providing a comprehensive overview of their performance.

def overwrite_paths(yaml_file: str, project_path: str, input_data_file: str = "LogD_unique_input_data.csv"):
	"""Given a yaml UNIQUE config file, overwrite the `data_path` and `output_path` fields."""
	# Use ruamel.yaml to preserve comments
	from ruamel.yaml import YAML
	yaml = YAML()

	# Read
	with open(yaml_file, "r") as f:
		# If you want the equivalent of yaml.safe_load use `typ="safe"`
		config = yaml.load(f) # defaults to `typ="rt"` (round-trip) argument. 

	# Overwrite
	config["data_path"] = os.path.join(project_path, input_data_file)
	config["output_path"] = os.path.join(project_path, "output")

	# Save
	with open(yaml_file, "w") as f:
		yaml.dump(config, f)

config_file = f'{PROJECT_PATH}/notebooks/LogD_public/logD_RFmfp_unique_input_config.yaml'

# Replace `data_path` and `output_path` to be able run the notebook automatically
# overwrite_paths(config_file, PROJECT_PATH) # COMMENT TO DISABLE OVERWRITING

pipeline = Pipeline.from_config(config_file)

# Compute UQ metrics, train error models (if any), evaluate UQ metrics
output, eval_results = pipeline.fit()

---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
Cell In[15], line 6
      1 config_file = f'{PROJECT_PATH}/notebooks/LogD_public/logD_RFmfp_unique_input_config.yaml'
      3 # Replace `data_path` and `output_path` to be able run the notebook automatically
      4 # overwrite_paths(config_file, PROJECT_PATH) # COMMENT TO DISABLE OVERWRITING
----> 6 pipeline = Pipeline.from_config(config_file)
      8 # Compute UQ metrics, train error models (if any), evaluate UQ metrics
      9 output, eval_results = pipeline.fit()

File ~/work/UNIQUE/UNIQUE/unique/pipeline.py:211, in Pipeline.from_config(cls, config_path)
    209 # Load config file
    210 if not Path(config_path).exists():
--> 211     raise FileNotFoundError(
    212         f"""Provided config file does not exist: "{config_path}"."""
    213     )
    214 # logging.error
    216 with open(Path(config_path), "r") as f:

FileNotFoundError: Provided config file does not exist: "/home/runner/work/UNIQUE/UNIQUE/notebooks/LogD_public/notebooks/LogD_public/logD_RFmfp_unique_input_config.yaml".

# Optionally save the computed UQ metrics
pd.DataFrame.from_dict(output).to_csv(pipeline.output_path / "uq_metrics_values.csv", index=False)

# `eval_results` is a dict containing the evaluation data used to generate the plots