Test UNIQUE with Random Forest LogD model#
In this Notebook, we showcase how to use UNIQUE to assess the uncertainty quantification methods for a random forest (RF) regressor trained to predict the LogD from public data automatically downlaoded from ChEMBL database (Query link: document_chemblid
= CHEMBL3301361
, standard_type
= LogD7.4
).
As uncertainty quantification methods based on models, we derive the variance of the predictions from 100 trees for the RF.
Regression Task
UQ metrics:
Ensemble Variance (from the property model output)
TanimotoDistance (from Morgan fingerprints)
Error model:
UniqueRandomForestRegressor
import os
from pathlib import Path
from typing import List
import numpy as np
import pandas as pd
import yaml
from rdkit.Chem import DataStructs, MolFromSmiles, Mol, rdFingerprintGenerator
from sklearn.ensemble import RandomForestRegressor
from unique.pipeline import Pipeline
# Set the project's directory
PROJECT_PATH = os.environ.get("PROJECT_PATH", os.path.abspath("")) # ALTERNATIVELY, REPLACE `os.path.abspath("")` WITH YOUR PATH TO THE SYNTHETIC EXAMPLE FOLDER
%cd $PROJECT_PATH
/home/runner/work/UNIQUE/UNIQUE/notebooks/LogD_public
Download data from CHEMBL and prepare UNIQUE input#
try:
from chembl_webresource_client.new_client import new_client
except ImportError as e:
%pip install chembl-webresource-client
from chembl_webresource_client.new_client import new_client
Collecting chembl-webresource-client
Downloading chembl_webresource_client-0.10.9-py3-none-any.whl.metadata (1.4 kB)
Requirement already satisfied: urllib3 in /home/runner/work/UNIQUE/UNIQUE/.conda/unique/lib/python3.12/site-packages (from chembl-webresource-client) (2.2.3)
Requirement already satisfied: requests>=2.18.4 in /home/runner/work/UNIQUE/UNIQUE/.conda/unique/lib/python3.12/site-packages (from chembl-webresource-client) (2.32.3)
Collecting requests-cache~=1.2 (from chembl-webresource-client)
Downloading requests_cache-1.2.1-py3-none-any.whl.metadata (9.9 kB)
Collecting easydict (from chembl-webresource-client)
Downloading easydict-1.13-py3-none-any.whl.metadata (4.2 kB)
Requirement already satisfied: charset-normalizer<4,>=2 in /home/runner/work/UNIQUE/UNIQUE/.conda/unique/lib/python3.12/site-packages (from requests>=2.18.4->chembl-webresource-client) (3.3.2)
Requirement already satisfied: idna<4,>=2.5 in /home/runner/work/UNIQUE/UNIQUE/.conda/unique/lib/python3.12/site-packages (from requests>=2.18.4->chembl-webresource-client) (3.10)
Requirement already satisfied: certifi>=2017.4.17 in /home/runner/work/UNIQUE/UNIQUE/.conda/unique/lib/python3.12/site-packages (from requests>=2.18.4->chembl-webresource-client) (2024.8.30)
Requirement already satisfied: attrs>=21.2 in /home/runner/work/UNIQUE/UNIQUE/.conda/unique/lib/python3.12/site-packages (from requests-cache~=1.2->chembl-webresource-client) (24.2.0)
Collecting cattrs>=22.2 (from requests-cache~=1.2->chembl-webresource-client)
Downloading cattrs-24.1.2-py3-none-any.whl.metadata (8.4 kB)
Requirement already satisfied: platformdirs>=2.5 in /home/runner/work/UNIQUE/UNIQUE/.conda/unique/lib/python3.12/site-packages (from requests-cache~=1.2->chembl-webresource-client) (4.3.6)
Collecting url-normalize>=1.4 (from requests-cache~=1.2->chembl-webresource-client)
Downloading url_normalize-1.4.3-py2.py3-none-any.whl.metadata (3.1 kB)
Requirement already satisfied: six in /home/runner/work/UNIQUE/UNIQUE/.conda/unique/lib/python3.12/site-packages (from url-normalize>=1.4->requests-cache~=1.2->chembl-webresource-client) (1.16.0)
Downloading chembl_webresource_client-0.10.9-py3-none-any.whl (55 kB)
Downloading requests_cache-1.2.1-py3-none-any.whl (61 kB)
Downloading easydict-1.13-py3-none-any.whl (6.8 kB)
Downloading cattrs-24.1.2-py3-none-any.whl (66 kB)
Downloading url_normalize-1.4.3-py2.py3-none-any.whl (6.8 kB)
Installing collected packages: easydict, url-normalize, cattrs, requests-cache, chembl-webresource-client
Successfully installed cattrs-24.1.2 chembl-webresource-client-0.10.9 easydict-1.13 requests-cache-1.2.1 url-normalize-1.4.3
Note: you may need to restart the kernel to use updated packages.
# Download data from ChEMBL
data_path = Path('./CHEMBL3301361.csv')
if data_path.is_file():
df = pd.read_csv(data_path)
else:
activity = new_client.activity
bioactivities = activity.filter(document_chembl_id="CHEMBL3301361",
standard_type="LogD7.4").only('molecule_chembl_id',
'canonical_smiles',
'standard_value')
df = pd.DataFrame.from_records(bioactivities)
# Save locally
df.to_csv(data_path, index=False)
df.head()
canonical_smiles | molecule_chembl_id | standard_value | value | |
---|---|---|---|---|
0 | COc1cc(F)ccc1-c1cncc(CNCC2CC2)n1 | CHEMBL1778865 | 1.8 | 1.8 |
1 | COc1cc(F)ccc1-c1cncc(CNCC2CC2)n1 | CHEMBL1778865 | 1.8 | 1.8 |
2 | CN(C(=O)Cc1ccc(-n2cnnn2)cc1)[C@@H]1CCN(Cc2ccc(... | CHEMBL2010849 | 1.8 | 1.8 |
3 | COc1cc2c(c(OC)c1OC)-c1ccc(O)cc1[C@@H](NC(C)=O)CC2 | CHEMBL486504 | 1.81 | 1.81 |
4 | N#Cc1cccc(S(=O)(=O)NC(=O)N2CCC(N3CCC(Oc4ccc(Cl... | CHEMBL2171044 | 1.81 | 1.81 |
Generate fingerprints#
def generate_fingerprints(molecules_list: List[Mol]) -> List:
"""Compute the corresponding Morgan fingerprints representation for input molecules.
Args:
molecules_list (List[Mol]):
A list of `rdkit.Chem.Mol` objects (i.e., rdkit-compatible SMILES codes).
Returns:
A list of corresponding Morgan fingerprints.
"""
fps = []
# Check if we have any molecule objects
if not molecules_list:
return fps
# generate count fingerprints output for sklearn
for fp in rdFingerprintGenerator.GetCountFPs(molecules_list, fpType=rdFingerprintGenerator.MorganFP):
cv = np.zeros((0,), np.int8)
DataStructs.ConvertToNumpyArray(fp, cv)
fps.append(cv)
fps = [x.astype(bool).astype(int) for x in fps]
return fps
# generate fingerprints
mols = [MolFromSmiles(i) for i in df.canonical_smiles]
fp = generate_fingerprints(mols)
df['fingerprints'] = [i.tolist() for i in fp]
df = df.drop(columns='value')
df.head()
canonical_smiles | molecule_chembl_id | standard_value | fingerprints | |
---|---|---|---|---|
0 | COc1cc(F)ccc1-c1cncc(CNCC2CC2)n1 | CHEMBL1778865 | 1.8 | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... |
1 | COc1cc(F)ccc1-c1cncc(CNCC2CC2)n1 | CHEMBL1778865 | 1.8 | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... |
2 | CN(C(=O)Cc1ccc(-n2cnnn2)cc1)[C@@H]1CCN(Cc2ccc(... | CHEMBL2010849 | 1.8 | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... |
3 | COc1cc2c(c(OC)c1OC)-c1ccc(O)cc1[C@@H](NC(C)=O)CC2 | CHEMBL486504 | 1.81 | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... |
4 | N#Cc1cccc(S(=O)(=O)NC(=O)N2CCC(N3CCC(Oc4ccc(Cl... | CHEMBL2171044 | 1.81 | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... |
Create training, calibration, and test sets#
import random
seed = 1
x = list(range(df.shape[0]))
random.Random(seed).shuffle(x)
perc_train = 0.5
perc_calib = 0.3
limit_train = int(len(x) * perc_train)
limit_calib = int(len(x) * perc_calib)
index_train = x[:limit_train]
index_calib = x[limit_train:(limit_train + limit_calib)]
index_test = x[(limit_train + limit_calib):]
df['which_set'] = None
df.loc[df.index[index_train], 'which_set'] = 'TRAIN'
df.loc[df.index[index_calib], 'which_set'] = 'CALIBRATION'
df.loc[df.index[index_test], 'which_set'] = 'TEST'
df.which_set.value_counts() / df.which_set.shape[0] * 100
which_set
TRAIN 49.988098
CALIBRATION 29.992859
TEST 20.019043
Name: count, dtype: float64
X_train = np.stack(df[df['which_set'] == 'TRAIN']['fingerprints'].tolist())
Y_train = df[df['which_set'] == 'TRAIN']['standard_value'].values
X_calib = np.stack(df[df['which_set'] == 'CALIBRATION']['fingerprints'].tolist())
Y_calib = df[df['which_set'] == 'CALIBRATION']['standard_value'].values
X_test = np.stack(df[df['which_set'] == 'TEST']['fingerprints'].tolist())
Y_test = df[df['which_set'] == 'TEST']['standard_value'].values
Define RF Regressor and fit with training set#
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
def evaluate(model, test_features, test_labels):
predictions = model.predict(test_features)
rmse = mean_squared_error(test_labels, predictions, squared=False)
print(f'MAE: {mean_absolute_error(test_labels, predictions):.4f}')
print(f'RMSE: {rmse:.4f}')
print(f'R2: {r2_score(test_labels, predictions):.4f}')
# fitting in ~ 1 minutes
best_parameter_set = {'n_estimators': 200,
'min_samples_split': 2,
'min_samples_leaf': 2,
'max_depth': None,
'random_state': 42}
regr = RandomForestRegressor(**best_parameter_set)
regr.fit(X_train, Y_train)
print('Evaluation on calibration set:')
evaluate(regr, X_calib, Y_calib)
print('Evaluation on test set:')
evaluate(regr, X_test, Y_test)
Evaluation on calibration set:
MAE: 0.6667
RMSE: 0.8729
R2: 0.4693
Evaluation on test set:
MAE: 0.6586
RMSE: 0.8740
R2: 0.4527
Predict on the entire data se#
Xall = np.stack(df['fingerprints'].tolist())
predictions = regr.predict(Xall)
df['predictions'] = predictions
df.head()
canonical_smiles | molecule_chembl_id | standard_value | fingerprints | which_set | predictions | |
---|---|---|---|---|---|---|
0 | COc1cc(F)ccc1-c1cncc(CNCC2CC2)n1 | CHEMBL1778865 | 1.8 | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... | TRAIN | 1.889113 |
1 | COc1cc(F)ccc1-c1cncc(CNCC2CC2)n1 | CHEMBL1778865 | 1.8 | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... | TRAIN | 1.889113 |
2 | CN(C(=O)Cc1ccc(-n2cnnn2)cc1)[C@@H]1CCN(Cc2ccc(... | CHEMBL2010849 | 1.8 | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... | CALIBRATION | 2.633078 |
3 | COc1cc2c(c(OC)c1OC)-c1ccc(O)cc1[C@@H](NC(C)=O)CC2 | CHEMBL486504 | 1.81 | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... | TEST | 0.788626 |
4 | N#Cc1cccc(S(=O)(=O)NC(=O)N2CCC(N3CCC(Oc4ccc(Cl... | CHEMBL2171044 | 1.81 | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... | TRAIN | 2.029591 |
Collect predictions for variance estimation#
# collect predictions
numberTrees = 100
collect = []
for tree in range(numberTrees):
collect.append(regr.estimators_[tree].predict(Xall))
# reshape predictions in order to obtain a vector of predictions for each sample
tree_num = 0
collect_all = []
for sample in range(df.shape[0]):
predictions_trees = []
for tree in collect:
predictions_trees.append(tree[sample])
collect_all.append(predictions_trees)
Prepare UNIQUE input#
# store it in the original dataframe
df['ensemble_input_type'] = collect_all
df = df.rename(columns={'ensemble_input_type': 'rf_predictions'})
df['variance'] = df['rf_predictions'].map(lambda x: np.var(x))
df.to_csv('./LogD_unique_input_data.csv')
df = pd.read_csv("./LogD_unique_input_data.csv")
UNIQUE Pipeline#
To evaluate the UQ methods of interest, including an additional set of UQ methods generated by the UNIQUE pipeline, you can run the fit()
method of the pipeline. This will allow you to assess their performance using three main UQ evaluation types: Ranking, Proper scoring rules, and Calibration curves.
The summary tables provide scores for each UQ method based on a set of UQ evaluation metrics that are indicative of each evaluation type. The UQ method with the highest score is highlighted in green, indicating it as the best performing method.
Following the summary tables, you will find individual plots showcasing the performance of the best UQ methods.
Additionally, you can explore the summary plots generated for all the evaluated UQ methods, providing a comprehensive overview of their performance.
def overwrite_paths(yaml_file: str, project_path: str, input_data_file: str = "LogD_unique_input_data.csv"):
"""Given a yaml UNIQUE config file, overwrite the `data_path` and `output_path` fields."""
# Use ruamel.yaml to preserve comments
from ruamel.yaml import YAML
yaml = YAML()
# Read
with open(yaml_file, "r") as f:
# If you want the equivalent of yaml.safe_load use `typ="safe"`
config = yaml.load(f) # defaults to `typ="rt"` (round-trip) argument.
# Overwrite
config["data_path"] = os.path.join(project_path, input_data_file)
config["output_path"] = os.path.join(project_path, "output")
# Save
with open(yaml_file, "w") as f:
yaml.dump(config, f)
config_file = f'{PROJECT_PATH}/notebooks/LogD_public/logD_RFmfp_unique_input_config.yaml'
# Replace `data_path` and `output_path` to be able run the notebook automatically
# overwrite_paths(config_file, PROJECT_PATH) # COMMENT TO DISABLE OVERWRITING
pipeline = Pipeline.from_config(config_file)
# Compute UQ metrics, train error models (if any), evaluate UQ metrics
output, eval_results = pipeline.fit()
---------------------------------------------------------------------------
FileNotFoundError Traceback (most recent call last)
Cell In[15], line 6
1 config_file = f'{PROJECT_PATH}/notebooks/LogD_public/logD_RFmfp_unique_input_config.yaml'
3 # Replace `data_path` and `output_path` to be able run the notebook automatically
4 # overwrite_paths(config_file, PROJECT_PATH) # COMMENT TO DISABLE OVERWRITING
----> 6 pipeline = Pipeline.from_config(config_file)
8 # Compute UQ metrics, train error models (if any), evaluate UQ metrics
9 output, eval_results = pipeline.fit()
File ~/work/UNIQUE/UNIQUE/unique/pipeline.py:211, in Pipeline.from_config(cls, config_path)
209 # Load config file
210 if not Path(config_path).exists():
--> 211 raise FileNotFoundError(
212 f"""Provided config file does not exist: "{config_path}"."""
213 )
214 # logging.error
216 with open(Path(config_path), "r") as f:
FileNotFoundError: Provided config file does not exist: "/home/runner/work/UNIQUE/UNIQUE/notebooks/LogD_public/notebooks/LogD_public/logD_RFmfp_unique_input_config.yaml".
# Optionally save the computed UQ metrics
pd.DataFrame.from_dict(output).to_csv(pipeline.output_path / "uq_metrics_values.csv", index=False)
# `eval_results` is a dict containing the evaluation data used to generate the plots