This guide walks you through creating a complete molecular optimization pipeline using ChemLactica. You’ll learn how to define an oracle function, configure the optimization algorithm, and generate molecules optimized for specific properties.
The oracle is responsible for evaluating molecules and assigning scores. Here’s a complete example that optimizes for TPSA (Topological Polar Surface Area) and molecular weight:
from typing import Listimport numpy as npfrom rdkit.Chem import rdMolDescriptorsfrom chemlactica.mol_opt.utils import MoleculeEntryclass TPSA_Weight_Oracle: def __init__(self, max_oracle_calls: int): # Maximum number of oracle calls to make self.max_oracle_calls = max_oracle_calls # The frequency with which to log self.freq_log = 100 # The buffer to keep track of all unique molecules generated self.mol_buffer = {} # The maximum possible oracle score or an upper bound self.max_possible_oracle_score = 800 # If True the __call__ function takes list of MoleculeEntry objects # If False (or unspecified) the __call__ function takes list of SMILES strings self.takes_entry = True def __call__(self, molecules: List[MoleculeEntry]): """ Evaluate and return the oracle scores for molecules. Log the intermediate results if necessary. """ oracle_scores = [] for molecule in molecules: if self.mol_buffer.get(molecule.smiles): oracle_scores.append(sum(self.mol_buffer[molecule.smiles][0])) else: try: tpsa = rdMolDescriptors.CalcTPSA(molecule.mol) oracle_score = tpsa weight = rdMolDescriptors.CalcExactMolWt(molecule.mol) num_rings = rdMolDescriptors.CalcNumRings(molecule.mol) # Apply constraints if weight >= 350: oracle_score = 0 if num_rings < 2: oracle_score = 0 except Exception as e: print(e) oracle_score = 0 self.mol_buffer[molecule.smiles] = [oracle_score, len(self.mol_buffer) + 1] if len(self.mol_buffer) % 100 == 0: self.log_intermediate() oracle_scores.append(oracle_score) return oracle_scores def log_intermediate(self): scores = [v[0] for v in self.mol_buffer.values()][-self.max_oracle_calls:] scores_sorted = sorted(scores, reverse=True)[:100] n_calls = len(self.mol_buffer) score_avg_top1 = np.max(scores_sorted) score_avg_top10 = np.mean(scores_sorted[:10]) score_avg_top100 = np.mean(scores_sorted) print(f"{n_calls}/{self.max_oracle_calls} | ", f'avg_top1: {score_avg_top1:.3f} | ' f'avg_top10: {score_avg_top10:.3f} | ' f'avg_top100: {score_avg_top100:.3f}') def __len__(self): return len(self.mol_buffer) @property def budget(self): return self.max_oracle_calls @property def finish(self): """The stopping condition for the optimization process""" return len(self.mol_buffer) >= self.max_oracle_calls
The oracle’s __call__ method receives a list of MoleculeEntry objects and must return a list of scores. Set takes_entry = True to receive MoleculeEntry objects instead of SMILES strings.
pool_size: Number of top molecules to maintain in the pool
num_similars: Number of similar molecules to include in prompts
num_gens_per_iter: Number of molecules to generate per iteration
generation_temperature: Sampling temperature for diversity
strategy: Use [rej-sample-v2] for fine-tuning during optimization, or [default] for generation-only
Using strategy: [rej-sample-v2] enables rejection sampling with fine-tuning during optimization, which improves results but requires more computational resources.