Skip to main content

Overview

This example demonstrates how to use Buildr for machine learning workloads. You’ll learn how to containerize training scripts, manage dependencies, and configure resource requirements like GPU access.

Complete Example

Here’s a complete ML training pipeline that’s automatically containerized:
import numpy as np
import tensorflow as tf
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from metaparticle_pkg import Containerize
import os

# Configuration
model_name = 'sentiment-classifier'
repository = 'docker.io/mlteam'
epochs = 10
batch_size = 32


def create_model(input_dim, num_classes):
    """Create a simple neural network model"""
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(128, activation='relu', input_shape=(input_dim,)),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(num_classes, activation='softmax')
    ])
    
    model.compile(
        optimizer='adam',
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    
    return model


def save_model(model, path='./models'):
    """Save trained model to disk"""
    os.makedirs(path, exist_ok=True)
    model_path = os.path.join(path, f'{model_name}.h5')
    model.save(model_path)
    print(f"Model saved to {model_path}")
    return model_path


@Containerize(
    package={
        'name': model_name,
        'repository': repository,
        'publish': True,
        'dockerfile': 'Dockerfile.ml'  # Use custom Dockerfile with ML dependencies
    },
    runtime={
        'executor': 'docker',
        'gpu': True,  # Enable GPU support
        'memory': '4Gi',  # Request 4GB memory
        'cpu': '2',  # Request 2 CPU cores
    }
)
def train_model():
    """Main training function"""
    print("Starting ML training job...")
    print(f"TensorFlow version: {tf.__version__}")
    print(f"GPU Available: {tf.config.list_physical_devices('GPU')}")
    
    # Generate synthetic dataset
    print("\nGenerating dataset...")
    X, y = make_classification(
        n_samples=10000,
        n_features=20,
        n_informative=15,
        n_redundant=5,
        n_classes=3,
        random_state=42
    )
    
    # Split into train and validation sets
    X_train, X_val, y_train, y_val = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    
    print(f"Training samples: {len(X_train)}")
    print(f"Validation samples: {len(X_val)}")
    
    # Create and train model
    print("\nBuilding model...")
    model = create_model(input_dim=X_train.shape[1], num_classes=3)
    model.summary()
    
    print("\nTraining model...")
    history = model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        epochs=epochs,
        batch_size=batch_size,
        verbose=1
    )
    
    # Evaluate model
    print("\nEvaluating model...")
    val_loss, val_accuracy = model.evaluate(X_val, y_val, verbose=0)
    print(f"Validation Loss: {val_loss:.4f}")
    print(f"Validation Accuracy: {val_accuracy:.4f}")
    
    # Save model
    model_path = save_model(model)
    
    print("\nTraining completed successfully!")
    return model, history


if __name__ == '__main__':
    model, history = train_model()

Step-by-Step Breakdown

1

Import ML dependencies

import numpy as np
import tensorflow as tf
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from metaparticle_pkg import Containerize
Import your ML frameworks and Buildr. The containerized environment will include all these dependencies.
2

Define model architecture

def create_model(input_dim, num_classes):
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(128, activation='relu', input_shape=(input_dim,)),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(num_classes, activation='softmax')
    ])
    
    model.compile(
        optimizer='adam',
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    
    return model
Create a simple neural network with dropout for regularization.
This architecture is just an example. Replace it with your actual model (CNN, transformer, etc.).
3

Configure containerization with GPU support

@Containerize(
    package={
        'name': model_name,
        'repository': repository,
        'publish': True,
        'dockerfile': 'Dockerfile.ml'
    },
    runtime={
        'executor': 'docker',
        'gpu': True,
        'memory': '4Gi',
        'cpu': '2',
    }
)
def train_model():
    # ... training code
Package Configuration:
  • name: Model/job name for tracking
  • repository: Docker registry for the training image
  • publish: Push image to registry for reuse
  • dockerfile: Custom Dockerfile with ML dependencies (see below)
Runtime Configuration:
  • executor: 'docker' for local or 'metaparticle' for cluster
  • gpu: True enables GPU access (requires NVIDIA Docker)
  • memory: '4Gi' requests 4GB RAM for training
  • cpu: '2' requests 2 CPU cores
Adjust memory and CPU based on your model size and dataset. Large models may need 8GB+ memory.
4

Implement training logic

def train_model():
    print("Starting ML training job...")
    
    # Generate or load dataset
    X, y = make_classification(n_samples=10000, n_features=20, n_classes=3)
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)
    
    # Create and train model
    model = create_model(input_dim=X_train.shape[1], num_classes=3)
    history = model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        epochs=epochs,
        batch_size=batch_size
    )
    
    # Evaluate and save
    val_loss, val_accuracy = model.evaluate(X_val, y_val)
    save_model(model)
    
    return model, history
Standard ML training workflow that runs inside the container.

Custom Dockerfile for ML

Create a Dockerfile.ml with ML-specific dependencies:
FROM tensorflow/tensorflow:2.13.0-gpu

# Install additional dependencies
RUN pip install --no-cache-dir \
    scikit-learn \
    pandas \
    matplotlib \
    seaborn \
    jupyter

# Copy application code
COPY ./ /app/
WORKDIR /app

# Install project dependencies
RUN pip install --no-cache-dir -r requirements.txt

# Run training script
CMD ["python", "train.py"]
Using a pre-built TensorFlow GPU image ensures CUDA and cuDNN are properly configured.

Configuration Options Explained

Package Options for ML

OptionTypeDescriptionExample
namestringModel/job identifier'sentiment-classifier'
repositorystringDocker registry'docker.io/mlteam'
publishbooleanPush image to registryTrue
dockerfilestringCustom Dockerfile path'Dockerfile.ml'
py_versionstringPython version (if using auto-generated Dockerfile)'3.9'

Runtime Options for ML

OptionTypeDescriptionExample
executorstringRuntime environment'docker', 'metaparticle'
gpubooleanEnable GPU accessTrue
memorystringMemory allocation'4Gi', '8Gi', '16Gi'
cpustringCPU cores'2', '4', '8'
envdictEnvironment variables{'CUDA_VISIBLE_DEVICES': '0'}

Running the Example

1

Create requirements.txt

metaparticle-pkg
tensorflow>=2.13.0
scikit-learn>=1.3.0
numpy>=1.24.0
pandas>=2.0.0
2

Create Dockerfile.ml

Use the Dockerfile example shown above or create your own with your ML framework.
3

Run training

python train.py
Buildr will:
  1. Build a Docker image with GPU support
  2. Mount your code into the container
  3. Allocate requested resources (GPU, memory, CPU)
  4. Run the training job
  5. Stream logs and metrics to your terminal
4

Access trained model

The model is saved inside the container at ./models/sentiment-classifier.h5To persist it, mount a volume:
runtime={
    'gpu': True,
    'volumes': [{'host': './models', 'container': '/app/models'}]
}

GPU Support

To use GPU acceleration:
1

Install NVIDIA Docker

# Install NVIDIA Container Toolkit
distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | \
  sudo tee /etc/apt/sources.list.d/nvidia-docker.list

sudo apt-get update
sudo apt-get install -y nvidia-docker2
sudo systemctl restart docker
2

Enable GPU in configuration

runtime={'gpu': True}
3

Verify GPU access

Your training script will log:
GPU Available: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
Test GPU access with a simple script first: tf.config.list_physical_devices('GPU')

Advanced ML Patterns

Hyperparameter Tuning

Run multiple training jobs with different hyperparameters:
configurations = [
    {'learning_rate': 0.001, 'batch_size': 32},
    {'learning_rate': 0.0001, 'batch_size': 64},
    {'learning_rate': 0.01, 'batch_size': 16},
]

for i, config in enumerate(configurations):
    @Containerize(
        package={'name': f'{model_name}-{i}', 'repository': repository},
        runtime={'gpu': True, 'memory': '4Gi'}
    )
    def train_with_config():
        return train_model(**config)
    
    train_with_config()

Distributed Training

Scale training across multiple nodes:
@Containerize(
    package={'name': 'distributed-training', 'repository': repository},
    runtime={
        'executor': 'metaparticle',
        'replicas': 4,  # 4 training workers
        'gpu': True,
        'memory': '8Gi',
        'env': {
            'TF_CONFIG': json.dumps({
                'cluster': {'worker': ['worker-0:2222', 'worker-1:2222', 
                                       'worker-2:2222', 'worker-3:2222']},
                'task': {'type': 'worker', 'index': 0}
            })
        }
    }
)
def distributed_train():
    strategy = tf.distribute.MultiWorkerMirroredStrategy()
    with strategy.scope():
        model = create_model()
    # ... training code

Model Inference Service

Deploy trained model as a web service:
from flask import Flask, request, jsonify

app = Flask(__name__)
model = tf.keras.models.load_model('./models/sentiment-classifier.h5')

@app.route('/predict', methods=['POST'])
def predict():
    data = request.json['data']
    predictions = model.predict(data)
    return jsonify({'predictions': predictions.tolist()})

@Containerize(
    package={'name': 'ml-inference', 'repository': repository, 'publish': True},
    runtime={'ports': [5000], 'public': True, 'replicas': 3, 'memory': '2Gi'}
)
def serve():
    app.run(host='0.0.0.0', port=5000)

if __name__ == '__main__':
    serve()

Best Practices for ML Workloads

1

Version your datasets and models

package={
    'name': f'{model_name}-v{version}',
    'repository': repository
}
2

Use appropriate resource limits

  • Small models: 2GB memory, 1-2 CPUs
  • Medium models: 4-8GB memory, 2-4 CPUs, 1 GPU
  • Large models (BERT, GPT): 16GB+ memory, 4+ CPUs, 2+ GPUs
3

Implement checkpointing

callbacks = [
    tf.keras.callbacks.ModelCheckpoint(
        'checkpoints/model-{epoch:02d}.h5',
        save_best_only=True
    )
]
model.fit(..., callbacks=callbacks)
4

Log metrics and experiments

Use tools like TensorBoard, MLflow, or Weights & Biases:
callbacks.append(tf.keras.callbacks.TensorBoard(log_dir='./logs'))

Build docs developers (and LLMs) love