Skip to main content

What is an Azure Machine Learning Component?

An Azure Machine Learning component is a self-contained piece of code that performs one step in a machine learning pipeline. Components are the building blocks of machine learning workflows, analogous to functions in programming.
Components enable reusability, versioning, and collaboration across machine learning pipelines and teams.

Component Structure

A component consists of three parts:
1

Metadata

Name, display name, version, type, description, tags
2

Interface

Input/output specifications with name, type, description, and default values
3

Execution

Command, code, and environment needed to run the component

Why Use Components?

Well-Defined Interface

Clear inputs and outputs hide implementation complexity

Reusability

Share components across pipelines, workspaces, and teams

Version Control

Track component versions for compatibility and reproducibility

Unit Testable

Self-contained code is easy to test independently

Component Example

Define a Component

from azure.ai.ml import command, Input, Output
from azure.ai.ml.constants import AssetTypes

# Define component
train_component = command(
    name="train_model",
    display_name="Train Model Component",
    description="Train a classification model",
    version="1.0.0",
    inputs={
        "training_data": Input(
            type=AssetTypes.URI_FOLDER,
            description="Path to training data"
        ),
        "learning_rate": Input(
            type="number",
            default=0.01,
            description="Learning rate for optimizer"
        ),
        "epochs": Input(
            type="integer",
            default=10,
            description="Number of training epochs"
        )
    },
    outputs={
        "model": Output(
            type=AssetTypes.URI_FOLDER,
            description="Trained model files"
        ),
        "metrics": Output(
            type=AssetTypes.URI_FILE,
            description="Training metrics JSON"
        )
    },
    code="./src",
    command="python train.py --data ${{inputs.training_data}} --lr ${{inputs.learning_rate}} --epochs ${{inputs.epochs}} --output ${{outputs.model}} --metrics ${{outputs.metrics}}",
    environment="azureml://registries/azureml/environments/sklearn-1.5/versions/1",
)

# Register component
train_component = ml_client.components.create_or_update(train_component)

Component Implementation

# src/train.py
import argparse
import json
import os
from pathlib import Path
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
import joblib

def main():
    # Parse arguments
    parser = argparse.ArgumentParser()
    parser.add_argument("--data", type=str, help="Training data path")
    parser.add_argument("--lr", type=float, default=0.01, help="Learning rate")
    parser.add_argument("--epochs", type=int, default=10, help="Training epochs")
    parser.add_argument("--output", type=str, help="Model output path")
    parser.add_argument("--metrics", type=str, help="Metrics output path")
    args = parser.parse_args()
    
    print(f"Training with lr={args.lr}, epochs={args.epochs}")
    
    # Load data
    data_path = Path(args.data)
    df = pd.read_csv(data_path / "train.csv")
    
    X = df.drop("target", axis=1)
    y = df["target"]
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    
    # Train model
    model = RandomForestClassifier(n_estimators=args.epochs * 10)
    model.fit(X_train, y_train)
    
    # Evaluate
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="weighted")
    
    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1 Score: {f1:.4f}")
    
    # Save model
    output_path = Path(args.output)
    output_path.mkdir(parents=True, exist_ok=True)
    joblib.dump(model, output_path / "model.pkl")
    
    # Save metrics
    metrics = {
        "accuracy": accuracy,
        "f1_score": f1,
        "learning_rate": args.lr,
        "epochs": args.epochs
    }
    
    metrics_path = Path(args.metrics)
    metrics_path.parent.mkdir(parents=True, exist_ok=True)
    with open(metrics_path, "w") as f:
        json.dump(metrics, f, indent=2)
    
    print("Training complete!")

if __name__ == "__main__":
    main()

Build a Pipeline with Components

Connect components to create end-to-end workflows:
from azure.ai.ml import dsl, Input, Output

@dsl.pipeline(
    name="training_pipeline",
    description="Complete ML training pipeline",
    default_compute="cpu-cluster"
)
def ml_training_pipeline(
    pipeline_data: Input,
    learning_rate: float = 0.01,
    epochs: int = 10
):
    # Step 1: Data preprocessing
    prep_step = prep_data_component(
        raw_data=pipeline_data
    )
    
    # Step 2: Feature engineering
    feature_step = feature_engineering_component(
        processed_data=prep_step.outputs.processed_data
    )
    
    # Step 3: Train model
    train_step = train_component(
        training_data=feature_step.outputs.features,
        learning_rate=learning_rate,
        epochs=epochs
    )
    
    # Step 4: Evaluate model
    eval_step = evaluate_component(
        model=train_step.outputs.model,
        test_data=feature_step.outputs.test_features
    )
    
    # Step 5: Register model if metrics are good
    register_step = register_model_component(
        model=train_step.outputs.model,
        metrics=eval_step.outputs.metrics,
        min_accuracy=0.85
    )
    
    # Return pipeline outputs
    return {
        "model": train_step.outputs.model,
        "metrics": eval_step.outputs.metrics,
        "registered_model": register_step.outputs.model_name
    }

# Create pipeline instance
pipeline_job = ml_training_pipeline(
    pipeline_data=Input(
        type="uri_folder",
        path="azureml://datastores/workspaceblobstore/paths/data/"
    ),
    learning_rate=0.001,
    epochs=20
)

# Submit pipeline
returned_job = ml_client.jobs.create_or_update(pipeline_job)
print(f"Pipeline submitted: {returned_job.studio_url}")

Component Input/Output Types

Supported Types

TypeDescriptionExample
uri_fileSingle file referenceCSV, JSON, model file
uri_folderDirectory referenceDataset folder, model directory
mltableTabular data abstractionStructured datasets

Input Definition

from azure.ai.ml import Input
from azure.ai.ml.constants import AssetTypes

inputs = {
    # Data inputs
    "training_data": Input(
        type=AssetTypes.URI_FOLDER,
        description="Training dataset folder"
    ),
    
    # Parameter inputs with defaults
    "batch_size": Input(
        type="integer",
        default=32,
        description="Training batch size",
        min=1,
        max=512
    ),
    
    "learning_rate": Input(
        type="number",
        default=0.001,
        description="Optimizer learning rate",
        min=0.0001,
        max=0.1
    ),
    
    "model_name": Input(
        type="string",
        default="my-model",
        description="Name for registered model"
    ),
    
    "enable_early_stopping": Input(
        type="boolean",
        default=True,
        description="Enable early stopping"
    )
}

Output Definition

from azure.ai.ml import Output
from azure.ai.ml.constants import AssetTypes

outputs = {
    "model": Output(
        type=AssetTypes.URI_FOLDER,
        description="Trained model directory",
        mode="rw_mount"  # Read-write mount
    ),
    
    "metrics": Output(
        type=AssetTypes.URI_FILE,
        description="Metrics JSON file"
    ),
    
    "visualizations": Output(
        type=AssetTypes.URI_FOLDER,
        description="Training plots and charts"
    )
}

Component Versioning

Manage component versions for reproducibility:
# Create version 1.0.0
train_component_v1 = command(
    name="train_model",
    version="1.0.0",
    inputs={...},
    outputs={...},
    code="./src",
    command="python train.py ...",
    environment="azureml://environments/sklearn-1.5/versions/1"
)

ml_client.components.create_or_update(train_component_v1)

# Create improved version 2.0.0
train_component_v2 = command(
    name="train_model",
    version="2.0.0",  # New version
    inputs={...},  # Added new input parameters
    outputs={...},
    code="./src",
    command="python train_v2.py ...",  # Improved implementation
    environment="azureml://environments/sklearn-1.5/versions/2"
)

ml_client.components.create_or_update(train_component_v2)

# Use specific version in pipeline
train_step = ml_client.components.get(
    name="train_model",
    version="1.0.0"  # Pin to specific version
)

Share Components Across Workspaces

Using Registries

from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential

# Connect to registry
registry_client = MLClient(
    DefaultAzureCredential(),
    registry_name="my-registry"
)

# Publish component to registry
registry_client.components.create_or_update(train_component)

# Use component from registry in any workspace
train_step = registry_client.components.get(
    name="train_model",
    version="1.0.0"
)

# Or reference by URI
train_step = command(
    component="azureml://registries/my-registry/components/train_model/versions/1.0.0"
)

Built-in Components

Azure ML provides pre-built components for common tasks:

Data Processing

  • Select Columns
  • Clean Missing Data
  • Normalize Data
  • Split Data

Feature Engineering

  • Feature Hashing
  • N-Gram Features
  • Filter-Based Selection
  • PCA Transformation

Training

  • Train Classifier
  • Train Regressor
  • Train Clustering Model
  • Train Recommender

Evaluation

  • Evaluate Model
  • Cross Validate Model
  • Tune Hyperparameters
  • Score Model

Using Built-in Components

from azure.ai.ml import load_component

# Load built-in component
normalize_data = load_component(
    client=ml_client,
    name="normalize_data",
    version="latest"
)

# Use in pipeline
@dsl.pipeline()
def my_pipeline(input_data):
    normalized = normalize_data(
        dataset=input_data,
        transformation_method="ZScore"
    )
    return normalized.outputs.transformed_data

Component Best Practices

Each component should do one thing well:
  • Data preprocessing
  • Feature engineering
  • Model training
  • Model evaluation
Don’t combine multiple responsibilities in one component.
Always specify version numbers:
  • Use semantic versioning (1.0.0, 1.1.0, 2.0.0)
  • Increment major version for breaking changes
  • Increment minor version for new features
  • Increment patch version for bug fixes
Provide clear descriptions:
inputs={
    "data": Input(
        type=AssetTypes.URI_FOLDER,
        description="Training data in CSV format with columns: feature1, feature2, target"
    )
}
Specify input constraints:
inputs={
    "batch_size": Input(
        type="integer",
        default=32,
        min=1,
        max=512,
        description="Must be power of 2 for optimal performance"
    )
}
# In component code
try:
    # Component logic
    pass
except Exception as e:
    print(f"Error in component: {str(e)}")
    raise  # Re-raise to fail pipeline
# Test component locally
from azure.ai.ml import command

test_job = command(
    component=train_component,
    inputs={
        "training_data": "./test_data",
        "learning_rate": 0.01
    },
    compute="local"
)

ml_client.jobs.create_or_update(test_job)

Component vs Python Function

AspectComponentPython Function
ReusabilityAcross pipelines, workspaces, teamsWithin same codebase
VersioningBuilt-in version managementManual tracking
EnvironmentIsolated, reproducibleDepends on notebook/script env
DistributionCan be shared via registryRequires code sharing
ExecutionRemote computeLocal or remote
TestingCan test independentlyFunction testing

Parallel Components

Process data in parallel using the parallel component:
from azure.ai.ml.parallel import ParallelTask

parallel_component = command(
    name="batch_inference",
    inputs={
        "input_data": Input(type=AssetTypes.URI_FOLDER),
        "model": Input(type=AssetTypes.URI_FOLDER)
    },
    outputs={
        "predictions": Output(type=AssetTypes.URI_FOLDER)
    },
    code="./src",
    command="python score.py --data ${{inputs.input_data}} --model ${{inputs.model}} --output ${{outputs.predictions}}",
    environment="azureml://environments/pytorch/versions/1",
    instance_count=10,  # Parallel instances
    distribution={
        "type": "pytorch",
        "process_count_per_instance": 1
    }
)

Next Steps

Build Pipelines

Create ML pipelines with components

Component Gallery

Browse pre-built components

Share Components

Use registries for team collaboration

CI/CD Integration

Automate component deployment

Additional Resources

Build docs developers (and LLMs) love