Skip to main content
The H2O R package exposes H2O-3’s distributed machine learning capabilities through a REST-backed R API. All operations run on the H2O server; R acts as a thin client that sends commands and receives results.

Installation

1

Install the h2o package

Install directly from CRAN or from the H2O release repository.
install.packages("h2o")
2

Load the package

library(h2o)
The h2o package requires Java 8 or later. H2O will attempt to start a local JVM automatically if one is not already running.

Connecting to H2O

h2o.init()

h2o.init() connects to a running H2O instance or starts a new local one. It checks that the R package version matches the server version.
# Start or connect to a local instance with defaults (localhost:54321)
h2o.init()

# Allocate more memory for larger datasets
h2o.init(max_mem_size = "8g")

# Connect to a remote H2O cluster
h2o.init(ip = "192.168.1.100", port = 54321, startH2O = FALSE)

# Disable automatic startup (fail if no instance is found)
h2o.init(startH2O = FALSE)
ip
string
default:"localhost"
IP address of the H2O server.
port
number
default:"54321"
Port number of the H2O server.
startH2O
boolean
default:"TRUE"
Whether to start a local H2O instance if no connection is found. Only possible when ip is localhost.
max_mem_size
string
Maximum JVM heap size (e.g., "4g", "512m"). Only used when starting H2O from R.
nthreads
number
default:"-1"
Number of threads. -1 uses all available CPUs.
strict_version_check
boolean
default:"TRUE"
Whether to require the R package version to match the server version.

h2o.connect()

Use h2o.connect() to connect to an existing H2O cluster without attempting to start one.
conn <- h2o.connect(ip = "my-h2o-server", port = 54321)

Shutting down

h2o.shutdown(prompt = FALSE)

Importing data

H2O keeps all data server-side. Two functions import data into the cluster:

h2o.importFile()

Reads data from a path on the H2O server’s filesystem. This is the recommended method for large datasets because data is read in parallel without passing through the R client.
# Import a CSV from the server filesystem
train <- h2o.importFile("/data/train.csv")

# Import with explicit column types
train <- h2o.importFile(
  path   = "/data/train.csv",
  header = TRUE,
  sep    = ",",
  col.types = list(by.col.name = c("label"), types = c("Enum"))
)

# Import all CSV files in a directory
train <- h2o.importFile("/data/train_parts/")

h2o.uploadFile()

Pushes a file from the R client’s local filesystem to the H2O cluster. Suitable for smaller files; not recommended for large datasets.
path <- system.file("extdata", "prostate.csv", package = "h2o")
prostate <- h2o.uploadFile(path = path)

Converting from R data frames

# Convert an in-memory R data frame to an H2OFrame
iris_hf <- as.h2o(iris)

# Convert back to R
iris_r <- as.data.frame(iris_hf)
Use h2o.importFile() for production workloads. It parallelizes reads across the H2O cluster and avoids the R client becoming a bottleneck.

H2OFrame operations

An H2OFrame is a distributed data frame stored in the H2O cluster. It supports many familiar R data frame operations.

Inspection

h2o.nrow(train)       # Number of rows
h2o.ncol(train)       # Number of columns
h2o.dim(train)        # c(nrow, ncol)
h2o.colnames(train)   # Column names
h2o.summary(train)    # Summary statistics per column
h2o.describe(train)   # Detailed type and missing-value info
head(train, n = 10)   # First 10 rows as an R data frame

Subsetting with bracket notation

# Select a single column (returns H2OFrame)
age_col <- train["AGE"]

# Select multiple columns
subset_cols <- train[, c("AGE", "RACE", "PSA")]

# Select rows by condition
young <- train[train["AGE"] < 40, ]

# Select rows and columns
young_age <- train[train["AGE"] < 40, "AGE"]

Column operations

# Create a new column
train["log_psa"] <- log(train["PSA"])

# Rename columns
names(train)[1] <- "id"

# Check and set column types
h2o.getTypes(train)
train["CAPSULE"] <- as.factor(train["CAPSULE"])

Splitting data

splits <- h2o.splitFrame(
  data         = train,
  ratios       = c(0.7, 0.15),
  destination_frames = c("train_split", "valid_split", "test_split"),
  seed         = 42
)
train_df <- splits[[1]]
valid_df  <- splits[[2]]
test_df   <- splits[[3]]

Training models

All estimator functions share a common signature: x (predictor columns), y (response column), and training_frame.
library(h2o)
h2o.init()

prostate_path <- system.file("extdata", "prostate.csv", package = "h2o")
prostate <- h2o.importFile(path = prostate_path)
prostate["CAPSULE"] <- as.factor(prostate["CAPSULE"])

splits <- h2o.splitFrame(prostate, ratios = 0.8, seed = 1)
train  <- splits[[1]]
test   <- splits[[2]]

predictors <- c("AGE", "RACE", "PSA", "VOL", "GLEASON")
response   <- "CAPSULE"

Gradient Boosting Machine (GBM)

gbm_model <- h2o.gbm(
  x              = predictors,
  y              = response,
  training_frame = train,
  validation_frame = test,
  ntrees         = 100,
  max_depth      = 5,
  learn_rate     = 0.05,
  seed           = 1
)

Random Forest

rf_model <- h2o.randomForest(
  x              = predictors,
  y              = response,
  training_frame = train,
  ntrees         = 100,
  max_depth      = 20,
  seed           = 1
)

Deep Learning

dl_model <- h2o.deeplearning(
  x              = predictors,
  y              = response,
  training_frame = train,
  hidden         = c(200, 200),
  epochs         = 10,
  activation     = "Rectifier",
  seed           = 1
)

Generalized Linear Model (GLM)

glm_model <- h2o.glm(
  x              = predictors,
  y              = response,
  training_frame = train,
  family         = "binomial",
  alpha          = 0.5,
  lambda_search  = TRUE
)

AutoML

aml <- h2o.automl(
  x                  = predictors,
  y                  = response,
  training_frame     = train,
  max_models         = 20,
  seed               = 1
)

# View the leaderboard
lb <- h2o.get_leaderboard(aml, extra_columns = "ALL")
print(lb, n = 10)

# Best model
best_model <- aml@leader

Predictions and performance

h2o.predict()

Generates predictions on new data. Returns an H2OFrame with prediction columns.
preds <- h2o.predict(gbm_model, newdata = test)
head(preds)
# For classification: predict, p0, p1 columns
# For regression: predict column

h2o.performance()

Computes model metrics on any labeled dataset.
# Performance on training data
train_perf <- h2o.performance(gbm_model, train = TRUE)

# Performance on a test set
test_perf <- h2o.performance(gbm_model, newdata = test)

# Key metrics
h2o.auc(test_perf)
h2o.logloss(test_perf)
h2o.mse(test_perf)
h2o.rmse(test_perf)
h2o.r2(test_perf)

# Confusion matrix (classification)
h2o.confusionMatrix(test_perf)

Variable importance

h2o.varimp(gbm_model)
h2o.varimp_plot(gbm_model)

Saving and loading models

Save a binary model

# Save to a directory on the H2O server filesystem
model_path <- h2o.saveModel(
  object = gbm_model,
  path   = "/models/",
  force  = TRUE
)
print(model_path)
# e.g., "/models/GBM_model_R_1234567890"

Load a binary model

loaded_model <- h2o.loadModel("/models/GBM_model_R_1234567890")

Download a MOJO

MOJOs (Model Object, Optimized) are portable binary representations that can be scored without an H2O cluster.
mojo_path <- h2o.download_mojo(
  model      = gbm_model,
  path       = "~/mojo_exports/",
  get_genmodel_jar = TRUE
)

Download a POJO

h2o.download_pojo(
  model = gbm_model,
  path  = "~/pojo_exports/"
)
Use MOJOs for production deployment. They are more compact than POJOs and support a wider range of algorithm types.

Cross-validation

gbm_cv <- h2o.gbm(
  x              = predictors,
  y              = response,
  training_frame = train,
  ntrees         = 100,
  nfolds         = 5,
  fold_assignment = "Stratified",
  keep_cross_validation_predictions = TRUE,
  seed           = 1
)

# Cross-validated AUC
h2o.auc(gbm_cv, xval = TRUE)

# Cross-validated confusion matrix
h2o.confusionMatrix(gbm_cv, xval = TRUE)

gbm_grid <- h2o.grid(
  algorithm = "gbm",
  x = predictors,
  y = response,
  training_frame = train,
  hyper_params = list(
    max_depth  = c(3, 5, 7),
    learn_rate = c(0.01, 0.05, 0.1),
    ntrees     = c(50, 100)
  ),
  search_criteria = list(strategy = "RandomDiscrete", max_models = 10),
  seed = 1
)

# Sort grid by AUC
sorted_grid <- h2o.getGrid(gbm_grid@grid_id, sort_by = "auc", decreasing = TRUE)
best_grid_model <- h2o.getModel(sorted_grid@model_ids[[1]])

Build docs developers (and LLMs) love