Skip to main content
Expressions in Apache Arrow provide a declarative way to describe data transformations and filtering operations. They can be optimized and pushed down to data sources.

Expression Basics

Creating Expressions

Expressions are built from three fundamental types:
  • Literals: Constant values
  • Field references: References to columns in the data
  • Function calls: Operations on literals and field references
#include <arrow/compute/expression.h>

using namespace arrow::compute;

// Literal expressions
auto lit_int = literal(42);
auto lit_str = literal("hello");
auto lit_bool = literal(true);

// Field reference expressions
auto field_a = field_ref("column_a");
auto nested_field = field_ref(FieldRef("parent", "child"));

// Function call expressions
auto comparison = equal(field_ref("age"), literal(25));
auto arithmetic = add(field_ref("x"), field_ref("y"));

Comparison Expressions

#include <arrow/compute/expression.h>

using namespace arrow::compute;

// Equality
auto eq_expr = equal(field_ref("status"), literal("active"));

// Inequality
auto ne_expr = not_equal(field_ref("type"), literal("deleted"));

// Ordering comparisons
auto gt_expr = greater(field_ref("price"), literal(100.0));
auto gte_expr = greater_equal(field_ref("quantity"), literal(10));
auto lt_expr = less(field_ref("age"), literal(65));
auto lte_expr = less_equal(field_ref("score"), literal(100));

Logical Expressions

using namespace arrow::compute;

// AND - all conditions must be true
auto and_expr = and_(
    greater(field_ref("age"), literal(18)),
    less(field_ref("age"), literal(65))
);

// OR - at least one condition must be true
auto or_expr = or_(
    equal(field_ref("country"), literal("US")),
    equal(field_ref("country"), literal("CA"))
);

// NOT - negate a condition
auto not_expr = not_(is_null(field_ref("email")));

// Complex combinations
auto complex = and_({
    or_(equal(field_ref("status"), literal("active")),
        equal(field_ref("status"), literal("pending"))),
    greater(field_ref("amount"), literal(0)),
    not_(is_null(field_ref("user_id")))
});

Null Handling

using namespace arrow::compute;

// Check for null values
auto null_check = is_null(field_ref("optional_field"));

// Check for valid (non-null) values
auto valid_check = is_valid(field_ref("required_field"));

// Null-aware comparisons
auto with_nulls = and_(
    is_valid(field_ref("price")),
    greater(field_ref("price"), literal(100.0))
);

String Operations

using namespace arrow::compute;

// String matching
MatchSubstringOptions match_opts("pattern");
auto match_expr = call("match_substring", 
                      {field_ref("text")},
                      std::make_shared<MatchSubstringOptions>(match_opts));

// String starts with
auto starts_expr = call("starts_with",
                       {field_ref("url")},
                       std::make_shared<MatchSubstringOptions>("https://"));

// String length
auto length_expr = call("utf8_length", {field_ref("name")});

Set Membership

using namespace arrow::compute;

// Check if value is in a set
auto value_set = ArrayFromJSON(arrow::utf8(), 
                              R"(["US", "CA", "UK", "AU"])");
SetLookupOptions lookup_opts(value_set);

auto isin_expr = call("is_in",
                     {field_ref("country")},
                     std::make_shared<SetLookupOptions>(lookup_opts));

// Using is_in helper
auto countries = ArrayFromJSON(arrow::utf8(), 
                              R"(["US", "CA"])");
auto expr = call("is_in", {field_ref("country")},
                std::make_shared<SetLookupOptions>(countries));

Arithmetic Expressions

using namespace arrow::compute;

// Basic arithmetic
auto add_expr = call("add", {field_ref("a"), field_ref("b")});
auto sub_expr = call("subtract", {field_ref("total"), field_ref("discount")});
auto mul_expr = call("multiply", {field_ref("price"), field_ref("quantity")});
auto div_expr = call("divide", {field_ref("sum"), field_ref("count")});

// Complex calculations
auto total_price = call("multiply", {
    field_ref("unit_price"),
    call("add", {
        field_ref("quantity"),
        literal(1)
    })
});

Temporal Expressions

using namespace arrow::compute;

// Extract year from timestamp
auto year_expr = call("year", {field_ref("timestamp")});

// Extract month
auto month_expr = call("month", {field_ref("timestamp")});

// Date comparison
auto date_filter = and_(
    greater_equal(field_ref("date"), literal("2023-01-01")),
    less(field_ref("date"), literal("2024-01-01"))
);

Binding and Execution

using namespace arrow::compute;

// Create an expression
auto expr = greater(field_ref("price"), literal(100.0));

// Bind expression to a schema
auto schema = arrow::schema({
    arrow::field("price", arrow::float64())
});

auto bound_expr = expr.Bind(*schema).ValueOrDie();

// Execute expression on data
auto batch = arrow::RecordBatchFromJSON(
    schema,
    R"([{"price": 150.0}, {"price": 75.0}])"
);

auto result = ExecuteScalarExpression(bound_expr, *batch).ValueOrDie();
// Result: [true, false]

Expression Simplification

using namespace arrow::compute;

auto expr = and_(
    equal(field_ref("x"), literal(5)),
    equal(field_ref("x"), literal(5))  // Redundant
);

// Simplify expression
auto simplified = Canonicalize(expr).ValueOrDie();

// Fold constants
auto constant_expr = equal(literal(1), literal(1));
auto folded = FoldConstants(constant_expr).ValueOrDie();
// Result: literal(true)

Using Expressions with Datasets

#include <arrow/dataset/api.h>

using namespace arrow::dataset;
using namespace arrow::compute;

// Create filter expression
auto filter = and_(
    greater(field_ref("year"), literal(2020)),
    equal(field_ref("status"), literal("completed"))
);

// Apply to scanner
auto scanner_builder = dataset->NewScan().ValueOrDie();
scanner_builder->Filter(filter);
scanner_builder->Project({"id", "amount", "date"});

auto scanner = scanner_builder->Finish().ValueOrDie();
auto table = scanner->ToTable().ValueOrDie();

Best Practices

  1. Build expressions declaratively: Let Arrow optimize the execution
  2. Push filters early: Apply filters at the dataset level for better performance
  3. Use strongly-typed expressions: Ensure type compatibility
  4. Leverage partition pruning: Structure filters to utilize partitions
  5. Combine expressions efficiently: Use and_/or_ helpers for multiple conditions

Next Steps

Build docs developers (and LLMs) love