Skip to main content
The Arrow dataset API provides efficient access to large, multi-file datasets with automatic partitioning, filtering, and schema evolution.

Dataset Basics

Creating Datasets

Datasets can be created from various sources:
#include <arrow/dataset/api.h>
#include <arrow/filesystem/api.h>

using namespace arrow::dataset;

// Create dataset from a directory of Parquet files
auto fs = arrow::fs::LocalFileSystem().ValueOrDie();

auto format = std::make_shared<ParquetFileFormat>();
auto factory = FileSystemDatasetFactory::Make(
    fs,
    arrow::fs::FileSelector{"/path/to/data"},
    format,
    FileSystemFactoryOptions{}
).ValueOrDie();

auto dataset = factory->Finish().ValueOrDie();

// Get dataset schema
auto schema = dataset->schema();

Partitioned Datasets

Datasets support partition discovery and pruning:
#include <arrow/dataset/partition.h>

// Hive-style partitioning (year=2023/month=01/)
auto partitioning = arrow::dataset::HivePartitioning::MakeFactory(
    arrow::schema({
        arrow::field("year", arrow::int32()),
        arrow::field("month", arrow::int32())
    })
);

FileSystemFactoryOptions options;
options.partitioning = partitioning;

auto factory = FileSystemDatasetFactory::Make(
    fs, selector, format, options
).ValueOrDie();

auto dataset = factory->Finish().ValueOrDie();

Scanning Datasets

Basic Scanning

#include <arrow/dataset/scanner.h>

// Create scanner builder
auto scanner_builder = dataset->NewScan().ValueOrDie();

// Configure scan options
scanner_builder->Project({"col_a", "col_b", "col_c"});
scanner_builder->BatchSize(128 * 1024); // 128Ki rows
scanner_builder->UseThreads(true);

// Build scanner
auto scanner = scanner_builder->Finish().ValueOrDie();

// Scan to table
auto table = scanner->ToTable().ValueOrDie();

// Or scan to batches
auto batch_iterator = scanner->ScanBatches().ValueOrDie();
for (auto batch_result : batch_iterator) {
    auto tagged_batch = batch_result.ValueOrDie();
    auto batch = tagged_batch.record_batch;
    // Process batch
}

Filtering During Scan

#include <arrow/compute/expression.h>

// Build filter expression
auto filter = arrow::compute::and_(
    arrow::compute::greater(
        arrow::compute::field_ref("price"),
        arrow::compute::literal(100.0)
    ),
    arrow::compute::equal(
        arrow::compute::field_ref("status"),
        arrow::compute::literal("active")
    )
);

// Apply filter to scanner
auto scanner_builder = dataset->NewScan().ValueOrDie();
scanner_builder->Filter(filter);

auto scanner = scanner_builder->Finish().ValueOrDie();
auto filtered_table = scanner->ToTable().ValueOrDie();

Multiple Formats

Parquet

#include <arrow/dataset/file_parquet.h>

auto format = std::make_shared<ParquetFileFormat>();

// Configure Parquet-specific options
auto parquet_opts = std::make_shared<ParquetFragmentScanOptions>();
parquet_opts->arrow_reader_properties->set_batch_size(256 * 1024);
parquet_opts->arrow_reader_properties->set_pre_buffer(true);

auto scanner_builder = dataset->NewScan().ValueOrDie();
scanner_builder->FragmentScanOptions(parquet_opts);

CSV

#include <arrow/dataset/file_csv.h>
#include <arrow/csv/options.h>

auto format = std::make_shared<CsvFileFormat>();

// Configure CSV options
auto csv_opts = std::make_shared<CsvFragmentScanOptions>();
csv_opts->convert_options.include_columns = {"col_a", "col_b"};
csv_opts->convert_options.strings_can_be_null = true;

auto factory = FileSystemDatasetFactory::Make(
    fs, selector, format, factory_options
).ValueOrDie();

IPC/Feather

#include <arrow/dataset/file_ipc.h>

auto format = std::make_shared<IpcFileFormat>();

auto factory = FileSystemDatasetFactory::Make(
    fs, selector, format, factory_options
).ValueOrDie();

auto dataset = factory->Finish().ValueOrDie();

Schema Evolution

Datasets handle schema evolution automatically:
// Dataset unifies schemas from multiple files
// Missing columns are filled with nulls
// Extra columns can be projected out

auto scanner_builder = dataset->NewScan().ValueOrDie();

// Project only common columns
scanner_builder->Project({"col_a", "col_b"});

auto scanner = scanner_builder->Finish().ValueOrDie();
auto table = scanner->ToTable().ValueOrDie();

Writing Datasets

#include <arrow/dataset/dataset_writer.h>

// Write partitioned dataset
auto partitioning = std::make_shared<arrow::dataset::HivePartitioning>(
    arrow::schema({{"year", arrow::int32()}, {"month", arrow::int32()}})
);

FileSystemDatasetWriteOptions write_opts;
write_opts.file_write_options = format->DefaultWriteOptions();
write_opts.filesystem = fs;
write_opts.base_dir = "/output/path";
write_opts.partitioning = partitioning;
write_opts.basename_template = "part-{i}.parquet";

auto scanner = /* ... */;
arrow::dataset::FileSystemDataset::Write(
    write_opts, scanner
).ValueOrDie();

Fragment-Level Operations

// Get fragments from dataset
auto fragments = dataset->GetFragments().ValueOrDie();

for (auto fragment_result : fragments) {
    auto fragment = fragment_result.ValueOrDie();
    
    // Get fragment schema
    auto schema = fragment->ReadPhysicalSchema().ValueOrDie();
    
    // Count rows (metadata only)
    auto count = fragment->CountRows(
        arrow::compute::literal(true),
        scan_options
    ).ValueOrDie();
}

Performance Tips

  1. Use partition pruning: Structure data with meaningful partitions
  2. Push down filters: Filter as early as possible to reduce I/O
  3. Project only needed columns: Avoid reading unnecessary data
  4. Configure batch size: Balance memory usage and processing efficiency
  5. Enable readahead: Use batch_readahead and fragment_readahead
  6. Use parallel scanning: Set use_threads=true for better performance

Next Steps

Build docs developers (and LLMs) love