Skip to main content
The Arrow IPC (Inter-Process Communication) streaming format provides a standardized way to serialize Arrow data for transmission over networks or storage on disk. It enables zero-copy reads and efficient columnar data exchange.

Overview

The IPC format features:
  • Zero-copy reads: Data can be memory-mapped or read without copying
  • Schema preservation: Type information is included in the stream
  • Streaming support: Process data incrementally as it arrives
  • Compression: Optional per-buffer compression (LZ4, ZSTD, etc.)
  • Dictionary encoding: Efficient encoding for low-cardinality data
  • Random access: File format supports seeking to specific batches

Format Types

Arrow provides two IPC format variants:
  1. Stream Format: Sequential access, ideal for pipes and network streams
  2. File Format: Random access with footer metadata, ideal for disk storage

Writing Streams

#include <arrow/io/file.h>
#include <arrow/ipc/writer.h>
#include <arrow/record_batch.h>

using namespace arrow;

// Create schema
auto schema = arrow::schema({
    field("id", int64()),
    field("name", utf8()),
    field("value", float64())
});

// Open output stream
ARROW_ASSIGN_OR_RAISE(auto output,
    io::FileOutputStream::Open("data.arrow"));

// Create IPC stream writer
ARROW_ASSIGN_OR_RAISE(auto writer,
    ipc::MakeStreamWriter(output, schema));

// Write record batches
std::shared_ptr<RecordBatch> batch;
// ... create batch ...

ARROW_RETURN_NOT_OK(writer->WriteRecordBatch(*batch));

// Write more batches as needed
// ...

// Close writer (writes EOS marker)
ARROW_RETURN_NOT_OK(writer->Close());
ARROW_RETURN_NOT_OK(output->Close());

Reading Streams

#include <arrow/io/file.h>
#include <arrow/ipc/reader.h>

// Open input stream
ARROW_ASSIGN_OR_RAISE(auto input,
    io::ReadableFile::Open("data.arrow"));

// Create IPC stream reader
ARROW_ASSIGN_OR_RAISE(auto reader,
    ipc::RecordBatchStreamReader::Open(input));

// Read schema
auto schema = reader->schema();
std::cout << "Schema: " << schema->ToString() << std::endl;

// Read batches one by one
std::shared_ptr<RecordBatch> batch;
while (true) {
  ARROW_ASSIGN_OR_RAISE(batch, reader->Next());
  if (batch == nullptr) break;  // End of stream
  
  std::cout << "Batch with " << batch->num_rows() 
            << " rows" << std::endl;
}

// Or read all batches into a table
ARROW_ASSIGN_OR_RAISE(auto input2,
    io::ReadableFile::Open("data.arrow"));
ARROW_ASSIGN_OR_RAISE(auto reader2,
    ipc::RecordBatchStreamReader::Open(input2));

ARROW_ASSIGN_OR_RAISE(auto table, reader2->ToTable());
std::cout << "Total rows: " << table->num_rows() << std::endl;

File Format (Random Access)

#include <arrow/io/file.h>
#include <arrow/ipc/writer.h>
#include <arrow/ipc/reader.h>

// Writing file format
ARROW_ASSIGN_OR_RAISE(auto output,
    io::FileOutputStream::Open("data.feather"));

ARROW_ASSIGN_OR_RAISE(auto writer,
    ipc::MakeFileWriter(output, schema));

ARROW_RETURN_NOT_OK(writer->WriteRecordBatch(*batch));
ARROW_RETURN_NOT_OK(writer->Close());

// Reading file format (random access)
ARROW_ASSIGN_OR_RAISE(auto input,
    io::ReadableFile::Open("data.feather"));

ARROW_ASSIGN_OR_RAISE(auto reader,
    ipc::RecordBatchFileReader::Open(input));

// Get metadata
int num_batches = reader->num_record_batches();
std::cout << "File contains " << num_batches 
          << " record batches" << std::endl;

// Read specific batch by index
ARROW_ASSIGN_OR_RAISE(auto batch_3,
    reader->ReadRecordBatch(3));

// Read all batches
ARROW_ASSIGN_OR_RAISE(auto table,
    reader->ReadTable());

Compression

#include <arrow/ipc/options.h>

// Configure compression
ipc::IpcWriteOptions options = ipc::IpcWriteOptions::Defaults();
options.codec = util::Codec::Create(Compression::LZ4_FRAME).ValueOrDie();

// Create writer with compression
ARROW_ASSIGN_OR_RAISE(auto writer,
    ipc::MakeStreamWriter(output, schema, options));

ARROW_RETURN_NOT_OK(writer->WriteRecordBatch(*batch));
ARROW_RETURN_NOT_OK(writer->Close());

// Reader automatically detects compression
ARROW_ASSIGN_OR_RAISE(auto reader,
    ipc::RecordBatchStreamReader::Open(input));

Memory-Mapped Reading

#include <arrow/io/file.h>

// Open file with memory mapping
ARROW_ASSIGN_OR_RAISE(auto input,
    io::MemoryMappedFile::Open("data.feather", io::FileMode::READ));

ARROW_ASSIGN_OR_RAISE(auto reader,
    ipc::RecordBatchFileReader::Open(input));

// Read batches - data is memory-mapped, not copied
ARROW_ASSIGN_OR_RAISE(auto batch,
    reader->ReadRecordBatch(0));

// Access data directly from memory-mapped region
auto column = batch->column(0);

Streaming Over Networks

#include <arrow/io/memory.h>

// Sender: Write to buffer
ARROW_ASSIGN_OR_RAISE(auto buffer_stream,
    io::BufferOutputStream::Create());

ARROW_ASSIGN_OR_RAISE(auto writer,
    ipc::MakeStreamWriter(buffer_stream, schema));

ARROW_RETURN_NOT_OK(writer->WriteRecordBatch(*batch));
ARROW_RETURN_NOT_OK(writer->Close());

ARROW_ASSIGN_OR_RAISE(auto buffer,
    buffer_stream->Finish());

// Send buffer->data() over network...

// Receiver: Read from buffer
ARROW_ASSIGN_OR_RAISE(auto buffer_reader,
    io::BufferReader::FromBuffer(buffer));

ARROW_ASSIGN_OR_RAISE(auto reader,
    ipc::RecordBatchStreamReader::Open(buffer_reader));

ARROW_ASSIGN_OR_RAISE(auto received_batch,
    reader->Next());

Dictionary Encoding

// Dictionary encoded arrays are automatically handled
auto dict_type = dictionary(int32(), utf8());
auto field = arrow::field("category", dict_type);

// Create dictionary array
auto indices = ArrayFromJSON(int32(), "[0, 1, 0, 2, 1]");
auto dictionary = ArrayFromJSON(utf8(), "[\"A\", \"B\", \"C\"]");

ARROW_ASSIGN_OR_RAISE(auto dict_array,
    DictionaryArray::FromArrays(dict_type, indices, dictionary));

// Write - dictionary is written once, then referenced
auto schema = arrow::schema({field});
auto batch = RecordBatch::Make(schema, 5, {dict_array});

ARROW_RETURN_NOT_OK(writer->WriteRecordBatch(*batch));

Custom Metadata

// Add schema metadata
auto metadata = key_value_metadata({
    {"source", "sensor_data"},
    {"version", "1.0"},
    {"timestamp", "2024-03-15T10:00:00Z"}
});

auto schema = arrow::schema(
    {field("value", float64())},
    metadata
);

// Add batch metadata
auto batch_metadata = key_value_metadata({
    {"batch_id", "123"},
    {"partition", "2024-03"}
});

ARROW_RETURN_NOT_OK(
    writer->WriteRecordBatch(*batch, batch_metadata));

// Read metadata
auto read_schema = reader->schema();
if (read_schema->metadata()) {
  std::cout << read_schema->metadata()->ToString() << std::endl;
}

Performance Tips

  1. Batch Size: Use 64K-1M rows per batch for optimal performance
  2. Memory Mapping: Use memory-mapped files for large datasets
  3. Compression: Enable compression for network I/O or storage
  4. Dictionary Encoding: Use for low-cardinality string columns
  5. Zero-Copy: Prefer file format for zero-copy reads
  6. Parallel I/O: Read multiple batches in parallel when possible

Build docs developers (and LLMs) love