Memory Management

Apache Arrow provides sophisticated memory management capabilities to ensure efficient use of system resources and enable zero-copy data sharing.

Memory Pools

Memory pools manage allocation and deallocation of memory buffers in Arrow. All allocations go through a memory pool, which allows tracking and controlling memory usage.

Python
C++

import pyarrow as pa

# Get the default memory pool
pool = pa.default_memory_pool()
print(f"Backend: {pool.backend_name}")
print(f"Bytes allocated: {pool.bytes_allocated()}")
print(f"Max memory: {pool.max_memory()}")

# Create a custom memory pool with tracking
# Available backends: 'system', 'jemalloc', 'mimalloc'
if 'jemalloc' in pa.supported_memory_backends():
    pool = pa.jemalloc_memory_pool()
else:
    pool = pa.system_memory_pool()

# Allocate a buffer with a specific pool
buf = pa.allocate_buffer(1024 * 1024, memory_pool=pool)  # 1 MB
print(f"Allocated: {pool.bytes_allocated()} bytes")

# Check available backends
print("Supported backends:", pa.supported_memory_backends())

#include <arrow/memory_pool.h>
#include <arrow/buffer.h>

void MemoryPoolExample() {
  // Get default memory pool
  arrow::MemoryPool* pool = arrow::default_memory_pool();
  
  std::cout << "Bytes allocated: " 
            << pool->bytes_allocated() << std::endl;
  std::cout << "Max memory: " 
            << pool->max_memory() << std::endl;

  // Allocate memory
  uint8_t* buffer = nullptr;
  int64_t size = 1024 * 1024;  // 1 MB
  arrow::Status status = pool->Allocate(size, &buffer);
  
  if (status.ok()) {
    std::cout << "Allocated " << size << " bytes" << std::endl;
    
    // Use the buffer...
    
    // Free the memory
    pool->Free(buffer, size);
  }
}

// Create a system memory pool
std::unique_ptr<arrow::MemoryPool> CreateSystemPool() {
  return arrow::MemoryPool::CreateDefault();
}

Always allocate buffers with alignment to 64 bytes for optimal SIMD performance. Arrow’s memory pools handle this automatically.

Buffers

Buffers are the fundamental memory container in Arrow, representing contiguous memory regions.

Python
C++

import pyarrow as pa
import numpy as np

# Create a buffer from Python bytes
data = b"Hello, Arrow!"
buf = pa.py_buffer(data)
print(f"Buffer size: {buf.size} bytes")
print(f"Is mutable: {buf.is_mutable}")

# Create from NumPy array (zero-copy)
arr = np.array([1, 2, 3, 4, 5], dtype=np.int32)
buf = pa.py_buffer(arr)
print(f"Buffer size: {buf.size} bytes")

# Allocate a new buffer
buf = pa.allocate_buffer(1024)  # 1 KB
print(f"Allocated {buf.size} bytes")

# Create a resizable buffer
buf = pa.BufferOutputStream()
buf.write(b"data")
buf.write(b" more data")
result = buf.getvalue()  # Get the final buffer
print(f"Buffer contents: {result.to_pybytes()}")

# Slice a buffer (zero-copy)
original = pa.py_buffer(b"0123456789")
sliced = original.slice(2, 5)  # offset=2, length=5
print(f"Sliced: {sliced.to_pybytes()}")  # b"23456"

#include <arrow/buffer.h>
#include <arrow/memory_pool.h>

void BufferExample() {
  // Create immutable buffer from string
  std::string data = "Hello, Arrow!";
  auto buf = arrow::Buffer::FromString(data);
  std::cout << "Size: " << buf->size() << std::endl;
  std::cout << "Is mutable: " << buf->is_mutable() << std::endl;

  // Allocate a mutable buffer
  int64_t size = 1024;
  arrow::Result<std::unique_ptr<arrow::Buffer>> result =
      arrow::AllocateBuffer(size, arrow::default_memory_pool());
  
  if (result.ok()) {
    std::unique_ptr<arrow::Buffer> buffer = std::move(result).ValueOrDie();
    
    // Get mutable data pointer
    uint8_t* mutable_data = buffer->mutable_data();
    
    // Write data
    std::memcpy(mutable_data, "data", 4);
  }

  // Slice a buffer (zero-copy view)
  auto parent = arrow::Buffer::FromString("0123456789");
  auto sliced = arrow::SliceBuffer(parent, 2, 5);  // offset, length
  // sliced contains "23456"

  // Create buffer from existing memory (non-owning)
  uint8_t existing_data[100];
  auto view = std::make_shared<arrow::Buffer>(
    existing_data, sizeof(existing_data));
}

Buffer slicing in Arrow is a zero-copy operation. The sliced buffer maintains a reference to the parent buffer, ensuring the underlying memory remains valid.

One of Arrow’s key features is the ability to share buffers between processes and language runtimes without copying data.

Python
C++

import pyarrow as pa
import numpy as np

# NumPy to Arrow (zero-copy when possible)
np_array = np.array([1, 2, 3, 4, 5], dtype=np.int64)
arrow_array = pa.array(np_array)
# No data copy - Arrow wraps the NumPy buffer

# Arrow to NumPy (zero-copy)
arrow_array = pa.array([1, 2, 3, 4, 5])
np_view = arrow_array.to_numpy(zero_copy_only=True)
# Raises error if zero-copy is not possible

# Pandas to Arrow
import pandas as pd
df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]})
table = pa.Table.from_pandas(df, preserve_index=False)

# Arrow to Pandas (zero-copy for some types)
df_back = table.to_pandas(zero_copy_only=False, 
                         self_destruct=True)
# self_destruct transfers ownership when possible

# Share via IPC (zero-copy in same process)
import pyarrow.plasma as plasma  # If available
# Or use shared memory for inter-process zero-copy

#include <arrow/api.h>
#include <arrow/buffer.h>

// Share buffer with another Array (zero-copy)
std::shared_ptr<arrow::Array> ShareBuffer(
    const std::shared_ptr<arrow::Array>& source) {
  // Get the underlying buffer
  auto data = source->data();
  
  // Create new array with same buffer
  // Both arrays point to the same memory
  auto new_data = arrow::ArrayData::Make(
    source->type(),
    source->length(),
    {data->buffers[0], data->buffers[1]},  // Share buffers
    source->null_count(),
    0  // offset
  );
  
  return arrow::MakeArray(new_data);
}

// Slice array (zero-copy)
std::shared_ptr<arrow::Array> SliceArray(
    const std::shared_ptr<arrow::Array>& array,
    int64_t offset, int64_t length) {
  // Creates a view without copying data
  return array->Slice(offset, length);
}