Reading and Writing Data

Apache Arrow provides efficient I/O interfaces for reading and writing data in multiple formats including Parquet, CSV, IPC (Arrow format), and Feather.

Reading and Writing Parquet Files

Parquet is a columnar storage format that provides efficient compression and encoding schemes.

Python
C++

import pyarrow as pa
import pyarrow.parquet as pq

# Create sample data
table = pa.table({
    'id': [1, 2, 3, 4],
    'name': ['Alice', 'Bob', 'Charlie', 'David'],
    'score': [95.5, 87.2, 92.8, 78.9]
})

# Write to Parquet file
pq.write_table(table, 'data.parquet')

# Read from Parquet file
table = pq.read_table('data.parquet')
print(table)

# Read with column selection
table = pq.read_table('data.parquet', columns=['name', 'score'])

# Read with filtering (requires pyarrow.dataset)
import pyarrow.dataset as ds
dataset = ds.dataset('data.parquet', format='parquet')
filtered = dataset.to_table(filter=ds.field('score') > 85)

#include <arrow/api.h>
#include <arrow/io/file.h>
#include <parquet/arrow/reader.h>
#include <parquet/arrow/writer.h>

// Write Parquet file
arrow::Status WriteParquet(const std::shared_ptr<arrow::Table>& table,
                           const std::string& path) {
  std::shared_ptr<arrow::io::FileOutputStream> outfile;
  ARROW_ASSIGN_OR_RAISE(outfile, 
    arrow::io::FileOutputStream::Open(path));
  
  ARROW_RETURN_NOT_OK(
    parquet::arrow::WriteTable(*table, arrow::default_memory_pool(),
                               outfile, /*chunk_size=*/1024));
  return arrow::Status::OK();
}

// Read Parquet file
arrow::Result<std::shared_ptr<arrow::Table>> ReadParquet(
    const std::string& path) {
  std::shared_ptr<arrow::io::ReadableFile> infile;
  ARROW_ASSIGN_OR_RAISE(infile, 
    arrow::io::ReadableFile::Open(path, arrow::default_memory_pool()));

  std::unique_ptr<parquet::arrow::FileReader> reader;
  ARROW_RETURN_NOT_OK(
    parquet::arrow::OpenFile(infile, arrow::default_memory_pool(), &reader));

  std::shared_ptr<arrow::Table> table;
  ARROW_RETURN_NOT_OK(reader->ReadTable(&table));
  return table;
}

IPC Format (Arrow Native Format)

The Arrow IPC format provides zero-copy reads and is optimized for inter-process communication.

Python
C++

import pyarrow as pa
import pyarrow.ipc as ipc

# Create sample data
schema = pa.schema([
    ('id', pa.int32()),
    ('value', pa.float64())
])

batch = pa.record_batch([
    [1, 2, 3, 4],
    [1.5, 2.5, 3.5, 4.5]
], schema=schema)

# Write to IPC file (also called Feather V2)
with pa.OSFile('data.arrow', 'wb') as sink:
    with ipc.new_file(sink, schema) as writer:
        writer.write_batch(batch)

# Read from IPC file
with pa.memory_map('data.arrow', 'rb') as source:
    reader = ipc.open_file(source)
    table = reader.read_all()
    print(f"Read {reader.num_record_batches} batches")

# Stream format (no random access)
with pa.OSFile('data.arrows', 'wb') as sink:
    with ipc.new_stream(sink, schema) as writer:
        writer.write_batch(batch)
        writer.write_batch(batch)  # Can write multiple batches

#include <arrow/api.h>
#include <arrow/io/file.h>
#include <arrow/ipc/reader.h>
#include <arrow/ipc/writer.h>

// Write IPC file
arrow::Status WriteIPC(const std::shared_ptr<arrow::RecordBatch>& batch,
                       const std::string& path) {
  ARROW_ASSIGN_OR_RAISE(auto outfile,
    arrow::io::FileOutputStream::Open(path));
  
  ARROW_ASSIGN_OR_RAISE(auto writer,
    arrow::ipc::MakeFileWriter(outfile, batch->schema()));
  
  ARROW_RETURN_NOT_OK(writer->WriteRecordBatch(*batch));
  ARROW_RETURN_NOT_OK(writer->Close());
  return arrow::Status::OK();
}

// Read IPC file
arrow::Result<std::shared_ptr<arrow::Table>> ReadIPC(
    const std::string& path) {
  ARROW_ASSIGN_OR_RAISE(auto infile,
    arrow::io::ReadableFile::Open(path));
  
  ARROW_ASSIGN_OR_RAISE(auto reader,
    arrow::ipc::RecordBatchFileReader::Open(infile));
  
  std::shared_ptr<arrow::Table> table;
  ARROW_RETURN_NOT_OK(reader->ReadAll(&table));
  return table;
}

The IPC file format supports random access, while the stream format is designed for sequential reading. Use files when you need to access specific batches, and streams when processing data sequentially.

Reading CSV Files

Arrow provides high-performance CSV reading with type inference and streaming capabilities.

Python
C++

import pyarrow.csv as csv
import pyarrow as pa

# Read entire CSV into a table
table = csv.read_csv('data.csv')

# Read with custom options
parse_options = csv.ParseOptions(
    delimiter='|',
    quote_char='"',
    escape_char='\\'
)

convert_options = csv.ConvertOptions(
    column_types={
        'id': pa.int64(),
        'timestamp': pa.timestamp('s')
    },
    strings_can_be_null=True
)

read_options = csv.ReadOptions(
    skip_rows=1,
    column_names=['id', 'name', 'timestamp', 'value']
)

table = csv.read_csv('data.csv',
                    parse_options=parse_options,
                    convert_options=convert_options,
                    read_options=read_options)

# Stream large CSV files
with csv.open_csv('large_file.csv') as reader:
    for batch in reader:
        # Process each batch
        print(f"Batch with {batch.num_rows} rows")

#include <arrow/api.h>
#include <arrow/csv/api.h>
#include <arrow/io/file.h>

arrow::Result<std::shared_ptr<arrow::Table>> ReadCSV(
    const std::string& path) {
  // Open input stream
  ARROW_ASSIGN_OR_RAISE(auto input,
    arrow::io::ReadableFile::Open(path));

  // Configure CSV reader
  auto read_options = arrow::csv::ReadOptions::Defaults();
  auto parse_options = arrow::csv::ParseOptions::Defaults();
  auto convert_options = arrow::csv::ConvertOptions::Defaults();

  // Create CSV reader
  ARROW_ASSIGN_OR_RAISE(auto reader,
    arrow::csv::TableReader::Make(
      arrow::io::default_io_context(),
      input,
      read_options,
      parse_options,
      convert_options));

  // Read entire table
  return reader->Read();
}

Memory-Mapped Files

Memory-mapped files provide efficient access to large files without loading them entirely into memory.

Python
C++

import pyarrow as pa
import pyarrow.ipc as ipc

# Memory-map an Arrow IPC file
with pa.memory_map('large_data.arrow', 'rb') as source:
    # Zero-copy read - data stays on disk
    reader = ipc.open_file(source)
    
    # Access specific batches without loading everything
    batch_0 = reader.get_batch(0)
    batch_5 = reader.get_batch(5)
    
    # Or read all if needed
    table = reader.read_all()

# Create a memory-mapped file for writing
with pa.memory_map('output.arrow', 'wb') as sink:
    with ipc.new_file(sink, schema) as writer:
        writer.write_table(table)

#include <arrow/api.h>
#include <arrow/io/memory.h>

arrow::Result<std::shared_ptr<arrow::io::MemoryMappedFile>> 
OpenMemoryMapped(const std::string& path) {
  // Open for reading
  ARROW_ASSIGN_OR_RAISE(auto mmap,
    arrow::io::MemoryMappedFile::Open(path, arrow::io::FileMode::READ));
  
  return mmap;
}

arrow::Result<std::shared_ptr<arrow::io::MemoryMappedFile>> 
CreateMemoryMapped(const std::string& path, int64_t size) {
  // Create for writing
  ARROW_ASSIGN_OR_RAISE(auto mmap,
    arrow::io::MemoryMappedFile::Create(path, size));
  
  return mmap;
}

Memory-mapped files on Windows have limitations with file sizes larger than available address space on 32-bit systems. Use 64-bit systems for large files.

Buffered I/O

For better performance with many small reads or writes, use buffered streams.

Python
C++

import pyarrow as pa

# Buffered output stream
with pa.OSFile('data.bin', 'wb') as raw_sink:
    # Wrap in buffered stream
    buffered = pa.BufferOutputStream(raw_sink)
    
    # Multiple small writes are buffered
    for i in range(1000):
        buffered.write(bytes([i % 256]))
    
    # Flush to ensure all data is written
    buffered.flush()

# Buffered input stream
with pa.OSFile('data.bin', 'rb') as raw_source:
    buffered = pa.BufferReader(raw_source.read())
    
    # Efficient random access
    buffered.seek(100)
    data = buffered.read(50)

#include <arrow/io/buffered.h>
#include <arrow/io/file.h>

// Create buffered input stream
arrow::Result<std::shared_ptr<arrow::io::BufferedInputStream>>
CreateBufferedInput(const std::string& path) {
  ARROW_ASSIGN_OR_RAISE(auto file,
    arrow::io::ReadableFile::Open(path));
  
  // Wrap in buffered stream with 1MB buffer
  return arrow::io::BufferedInputStream::Create(
    1024 * 1024,  // buffer size
    arrow::default_memory_pool(),
    file);
}

// Create buffered output stream
arrow::Result<std::shared_ptr<arrow::io::BufferedOutputStream>>
CreateBufferedOutput(const std::string& path) {
  ARROW_ASSIGN_OR_RAISE(auto file,
    arrow::io::FileOutputStream::Open(path));
  
  // Wrap in buffered stream
  return arrow::io::BufferedOutputStream::Create(
    1024 * 1024,  // buffer size
    arrow::default_memory_pool(),
    file);
}

Best Practices

Choose the right format: Use Parquet for long-term storage and analytics, IPC for inter-process communication, and CSV for compatibility.
Use compression: Enable compression for Parquet files to reduce storage size:
```
pq.write_table(table, 'data.parquet', compression='snappy')
```

Stream large files: For files that don’t fit in memory, use streaming readers:

with csv.open_csv('huge_file.csv') as reader:
    for batch in reader:
        process(batch)  # Process in chunks

Use column selection: Only read columns you need to reduce I/O:

table = pq.read_table('data.parquet', columns=['col1', 'col2'])

Memory-map when appropriate: Use memory-mapped files for read-heavy workloads on local storage.

Working with Data

File Formats

Data Processing

Data Transfer

Advanced Topics

Reading and Writing Data

Reading and Writing Parquet Files

IPC Format (Arrow Native Format)

Reading CSV Files

Memory-Mapped Files

Buffered I/O

Best Practices

Build docs developers (and LLMs) love

Working with Data

File Formats

Data Processing

Data Transfer

Advanced Topics

​Reading and Writing Parquet Files

​IPC Format (Arrow Native Format)

​Reading CSV Files

​Memory-Mapped Files

​Buffered I/O

​Best Practices

Build docs developers (and LLMs) love

Reading and Writing Parquet Files

IPC Format (Arrow Native Format)

Reading CSV Files

Memory-Mapped Files

Buffered I/O

Best Practices