Skip to main content
CSV (Comma-Separated Values) is a widely-used text format for tabular data. Apache Arrow provides efficient CSV readers with automatic type inference and customizable parsing options.

Reading CSV Files

#include <arrow/api.h>
#include <arrow/csv/api.h>
#include <arrow/io/api.h>

// Read entire CSV file as a Table
arrow::Result<std::shared_ptr<arrow::Table>> ReadCSVFile(
    const std::string& filename) {
  // Open the file
  ARROW_ASSIGN_OR_RAISE(
      auto input,
      arrow::io::ReadableFile::Open(filename));

  // Configure CSV reader
  auto read_options = arrow::csv::ReadOptions::Defaults();
  auto parse_options = arrow::csv::ParseOptions::Defaults();
  auto convert_options = arrow::csv::ConvertOptions::Defaults();

  // Create a CSV TableReader
  ARROW_ASSIGN_OR_RAISE(
      auto reader,
      arrow::csv::TableReader::Make(
          arrow::io::default_io_context(),
          input,
          read_options,
          parse_options,
          convert_options));

  // Read the entire CSV as a table
  return reader->Read();
}

// Read CSV with custom delimiter
arrow::Result<std::shared_ptr<arrow::Table>> ReadTSVFile(
    const std::string& filename) {
  ARROW_ASSIGN_OR_RAISE(
      auto input,
      arrow::io::ReadableFile::Open(filename));

  auto read_options = arrow::csv::ReadOptions::Defaults();
  auto parse_options = arrow::csv::ParseOptions::Defaults();
  parse_options.delimiter = '\t';  // Tab delimiter

  auto convert_options = arrow::csv::ConvertOptions::Defaults();

  ARROW_ASSIGN_OR_RAISE(
      auto reader,
      arrow::csv::TableReader::Make(
          arrow::io::default_io_context(),
          input,
          read_options,
          parse_options,
          convert_options));

  return reader->Read();
}

// Streaming CSV read
arrow::Status StreamCSVFile(const std::string& filename) {
  ARROW_ASSIGN_OR_RAISE(
      auto input,
      arrow::io::ReadableFile::Open(filename));

  auto read_options = arrow::csv::ReadOptions::Defaults();
  auto parse_options = arrow::csv::ParseOptions::Defaults();
  auto convert_options = arrow::csv::ConvertOptions::Defaults();

  // Create a streaming reader
  ARROW_ASSIGN_OR_RAISE(
      auto reader,
      arrow::csv::StreamingReader::Make(
          arrow::io::default_io_context(),
          input,
          read_options,
          parse_options,
          convert_options));

  // Read batches
  std::shared_ptr<arrow::RecordBatch> batch;
  while (reader->ReadNext(&batch).ok() && batch != nullptr) {
    std::cout << "Read batch with " << batch->num_rows() << " rows" << std::endl;
  }

  return arrow::Status::OK();
}

Writing CSV Files

#include <arrow/api.h>
#include <arrow/csv/api.h>
#include <arrow/io/api.h>

// Write Table to CSV file
arrow::Status WriteCSVFile(
    const std::string& filename,
    const std::shared_ptr<arrow::Table>& table) {
  // Open output stream
  ARROW_ASSIGN_OR_RAISE(
      auto output,
      arrow::io::FileOutputStream::Open(filename));

  // Configure write options
  auto write_options = arrow::csv::WriteOptions::Defaults();

  // Write the table to CSV
  ARROW_RETURN_NOT_OK(
      arrow::csv::WriteCSV(*table, write_options, output.get()));

  // Close the output
  ARROW_RETURN_NOT_OK(output->Close());
  return arrow::Status::OK();
}

// Write CSV with custom options
arrow::Status WriteCSVWithOptions(
    const std::string& filename,
    const std::shared_ptr<arrow::Table>& table) {
  ARROW_ASSIGN_OR_RAISE(
      auto output,
      arrow::io::FileOutputStream::Open(filename));

  auto write_options = arrow::csv::WriteOptions::Defaults();
  write_options.include_header = true;
  write_options.delimiter = ',';

  ARROW_RETURN_NOT_OK(
      arrow::csv::WriteCSV(*table, write_options, output.get()));

  ARROW_RETURN_NOT_OK(output->Close());
  return arrow::Status::OK();
}

// Incremental CSV writing
arrow::Status WriteCSVIncremental(
    const std::string& filename,
    const std::shared_ptr<arrow::Schema>& schema) {
  ARROW_ASSIGN_OR_RAISE(
      auto output,
      arrow::io::FileOutputStream::Open(filename));

  auto write_options = arrow::csv::WriteOptions::Defaults();

  // Create a CSV writer
  ARROW_ASSIGN_OR_RAISE(
      auto writer,
      arrow::csv::MakeCSVWriter(output, schema, write_options));

  // Write batches
  for (int i = 0; i < 10; ++i) {
    ARROW_ASSIGN_OR_RAISE(auto batch, CreateRecordBatch(schema));
    ARROW_RETURN_NOT_OK(writer->WriteRecordBatch(*batch));
  }

  // Close the writer
  ARROW_RETURN_NOT_OK(writer->Close());
  return arrow::Status::OK();
}

Parse Options

Delimiters and Quoting

auto parse_options = arrow::csv::ParseOptions::Defaults();

// Set delimiter
parse_options.delimiter = ';';

// Configure quoting
parse_options.quoting = true;
parse_options.quote_char = '"';
parse_options.double_quote = true;  // "" escapes quote

// Enable escaping
parse_options.escaping = true;
parse_options.escape_char = '\\';

// Allow newlines in quoted values
parse_options.newlines_in_values = true;

// Ignore empty lines
parse_options.ignore_empty_lines = true;

Null Value Handling

auto convert_options = arrow::csv::ConvertOptions::Defaults();

// Specify null value representations
convert_options.null_values = {"NA", "NULL", "", "N/A"};

// Allow nulls in string columns
convert_options.strings_can_be_null = true;

// Treat quoted nulls as null values
convert_options.quoted_strings_can_be_null = true;

Type Inference and Conversion

Automatic Type Inference

By default, Arrow infers column types from the CSV data:
# Arrow automatically infers types
table = csv.read_csv('data.csv')
print(table.schema)
# Output: id: int64, name: string, value: double
Type Inference: Arrow samples the first block of data to infer types. For large files with heterogeneous data at the start, you may want to increase block_size or specify types explicitly.

Explicit Type Specification

auto convert_options = arrow::csv::ConvertOptions::Defaults();

// Specify column types
convert_options.column_types = {
    {"id", arrow::int64()},
    {"name", arrow::utf8()},
    {"date", arrow::timestamp(arrow::TimeUnit::SECOND)},
    {"value", arrow::float64()}
};

Boolean Values

auto convert_options = arrow::csv::ConvertOptions::Defaults();

// Define what represents true/false
convert_options.true_values = {"true", "True", "TRUE", "1", "yes"};
convert_options.false_values = {"false", "False", "FALSE", "0", "no"};

Read Options

Multi-threading

auto read_options = arrow::csv::ReadOptions::Defaults();

// Enable multi-threaded parsing
read_options.use_threads = true;

// Set block size (affects parallelism)
read_options.block_size = 1 << 20;  // 1 MB
Block Size: Larger block sizes reduce overhead but require more memory. Smaller blocks enable better parallelism for multi-core systems. Default is 1 MB.

Column Names

auto read_options = arrow::csv::ReadOptions::Defaults();

// Read CSV without header (provide column names)
read_options.column_names = {"id", "name", "value"};

// Auto-generate column names
read_options.autogenerate_column_names = true;  // f0, f1, f2...

Skipping Rows

read_options = csv.ReadOptions(
    skip_rows=10,                # Skip first 10 rows
    skip_rows_after_names=5     # Skip 5 rows after header
)

table = csv.read_csv('data.csv', read_options=read_options)

Performance Tips

Multi-threaded Parsing: Enable use_threads for faster parsing on multi-core systems:
read_options = csv.ReadOptions(use_threads=True)
table = csv.read_csv('large_file.csv', read_options=read_options)
Streaming for Large Files: Use streaming readers to process large files without loading everything into memory:
reader = csv.open_csv('huge_file.csv')
for batch in reader:
    process_batch(batch)
Column Selection: Read only needed columns to reduce memory usage:
convert_options = csv.ConvertOptions(
    include_columns=['col1', 'col2']
)
table = csv.read_csv('data.csv', convert_options=convert_options)
Block Size Tuning: Adjust block_size based on your file size and system memory:
  • Smaller files: Use default 1 MB
  • Large files with many cores: Increase to 4-8 MB
  • Memory-constrained systems: Decrease to 256-512 KB

Common Use Cases

Reading Files with Different Encodings

# UTF-8 (default)
table = csv.read_csv('data.csv')

# For other encodings, pre-process with Python
import codecs
with codecs.open('latin1.csv', 'r', encoding='latin-1') as f:
    content = f.read()

# Then read from buffer
import io
table = csv.read_csv(io.BytesIO(content.encode('utf-8')))

Handling Invalid Rows

auto parse_options = arrow::csv::ParseOptions::Defaults();

// Set a custom invalid row handler
parse_options.invalid_row_handler = 
    [](const arrow::csv::InvalidRow& row) -> arrow::Status {
  std::cerr << "Invalid row at line " << row.number 
            << ": " << row.text << std::endl;
  return arrow::Status::OK();  // Continue processing
};

Dictionary Encoding for String Columns

# Automatically dictionary-encode low-cardinality strings
convert_options = csv.ConvertOptions(
    auto_dict_encode=True,
    auto_dict_max_cardinality=1000  # Dict-encode up to 1000 unique values
)

table = csv.read_csv('data.csv', convert_options=convert_options)
Dictionary Encoding: Automatically converts string columns with low cardinality to dictionary type, saving memory and improving performance for repetitive string data.

Build docs developers (and LLMs) love