Working with Tables and Arrays

Arrays

Arrays are the fundamental one-dimensional data structure in PyArrow. They represent a contiguous sequence of values with a single data type.

Creating Arrays

import pyarrow as pa

# From Python list
array = pa.array([1, 2, 3, 4, 5])
print(array.type)  # int64

# With explicit type
array = pa.array([1, 2, 3], type=pa.int32())
array = pa.array(['a', 'b', 'c'], type=pa.string())

# With null values
array = pa.array([1, None, 3, None, 5])
print(f"Null count: {array.null_count}")  # 2

# From NumPy array
import numpy as np
np_array = np.array([1.0, 2.0, 3.0])
arrow_array = pa.array(np_array)

Array Types

Numeric Arrays
String and Binary
Temporal Arrays
Nested Arrays

import pyarrow as pa

# Integer arrays
int_array = pa.array([1, 2, 3], type=pa.int64())
uint_array = pa.array([1, 2, 3], type=pa.uint32())

# Floating point arrays
float_array = pa.array([1.5, 2.7, 3.9], type=pa.float64())

# Boolean arrays
bool_array = pa.array([True, False, True], type=pa.bool_())

# Decimal arrays for precise arithmetic
decimal_array = pa.array([1.23, 4.56], type=pa.decimal128(10, 2))

import pyarrow as pa

# String arrays (UTF-8)
string_array = pa.array(['hello', 'world', 'arrow'])
print(string_array.type)  # string

# Binary arrays
binary_array = pa.array([b'bytes', b'data'], type=pa.binary())

# Large strings for > 2GB data
large_string_array = pa.array(
    ['large', 'strings'],
    type=pa.large_string()
)

import pyarrow as pa
from datetime import datetime, date, timedelta

# Timestamp arrays
timestamps = pa.array(
    [datetime(2024, 1, 1), datetime(2024, 1, 2)],
    type=pa.timestamp('us')
)

# With timezone
timestamps_tz = pa.array(
    [datetime(2024, 1, 1), datetime(2024, 1, 2)],
    type=pa.timestamp('us', tz='UTC')
)

# Date arrays
dates = pa.array(
    [date(2024, 1, 1), date(2024, 1, 2)],
    type=pa.date32()
)

# Duration arrays
durations = pa.array(
    [timedelta(days=1), timedelta(hours=2)],
    type=pa.duration('s')
)

import pyarrow as pa

# List arrays
list_array = pa.array(
    [[1, 2], [3, 4, 5], [6]],
    type=pa.list_(pa.int64())
)

# Struct arrays (like records)
struct_array = pa.array(
    [{'x': 1, 'y': 2.0}, {'x': 3, 'y': 4.0}],
    type=pa.struct([('x', pa.int64()), ('y', pa.float64())])
)

# Map arrays
map_array = pa.array(
    [{'a': 1, 'b': 2}, {'c': 3}],
    type=pa.map_(pa.string(), pa.int64())
)

Array Operations

import pyarrow as pa

array = pa.array([1, 2, 3, 4, 5])

# Access elements
print(array[0])  # <pyarrow.Int64Scalar: 1>
print(array[0].as_py())  # 1 (convert to Python)

# Slicing
subarray = array[1:4]  # Elements 1, 2, 3
print(subarray.to_pylist())  # [2, 3, 4]

# Length and properties
print(len(array))  # 5
print(array.type)  # int64
print(array.null_count)  # 0

# Convert to Python list
python_list = array.to_pylist()

# Convert to NumPy array
import numpy as np
np_array = array.to_numpy()

Chunked Arrays

Chunked arrays are sequences of arrays with the same type:

import pyarrow as pa

# Create chunked array from multiple arrays
chunked = pa.chunked_array([
    [1, 2, 3],
    [4, 5, 6],
    [7, 8, 9]
])

print(f"Number of chunks: {len(chunked.chunks)}")
print(f"Total length: {len(chunked)}")

# Access individual chunks
for i, chunk in enumerate(chunked.chunks):
    print(f"Chunk {i}: {chunk.to_pylist()}")

# Combine chunks into single array
combined = chunked.combine_chunks()
print(combined.to_pylist())  # [1, 2, 3, 4, 5, 6, 7, 8, 9]

Chunked arrays are useful when data comes from multiple sources or when processing data in batches.

Tables

Tables are two-dimensional datasets with named columns, similar to pandas DataFrames or SQL tables.

Creating Tables

From Dictionary
From Arrays
From Pandas
From Record Batches

import pyarrow as pa

# Create table from Python dictionary
table = pa.table({
    'id': [1, 2, 3],
    'name': ['Alice', 'Bob', 'Charlie'],
    'score': [95.5, 87.3, 92.1]
})

print(table)
# pyarrow.Table
# id: int64
# name: string
# score: double

import pyarrow as pa

# Create table from arrays with schema
schema = pa.schema([
    ('id', pa.int32()),
    ('name', pa.string()),
    ('score', pa.float64())
])

table = pa.table([
    pa.array([1, 2, 3], type=pa.int32()),
    pa.array(['Alice', 'Bob', 'Charlie']),
    pa.array([95.5, 87.3, 92.1])
], schema=schema)

import pyarrow as pa
import pandas as pd

# Create pandas DataFrame
df = pd.DataFrame({
    'id': [1, 2, 3],
    'name': ['Alice', 'Bob', 'Charlie'],
    'score': [95.5, 87.3, 92.1]
})

# Convert to Arrow table
table = pa.Table.from_pandas(df)

# Without pandas index
table = pa.Table.from_pandas(df, preserve_index=False)

import pyarrow as pa

# Create record batch
batch = pa.record_batch([
    pa.array([1, 2, 3]),
    pa.array(['a', 'b', 'c'])
], names=['id', 'value'])

# Create table from batches
table = pa.Table.from_batches([batch])

Table Operations

import pyarrow as pa

table = pa.table({
    'id': [1, 2, 3, 4],
    'name': ['Alice', 'Bob', 'Charlie', 'David'],
    'score': [95.5, 87.3, 92.1, 88.9]
})

# Get column names
print(table.column_names)  # ['id', 'name', 'score']

# Get schema
print(table.schema)

# Number of rows and columns
print(f"Rows: {table.num_rows}, Columns: {table.num_columns}")

# Access columns
id_column = table['id']
name_column = table.column('name')  # Alternative syntax

# Access by index
first_column = table.column(0)

# Get multiple columns
subtable = table.select(['name', 'score'])

Modifying Tables

import pyarrow as pa

table = pa.table({
    'id': [1, 2, 3],
    'name': ['Alice', 'Bob', 'Charlie']
})

# Add a column
new_table = table.append_column(
    'score',
    pa.array([95, 87, 92])
)

# Add multiple columns
new_table = table.add_column(
    0,  # Insert at position 0
    'index',
    pa.array([0, 1, 2])
)

# Remove column
smaller_table = table.remove_column(1)  # Remove column at index 1

# Rename columns
renamed_table = table.rename_columns(['user_id', 'user_name'])

# Replace column
updated_table = table.set_column(
    1,
    'name',
    pa.array(['Alice Smith', 'Bob Jones', 'Charlie Brown'])
)

Tables are immutable. Operations like append_column return a new table rather than modifying the existing one.

Filtering and Slicing

import pyarrow as pa
import pyarrow.compute as pc

table = pa.table({
    'id': [1, 2, 3, 4, 5],
    'score': [95, 87, 92, 88, 91]
})

# Slice rows
sliced = table.slice(0, 3)  # First 3 rows
sliced = table.slice(2)  # From row 2 to end

# Filter using mask
mask = pc.greater(table['score'], 90)
filtered = table.filter(mask)

# Filter with expression
from pyarrow import compute as pc
filtered = table.filter(
    pc.and_(
        pc.greater_equal(table['score'], 85),
        pc.less(table['score'], 95)
    )
)

Sorting Tables

import pyarrow as pa
import pyarrow.compute as pc

table = pa.table({
    'name': ['Charlie', 'Alice', 'Bob'],
    'age': [30, 25, 35],
    'score': [92, 95, 87]
})

# Sort by single column
sorted_table = table.sort_by([('score', 'descending')])

# Sort by multiple columns
sorted_table = table.sort_by([
    ('age', 'ascending'),
    ('score', 'descending')
])

# Using take and sort_indices for more control
indices = pc.sort_indices(table['score'], sort_keys='descending')
sorted_table = table.take(indices)

Schemas

Schemas define the structure and metadata of tables:

import pyarrow as pa

# Create schema
schema = pa.schema([
    pa.field('id', pa.int32(), nullable=False),
    pa.field('name', pa.string()),
    pa.field('timestamp', pa.timestamp('ms', tz='UTC')),
    pa.field('tags', pa.list_(pa.string())),
])

print(schema)

# Add metadata to schema
schema_with_meta = schema.with_metadata({
    'source': 'user_database',
    'version': '1.0'
})

# Access field information
for field in schema:
    print(f"{field.name}: {field.type}, nullable={field.nullable}")

# Get specific field
id_field = schema.field('id')
print(id_field.metadata)

# Compare schemas
schema2 = pa.schema([('id', pa.int32()), ('name', pa.string())])
print(schema.equals(schema2))  # False

Schema Evolution

import pyarrow as pa

schema = pa.schema([
    ('id', pa.int32()),
    ('name', pa.string())
])

# Add field
new_schema = schema.append(pa.field('age', pa.int32()))

# Insert field at position
new_schema = schema.insert(1, pa.field('email', pa.string()))

# Remove field
smaller_schema = schema.remove(1)

# Unify multiple schemas
schemas = [
    pa.schema([('a', pa.int32()), ('b', pa.string())]),
    pa.schema([('a', pa.int32()), ('c', pa.float64())])
]
unified = pa.unify_schemas(schemas)
print(unified)  # Contains fields a, b, c

Record Batches

Record batches are like tables but represent a single contiguous block of data:

import pyarrow as pa

# Create record batch
batch = pa.record_batch([
    pa.array([1, 2, 3]),
    pa.array(['a', 'b', 'c'])
], names=['id', 'value'])

print(f"Rows: {batch.num_rows}")
print(f"Schema: {batch.schema}")

# Access columns
id_col = batch['id']

# Convert to table
table = pa.Table.from_batches([batch])

# Convert table to batches
table = pa.table({'x': [1, 2, 3], 'y': ['a', 'b', 'c']})
batches = table.to_batches(max_chunksize=2)
for i, batch in enumerate(batches):
    print(f"Batch {i}: {batch.num_rows} rows")

Concatenation

Concatenate Arrays
Concatenate Tables
Concatenate Batches

import pyarrow as pa

arr1 = pa.array([1, 2, 3])
arr2 = pa.array([4, 5, 6])

# Concatenate arrays
result = pa.concat_arrays([arr1, arr2])
print(result.to_pylist())  # [1, 2, 3, 4, 5, 6]

import pyarrow as pa

table1 = pa.table({'a': [1, 2], 'b': ['x', 'y']})
table2 = pa.table({'a': [3, 4], 'b': ['z', 'w']})

# Concatenate tables (must have same schema)
result = pa.concat_tables([table1, table2])
print(result.num_rows)  # 4

import pyarrow as pa

batch1 = pa.record_batch([[1, 2]], names=['a'])
batch2 = pa.record_batch([[3, 4]], names=['a'])

# Concatenate record batches into table
table = pa.Table.from_batches([batch1, batch2])
print(table.num_rows)  # 4

Memory and Performance

Zero-Copy Operations

import pyarrow as pa
import numpy as np

# Zero-copy from NumPy (when possible)
np_array = np.array([1, 2, 3, 4, 5], dtype='int64')
arrow_array = pa.array(np_array, type=pa.int64())

# Zero-copy to NumPy (when contiguous)
arrow_array = pa.array([1, 2, 3, 4, 5])
np_array = arrow_array.to_numpy(zero_copy_only=True)

Memory Usage

import pyarrow as pa

table = pa.table({
    'a': range(1000000),
    'b': ['text'] * 1000000
})

# Get memory usage
print(f"Table size: {table.nbytes} bytes")
print(f"Total buffer size: {table.get_total_buffer_size()} bytes")

# Per-column memory
for name in table.column_names:
    col = table[name]
    print(f"{name}: {col.nbytes} bytes")

Efficient Data Transfer

import pyarrow as pa

table = pa.table({'a': [1, 2, 3]})

# Serialize to bytes (for IPC/network transfer)
sink = pa.BufferOutputStream()
with pa.ipc.RecordBatchStreamWriter(sink, table.schema) as writer:
    writer.write_table(table)
buf = sink.getvalue()

# Deserialize
reader = pa.ipc.RecordBatchStreamReader(buf)
reconstructed = reader.read_all()

Next Steps

Compute Functions - Perform operations on arrays and tables
Dataset API - Work with multi-file datasets
Parquet Files - Read and write Parquet format

C++

Python

R

Ruby

Other Languages

Working with Tables and Arrays

Arrays

Creating Arrays

Array Types

Array Operations

Chunked Arrays

Tables

Creating Tables

Table Operations

Modifying Tables

Filtering and Slicing

Sorting Tables

Schemas

Schema Evolution

Record Batches

Concatenation

Memory and Performance

Zero-Copy Operations

Memory Usage

Efficient Data Transfer

Next Steps

Build docs developers (and LLMs) love

C++

Python

R

Ruby

Other Languages

​Arrays

​Creating Arrays

​Array Types

​Array Operations

​Chunked Arrays

​Tables

​Creating Tables

​Table Operations

​Modifying Tables

​Filtering and Slicing

​Sorting Tables

​Schemas

​Schema Evolution

​Record Batches

​Concatenation

​Memory and Performance

​Zero-Copy Operations

​Memory Usage

​Efficient Data Transfer

​Next Steps

Build docs developers (and LLMs) love

Arrays

Creating Arrays

Array Types

Array Operations

Chunked Arrays

Tables

Creating Tables

Table Operations

Modifying Tables

Filtering and Slicing

Sorting Tables

Schemas

Schema Evolution

Record Batches

Concatenation

Memory and Performance

Zero-Copy Operations

Memory Usage

Efficient Data Transfer

Next Steps