Skip to main content
Apache Arrow provides a comprehensive type system that supports primitive types, nested types, temporal types, and custom extension types.

Primitive Types

Arrow supports all common primitive data types with explicit bit widths.
import pyarrow as pa

# Numeric types
int8_type = pa.int8()
int16_type = pa.int16()
int32_type = pa.int32()
int64_type = pa.int64()

uint8_type = pa.uint8()
uint16_type = pa.uint16()
uint32_type = pa.uint32()
uint64_type = pa.uint64()

# Floating point
float16_type = pa.float16()  # Half precision
float32_type = pa.float32()
float64_type = pa.float64()

# Boolean
bool_type = pa.bool_()

# Create arrays with explicit types
int_array = pa.array([1, 2, 3, 4], type=pa.int32())
float_array = pa.array([1.5, 2.5, 3.5], type=pa.float64())

# Type checking
print(f"Type: {int_array.type}")
print(f"Is integer: {pa.types.is_integer(int_array.type)}")
print(f"Bit width: {int_array.type.bit_width}")

String and Binary Types

Arrow distinguishes between binary data and UTF-8 encoded strings.
import pyarrow as pa

# UTF-8 strings (variable length)
string_array = pa.array(['hello', 'world', 'arrow'], type=pa.string())
# pa.utf8() is an alias for pa.string()

# Binary data (variable length)
binary_array = pa.array([b'\x00\x01', b'\xff\xfe'], type=pa.binary())

# Large variants (for >2GB data)
large_string = pa.large_string()  # Uses 64-bit offsets
large_binary = pa.large_binary()

# Fixed-size binary
fixed_binary = pa.binary(5)  # Each value is exactly 5 bytes
fixed_array = pa.array([b'12345', b'abcde'], type=fixed_binary)

# String/Binary views (new, more efficient)
string_view = pa.string_view()
binary_view = pa.binary_view()

print(f"String type: {string_array.type}")
print(f"Is string: {pa.types.is_string(string_array.type)}")
print(f"Is binary: {pa.types.is_binary(binary_array.type)}")
Arrow’s string type always expects UTF-8 encoding. For binary data that may not be valid UTF-8, use the binary type instead.

Temporal Types

Arrow provides rich support for dates, times, and timestamps with timezone information.
import pyarrow as pa
from datetime import datetime, date, timedelta

# Date types (32-bit and 64-bit)
date32 = pa.date32()  # Days since UNIX epoch
date64 = pa.date64()  # Milliseconds since UNIX epoch

dates = pa.array([date(2024, 1, 1), date(2024, 12, 31)], 
                 type=date32)

# Time types
time32_s = pa.time32('s')   # Seconds since midnight
time32_ms = pa.time32('ms') # Milliseconds since midnight
time64_us = pa.time64('us') # Microseconds since midnight
time64_ns = pa.time64('ns') # Nanoseconds since midnight

# Timestamp with timezone
timestamp_s = pa.timestamp('s')  # Seconds
timestamp_ms = pa.timestamp('ms')  # Milliseconds
timestamp_us = pa.timestamp('us')  # Microseconds
timestamp_ns = pa.timestamp('ns')  # Nanoseconds

# With timezone
timestamp_tz = pa.timestamp('ms', tz='America/New_York')
timestamps = pa.array([
    datetime(2024, 1, 1, 12, 0, 0),
    datetime(2024, 6, 15, 18, 30, 0)
], type=timestamp_tz)

# Duration
duration_ms = pa.duration('ms')
durations = pa.array([
    timedelta(days=1),
    timedelta(hours=2, minutes=30)
], type=duration_ms)

# Month-day-nano interval
interval = pa.month_day_nano_interval()

print(f"Timestamp type: {timestamps.type}")
print(f"Timezone: {timestamps.type.tz}")
print(f"Unit: {timestamps.type.unit}")
When working with timestamps, always specify the unit and timezone explicitly to avoid ambiguity. Arrow stores timestamps as integers since the UNIX epoch.

Decimal Types

Arrow supports high-precision decimal numbers with configurable precision and scale.
import pyarrow as pa
from decimal import Decimal

# Decimal128 (up to 38 digits)
# precision: total number of digits
# scale: number of digits after decimal point
decimal128 = pa.decimal128(precision=10, scale=2)

decimal_array = pa.array([
    Decimal('123.45'),
    Decimal('678.90'),
    Decimal('1000.00')
], type=decimal128)

# Decimal256 (up to 76 digits)
decimal256 = pa.decimal256(precision=38, scale=6)

high_precision = pa.array([
    Decimal('123456789.123456'),
    Decimal('987654321.654321')
], type=decimal256)

print(f"Type: {decimal_array.type}")
print(f"Precision: {decimal_array.type.precision}")
print(f"Scale: {decimal_array.type.scale}")

Nested Types

Arrow supports complex nested data structures including lists, structs, and maps.
import pyarrow as pa

# List type (variable length lists)
list_type = pa.list_(pa.int32())
list_array = pa.array([
    [1, 2, 3],
    [4, 5],
    [6, 7, 8, 9]
], type=list_type)

# Large list (64-bit offsets)
large_list_type = pa.large_list(pa.string())

# Fixed size list
fixed_list_type = pa.list_(pa.float64(), 3)  # Always 3 elements
fixed_array = pa.array([
    [1.0, 2.0, 3.0],
    [4.0, 5.0, 6.0]
], type=fixed_list_type)

# Struct type (like a row with named fields)
struct_type = pa.struct([
    ('name', pa.string()),
    ('age', pa.int32()),
    ('score', pa.float64())
])

struct_array = pa.array([
    {'name': 'Alice', 'age': 30, 'score': 95.5},
    {'name': 'Bob', 'age': 25, 'score': 87.2}
], type=struct_type)

# Map type (key-value pairs)
map_type = pa.map_(pa.string(), pa.int32())
map_array = pa.array([
    [('a', 1), ('b', 2)],
    [('x', 10), ('y', 20), ('z', 30)]
], type=map_type)

# Nested structures
nested = pa.struct([
    ('id', pa.int32()),
    ('tags', pa.list_(pa.string())),
    ('metrics', pa.map_(pa.string(), pa.float64()))
])

print(f"List type: {list_array.type}")
print(f"Struct fields: {struct_array.type}")

Dictionary Types

Dictionary encoding is efficient for data with many repeated values.
import pyarrow as pa

# Create dictionary encoded array
categories = ['red', 'green', 'blue', 'red', 'green', 'red']
dict_array = pa.array(categories).dictionary_encode()

print(f"Type: {dict_array.type}")
print(f"Indices: {dict_array.indices}")
print(f"Dictionary: {dict_array.dictionary}")

# Specify index and value types
dict_type = pa.dictionary(pa.int8(), pa.string())
typed_dict = pa.array(categories, type=dict_type)

# Dictionary with null values
with_nulls = pa.array(['a', 'b', None, 'a'], type=dict_type)
Dictionary encoding can significantly reduce memory usage and improve performance for categorical data. The dictionary is shared across all chunks in a ChunkedArray.

Type Checking and Conversion

import pyarrow as pa
import pyarrow.compute as pc

array = pa.array([1, 2, 3])

# Type checking
print(pa.types.is_integer(array.type))
print(pa.types.is_floating(array.type))
print(pa.types.is_string(array.type))
print(pa.types.is_temporal(pa.timestamp('ms')))

# Type conversion (cast)
int_array = pa.array([1, 2, 3], type=pa.int32())
float_array = pc.cast(int_array, pa.float64())

string_array = pa.array(['1', '2', '3'])
int_from_string = pc.cast(string_array, pa.int32())

# Safe casting (returns error on failure)
try:
    result = pc.cast(pa.array(['abc']), pa.int32(), safe=True)
except pa.ArrowInvalid as e:
    print(f"Cast failed: {e}")

Build docs developers (and LLMs) love