Skip to main content

Schema

A Schema defines the structure of tabular data with named fields and types.
import pyarrow as pa

schema = pa.schema([
    ('name', pa.string()),
    ('age', pa.int64()),
    ('income', pa.float64())
])
print(schema)

schema()

Create a Schema from field definitions.
pa.schema(fields, metadata=None)
fields
list
List of Field objects or (name, type) tuples.
metadata
dict
default:"None"
Optional metadata as key-value pairs.
schema
pyarrow.Schema
A Schema object.

Properties

names

List of field names in the schema.
schema = pa.schema([('a', pa.int64()), ('b', pa.string())])
print(schema.names)  # ['a', 'b']
names
list of str
The field names.

types

List of field types in the schema.
schema = pa.schema([('a', pa.int64()), ('b', pa.string())])
print(schema.types)  # [DataType(int64), DataType(string)]
types
list of DataType
The field types.

metadata

Custom metadata attached to the schema.
schema = pa.schema([('a', pa.int64())], metadata={'key': 'value'})
print(schema.metadata)  # {b'key': b'value'}
metadata
dict
The schema metadata as bytes.

Methods

field()

Select a field by name or index.
schema.field(i)
i
int or str
Field index or name.
field
pyarrow.Field
The selected field.

append()

Add a field to the schema.
schema.append(field)
field
pyarrow.Field
Field to append.
schema
pyarrow.Schema
A new schema with the appended field.

insert()

Insert a field at a specific position.
schema.insert(i, field)
i
int
Position to insert the field.
field
pyarrow.Field
Field to insert.
schema
pyarrow.Schema
A new schema with the inserted field.

remove()

Remove a field by index.
schema.remove(i)
i
int
Index of field to remove.
schema
pyarrow.Schema
A new schema without the removed field.

equals()

Check if two schemas are equal.
schema.equals(other, check_metadata=True)
other
pyarrow.Schema
Schema to compare with.
check_metadata
bool
default:"True"
Whether to compare metadata.
equal
bool
True if schemas are equal.

Field

A Field represents a named column with a data type.
import pyarrow as pa

field = pa.field('age', pa.int64(), nullable=True)
print(field)

field()

Create a Field.
pa.field(name, type, nullable=True, metadata=None)
name
str
Name of the field.
type
pyarrow.DataType
Data type of the field.
nullable
bool
default:"True"
Whether the field can contain null values.
metadata
dict
default:"None"
Optional metadata.
field
pyarrow.Field
A Field object.

Properties

name

The field name.
field = pa.field('age', pa.int64())
print(field.name)  # 'age'
name
str
The field name.

type

The field data type.
field = pa.field('age', pa.int64())
print(field.type)  # int64
type
pyarrow.DataType
The field’s data type.

nullable

Whether the field can contain nulls.
field = pa.field('age', pa.int64(), nullable=False)
print(field.nullable)  # False
nullable
bool
True if the field can be null.

DataType

Base class for all Arrow data types.

Primitive Types

Numeric Types

import pyarrow as pa

# Integer types
pa.int8()    # 8-bit signed integer
pa.int16()   # 16-bit signed integer
pa.int32()   # 32-bit signed integer
pa.int64()   # 64-bit signed integer

pa.uint8()   # 8-bit unsigned integer
pa.uint16()  # 16-bit unsigned integer
pa.uint32()  # 32-bit unsigned integer
pa.uint64()  # 64-bit unsigned integer

# Floating point types
pa.float16()  # 16-bit float (half precision)
pa.float32()  # 32-bit float
pa.float64()  # 64-bit float (double precision)

Boolean Type

pa.bool_()  # Boolean (true/false)

String and Binary Types

pa.string()        # Variable-length UTF-8 string
pa.utf8()          # Alias for string()
pa.binary()        # Variable-length binary
pa.large_string()  # Large (64-bit offset) string
pa.large_binary()  # Large (64-bit offset) binary

Null Type

pa.null()  # Null type (only contains nulls)

Temporal Types

Date Types

pa.date32()  # Days since UNIX epoch (32-bit)
pa.date64()  # Milliseconds since UNIX epoch (64-bit)

Time Types

pa.time32('s')   # Time with second unit
pa.time32('ms')  # Time with millisecond unit
pa.time64('us')  # Time with microsecond unit
pa.time64('ns')  # Time with nanosecond unit

Timestamp Types

pa.timestamp('s')    # Timestamp with second unit
pa.timestamp('ms')   # Timestamp with millisecond unit
pa.timestamp('us')   # Timestamp with microsecond unit
pa.timestamp('ns')   # Timestamp with nanosecond unit

# With timezone
pa.timestamp('ns', tz='America/New_York')
pa.timestamp('ms', tz='UTC')
unit
str
Time unit: ‘s’, ‘ms’, ‘us’, or ‘ns’.
tz
str
default:"None"
Timezone name (IANA timezone database).
type
pyarrow.TimestampType
A timestamp type.

Duration Types

pa.duration('s')   # Duration in seconds
pa.duration('ms')  # Duration in milliseconds
pa.duration('us')  # Duration in microseconds
pa.duration('ns')  # Duration in nanoseconds

Nested Types

List Types

# Variable-length list
pa.list_(pa.int64())  # List of int64

# Fixed-size list
from pyarrow import list as pa_list
pa.list_(pa.float32(), 3)  # List of exactly 3 float32 values

# Large list (64-bit offsets)
pa.large_list(pa.string())
value_type
pyarrow.DataType
Type of list elements.
list_size
int
default:"-1"
Fixed size for list (only for fixed-size lists).
type
pyarrow.ListType
A list type.

Struct Types

pa.struct([
    ('x', pa.int64()),
    ('y', pa.float64()),
    ('name', pa.string())
])

# Or using Field objects
pa.struct([
    pa.field('x', pa.int64(), nullable=False),
    pa.field('y', pa.float64())
])
fields
list
List of Field objects or (name, type) tuples.
type
pyarrow.StructType
A struct type.

Map Types

pa.map_(pa.string(), pa.int64())  # Map from string to int64
key_type
pyarrow.DataType
Type of map keys.
item_type
pyarrow.DataType
Type of map values.
type
pyarrow.MapType
A map type.

Dictionary Type

pa.dictionary(pa.int32(), pa.string())  # Dictionary encoded strings
index_type
pyarrow.DataType
Type for dictionary indices.
value_type
pyarrow.DataType
Type of dictionary values.
type
pyarrow.DictionaryType
A dictionary type.

Decimal Types

pa.decimal128(precision=10, scale=2)  # 128-bit decimal
pa.decimal256(precision=38, scale=10) # 256-bit decimal
precision
int
Total number of digits.
scale
int
Number of digits after decimal point.
type
pyarrow.Decimal128Type or pyarrow.Decimal256Type
A decimal type.

Utility Functions

unify_schemas()

Unify multiple schemas into one.
pa.unify_schemas(schemas)
schemas
list of Schema
Schemas to unify.
schema
pyarrow.Schema
Unified schema with all fields.

infer_type()

Infer Arrow type from Python values.
pa.infer_type([1, 2, 3])       # int64
pa.infer_type(['a', 'b', 'c']) # string
values
list
Python values to infer type from.
type
pyarrow.DataType
The inferred data type.

from_numpy_dtype()

Convert NumPy dtype to Arrow type.
import numpy as np
import pyarrow as pa

pa.from_numpy_dtype(np.dtype('int32'))  # int32
pa.from_numpy_dtype(np.dtype('float64')) # float64
dtype
numpy.dtype
NumPy data type.
type
pyarrow.DataType
Corresponding Arrow type.

Build docs developers (and LLMs) love