Skip to main content

Structured Arrays

Structured arrays are ndarrays with a composite datatype consisting of multiple named fields. They allow you to work with heterogeneous data (different data types) in a single array, similar to structs in C or database tables.

Introduction

A structured array has a datatype composed of simpler datatypes organized as named fields.
import numpy as np

# Create structured array
data = np.array([
    ('Alice', 25, 65.5),
    ('Bob', 30, 75.0),
    ('Charlie', 35, 68.2)
], dtype=[('name', 'U10'), ('age', 'i4'), ('weight', 'f4')])

print(data)
# [('Alice', 25, 65.5) ('Bob', 30, 75. ) ('Charlie', 35, 68.2)]

print(data.dtype)
# [('name', '<U10'), ('age', '<i4'), ('weight', '<f4')]

Accessing Fields

# Access individual fields by name
print(data['name'])
# ['Alice' 'Bob' 'Charlie']

print(data['age'])
# [25 30 35]

# Access individual records
print(data[0])
# ('Alice', 25, 65.5)

# Access field of specific record
print(data[1]['name'])
# Bob

# Modify fields
data['age'] = data['age'] + 1
print(data['age'])
# [26 31 36]

Creating Structured Datatypes

There are multiple ways to define structured dtypes.

Method 1: List of Tuples

import numpy as np

# Basic: (field_name, data_type)
dt = np.dtype([('x', 'f4'), ('y', 'f4')])

# With subarray shape: (field_name, data_type, shape)
dt = np.dtype([('position', 'f4', (3,)), ('velocity', 'f4', (3,))])

# Create array with this dtype
particles = np.zeros(5, dtype=dt)
print(particles.dtype)
# [('position', '<f4', (3,)), ('velocity', '<f4', (3,))]

particles[0]['position'] = [1.0, 2.0, 3.0]
print(particles[0])
# ([1., 2., 3.], [0., 0., 0.])

Method 2: Comma-Separated String

import numpy as np

# Shorthand notation
dt = np.dtype('i4, f4, S10')
print(dt)
# [('f0', '<i4'), ('f1', '<f4'), ('f2', 'S10')]

# With subarrays
dt = np.dtype('3i4, f4, (2,3)f8')
print(dt)
# [('f0', '<i4', (3,)), ('f1', '<f4'), ('f2', '<f8', (2, 3))]

Method 3: Dictionary with Names and Formats

import numpy as np

# Most flexible form
dt = np.dtype({
    'names': ['id', 'name', 'score'],
    'formats': ['i4', 'U20', 'f8']
})

# With explicit offsets and itemsize
dt = np.dtype({
    'names': ['x', 'y', 'z'],
    'formats': ['f4', 'f4', 'f4'],
    'offsets': [0, 4, 8],
    'itemsize': 16  # Extra padding
})

print(dt)
# {'names':['x','y','z'], 'formats':['<f4','<f4','<f4'], 'offsets':[0,4,8], 'itemsize':16}

Method 4: Dictionary of Field Names

import numpy as np

# Dictionary with (type, offset) tuples
dt = np.dtype({
    'x': ('i4', 0),
    'y': ('f4', 4),
    'z': ('f8', 8)
})

# Note: Field order preserved in Python 3.6+

Field Titles

Fields can have both a name and a title (alternate name).
import numpy as np

# Using tuple (title, name)
dt = np.dtype([(('Full Name', 'name'), 'U20'), 
               (('Age in Years', 'age'), 'i4')])

data = np.array([('Alice', 25), ('Bob', 30)], dtype=dt)

# Access by name or title
print(data['name'])
# ['Alice' 'Bob']

print(data['Full Name'])
# ['Alice' 'Bob']

print(dt.names)
# ('name', 'age')

print(dt.fields)
# mappingproxy({'Full Name': (dtype('<U20'), 0), 'name': (dtype('<U20'), 0), 
#               'Age in Years': (dtype('int32'), 80), 'age': (dtype('int32'), 80)})

Alignment and Padding

Structured dtypes can be aligned to match C struct layouts.

Without Alignment

import numpy as np

# Packed (no alignment)
dt = np.dtype([('a', 'u1'), ('b', 'u1'), ('c', 'i4'), ('d', 'u1'), ('e', 'i8')])
print(f"Itemsize: {dt.itemsize}")  # 15 bytes
print(f"Offsets: {[dt.fields[name][1] for name in dt.names]}")
# [0, 1, 2, 6, 7]

With Alignment

import numpy as np

# Aligned (like C struct)
dt = np.dtype([('a', 'u1'), ('b', 'u1'), ('c', 'i4'), ('d', 'u1'), ('e', 'i8')], 
              align=True)
print(f"Itemsize: {dt.itemsize}")  # 24 bytes (with padding)
print(f"Offsets: {[dt.fields[name][1] for name in dt.names]}")
# [0, 1, 4, 8, 16]

# Check alignment flag
print(f"Is aligned: {dt.isalignedstruct}")  # True

Nested Structures

Structured dtypes can be nested for hierarchical data.
import numpy as np

# Define nested dtype
address_dt = np.dtype([
    ('street', 'U30'),
    ('city', 'U20'),
    ('zip', 'U10')
])

person_dt = np.dtype([
    ('name', 'U20'),
    ('age', 'i4'),
    ('address', address_dt)
])

# Create array
people = np.array([
    ('Alice', 25, ('123 Main St', 'Boston', '02101')),
    ('Bob', 30, ('456 Elm St', 'Cambridge', '02139'))
], dtype=person_dt)

print(people['name'])
# ['Alice' 'Bob']

print(people['address']['city'])
# ['Boston' 'Cambridge']

print(people[0]['address'])
# ('123 Main St', 'Boston', '02101')

Indexing Structured Arrays

Single Field Access

import numpy as np

data = np.array([
    (1, 2.5, 'A'),
    (2, 3.7, 'B'),
    (3, 1.2, 'C')
], dtype=[('id', 'i4'), ('value', 'f4'), ('label', 'U1')])

# Field access returns a view
values = data['value']
values[0] = 99.0
print(data['value'])
# [99.   3.7  1.2]

Multiple Field Access

import numpy as np

data = np.zeros(3, dtype=[('a', 'i4'), ('b', 'f4'), ('c', 'i4')])

# Access multiple fields
subset = data[['a', 'c']]
print(subset.dtype)
# [('a', '<i4'), ('c', '<i4')]

# Assignment to multiple fields
data[['a', 'c']] = (10, 20)
print(data)
# [(10, 0., 20) (10, 0., 20) (10, 0., 20)]

Boolean Indexing

import numpy as np

data = np.array([
    ('Alice', 25, 65.5),
    ('Bob', 30, 75.0),
    ('Charlie', 35, 68.2),
    ('David', 28, 72.3)
], dtype=[('name', 'U10'), ('age', 'i4'), ('weight', 'f4')])

# Filter by age
young = data[data['age'] < 30]
print(young)
# [('Alice', 25, 65.5) ('David', 28, 72.3)]

# Complex conditions
heavy_and_old = data[(data['age'] > 27) & (data['weight'] > 70)]
print(heavy_and_old)
# [('Bob', 30, 75.) ('David', 28, 72.3)]

Record Arrays

Record arrays (np.recarray) are structured arrays with field access via attributes.
import numpy as np

# Create recarray
recdata = np.rec.array([
    (1, 'Alice', 25),
    (2, 'Bob', 30),
    (3, 'Charlie', 35)
], dtype=[('id', 'i4'), ('name', 'U10'), ('age', 'i4')])

# Access fields as attributes (not just indexing)
print(recdata.name)
# ['Alice' 'Bob' 'Charlie']

print(recdata.age)
# [25 30 35]

# Still works with indexing
print(recdata['name'])
# ['Alice' 'Bob' 'Charlie']

# Individual record access
print(recdata[0].name)
# Alice

print(recdata[0].age)
# 25

Converting to Record Array

import numpy as np

# Create structured array
structured = np.array([
    (1, 2.5),
    (2, 3.7)
], dtype=[('x', 'i4'), ('y', 'f4')])

# Convert to recarray
rec = structured.view(np.recarray)

# Now can use attribute access
print(rec.x)
# [1 2]

print(rec.y)
# [2.5 3.7]

Record Arrays vs Structured Arrays

Record arrays allow field access via attributes (e.g., arr.field) but are slightly slower due to attribute lookup overhead.Structured arrays require indexing (e.g., arr['field']) but are faster.Use record arrays for convenience, structured arrays for performance.

Practical Examples

Example 1: CSV-like Data

import numpy as np

# Tabular data
dt = np.dtype([
    ('date', 'U10'),
    ('product', 'U20'),
    ('quantity', 'i4'),
    ('price', 'f4')
])

sales = np.array([
    ('2024-01-01', 'Widget', 10, 29.99),
    ('2024-01-02', 'Gadget', 5, 49.99),
    ('2024-01-03', 'Widget', 15, 29.99),
    ('2024-01-04', 'Doohickey', 8, 19.99)
], dtype=dt)

# Calculate total revenue
sales_total = sales['quantity'] * sales['price']
print(f"Total sales: ${sales_total.sum():.2f}")

# Filter by product
widget_sales = sales[sales['product'] == 'Widget']
print(f"Widget quantity: {widget_sales['quantity'].sum()}")

Example 2: Scientific Data

import numpy as np

# Particle physics experiment data
dt = np.dtype([
    ('event_id', 'i8'),
    ('position', 'f8', (3,)),  # x, y, z
    ('momentum', 'f8', (3,)),  # px, py, pz
    ('energy', 'f8'),
    ('charge', 'i1')
])

particles = np.zeros(1000, dtype=dt)

# Fill with simulated data
particles['event_id'] = np.arange(1000)
particles['position'] = np.random.randn(1000, 3)
particles['momentum'] = np.random.randn(1000, 3)
particles['energy'] = np.abs(np.random.randn(1000)) * 100
particles['charge'] = np.random.choice([-1, 0, 1], size=1000)

# Analysis
charged = particles[particles['charge'] != 0]
print(f"Charged particles: {len(charged)}")

# Compute momentum magnitude
momentum_mag = np.sqrt(np.sum(charged['momentum']**2, axis=1))
print(f"Average momentum: {momentum_mag.mean():.2f}")

Example 3: C Struct Interfacing

import numpy as np
import struct

# Define dtype matching C struct
# struct Packet {
#     uint32_t id;
#     float x, y, z;
#     uint8_t flags;
# };

Packet = np.dtype([
    ('id', 'u4'),
    ('x', 'f4'),
    ('y', 'f4'),
    ('z', 'f4'),
    ('flags', 'u1')
], align=True)

# Read binary data
binary_data = struct.pack('Ifffb', 1, 1.5, 2.5, 3.5, 255)
packet = np.frombuffer(binary_data, dtype=Packet)

print(f"ID: {packet['id'][0]}")
print(f"Position: ({packet['x'][0]}, {packet['y'][0]}, {packet['z'][0]})")
print(f"Flags: {packet['flags'][0]}")

Sorting Structured Arrays

import numpy as np

data = np.array([
    ('Charlie', 35, 68.2),
    ('Alice', 25, 65.5),
    ('David', 28, 72.3),
    ('Bob', 30, 75.0)
], dtype=[('name', 'U10'), ('age', 'i4'), ('weight', 'f4')])

# Sort by single field
sorted_by_age = np.sort(data, order='age')
print(sorted_by_age['name'])
# ['Alice' 'David' 'Bob' 'Charlie']

# Sort by multiple fields
sorted_multi = np.sort(data, order=['age', 'name'])

# Use argsort for indirect sorting
indices = np.argsort(data, order='weight')
sorted_by_weight = data[indices]
print(sorted_by_weight['name'])
# ['Alice' 'Charlie' 'David' 'Bob']

Limitations and Considerations

Performance Trade-offs

Structured arrays can be slower than separate arrays for numerical operations due to memory layout. For intensive computations, consider using separate arrays and combining results.
import numpy as np
import time

# Structured array
struct_data = np.zeros(1000000, dtype=[('x', 'f8'), ('y', 'f8')])
struct_data['x'] = np.random.rand(1000000)
struct_data['y'] = np.random.rand(1000000)

start = time.time()
result = struct_data['x'] + struct_data['y']
struct_time = time.time() - start

# Separate arrays
x = np.random.rand(1000000)
y = np.random.rand(1000000)

start = time.time()
result = x + y
separate_time = time.time() - start

print(f"Structured: {struct_time:.4f}s")
print(f"Separate: {separate_time:.4f}s")
print(f"Separate is {struct_time/separate_time:.2f}x faster")

Helper Functions

import numpy as np
from numpy.lib import recfunctions as rfn

# Original arrays
x = np.array([1, 2, 3])
y = np.array([4.0, 5.0, 6.0])
z = np.array(['a', 'b', 'c'])

# Combine into structured array
structured = rfn.unstructured_to_structured(
    np.column_stack([x, y]),
    names=['x', 'y']
)
print(structured.dtype)
# [('x', '<i8'), ('y', '<f8')]

# Add field
structured = rfn.append_fields(structured, 'z', z, usemask=False)
print(structured.dtype.names)
# ('x', 'y', 'z')

# Drop field
structured = rfn.drop_fields(structured, 'y')
print(structured.dtype.names)
# ('x', 'z')

# Rename field
structured = rfn.rename_fields(structured, {'x': 'id'})
print(structured.dtype.names)
# ('id', 'z')

Summary

Heterogeneous Data

Store multiple data types in a single array with named fields

Field Access

Access data by field name, like database columns or struct members

C Interop

Match C struct layouts for binary data interfacing

Record Arrays

Convenient attribute access for structured arrays

See Also

Build docs developers (and LLMs) love