Skip to main content

Overview

The compute module provides functions for performing computations on Arrow data structures. These functions support both scalar operations and aggregations.
import pyarrow as pa
import pyarrow.compute as pc

arr = pa.array([1, 2, 3, 4, 5])
result = pc.sum(arr)
print(result)  # 15

Arithmetic Functions

add()

Add two arrays or scalars element-wise.
pc.add(x, y, memory_pool=None)
x
Array-like or scalar-like
First argument.
y
Array-like or scalar-like
Second argument.
memory_pool
pyarrow.MemoryPool
default:"None"
Memory pool for allocation.
result
Array or Scalar
Element-wise sum of x and y.

subtract()

Subtract two arrays or scalars element-wise.
pc.subtract(x, y, memory_pool=None)

multiply()

Multiply two arrays or scalars element-wise.
pc.multiply(x, y, memory_pool=None)

divide()

Divide two arrays or scalars element-wise.
pc.divide(x, y, memory_pool=None)

power()

Raise to power element-wise.
pc.power(base, exponent, memory_pool=None)
base
Array-like or scalar-like
Base value.
exponent
Array-like or scalar-like
Exponent value.
result
Array or Scalar
Element-wise power computation.

negate()

Negate values element-wise.
arr = pa.array([1, -2, 3])
result = pc.negate(arr)
print(result)  # [-1, 2, -3]

abs()

Compute absolute value element-wise.
arr = pa.array([-1, -2, 3])
result = pc.abs(arr)
print(result)  # [1, 2, 3]

Aggregation Functions

sum()

Compute sum of array values.
pc.sum(array, skip_nulls=True, min_count=1, memory_pool=None)
array
Array-like
Array to aggregate.
skip_nulls
bool
default:"True"
Whether to skip null values.
min_count
int
default:"1"
Minimum number of non-null values required.
result
Scalar
Sum of array values.

mean()

Compute arithmetic mean.
arr = pa.array([1, 2, 3, 4, 5])
result = pc.mean(arr)
print(result)  # 3.0

min()

Find minimum value.
pc.min(array, skip_nulls=True, min_count=1, memory_pool=None)

max()

Find maximum value.
pc.max(array, skip_nulls=True, min_count=1, memory_pool=None)

count()

Count non-null values.
pc.count(array, mode='only_valid', memory_pool=None)
array
Array-like
Array to count.
mode
str
default:"'only_valid'"
Count mode: ‘only_valid’, ‘only_null’, or ‘all’.
result
Scalar
Count of values.

variance()

Compute variance.
pc.variance(array, ddof=0, skip_nulls=True, min_count=0, memory_pool=None)
array
Array-like
Array to compute variance for.
ddof
int
default:"0"
Delta degrees of freedom (0 for population variance, 1 for sample variance).
result
Scalar
Variance of array values.

stddev()

Compute standard deviation.
pc.stddev(array, ddof=0, skip_nulls=True, min_count=0, memory_pool=None)

String Functions

string_length()

Compute string length.
arr = pa.array(['hello', 'world', 'arrow'])
result = pc.utf8_length(arr)
print(result)  # [5, 5, 5]

upper()

Convert strings to uppercase.
arr = pa.array(['hello', 'world'])
result = pc.utf8_upper(arr)
print(result)  # ['HELLO', 'WORLD']

lower()

Convert strings to lowercase.
arr = pa.array(['HELLO', 'WORLD'])
result = pc.utf8_lower(arr)
print(result)  # ['hello', 'world']

match_substring()

Match substring in strings.
pc.match_substring(strings, pattern, ignore_case=False, memory_pool=None)
strings
Array-like
String array to search in.
pattern
str
Substring pattern to match.
ignore_case
bool
default:"False"
Whether to ignore case.
result
BooleanArray
Boolean array indicating matches.

replace_substring()

Replace substring occurrences.
pc.replace_substring(strings, pattern, replacement, max_replacements=None, memory_pool=None)
strings
Array-like
String array.
pattern
str
Substring to replace.
replacement
str
Replacement string.
max_replacements
int
default:"None"
Maximum number of replacements per string.
result
StringArray
Array with replacements made.

Filter and Selection

filter()

Filter values based on a boolean mask.
pc.filter(data, mask, null_selection_behavior='drop', memory_pool=None)
data
Array, ChunkedArray, RecordBatch, or Table
Data to filter.
mask
Array or Expression
Boolean mask for filtering.
null_selection_behavior
str
default:"'drop'"
How to handle nulls in mask: ‘drop’ or ‘emit_null’.
result
Same as input type
Filtered data.

take()

Select values by indices.
pc.take(data, indices, boundscheck=True, memory_pool=None)
data
Array, ChunkedArray, RecordBatch, or Table
Data to select from.
indices
Array, ChunkedArray
Integer indices to select. Must be integer type.
boundscheck
bool
default:"True"
Whether to check indices are in bounds.
result
Same as input type
Selected values.

index()

Find index of first occurrence of a value.
pc.index(data, value, start=None, end=None, memory_pool=None)
data
Array-like
Array to search in.
value
Scalar-like
Value to search for.
start
int
default:"None"
Start index for search.
end
int
default:"None"
End index for search.
index
int
Index of first occurrence, or -1 if not found.

Sorting and Ranking

sort_indices()

Return indices that would sort an array.
pc.sort_indices(array, order='ascending', null_placement='at_end', memory_pool=None)
array
Array-like
Array to get sort indices for.
order
str
default:"'ascending'"
Sort order: ‘ascending’ or ‘descending’.
null_placement
str
default:"'at_end'"
Where to place nulls: ‘at_start’ or ‘at_end’.
indices
UInt64Array
Indices that would sort the array.

top_k_unstable()

Select indices of top k elements.
pc.top_k_unstable(values, k, sort_keys=None, memory_pool=None)
values
Array, ChunkedArray, RecordBatch, or Table
Data to sort and get top indices from.
k
int
Number of top elements to select.
sort_keys
list
default:"None"
Column key names to order by (for table-like data).
indices
UInt64Array
Indices of top k elements.

bottom_k_unstable()

Select indices of bottom k elements.
pc.bottom_k_unstable(values, k, sort_keys=None, memory_pool=None)

Type Conversions

cast()

Cast array values to another data type.
pc.cast(arr, target_type=None, safe=True, options=None, memory_pool=None)
arr
Array-like
Array to cast.
target_type
DataType or str
Type to cast to.
safe
bool
default:"True"
Check for overflows or unsafe conversions.
options
CastOptions
default:"None"
Additional casting options.
result
Array
Array with values cast to target type.

Null Handling

is_null()

Check which values are null.
arr = pa.array([1, None, 3])
result = pc.is_null(arr)
print(result)  # [False, True, False]
result
BooleanArray
Boolean array indicating null values.

is_valid()

Check which values are not null.
arr = pa.array([1, None, 3])
result = pc.is_valid(arr)
print(result)  # [True, False, True]

fill_null()

Replace null values.
pc.fill_null(values, fill_value)
values
Array, ChunkedArray, or Scalar-like
Values to fill nulls in.
fill_value
Array, ChunkedArray, or Scalar-like
Value to replace nulls with.
result
Same as input type
Array with nulls replaced.

Comparison Functions

equal()

Element-wise equality.
pc.equal(x, y, memory_pool=None)
result
BooleanArray
Boolean array of equality results.

not_equal()

Element-wise inequality.
pc.not_equal(x, y, memory_pool=None)

greater()

Element-wise greater than.
pc.greater(x, y, memory_pool=None)

greater_equal()

Element-wise greater than or equal.
pc.greater_equal(x, y, memory_pool=None)

less()

Element-wise less than.
pc.less(x, y, memory_pool=None)

less_equal()

Element-wise less than or equal.
pc.less_equal(x, y, memory_pool=None)

Logical Functions

and_()

Element-wise logical AND.
pc.and_(x, y, memory_pool=None)

or_()

Element-wise logical OR.
pc.or_(x, y, memory_pool=None)

invert()

Element-wise logical NOT.
arr = pa.array([True, False, True])
result = pc.invert(arr)
print(result)  # [False, True, False]

Expression API

The Expression API provides a way to build complex compute expressions.

field()

Reference a field in a dataset.
pc.field('column_name')
pc.field(('nested', 'field'))
pc.field(0)  # By index
name_or_index
str, tuple, or int
Field name, nested field tuple, or column index.
expr
Expression
Field reference expression.

scalar()

Create a scalar expression.
expr = pc.scalar(42)
expr = pc.scalar('hello')
value
bool, int, float, or str
Python value to convert to expression.
expr
Expression
Scalar value expression.

Example: Complex Expressions

import pyarrow as pa
import pyarrow.compute as pc
import pyarrow.dataset as ds

# Create a filter expression
filter_expr = (pc.field('age') > 18) & (pc.field('income') < 100000)

# Use with dataset filtering
table = pa.table({
    'age': [25, 15, 35, 45],
    'income': [50000, 0, 80000, 120000]
})

filtered = table.filter(filter_expr)
print(filtered)

Function Registry

list_functions()

List all available compute functions.
functions = pc.list_functions()
print(functions[:10])  # Show first 10
functions
list of str
Names of all available functions.

get_function()

Get a function by name.
func = pc.get_function('add')
print(func)
name
str
Function name.
function
Function
The compute function object.

call_function()

Call a function by name.
pc.call_function(name, args, options=None, memory_pool=None)
name
str
Function name to call.
args
list
Function arguments.
options
FunctionOptions
default:"None"
Function-specific options.
result
Array or Scalar
Function result.

Build docs developers (and LLMs) love