Skip to main content

Overview

The filesystem API provides a unified interface for interacting with various storage systems, including local filesystems, cloud storage (S3, Azure, GCS), and HDFS.
import pyarrow.fs as fs

# Local filesystem
local = fs.LocalFileSystem()

# S3 filesystem
s3 = fs.S3FileSystem(region='us-east-1')

# List files
file_infos = local.get_file_info(fs.FileSelector('/path/to/dir', recursive=True))
for info in file_infos:
    print(info.path, info.type, info.size)

FileSystem

Base class for all filesystem implementations.

from_uri()

Create a filesystem from a URI.
FileSystem.from_uri(uri)
uri
str
URI string (e.g., ‘s3://bucket/path’, ‘file:///local/path’).
result
tuple of (FileSystem, str)
Tuple of (filesystem, path) where path is the path component of the URI.

Example

import pyarrow.fs as fs

# Parse S3 URI
filesystem, path = fs.FileSystem.from_uri('s3://my-bucket/data/')
print(type(filesystem))  # S3FileSystem
print(path)              # 'my-bucket/data/'

# Parse local URI
filesystem, path = fs.FileSystem.from_uri('file:///home/user/data')
print(type(filesystem))  # LocalFileSystem

Methods

get_file_info()

Get information about files.
filesystem.get_file_info(paths_or_selector)
paths_or_selector
str, list of str, or FileSelector
Path(s) to get info for, or a FileSelector for recursive discovery.
info
FileInfo or list of FileInfo
File information object(s).

create_dir()

Create a directory.
filesystem.create_dir(path, recursive=True)
path
str
Directory path to create.
recursive
bool
default:"True"
Create parent directories if needed.

delete_dir()

Delete a directory and its contents.
filesystem.delete_dir(path)
path
str
Directory path to delete.

delete_file()

Delete a file.
filesystem.delete_file(path)
path
str
File path to delete.

move()

Move or rename a file/directory.
filesystem.move(src, dest)
src
str
Source path.
dest
str
Destination path.

copy_file()

Copy a file.
filesystem.copy_file(src, dest)
src
str
Source file path.
dest
str
Destination file path.

open_input_stream()

Open a file for reading.
stream = filesystem.open_input_stream('data.parquet')
data = stream.read()
stream.close()
path
str
File path to open.
stream
NativeFile
Input stream for reading.

open_output_stream()

Open a file for writing.
stream = filesystem.open_output_stream('output.parquet')
stream.write(data)
stream.close()
path
str
File path to write to.
stream
NativeFile
Output stream for writing.

LocalFileSystem

Filesystem interface for local storage.
import pyarrow.fs as fs

local_fs = fs.LocalFileSystem()

# List files
for file_info in local_fs.get_file_info(fs.FileSelector('/path', recursive=True)):
    print(file_info.path)

# Read file
with local_fs.open_input_stream('/path/to/file.txt') as f:
    content = f.read()

Constructor

fs.LocalFileSystem(use_mmap=False)
use_mmap
bool
default:"False"
Use memory mapping when opening files for reading.
filesystem
LocalFileSystem
A local filesystem object.

S3FileSystem

Filesystem interface for Amazon S3.
import pyarrow.fs as fs

s3 = fs.S3FileSystem(
    region='us-east-1',
    access_key='YOUR_ACCESS_KEY',
    secret_key='YOUR_SECRET_KEY'
)

# List files in bucket
for file_info in s3.get_file_info(fs.FileSelector('my-bucket/', recursive=True)):
    print(file_info.path)

# Read file from S3
with s3.open_input_stream('my-bucket/data.parquet') as f:
    data = f.read()

Constructor

fs.S3FileSystem(access_key=None, secret_key=None, session_token=None, 
                region=None, scheme=None, endpoint_override=None,
                background_writes=True, default_metadata=None,
                role_arn=None, session_name=None, external_id=None,
                load_frequency=900, proxy_options=None)
access_key
str
default:"None"
AWS access key ID. If not provided, uses AWS credentials from environment or config.
secret_key
str
default:"None"
AWS secret access key.
session_token
str
default:"None"
AWS session token for temporary credentials.
region
str
default:"None"
AWS region (e.g., ‘us-east-1’). If not set, uses AWS_DEFAULT_REGION environment variable.
scheme
str
default:"None"
URL scheme (‘http’ or ‘https’). Defaults to ‘https’.
endpoint_override
str
default:"None"
Custom S3 endpoint URL (for S3-compatible storage).
background_writes
bool
default:"True"
Use background threads for writes.
role_arn
str
default:"None"
AWS IAM role ARN to assume.
filesystem
S3FileSystem
An S3 filesystem object.

GcsFileSystem

Filesystem interface for Google Cloud Storage.
import pyarrow.fs as fs

gcs = fs.GcsFileSystem(
    project_id='my-project',
    access_token='YOUR_ACCESS_TOKEN'
)

# List files in bucket
for file_info in gcs.get_file_info(fs.FileSelector('my-bucket/', recursive=True)):
    print(file_info.path)

Constructor

fs.GcsFileSystem(project_id=None, access_token=None, credential_token_expiration=None,
                default_bucket_location='US', scheme=None, endpoint_override=None,
                default_metadata=None, retry_time_limit=None)
project_id
str
default:"None"
GCS project ID.
access_token
str
default:"None"
OAuth2 access token. If not provided, uses default credentials.
default_bucket_location
str
default:"'US'"
Default location for new buckets.
filesystem
GcsFileSystem
A GCS filesystem object.

AzureFileSystem

Filesystem interface for Azure Blob Storage.
import pyarrow.fs as fs

azure = fs.AzureFileSystem(
    account_name='myaccount',
    account_key='YOUR_ACCOUNT_KEY'
)

# List files in container
for file_info in azure.get_file_info(fs.FileSelector('container/', recursive=True)):
    print(file_info.path)

Constructor

fs.AzureFileSystem(account_name, account_key=None, blob_storage_authority=None,
                   dfs_storage_authority=None, blob_storage_scheme='https',
                   dfs_storage_scheme='https')
account_name
str
Azure storage account name.
account_key
str
default:"None"
Azure storage account key.
filesystem
AzureFileSystem
An Azure filesystem object.

HadoopFileSystem

Filesystem interface for HDFS.
import pyarrow.fs as fs

hdfs = fs.HadoopFileSystem(
    host='namenode.example.com',
    port=8020,
    user='hdfs_user'
)

# List files
for file_info in hdfs.get_file_info(fs.FileSelector('/data/', recursive=True)):
    print(file_info.path)

Constructor

fs.HadoopFileSystem(host, port=8020, user=None, kerb_ticket=None,
                    driver='libhdfs', replication=3, buffer_size=0,
                    default_block_size=0)
host
str
HDFS namenode hostname.
port
int
default:"8020"
HDFS namenode port.
user
str
default:"None"
Username for HDFS.
filesystem
HadoopFileSystem
An HDFS filesystem object.

SubTreeFileSystem

Wrap a filesystem with a base path prefix.
import pyarrow.fs as fs

# Create a SubTreeFileSystem rooted at /data/subset
base_fs = fs.LocalFileSystem()
subtree = fs.SubTreeFileSystem('/data/subset', base_fs)

# All paths are relative to /data/subset
subtree.get_file_info('file.txt')  # Actually accesses /data/subset/file.txt

Constructor

fs.SubTreeFileSystem(base_path, base_fs)
base_path
str
Base path prefix for all operations.
base_fs
FileSystem
Underlying filesystem.
filesystem
SubTreeFileSystem
A subtree filesystem.

FileInfo

Information about a file or directory.
import pyarrow.fs as fs

local_fs = fs.LocalFileSystem()
info = local_fs.get_file_info('data.parquet')

print(info.path)       # Path to file
print(info.type)       # FileType.File
print(info.size)       # File size in bytes
print(info.mtime)      # Modification time

Properties

path

The file path.
path
str
File or directory path.

type

The file type.
type
FileType
One of FileType.File, FileType.Directory, FileType.NotFound, or FileType.Unknown.

size

File size in bytes.
size
int
Size in bytes (None for directories).

mtime

Modification time.
mtime
datetime.datetime
Last modification time (None if not available).

FileSelector

Selector for recursive file discovery.
import pyarrow.fs as fs

# Select all files recursively
selector = fs.FileSelector('/data', recursive=True)

# Select only files in directory (non-recursive)
selector = fs.FileSelector('/data', recursive=False)

# Use with filesystem
local_fs = fs.LocalFileSystem()
file_infos = local_fs.get_file_info(selector)

Constructor

fs.FileSelector(base_dir, allow_not_found=False, recursive=False)
base_dir
str
Base directory to select from.
allow_not_found
bool
default:"False"
If False, raise error if base_dir doesn’t exist.
recursive
bool
default:"False"
Recursively traverse subdirectories.
selector
FileSelector
A file selector object.

FileType

Enumeration of file types.
import pyarrow.fs as fs

fs.FileType.File        # Regular file
fs.FileType.Directory   # Directory
fs.FileType.NotFound    # File doesn't exist
fs.FileType.Unknown     # Unknown/special file type

Utility Functions

copy_files()

Copy files between filesystems.
fs.copy_files(source, destination, source_filesystem=None, 
             destination_filesystem=None, chunk_size=1048576, use_threads=True)
source
str
Source file or directory path.
destination
str
Destination file or directory path.
source_filesystem
FileSystem
default:"None"
Source filesystem (inferred if None).
destination_filesystem
FileSystem
default:"None"
Destination filesystem (inferred if None).
chunk_size
int
default:"1048576"
Chunk size for copying (in bytes).
use_threads
bool
default:"True"
Use multiple threads for copying.

Example

import pyarrow.fs as fs

# Copy from local to S3
fs.copy_files(
    'local/data.parquet',
    's3://bucket/data.parquet'
)

# Copy directory recursively
fs.copy_files(
    '/local/data/',
    's3://bucket/data/'
)

# Copy between specific filesystems
local = fs.LocalFileSystem()
s3 = fs.S3FileSystem(region='us-east-1')

fs.copy_files(
    '/data/file.parquet',
    'bucket/file.parquet',
    source_filesystem=local,
    destination_filesystem=s3
)

Build docs developers (and LLMs) love