Overview
The filesystem API provides a unified interface for interacting with various storage systems, including local filesystems, cloud storage (S3, Azure, GCS), and HDFS.
import pyarrow.fs as fs
# Local filesystem
local = fs.LocalFileSystem()
# S3 filesystem
s3 = fs.S3FileSystem(region='us-east-1')
# List files
file_infos = local.get_file_info(fs.FileSelector('/path/to/dir', recursive=True))
for info in file_infos:
print(info.path, info.type, info.size)
FileSystem
Base class for all filesystem implementations.
from_uri()
Create a filesystem from a URI.
URI string (e.g., ‘s3://bucket/path’, ‘file:///local/path’).
result
tuple of (FileSystem, str)
Tuple of (filesystem, path) where path is the path component of the URI.
Example
import pyarrow.fs as fs
# Parse S3 URI
filesystem, path = fs.FileSystem.from_uri('s3://my-bucket/data/')
print(type(filesystem)) # S3FileSystem
print(path) # 'my-bucket/data/'
# Parse local URI
filesystem, path = fs.FileSystem.from_uri('file:///home/user/data')
print(type(filesystem)) # LocalFileSystem
Methods
get_file_info()
Get information about files.
filesystem.get_file_info(paths_or_selector)
paths_or_selector
str, list of str, or FileSelector
Path(s) to get info for, or a FileSelector for recursive discovery.
info
FileInfo or list of FileInfo
File information object(s).
create_dir()
Create a directory.
filesystem.create_dir(path, recursive=True)
Directory path to create.
Create parent directories if needed.
delete_dir()
Delete a directory and its contents.
filesystem.delete_dir(path)
Directory path to delete.
delete_file()
Delete a file.
filesystem.delete_file(path)
move()
Move or rename a file/directory.
filesystem.move(src, dest)
copy_file()
Copy a file.
filesystem.copy_file(src, dest)
Open a file for reading.
stream = filesystem.open_input_stream('data.parquet')
data = stream.read()
stream.close()
Input stream for reading.
open_output_stream()
Open a file for writing.
stream = filesystem.open_output_stream('output.parquet')
stream.write(data)
stream.close()
Output stream for writing.
LocalFileSystem
Filesystem interface for local storage.
import pyarrow.fs as fs
local_fs = fs.LocalFileSystem()
# List files
for file_info in local_fs.get_file_info(fs.FileSelector('/path', recursive=True)):
print(file_info.path)
# Read file
with local_fs.open_input_stream('/path/to/file.txt') as f:
content = f.read()
Constructor
fs.LocalFileSystem(use_mmap=False)
Use memory mapping when opening files for reading.
A local filesystem object.
S3FileSystem
Filesystem interface for Amazon S3.
import pyarrow.fs as fs
s3 = fs.S3FileSystem(
region='us-east-1',
access_key='YOUR_ACCESS_KEY',
secret_key='YOUR_SECRET_KEY'
)
# List files in bucket
for file_info in s3.get_file_info(fs.FileSelector('my-bucket/', recursive=True)):
print(file_info.path)
# Read file from S3
with s3.open_input_stream('my-bucket/data.parquet') as f:
data = f.read()
Constructor
fs.S3FileSystem(access_key=None, secret_key=None, session_token=None,
region=None, scheme=None, endpoint_override=None,
background_writes=True, default_metadata=None,
role_arn=None, session_name=None, external_id=None,
load_frequency=900, proxy_options=None)
AWS access key ID. If not provided, uses AWS credentials from environment or config.
AWS session token for temporary credentials.
AWS region (e.g., ‘us-east-1’). If not set, uses AWS_DEFAULT_REGION environment variable.
URL scheme (‘http’ or ‘https’). Defaults to ‘https’.
Custom S3 endpoint URL (for S3-compatible storage).
Use background threads for writes.
AWS IAM role ARN to assume.
GcsFileSystem
Filesystem interface for Google Cloud Storage.
import pyarrow.fs as fs
gcs = fs.GcsFileSystem(
project_id='my-project',
access_token='YOUR_ACCESS_TOKEN'
)
# List files in bucket
for file_info in gcs.get_file_info(fs.FileSelector('my-bucket/', recursive=True)):
print(file_info.path)
Constructor
fs.GcsFileSystem(project_id=None, access_token=None, credential_token_expiration=None,
default_bucket_location='US', scheme=None, endpoint_override=None,
default_metadata=None, retry_time_limit=None)
OAuth2 access token. If not provided, uses default credentials.
Default location for new buckets.
AzureFileSystem
Filesystem interface for Azure Blob Storage.
import pyarrow.fs as fs
azure = fs.AzureFileSystem(
account_name='myaccount',
account_key='YOUR_ACCOUNT_KEY'
)
# List files in container
for file_info in azure.get_file_info(fs.FileSelector('container/', recursive=True)):
print(file_info.path)
Constructor
fs.AzureFileSystem(account_name, account_key=None, blob_storage_authority=None,
dfs_storage_authority=None, blob_storage_scheme='https',
dfs_storage_scheme='https')
Azure storage account name.
Azure storage account key.
An Azure filesystem object.
HadoopFileSystem
Filesystem interface for HDFS.
import pyarrow.fs as fs
hdfs = fs.HadoopFileSystem(
host='namenode.example.com',
port=8020,
user='hdfs_user'
)
# List files
for file_info in hdfs.get_file_info(fs.FileSelector('/data/', recursive=True)):
print(file_info.path)
Constructor
fs.HadoopFileSystem(host, port=8020, user=None, kerb_ticket=None,
driver='libhdfs', replication=3, buffer_size=0,
default_block_size=0)
An HDFS filesystem object.
SubTreeFileSystem
Wrap a filesystem with a base path prefix.
import pyarrow.fs as fs
# Create a SubTreeFileSystem rooted at /data/subset
base_fs = fs.LocalFileSystem()
subtree = fs.SubTreeFileSystem('/data/subset', base_fs)
# All paths are relative to /data/subset
subtree.get_file_info('file.txt') # Actually accesses /data/subset/file.txt
Constructor
fs.SubTreeFileSystem(base_path, base_fs)
Base path prefix for all operations.
FileInfo
Information about a file or directory.
import pyarrow.fs as fs
local_fs = fs.LocalFileSystem()
info = local_fs.get_file_info('data.parquet')
print(info.path) # Path to file
print(info.type) # FileType.File
print(info.size) # File size in bytes
print(info.mtime) # Modification time
Properties
path
The file path.
type
The file type.
One of FileType.File, FileType.Directory, FileType.NotFound, or FileType.Unknown.
size
File size in bytes.
Size in bytes (None for directories).
mtime
Modification time.
Last modification time (None if not available).
FileSelector
Selector for recursive file discovery.
import pyarrow.fs as fs
# Select all files recursively
selector = fs.FileSelector('/data', recursive=True)
# Select only files in directory (non-recursive)
selector = fs.FileSelector('/data', recursive=False)
# Use with filesystem
local_fs = fs.LocalFileSystem()
file_infos = local_fs.get_file_info(selector)
Constructor
fs.FileSelector(base_dir, allow_not_found=False, recursive=False)
Base directory to select from.
If False, raise error if base_dir doesn’t exist.
Recursively traverse subdirectories.
FileType
Enumeration of file types.
import pyarrow.fs as fs
fs.FileType.File # Regular file
fs.FileType.Directory # Directory
fs.FileType.NotFound # File doesn't exist
fs.FileType.Unknown # Unknown/special file type
Utility Functions
copy_files()
Copy files between filesystems.
fs.copy_files(source, destination, source_filesystem=None,
destination_filesystem=None, chunk_size=1048576, use_threads=True)
Source file or directory path.
Destination file or directory path.
Source filesystem (inferred if None).
Destination filesystem (inferred if None).
Chunk size for copying (in bytes).
Use multiple threads for copying.
Example
import pyarrow.fs as fs
# Copy from local to S3
fs.copy_files(
'local/data.parquet',
's3://bucket/data.parquet'
)
# Copy directory recursively
fs.copy_files(
'/local/data/',
's3://bucket/data/'
)
# Copy between specific filesystems
local = fs.LocalFileSystem()
s3 = fs.S3FileSystem(region='us-east-1')
fs.copy_files(
'/data/file.parquet',
'bucket/file.parquet',
source_filesystem=local,
destination_filesystem=s3
)