What is SyftBox?
SyftBox is a file-based communication layer that enables:- Offline-capable FL: Asynchronous model training across participants
- Privacy-first: Data never leaves the owner’s machine
- Zero servers: Communication happens through synced folders
- End-to-end encryption: Secure message passing with RPC and crypto
Architecture
A local SyftBox network consists of:You can also install the SyftBox UI app from syftbox.net for a graphical interface.
~/SyftBox)Your email serves as your unique identifier across the SyftBox network. Use a valid email that you control.
~/SyftBox/
├── .syftbox/ # Configuration and keys
│ ├── config.json # Client configuration
│ └── client_config.json # Connection settings
├── datasites/ # Peer datasites appear here
├── public/ # Your public folder (readable by all)
├── private/ # Your private data (only you can read)
├── api_data/ # API and app data
└── sync/ # Sync metadata
import syft_rds as sy
# Initialize as admin on your own datasite
do_email = "[email protected]" # Your email
do_client = sy.init_session(host=do_email, email=do_email)
# Verify admin access
print(f"Admin access: {do_client.is_admin}")
from pathlib import Path
# Prepare your dataset
DATASET_DIR = Path("./my-dataset").absolute()
# Ensure it has the required structure:
# my-dataset/
# ├── private/
# │ ├── train.csv
# │ └── test.csv
# └── mock/
# ├── train.csv
# └── test.csv
# Create the dataset in SyftBox
do_client.dataset.create(
name="pima-indians-diabetes-database",
asset_path=DATASET_DIR,
description="Diabetes prediction dataset",
)
print("✅ Dataset created successfully")
Private data (
private/) stays on your machine and is never shared. Only mock data (mock/) is accessible to data scientists for development.# List all datasets
datasets = do_client.dataset.get_all()
for ds in datasets:
print(f"Dataset: {ds.name}")
print(f" Private path: {ds.get_private_path()}")
print(f" Mock path: {ds.get_mock_path()}")
import syft_rds as sy
# Initialize your session
ds_email = "[email protected]"
ds = sy.init_session(host=ds_email, email=ds_email)
print(f"Data scientist logged in: {ds_email}")
# Connect to first data owner
do1_email = "[email protected]"
do1_client = sy.init_session(
host=do1_email,
email=ds_email, # Login as yourself (guest)
start_syft_event_server=False,
)
print(f"Connected to {do1_email}")
print(f"Admin access: {do1_client.is_admin}") # Should be False
# Connect to second data owner
do2_email = "[email protected]"
do2_client = sy.init_session(
host=do2_email,
email=ds_email,
start_syft_event_server=False,
)
print(f"Connected to {do2_email}")
# Get DO1's datasets
do1_datasets = do1_client.dataset.get_all()
for ds in do1_datasets:
print(f"\nDataset: {ds.name}")
print(f" Description: {ds.description}")
# You can access mock data
mock_path = ds.get_mock_path()
print(f" Mock data: {mock_path}")
# But NOT private data (will raise error)
try:
private_path = ds.get_private_path()
except Exception as e:
print(f" Private data: ❌ Access denied")
# On first machine
syftbox client
# Enter: [email protected]
# On second machine
syftbox client
# Enter: [email protected]
# On third machine
syftbox client
# Enter: [email protected]
import os
from pathlib import Path
# Point to your SyftBox config
ds = sy.init_session(host="[email protected]", email="[email protected]")
os.environ["SYFTBOX_CLIENT_CONFIG_PATH"] = str(ds.syftbox_client.config_path)
# Configure logging
os.environ["LOGURU_LEVEL"] = "DEBUG"
# Set message timeout (in seconds)
os.environ["SYFT_FLWR_MSG_TIMEOUT"] = "30"
print("✅ Environment configured")
import syft_flwr
from pathlib import Path
# Your FL project directory
PROJECT_PATH = Path("./fl-diabetes-prediction")
# Mock dataset paths from data owners
mock_paths = [
do1_client.dataset.get(name="pima-indians-diabetes-database").get_mock_path(),
do2_client.dataset.get(name="pima-indians-diabetes-database").get_mock_path(),
]
print(f"Mock paths: {mock_paths}")
# Run simulation
syft_flwr.run(PROJECT_PATH, mock_paths)
import os
# Encryption is enabled by default
# To explicitly control:
os.environ["SYFT_FLWR_ENCRYPTION_ENABLED"] = "true" # Enable
# os.environ["SYFT_FLWR_ENCRYPTION_ENABLED"] = "false" # Disable (dev only)
from syft_crypto.x3dh_bootstrap import ensure_bootstrap
# This runs automatically in syft_flwr
client = ensure_bootstrap(syftbox_client)
print("🔐 End-to-end encryption enabled")
import syft_flwr
PROJECT_PATH = Path("./fl-diabetes-prediction")
do_emails = ["[email protected]", "[email protected]"]
ds_email = "[email protected]"
syft_flwr.bootstrap(
PROJECT_PATH,
aggregator=ds_email,
datasites=do_emails,
transport="syftbox", # Use local SyftBox
)
# Clean before submitting
!rm -rf {PROJECT_PATH / "fl_diabetes_prediction" / "__pycache__"}
!rm -rf {PROJECT_PATH / "simulation_logs"}
# Submit to DO1
do1_client.job.submit(
name="fl-diabetes-prediction",
user_code_path=PROJECT_PATH,
dataset_name="pima-indians-diabetes-database",
entrypoint="main.py",
)
# Submit to DO2
do2_client.job.submit(
name="fl-diabetes-prediction",
user_code_path=PROJECT_PATH,
dataset_name="pima-indians-diabetes-database",
entrypoint="main.py",
)
print("✅ Jobs submitted, waiting for approval...")
# Data owner reviews and approves
jobs = do1_client.job.get_all()
pending_job = jobs[0]
print(f"Job from: {pending_job.created_by}")
print(f"Code path: {pending_job.user_code_path}")
# Approve the job
do1_client.job.approve(pending_job)
print("✅ Job approved")
# Data scientist submits server job to themselves
server_job = ds.job.submit(
name="fl-diabetes-prediction-server",
user_code_path=PROJECT_PATH,
entrypoint="main.py",
)
# Auto-approve own job
ds.job.approve(server_job)
# Run the server (blocking)
ds.run_private(server_job, blocking=True)
# View logs
ds.job.show_logs(server_job)
# Check output directory
output_dir = ds.job.get_output_dir(server_job)
print(f"Results saved to: {output_dir}")
# List model checkpoints
weights_dir = output_dir / "weights"
for weight_file in weights_dir.glob("*.safetensors"):
print(f" - {weight_file.name}")
# View sync status
syftbox status
# Check recent sync activity
tail -f ~/SyftBox/.syftbox/logs/sync.log
SyftBox vs Google Drive Transport
| Feature | SyftBox (Local) | Google Drive (P2P) |
|---|---|---|
| Latency | Low (less than 1s) | High (30-60s) |
| Encryption | ✅ End-to-end | ❌ No encryption |
| Setup | Install client | Just browser |
| Offline | ✅ Full support | Limited |
| Networking | None required | None required |
| Best for | Production FL | Quick experiments |
Folder Permissions
SyftBox enforces strict folder permissions:Advanced Configuration
Customize your SyftBox setup:Change Sync Directory
Adjust Sync Frequency
Edit~/SyftBox/.syftbox/config.json:
Use Custom Network
For private networks:Troubleshooting
SyftBox client won't start
SyftBox client won't start
Check that port 8080 isn’t in use:Kill any conflicting process or configure a different port.
Datasites not syncing
Datasites not syncing
Verify:
- SyftBox client is running (
syftbox status) - You’re connected to the network
- Check sync logs:
tail -f ~/SyftBox/.syftbox/logs/sync.log
Permission denied on dataset
Permission denied on dataset
Ensure:
- You’re using the correct client (admin for private, guest for mock)
- Dataset is properly registered
- Paths exist and have correct permissions
FL messages not arriving
FL messages not arriving
- Increase timeout:
os.environ["SYFT_FLWR_MSG_TIMEOUT"] = "60" - Check
app_namematches in all participants - Verify encryption keys are bootstrapped
What’s Next?
- Try the first FL project tutorial
- Learn about custom strategies
- Explore example notebooks