Description
DVCFileSystem provides a unified file system interface to access both DVC-tracked and Git-tracked files in a repository. It implements the fsspec protocol, making it compatible with many data science libraries like pandas, PyTorch, and TensorFlow.
This is a lower-level API compared to dvc.api.open() and dvc.api.read(), offering more control and better performance when working with multiple files from the same repository.
DVCFileSystem is also available as dvc.api.DVCFileSystem for convenience.
Signature
from dvc.api import DVCFileSystem
fs = DVCFileSystem(
repo: Union[Repo, os.PathLike[ str ], str , None ] = None ,
rev: Optional[ str ] = None ,
subrepos: bool = False ,
config: Optional[dict[ str , Any]] = None ,
remote: Optional[ str ] = None ,
remote_config: Optional[dict[ str , Any]] = None ,
)
Parameters
repo
Union[Repo, os.PathLike, str, None]
default: "None"
A URL, path to a DVC/Git repository, or a Repo instance.
Defaults to the current working directory DVC project
Can be a local path or remote URL
Both HTTP and SSH protocols are supported
repo = None # Current directory
repo = "/path/to/local/repo"
repo = "https://github.com/user/repo"
repo = "[email protected] :user/repo.git"
Any Git revision such as a branch name, tag name, commit hash, or DVC experiment name.
Defaults to the default branch for remote repos
For local repos without rev, uses the working directory
Ignored if repo is not a Git repository
rev = "main"
rev = "v1.0.0"
rev = "abc123"
rev = "exp-random-forest"
Whether to traverse into subrepos (nested DVC repositories). subrepos = True # Include subrepos
subrepos = False # Ignore subrepos (default)
DVC config dictionary to be passed to the repository. config = { "cache" : { "type" : "symlink" }}
Name of the DVC remote to use. remote = "myremote"
remote = "s3-storage"
Remote configuration dictionary. remote_config = { "url" : "s3://bucket/path" }
Methods
File Operations
open(path, mode='rb', **kwargs)
Open a file for reading. with fs.open( 'data/file.csv' , mode = 'r' ) as f:
data = f.read()
get(rpath, lpath, recursive=False, **kwargs)
Download file(s) from the repository to local path. fs.get( 'data/dataset.csv' , 'local_dataset.csv' )
fs.get( 'data/' , 'local_data/' , recursive = True )
get_file(rpath, lpath, **kwargs)
Download a single file. fs.get_file( 'model.pkl' , 'local_model.pkl' )
Directory Operations
ls(path, detail=True, **kwargs)
List directory contents. # Get detailed info
files = fs.ls( 'data/' , detail = True )
# Get just paths
paths = fs.ls( 'data/' , detail = False )
walk(path, maxdepth=None, **kwargs)
Walk through directory tree. for root, dirs, files in fs.walk( 'data/' ):
for file in files:
print (fs.join(root, file ))
File Info
Get file or directory information. info = fs.info( 'data/file.csv' )
print (info[ 'size' ])
print (info[ 'type' ]) # 'file' or 'directory'
Check if a path exists. if fs.exists( 'data/file.csv' ):
print ( "File exists" )
Check if path is a file. if fs.isfile( 'data/file.csv' ):
print ( "Is a file" )
Check if path is a directory. if fs.isdir( 'data/' ):
print ( "Is a directory" )
Check if path is DVC-tracked. if fs.isdvc( 'data/large_file.bin' ):
print ( "Tracked by DVC" )
Utilities
du(path, total=True, maxdepth=None, **kwargs)
Get disk usage for a path. size = fs.du( 'data/' , total = True )
print ( f "Total size: { size } bytes" )
Get current working directory.
Join path components. path = fs.join( 'data' , 'subfolder' , 'file.csv' )
Examples
Basic File Reading
from dvc.api import DVCFileSystem
# Create filesystem instance
fs = DVCFileSystem(
repo = 'https://github.com/iterative/example-get-started' ,
rev = 'main'
)
# Read a file
with fs.open( 'data/data.xml' , mode = 'r' ) as f:
content = f.read()
print (content)
List Directory Contents
from dvc.api import DVCFileSystem
fs = DVCFileSystem()
# List files with details
files = fs.ls( 'data/' , detail = True )
for file_info in files:
print ( f "Name: { file_info[ 'name' ] } " )
print ( f "Size: { file_info[ 'size' ] } bytes" )
print ( f "Type: { file_info[ 'type' ] } " )
print ()
Download Files
from dvc.api import DVCFileSystem
fs = DVCFileSystem(
repo = 'https://github.com/user/ml-project' ,
rev = 'production'
)
# Download single file
fs.get_file( 'models/classifier.pkl' , 'local_classifier.pkl' )
# Download directory recursively
fs.get( 'data/' , 'local_data/' , recursive = True )
print ( "Data downloaded successfully" )
Walk Directory Tree
from dvc.api import DVCFileSystem
fs = DVCFileSystem()
# Walk through all files
for root, dirs, files in fs.walk( 'data/' ):
print ( f "Directory: { root } " )
for file in files:
file_path = fs.join(root, file )
size = fs.info(file_path)[ 'size' ]
print ( f " { file } : { size } bytes" )
Check DVC Tracking Status
from dvc.api import DVCFileSystem
fs = DVCFileSystem()
files = [ 'data/small.csv' , 'data/large.bin' , 'src/train.py' ]
for file in files:
if fs.exists( file ):
is_dvc = fs.isdvc( file )
status = "DVC-tracked" if is_dvc else "Git-tracked"
print ( f " { file } : { status } " )
Use with Pandas
from dvc.api import DVCFileSystem
import pandas as pd
fs = DVCFileSystem(
repo = 'https://github.com/user/data-repo' ,
rev = 'v1.0.0'
)
# Read CSV directly
with fs.open( 'data/dataset.csv' , mode = 'r' ) as f:
df = pd.read_csv(f)
print (df.head())
print ( f "Shape: { df.shape } " )
Use with NumPy
from dvc.api import DVCFileSystem
import numpy as np
fs = DVCFileSystem()
# Read NumPy array
with fs.open( 'data/features.npy' , mode = 'rb' ) as f:
array = np.load(f)
print ( f "Shape: { array.shape } " )
print ( f "Dtype: { array.dtype } " )
Multiple File Operations
from dvc.api import DVCFileSystem
# Create filesystem once for multiple operations
fs = DVCFileSystem(
repo = 'https://github.com/user/repo' ,
rev = 'main'
)
# Perform multiple operations efficiently
files = fs.ls( 'data/' , detail = False )
for file in files:
if file .endswith( '.csv' ):
with fs.open( file , mode = 'r' ) as f:
lines = f.readlines()
print ( f " { file } : { len (lines) } lines" )
Compare File Sizes
from dvc.api import DVCFileSystem
fs_main = DVCFileSystem( rev = 'main' )
fs_dev = DVCFileSystem( rev = 'development' )
file_path = 'data/dataset.csv'
if fs_main.exists(file_path) and fs_dev.exists(file_path):
size_main = fs_main.info(file_path)[ 'size' ]
size_dev = fs_dev.info(file_path)[ 'size' ]
print ( f "Main branch: { size_main :,} bytes" )
print ( f "Dev branch: { size_dev :,} bytes" )
print ( f "Difference: { abs (size_main - size_dev) :,} bytes" )
Get Directory Size
from dvc.api import DVCFileSystem
fs = DVCFileSystem()
# Get total size
total_size = fs.du( 'data/' , total = True )
print ( f "Total size: { total_size :,} bytes" )
print ( f "Total size: { total_size / ( 1024 ** 3 ) :.2f} GB" )
# Get size breakdown
sizes = fs.du( 'data/' , total = False , withdirs = True )
for path, size in sizes.items():
print ( f " { path } : { size :,} bytes" )
Filter DVC-Tracked Files
from dvc.api import DVCFileSystem
fs = DVCFileSystem()
# Find all DVC-tracked files
dvc_files = []
for root, dirs, files in fs.walk( './' ):
for file in files:
file_path = fs.join(root, file )
if fs.isdvc(file_path):
dvc_files.append(file_path)
print ( f "Found { len (dvc_files) } DVC-tracked files:" )
for file in dvc_files:
print ( f " { file } " )
Access Private Repository
from dvc.api import DVCFileSystem
# Access private repo via SSH (requires SSH keys configured)
fs = DVCFileSystem(
repo = '[email protected] :company/private-ml-repo.git' ,
rev = 'production'
)
with fs.open( 'models/model.pkl' , mode = 'rb' ) as f:
model_data = f.read()
print ( f "Loaded { len (model_data) } bytes" )
Use Cases
Batch Processing Process multiple files from a repository efficiently.
Library Integration Use with pandas, PyTorch, TensorFlow via fsspec protocol.
Directory Operations List, walk, and analyze directory structures.
Performance Better performance than open()/read() for multiple operations.
fsspec Compatibility
DVCFileSystem implements the fsspec protocol, making it compatible with many libraries:
Pandas
import pandas as pd
from dvc.api import DVCFileSystem
fs = DVCFileSystem( repo = 'https://github.com/user/repo' )
# Method 1: Direct open
with fs.open( 'data/dataset.csv' ) as f:
df = pd.read_csv(f)
# Method 2: Using fsspec protocol
df = pd.read_csv(
'data/dataset.csv' ,
storage_options = { 'fs' : fs}
)
PyTorch
import torch
from dvc.api import DVCFileSystem
fs = DVCFileSystem()
with fs.open( 'models/model.pt' , mode = 'rb' ) as f:
model = torch.load(f)
Dask
import dask.dataframe as dd
from dvc.api import DVCFileSystem
fs = DVCFileSystem()
ddf = dd.read_csv(
'data/*.csv' ,
storage_options = { 'fs' : fs}
)
Comparison with dvc.api Functions
Feature DVCFileSystemdvc.api.open()/read()Use Case Multiple operations Single file access Performance Better for batch Good for one-off API Style Object-oriented Functional Features Full fs operations Read-only Context Reusable instance One-time operation
# DVCFileSystem - Better for multiple operations
fs = DVCFileSystem( repo = '...' )
for file in fs.ls( 'data/' ):
with fs.open( file ) as f:
process(f.read())
# dvc.api.open() - Better for single file
import dvc.api
with dvc.api.open( 'data/file.csv' , repo = '...' ) as f:
data = f.read()
Best Practices
Reuse filesystem instances
Create one instance and reuse it for multiple operations: # ✅ Good - Reuse instance
fs = DVCFileSystem( repo = '...' )
for file in files:
with fs.open( file ) as f:
process(f)
# ❌ Bad - Create new instance each time
for file in files:
fs = DVCFileSystem( repo = '...' )
with fs.open( file ) as f:
process(f)
Choose the right method for your use case: fs = DVCFileSystem()
# Check existence before accessing
if fs.exists( 'data/file.csv' ):
with fs.open( 'data/file.csv' ) as f:
data = f.read()
# Use get() for downloads
fs.get( 'data/' , 'local_data/' , recursive = True )
# Use isdvc() to check DVC tracking
if fs.isdvc( 'large_file.bin' ):
print ( "File is DVC-tracked" )
Close the filesystem when finished (or use context manager if available): fs = DVCFileSystem()
try :
# Do work
with fs.open( 'file.csv' ) as f:
data = f.read()
finally :
fs.close()
Leverage fsspec compatibility
Take advantage of fsspec protocol support in libraries: import pandas as pd
from dvc.api import DVCFileSystem
fs = DVCFileSystem()
# Many libraries can use the filesystem directly
df = pd.read_csv(
'data/dataset.csv' ,
storage_options = { 'fs' : fs}
)
open() Simple file opening
read() Simple file reading
Advanced Usage : See the fsspec documentation for more details on the file system protocol and advanced features.