Skip to main content
The Schema class manages table definitions, column types, and data validation rules for your data pipeline. It handles schema evolution, normalization, and ensures data integrity.

Creating a Schema

Schemas are typically created automatically by dlt, but can also be created explicitly:
from dlt.common.schema import Schema

schema = Schema("my_schema")

Properties

name

The name of the schema.
name = schema.name
Returns: str - Schema name Source: ~/workspace/source/dlt/common/schema/schema.py:654

tables

Dictionary of all tables in the schema.
for table_name, table_def in schema.tables.items():
    print(f"Table: {table_name}")
Returns: TSchemaTables - Dictionary of table schemas Source: ~/workspace/source/dlt/common/schema/schema.py:658

version

Current version of the schema content, incremented when modified.
version = schema.version
Returns: int - Current schema version Source: ~/workspace/source/dlt/common/schema/schema.py:606

stored_version

Version of the schema from the time it was loaded/created.
stored = schema.stored_version
Returns: int - Stored schema version Source: ~/workspace/source/dlt/common/schema/schema.py:617

version_hash

Current version hash computed from schema content.
hash_value = schema.version_hash
Returns: str - Schema version hash Source: ~/workspace/source/dlt/common/schema/schema.py:626

is_modified

Checks if schema was modified since it was saved.
if schema.is_modified:
    print("Schema has unsaved changes")
Returns: bool - True if schema is modified Source: ~/workspace/source/dlt/common/schema/schema.py:640

is_new

Checks if schema was ever saved.
if schema.is_new:
    print("Schema has not been saved yet")
Returns: bool - True if schema is new Source: ~/workspace/source/dlt/common/schema/schema.py:648

settings

Schema settings including default hints and preferred types.
settings = schema.settings
Returns: TSchemaSettings - Schema settings dictionary Source: ~/workspace/source/dlt/common/schema/schema.py:715

naming

Naming convention used by the schema to normalize identifiers.
naming = schema.naming
Returns: NamingConvention - Naming convention instance

Methods

get_table()

Gets a table schema by name.
table = schema.get_table("users")
table_name
str
required
Name of the table to retrieve.
Returns: TTableSchema - Table schema dictionary Raises: TableNotFound - If table doesn’t exist Source: ~/workspace/source/dlt/common/schema/schema.py:537

get_table_columns()

Gets columns of a table, optionally including incomplete columns.
columns = schema.get_table_columns("users", include_incomplete=False)
table_name
str
required
Name of the table.
include_incomplete
bool
default:"False"
Whether to include columns without data type.
Returns: TTableSchemaColumns - Dictionary of column schemas Source: ~/workspace/source/dlt/common/schema/schema.py:543

update_table()

Adds or merges a partial table schema into the schema.
partial_table = {
    "name": "users",
    "columns": {
        "id": {"data_type": "bigint", "nullable": False},
        "name": {"data_type": "text"}
    }
}

schema.update_table(partial_table)
partial_table
TPartialTableSchema
required
Table schema to add or merge.
normalize_identifiers
bool
default:"True"
Whether to normalize identifiers using naming convention.
from_diff
bool
default:"False"
If True, partial_table is treated as a diff to apply directly.
merge_compound_props
bool
default:"True"
If False, compound properties replace rather than merge.
Returns: TPartialTableSchema - The applied partial table Source: ~/workspace/source/dlt/common/schema/schema.py:350

update_schema()

Updates this schema from an incoming schema.
schema.update_schema(other_schema)
schema
Schema
required
Schema to merge from.
Source: ~/workspace/source/dlt/common/schema/schema.py:405

drop_tables()

Drops tables from the schema.
dropped = schema.drop_tables(["table1", "table2"])
table_names
Sequence[str]
required
List of table names to drop. Must include all nested tables.
Returns: List[TTableSchema] - List of dropped table schemas Source: ~/workspace/source/dlt/common/schema/schema.py:420

data_tables()

Gets list of all data tables (excludes dlt internal tables).
tables = schema.data_tables(seen_data_only=True)
seen_data_only
bool
default:"False"
Only include tables that have seen data.
include_incomplete
bool
default:"False"
Include tables without columns.
Returns: List[TTableSchema] - List of data table schemas Source: ~/workspace/source/dlt/common/schema/schema.py:553

data_table_names()

Returns list of data table names.
names = schema.data_table_names(seen_data_only=True)
seen_data_only
bool
default:"False"
Only include tables that have seen data.
include_incomplete
bool
default:"False"
Include incomplete tables.
Returns: List[str] - List of table names Source: ~/workspace/source/dlt/common/schema/schema.py:570

dlt_tables()

Gets dlt internal tables.
internal_tables = schema.dlt_tables()
Returns: List[TTableSchema] - List of dlt table schemas Source: ~/workspace/source/dlt/common/schema/schema.py:581

clone()

Creates a deep copy of the schema.
cloned = schema.clone(
    with_name="new_schema",
    remove_processing_hints=True
)
with_name
str
default:"None"
New name for the cloned schema.
remove_processing_hints
bool
default:"False"
Remove processing markers (x-normalizer, x-loader hints).
update_normalizers
bool
default:"False"
Update normalizers and identifiers in cloned schema.
Returns: Schema - Cloned schema Source: ~/workspace/source/dlt/common/schema/schema.py:898

set_schema_contract()

Sets schema contract settings.
schema.set_schema_contract({
    "tables": "freeze",
    "columns": "evolve",
    "data_type": "discard_value"
})
settings
TSchemaContract
required
Contract settings or None to remove.
Source: ~/workspace/source/dlt/common/schema/schema.py:943

to_dict()

Converts schema to dictionary representation.
schema_dict = schema.to_dict(
    remove_defaults=True,
    remove_processing_hints=True
)
remove_defaults
bool
default:"False"
Remove default values from output.
remove_processing_hints
bool
default:"False"
Remove processing hints.
bump_version
bool
default:"True"
Increment version if modified.
Returns: TStoredSchema - Schema as dictionary Source: ~/workspace/source/dlt/common/schema/schema.py:739

to_pretty_json()

Converts schema to formatted JSON string.
json_str = schema.to_pretty_json(remove_defaults=True)
remove_defaults
bool
default:"True"
Remove default values.
remove_processing_hints
bool
default:"False"
Remove processing hints.
Returns: str - Pretty-printed JSON Source: ~/workspace/source/dlt/common/schema/schema.py:774

to_pretty_yaml()

Converts schema to formatted YAML string.
yaml_str = schema.to_pretty_yaml(remove_defaults=True)
remove_defaults
bool
default:"True"
Remove default values.
remove_processing_hints
bool
default:"False"
Remove processing hints.
Returns: str - Pretty-printed YAML Source: ~/workspace/source/dlt/common/schema/schema.py:782

Example Usage

Accessing Schema Information

import dlt

pipeline = dlt.pipeline(
    pipeline_name="my_pipeline",
    destination="duckdb"
)

# Get schema from pipeline
schema = pipeline.default_schema

print(f"Schema name: {schema.name}")
print(f"Version: {schema.version}")
print(f"Tables: {schema.data_table_names()}")

# Get specific table
users_table = schema.get_table("users")
print(f"Users columns: {list(users_table['columns'].keys())}")

Setting Schema Contract

import dlt

schema = dlt.Schema("strict_schema")

# Freeze schema - no new tables or columns allowed
schema.set_schema_contract({
    "tables": "freeze",
    "columns": "freeze",
    "data_type": "freeze"
})

# Or use shorthand
schema.set_schema_contract("freeze")

Modifying Schema

from dlt.common.schema import Schema

schema = Schema("my_schema")

# Add a new table
table_schema = {
    "name": "users",
    "columns": {
        "id": {"data_type": "bigint", "nullable": False},
        "name": {"data_type": "text"},
        "email": {"data_type": "text", "unique": True},
        "created_at": {"data_type": "timestamp"}
    },
    "write_disposition": "append"
}

schema.update_table(table_schema)

# Get table columns
columns = schema.get_table_columns("users")
for col_name, col_def in columns.items():
    print(f"{col_name}: {col_def['data_type']}")

Schema Versioning

import dlt

pipeline = dlt.pipeline(
    pipeline_name="versioned",
    destination="postgres"
)

schema = pipeline.default_schema

print(f"Current version: {schema.version}")
print(f"Version hash: {schema.version_hash}")
print(f"Is modified: {schema.is_modified}")
print(f"Is new: {schema.is_new}")

Exporting Schema

from dlt.common.schema import Schema

schema = Schema("export_example")

# Export as JSON
json_schema = schema.to_pretty_json(remove_defaults=True)
with open("schema.json", "w") as f:
    f.write(json_schema)

# Export as YAML
yaml_schema = schema.to_pretty_yaml(remove_defaults=True)
with open("schema.yaml", "w") as f:
    f.write(yaml_schema)

# Export as dictionary
schema_dict = schema.to_dict()

Cloning and Modifying Schema

import dlt

pipeline = dlt.pipeline(
    pipeline_name="source",
    destination="duckdb"
)

original_schema = pipeline.default_schema

# Clone with new name
cloned_schema = original_schema.clone(
    with_name="modified_schema",
    remove_processing_hints=True
)

# Modify cloned schema
cloned_schema.set_schema_contract("freeze")

Build docs developers (and LLMs) love