Skip to main content

Overview

The MSA (Multiple Sequence Alignment) module provides tools for generating, processing, and featurising multiple sequence alignments. MSAs capture evolutionary information that AlphaFold 3 uses to predict protein structure.

Classes

Msa

Container for multiple sequence alignments with manipulation methods.
class Msa:
    def __init__(
        self,
        query_sequence: str,
        chain_poly_type: str,
        sequences: Sequence[str],
        descriptions: Sequence[str],
        deduplicate: bool = True,
    )
query_sequence
str
required
The sequence used to search for the MSA.
chain_poly_type
str
required
Polymer type of the query sequence (see mmcif_names for valid types: PROTEIN_CHAIN, RNA_CHAIN, DNA_CHAIN).
sequences
Sequence[str]
required
MSA sequences from search tool. First sequence must match query in featurised form. Empty sequences default to query only.
descriptions
Sequence[str]
required
Metadata for each MSA sequence. Must match length of sequences.
deduplicate
bool
default:"True"
Whether to deduplicate MSA sequences in input order. Lowercase letters (insertions) are ignored during deduplication.
Properties:
depth
int
Number of sequences in the MSA.
query_sequence
str
The original query sequence.
chain_poly_type
str
The polymer type of the sequences.
sequences
list[str]
List of MSA sequences.
descriptions
list[str]
List of sequence descriptions.
Example:
from alphafold3.data import msa
from alphafold3.constants import mmcif_names

# Create MSA from sequences
query_seq = "MKTAYIAKQRQISFVKSHFSRQLEERLGLIEVQAPILSRVGDGTQDNLSGAEKAVQVKVKALPDAQFEVVHSLAKWKRQTLGQHDFSAGEGLYTHMKALRPDEDRLSPLHSVYVDQWDWERVMGDGERQFSTLKSTVEAIWAGIKATEAAVSEEFGLAPFLPDQIHFVHSQELLSRYPDLDAKGRERAIAKDLGAVFLVGIGGKLSDGHRHDVRAPDYDDWSTPSELGHAGLNGDILVWNPVLEDAFELSSMGIRVDADTLKHQLALTGDEDRLELEWHQALLRGEMPQTIGGGIGQSRLTMLLLQLPHIGQVQAGVWPAAVRESVPSLL"

sequences = [
    query_seq,
    "MKTAYIAKQRQISFVKSHFSRQLEERLGLIEVQAPILSRVGDGTQD-LSGAEKAVQVKVKALPDAQFEVVHSLAKWKRQTLGQHDFSAGEGLYTHMKALRPDEDRLSPLHSVYVDQWDWERVMGDGERQFSTLKSTVEAIWAGIKATEAAVSEEFGLAPFLPDQIHFVHSQELLSRYPDLDAKGRERAIAKDLGAVFLVGIGGKLSDGHRHDVRAPDYDDWSTPSELGHAGLNGDILVWNPVLEDAFELSSMGIRVDADTLKHQLALTGDEDRLELEWHQALLRGEMPQTIGGGIGQSRLTMLLLQLPHIGQVQAGVWPAAVRESVPSLL",
    "MKTAYIAKQRQISFVKSHFSRQLEERLGLIEVQAPILSRVGDGTQDNLSGAEKAVQVKVKALPDAQFEVVHSLAKWKRQTLGQHDFSAGEGLYTHMKALRPDEDRLSPLHSVYVDQWDWERVMGDGERQFSTLKSTVEAIWAGIKATEAAVSEEFGLAPFLPDQIHFVHSQELLSRYPDLDAKGRERaIAKDLGAVFLVGIGGKLSDGHRHDVRAPDYDDWSTPSELGHAGLNGDILVWNPVLEDAFELSSMGIRVDADTLKHQLALTGDEDRLELEWHQALLRGEMPQTIGGGIGQSRLTMLLLQLPHIGQVQAGVWPAAVRESVPSLL"
]

descriptions = [
    "Original query",
    "UniRef90_A0A123ABC1",
    "UniRef90_B1B234DEF2"
]

msa_obj = msa.Msa(
    query_sequence=query_seq,
    chain_poly_type=mmcif_names.PROTEIN_CHAIN,
    sequences=sequences,
    descriptions=descriptions,
    deduplicate=True
)

print(f"MSA depth: {msa_obj.depth}")
print(f"Polymer type: {msa_obj.chain_poly_type}")

Class Methods

from_a3m
Parse a single A3M format string and build an MSA object.
@classmethod
def from_a3m(
    cls,
    query_sequence: str,
    chain_poly_type: str,
    a3m: str,
    max_depth: int | None = None,
    deduplicate: bool = True,
) -> Self
query_sequence
str
required
The query sequence used for MSA search.
chain_poly_type
str
required
Polymer type of the sequence.
a3m
str
required
MSA in A3M format.
max_depth
int | None
Maximum number of sequences to keep. If specified and positive, crops MSA to this depth.
deduplicate
bool
default:"True"
Whether to deduplicate sequences.
Example:
a3m_string = """>query
MKTAYIAKQRQISFVKSHFSRQLEERLGLIEVQAPILSRVGDGTQDNLSGAEKAVQVKVKALPDAQ
>hit1
MKTAYIAKQRQISFVKSHFSRQLEERLGLIEVQAPILSRVGDGTQD-LSGAEKAVQVKVKALPDAQ
"""

msa_obj = msa.Msa.from_a3m(
    query_sequence="MKTAYIAKQRQISFVKSHFSRQLEERLGLIEVQAPILSRVGDGTQDNLSGAEKAVQVKVKALPDAQFEVVHSLAKWKRQTL",
    chain_poly_type=mmcif_names.PROTEIN_CHAIN,
    a3m=a3m_string,
    max_depth=5000
)
from_multiple_a3ms
Merge multiple A3M strings into a single MSA.
@classmethod
def from_multiple_a3ms(
    cls,
    a3ms: Sequence[str],
    chain_poly_type: str,
    deduplicate: bool = True,
) -> Self
a3ms
Sequence[str]
required
Multiple A3M strings from different tools/databases. Query sequences must match across all A3Ms.
chain_poly_type
str
required
Polymer type of the sequences.
deduplicate
bool
default:"True"
Whether to deduplicate merged sequences.
Example:
# Combine results from multiple databases
uniref_a3m = "..."
unclustered_a3m = "..."

msa_obj = msa.Msa.from_multiple_a3ms(
    a3ms=[uniref_a3m, unclustered_a3m],
    chain_poly_type=mmcif_names.PROTEIN_CHAIN,
    deduplicate=True
)
from_multiple_msas
Merge multiple MSA objects into one.
@classmethod
def from_multiple_msas(
    cls,
    msas: Sequence[Self],
    deduplicate: bool = True
) -> Self
msas
Sequence[Msa]
required
Multiple MSA objects. All must have matching query sequences and polymer types.
deduplicate
bool
default:"True"
Whether to deduplicate merged sequences.
from_empty
Create an empty MSA containing only the query sequence.
@classmethod
def from_empty(cls, query_sequence: str, chain_poly_type: str) -> Self
Example:
# Useful when MSA search returns no results
empty_msa = msa.Msa.from_empty(
    query_sequence="MKTAYIAKQRQISFVK",
    chain_poly_type=mmcif_names.PROTEIN_CHAIN
)
print(f"Empty MSA depth: {empty_msa.depth}")  # Output: 1

Instance Methods

to_a3m
Convert the MSA to A3M format string.
def to_a3m(self) -> str
Example:
a3m_output = msa_obj.to_a3m()
with open("output.a3m", "w") as f:
    f.write(a3m_output)
featurize
Convert MSA to numerical features for model input.
def featurize(self) -> MutableMapping[str, np.ndarray]
return
MutableMapping[str, np.ndarray]
Dictionary with keys:
  • msa: Encoded MSA sequences as integer array
  • deletion_matrix: Deletion counts at each position
  • msa_species_identifiers: Species IDs extracted from descriptions
  • num_alignments: Total number of sequences
Raises:
  • msa.Error: If sequences have different lengths after removing deletions, contain unknown codes, or if MSA is empty after alignment
Example:
try:
    features = msa_obj.featurize()
    print(f"MSA shape: {features['msa'].shape}")
    print(f"Deletion matrix shape: {features['deletion_matrix'].shape}")
    print(f"Number of alignments: {features['num_alignments']}")
except msa.Error as e:
    print(f"Featurization failed: {e}")

Functions

get_msa

Run MSA search tool and return MSA object.
def get_msa(
    target_sequence: str,
    run_config: msa_config.RunConfig,
    chain_poly_type: str,
    deduplicate: bool = False,
) -> Msa
target_sequence
str
required
The amino acid or nucleotide sequence to search.
run_config
msa_config.RunConfig
required
MSA run configuration specifying tool and parameters.
chain_poly_type
str
required
Type of chain for MSA search (protein, RNA, DNA).
deduplicate
bool
default:"False"
Whether to deduplicate sequences (insertions ignored).
return
Msa
MSA object containing aligned sequences.
Example:
from alphafold3.data import msa_config

# Configure Jackhmmer search
config = msa_config.RunConfig(
    config=msa_config.JackhmmerConfig(
        binary_path="/usr/bin/jackhmmer",
        database_config=msa_config.DatabaseConfig(path="/data/uniref90.fasta"),
        n_cpu=8,
        n_iter=1,
        e_value=0.0001,
        max_sequences=10000
    ),
    crop_size=5000
)

# Run search
msa_result = msa.get_msa(
    target_sequence="MKTAYIAKQRQISFVKSHFSRQLEERLGLIEVQAPILSRVGDGTQDNLSGAEKAVQVKVKALPDAQFEVVHSLAKWKRQTL",
    run_config=config,
    chain_poly_type=mmcif_names.PROTEIN_CHAIN,
    deduplicate=True
)

print(f"Found {msa_result.depth} sequences")

get_msa_tool

Get an MSA search tool instance from configuration.
def get_msa_tool(
    msa_tool_config: msa_config.JackhmmerConfig | msa_config.NhmmerConfig,
) -> msa_tool.MsaTool
msa_tool_config
msa_config.JackhmmerConfig | msa_config.NhmmerConfig
required
Configuration for Jackhmmer (protein) or Nhmmer (RNA/DNA) tool.
return
msa_tool.MsaTool
Configured MSA search tool instance.
Example:
# Configure tool for protein search
jackhmmer_config = msa_config.JackhmmerConfig(
    binary_path="/usr/bin/jackhmmer",
    database_config=msa_config.DatabaseConfig(path="/data/uniref90.fasta"),
    n_cpu=8,
    n_iter=1,
    e_value=0.0001,
    max_sequences=10000
)

tool = msa.get_msa_tool(jackhmmer_config)
result = tool.query("MKTAYIAKQRQISFVK")
print(result.a3m)

sequences_are_feature_equivalent

Check if two sequences produce identical features.
def sequences_are_feature_equivalent(
    sequence1: str,
    sequence2: str,
    chain_poly_type: str,
) -> bool
sequence1
str
required
First sequence to compare.
sequence2
str
required
Second sequence to compare.
chain_poly_type
str
required
Polymer type for featurisation.
return
bool
True if sequences produce identical features, False otherwise.
Example:
# Check if sequences are equivalent for modeling
seq1 = "MKTAYIAKQRQISFVK"
seq2 = "MKTAYIAKQRQISFVK"  # Identical
seq3 = "MKTAYIAKQRQISFVX"  # Different (X vs K)

print(msa.sequences_are_feature_equivalent(seq1, seq2, mmcif_names.PROTEIN_CHAIN))  # True
print(msa.sequences_are_feature_equivalent(seq1, seq3, mmcif_names.PROTEIN_CHAIN))  # False

MSA Search Tools

Jackhmmer (Protein)

Iterative sequence search using HMM profiles. Best for protein sequences. Configuration:
jackhmmer_config = msa_config.JackhmmerConfig(
    binary_path="/usr/bin/jackhmmer",
    database_config=msa_config.DatabaseConfig(path="/data/uniref90.fasta"),
    n_cpu=8,              # Number of CPUs
    n_iter=1,             # Number of iterations
    e_value=0.0001,       # E-value threshold
    z_value=None,         # Z-value for significance
    max_sequences=10000   # Maximum sequences to return
)

Nhmmer (RNA/DNA)

HMM-based search for nucleotide sequences. Used for RNA and DNA. Configuration:
nhmmer_config = msa_config.NhmmerConfig(
    binary_path="/usr/bin/nhmmer",
    hmmalign_binary_path="/usr/bin/hmmalign",
    hmmbuild_binary_path="/usr/bin/hmmbuild",
    database_config=msa_config.DatabaseConfig(path="/data/rfam.fasta"),
    n_cpu=8,
    e_value=0.001,
    max_sequences=5000,
    alphabet="rna"  # or "dna"
)

Error Handling

from alphafold3.data import msa

try:
    msa_obj = msa.Msa.from_a3m(
        query_sequence=query_seq,
        chain_poly_type=mmcif_names.PROTEIN_CHAIN,
        a3m=a3m_string
    )
    features = msa_obj.featurize()
except ValueError as e:
    # Raised for invalid inputs (mismatched sequences, etc.)
    print(f"Validation error: {e}")
except msa.Error as e:
    # Raised for MSA-specific errors (empty MSA, unknown residues, etc.)
    print(f"MSA processing error: {e}")

Build docs developers (and LLMs) love