Skip to main content

Overview

The Qwen tokenizer converts text to token IDs for model input and decodes token IDs back to text. It includes special tokens for the ChatML format and provides various encoding/decoding options.

Loading Tokenizer

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(
    "Qwen/Qwen-7B-Chat",
    trust_remote_code=True,
    resume_download=True
)

Encoding Text

encode() Method

Convert text to token IDs:
text = "Hello, world!"
token_ids = tokenizer.encode(text)
print(token_ids)  # [151644, 13225, 11, 1879, 0, 151645]

Parameters

text
str
required
Input text to encode
add_special_tokens
bool
default:"True"
Whether to add special tokens (BOS/EOS)
max_length
int
default:"None"
Maximum sequence length (truncate if exceeded)
truncation
bool
default:"False"
Whether to truncate sequences exceeding max_length
padding
str | bool
default:"False"
Padding strategy: "max_length", "longest", or False

Returns

token_ids
list[int]
List of token IDs

Decoding Token IDs

decode() Method

Convert token IDs back to text:
token_ids = [151644, 13225, 11, 1879, 0, 151645]
text = tokenizer.decode(token_ids)
print(text)  # "Hello, world!"

Parameters

token_ids
list[int]
required
List of token IDs to decode
skip_special_tokens
bool
default:"False"
Whether to remove special tokens from output
clean_up_tokenization_spaces
bool
default:"True"
Whether to clean up extra spaces
errors
str
default:"replace"
How to handle decoding errors: "ignore", "replace", or "strict"

Returns

text
str
Decoded text string

Batch Encoding

Encode multiple texts at once:
texts = [
    "First example",
    "Second example",
    "Third example"
]

encoded = tokenizer(
    texts,
    padding=True,
    truncation=True,
    max_length=512,
    return_tensors="pt"
)

print(encoded.input_ids.shape)  # torch.Size([3, max_seq_len])
print(encoded.attention_mask.shape)  # torch.Size([3, max_seq_len])

Parameters

text
str | list[str]
required
Single text or list of texts to encode
padding
bool | str
default:"False"
Padding strategy:
  • True or "longest": Pad to longest sequence in batch
  • "max_length": Pad to max_length parameter
  • False: No padding
truncation
bool
default:"False"
Whether to truncate sequences exceeding max_length
max_length
int
default:"None"
Maximum sequence length
return_tensors
str
default:"None"
Return format:
  • "pt": PyTorch tensors
  • "tf": TensorFlow tensors
  • "np": NumPy arrays
  • None: Python lists
return_attention_mask
bool
default:"True"
Whether to return attention mask

Returns

input_ids
torch.Tensor | list
Encoded token IDs
attention_mask
torch.Tensor | list
Attention mask (1 for real tokens, 0 for padding)

Special Tokens

ChatML Format Tokens

# Get special token IDs
im_start_id = tokenizer.im_start_id  # 151644
im_end_id = tokenizer.im_end_id      # 151645
eod_id = tokenizer.eod_id            # End of document token

# Encode special tokens
im_start = tokenizer.encode("<|im_start|>")
im_end = tokenizer.encode("<|im_end|>")

Common Special Tokens

TokenIDPurpose
<|im_start|>151644Start of message
<|im_end|>151645End of message
<|endoftext|>151643End of text/document

Token Information

# Vocabulary size
vocab_size = tokenizer.vocab_size
print(f"Vocabulary size: {vocab_size}")  # 151936

# Get token from ID
token = tokenizer.convert_ids_to_tokens(100)
print(token)

# Get ID from token
token_id = tokenizer.convert_tokens_to_ids("hello")
print(token_id)

# Tokenize with offset mapping
result = tokenizer(
    "Hello world",
    return_offsets_mapping=True
)
print(result.offset_mapping)

Advanced Encoding

Manual ChatML Formatting

im_start = "<|im_start|>"
im_end = "<|im_end|>"

# Format conversation
system = "You are a helpful assistant."
user = "What is AI?"

prompt = f"""{im_start}system
{system}{im_end}
{im_start}user
{user}{im_end}
{im_start}assistant
"""

input_ids = tokenizer.encode(prompt)

Truncation Strategies

# Truncate from right
encoded = tokenizer(
    long_text,
    max_length=512,
    truncation=True,
    truncation_strategy="longest_first"
)

# Keep only recent tokens
encoded = tokenizer(
    long_text,
    max_length=512,
    truncation=True,
    stride=128  # Overlapping tokens
)

Padding Configuration

# Pad on right (for training)
tokenizer.padding_side = "right"

# Pad on left (for generation)
tokenizer.padding_side = "left"

# Set pad token
tokenizer.pad_token_id = tokenizer.eod_id

Complete Example

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    "Qwen/Qwen-7B-Chat",
    trust_remote_code=True
)

# Prepare inputs
messages = [
    "Explain quantum computing",
    "What is machine learning?"
]

# Batch encode
inputs = tokenizer(
    messages,
    padding=True,
    truncation=True,
    max_length=2048,
    return_tensors="pt"
)

print(f"Input shape: {inputs.input_ids.shape}")
print(f"Attention mask shape: {inputs.attention_mask.shape}")

# Decode single sequence
output_ids = model.generate(inputs.input_ids[0:1])
decoded = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print(decoded)

# Batch decode
output_ids = model.generate(inputs.input_ids)
decoded_batch = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
for i, text in enumerate(decoded_batch):
    print(f"Output {i}: {text}")

Error Handling

# Handle invalid token IDs
try:
    text = tokenizer.decode([99999999], errors="strict")
except Exception as e:
    print(f"Error: {e}")

# Ignore errors
text = tokenizer.decode([99999999], errors="ignore")

# Replace invalid characters
text = tokenizer.decode(invalid_ids, errors="replace")

Build docs developers (and LLMs) love