Skip to main content

Overview

Streaming allows you to receive model responses incrementally as they’re generated, rather than waiting for the complete response. This is implemented using Server-Sent Events (SSE) and is useful for real-time user interfaces.

Enabling Streaming

Set stream: true in your chat completions request:
import requests
import json

url = "http://localhost:8000/v1/chat/completions"
data = {
    "model": "gpt-3.5-turbo",
    "messages": [
        {"role": "user", "content": "Count from 1 to 10"}
    ],
    "stream": True
}

response = requests.post(url, json=data, stream=True)

for line in response.iter_lines():
    if line:
        line = line.decode('utf-8')
        if line.startswith('data: '):
            data_str = line[6:]
            if data_str == '[DONE]':
                break
            chunk = json.loads(data_str)
            delta = chunk['choices'][0]['delta']
            if 'content' in delta:
                print(delta['content'], end='', flush=True)

Stream Response Format

Initial Chunk

The first chunk contains the role:
data: {
  "model": "gpt-3.5-turbo",
  "object": "chat.completion.chunk",
  "choices": [
    {
      "index": 0,
      "delta": {
        "role": "assistant"
      },
      "finish_reason": null
    }
  ],
  "created": 1677652288
}

Content Chunks

Subsequent chunks contain content deltas:
data: {
  "model": "gpt-3.5-turbo",
  "object": "chat.completion.chunk",
  "choices": [
    {
      "index": 0,
      "delta": {
        "content": "Hello"
      },
      "finish_reason": null
    }
  ]
}

Final Chunk

The last chunk has an empty delta and a finish_reason:
data: {
  "model": "gpt-3.5-turbo",
  "object": "chat.completion.chunk",
  "choices": [
    {
      "index": 0,
      "delta": {},
      "finish_reason": "stop"
    }
  ]
}

Stream Termination

data: [DONE]

Response Fields

object
string
Always "chat.completion.chunk" for streaming responses
choices[].delta
object
Incremental content update:
  • role: Present in first chunk only
  • content: Text content delta (not cumulative)
choices[].finish_reason
string | null
Null during generation, then one of:
  • "stop": Natural completion
  • "length": Reached max_length

Python Client Examples

Basic Streaming

import requests
import json

def stream_chat(messages):
    url = "http://localhost:8000/v1/chat/completions"
    response = requests.post(
        url,
        json={
            "model": "gpt-3.5-turbo",
            "messages": messages,
            "stream": True
        },
        stream=True
    )
    
    for line in response.iter_lines():
        if line:
            line = line.decode('utf-8')
            if line.startswith('data: '):
                data = line[6:]
                if data == '[DONE]':
                    print()  # New line at end
                    break
                
                chunk = json.loads(data)
                delta = chunk['choices'][0]['delta']
                
                if 'content' in delta:
                    print(delta['content'], end='', flush=True)

messages = [
    {"role": "user", "content": "Write a haiku about programming"}
]

stream_chat(messages)

With OpenAI SDK

import openai

openai.api_base = "http://localhost:8000/v1"
openai.api_key = "none"

response = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=[
        {"role": "user", "content": "Tell me a story"}
    ],
    stream=True
)

for chunk in response:
    delta = chunk.choices[0].delta
    if hasattr(delta, 'content'):
        print(delta.content, end='', flush=True)

print()  # New line

Collecting Full Response

import requests
import json

def stream_and_collect(messages):
    url = "http://localhost:8000/v1/chat/completions"
    response = requests.post(
        url,
        json={
            "model": "gpt-3.5-turbo",
            "messages": messages,
            "stream": True
        },
        stream=True
    )
    
    full_response = ""
    
    for line in response.iter_lines():
        if line:
            line = line.decode('utf-8')
            if line.startswith('data: '):
                data = line[6:]
                if data == '[DONE]':
                    break
                
                chunk = json.loads(data)
                delta = chunk['choices'][0]['delta']
                
                if 'content' in delta:
                    content = delta['content']
                    full_response += content
                    print(content, end='', flush=True)
    
    print()  # New line
    return full_response

response = stream_and_collect([
    {"role": "user", "content": "Explain async programming"}
])
print(f"\nFull response length: {len(response)} characters")

JavaScript Example

async function streamChat(messages) {
  const response = await fetch('http://localhost:8000/v1/chat/completions', {
    method: 'POST',
    headers: {
      'Content-Type': 'application/json',
    },
    body: JSON.stringify({
      model: 'gpt-3.5-turbo',
      messages: messages,
      stream: true,
    }),
  });

  const reader = response.body.getReader();
  const decoder = new TextDecoder('utf-8');
  let buffer = '';

  while (true) {
    const { done, value } = await reader.read();
    if (done) break;

    buffer += decoder.decode(value, { stream: true });
    const lines = buffer.split('\n');
    buffer = lines.pop();

    for (const line of lines) {
      if (line.startsWith('data: ')) {
        const data = line.slice(6);
        if (data === '[DONE]') {
          return;
        }

        const chunk = JSON.parse(data);
        const delta = chunk.choices[0].delta;
        if (delta.content) {
          process.stdout.write(delta.content);
        }
      }
    }
  }
}

streamChat([
  { role: 'user', content: 'Write a poem about AI' }
]);

Limitations

Function Calling Not Supported

Streaming does not support function calling:
# This will return HTTP 400
response = requests.post(
    "http://localhost:8000/v1/chat/completions",
    json={
        "model": "gpt-3.5-turbo",
        "messages": [{"role": "user", "content": "Hello"}],
        "stream": True,
        "functions": [{"name": "test", "parameters": {}}]  # Not allowed
    }
)
Error response:
{
  "detail": "Invalid request: Function calling is not yet implemented for stream mode."
}

Best Practices

Buffer Management

Handle partial lines in streams:
def stream_with_buffer(response):
    buffer = ""
    
    for chunk in response.iter_content(decode_unicode=True):
        buffer += chunk
        
        while '\n' in buffer:
            line, buffer = buffer.split('\n', 1)
            if line.startswith('data: '):
                process_line(line)

Error Handling

import requests
import json

def safe_stream(messages):
    try:
        response = requests.post(
            "http://localhost:8000/v1/chat/completions",
            json={
                "model": "gpt-3.5-turbo",
                "messages": messages,
                "stream": True
            },
            stream=True,
            timeout=30
        )
        response.raise_for_status()
        
        for line in response.iter_lines():
            if line:
                line = line.decode('utf-8')
                if line.startswith('data: '):
                    data = line[6:]
                    if data == '[DONE]':
                        break
                    
                    try:
                        chunk = json.loads(data)
                        delta = chunk['choices'][0]['delta']
                        if 'content' in delta:
                            yield delta['content']
                    except json.JSONDecodeError:
                        print(f"Warning: Invalid JSON: {data}")
                        continue
    
    except requests.exceptions.RequestException as e:
        print(f"Error: {e}")
        return

for content in safe_stream([{"role": "user", "content": "Hello"}]):
    print(content, end='', flush=True)

Stop Word Handling

The streaming implementation includes a delay buffer to properly handle stop words. The last few tokens may be held back temporarily to check for stop sequences before being yielded.

Build docs developers (and LLMs) love